diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,15945 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5928147616874225, + "eval_steps": 500, + "global_step": 2273, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002608071982786725, + "grad_norm": 77.0, + "learning_rate": 0.0, + "loss": 17.314273834228516, + "step": 1 + }, + { + "epoch": 0.000521614396557345, + "grad_norm": 72.5, + "learning_rate": 3.773584905660378e-07, + "loss": 17.201396942138672, + "step": 2 + }, + { + "epoch": 0.0007824215948360175, + "grad_norm": 72.5, + "learning_rate": 7.547169811320755e-07, + "loss": 17.240808486938477, + "step": 3 + }, + { + "epoch": 0.00104322879311469, + "grad_norm": 77.0, + "learning_rate": 1.1320754716981133e-06, + "loss": 17.186172485351562, + "step": 4 + }, + { + "epoch": 0.0013040359913933624, + "grad_norm": 75.5, + "learning_rate": 1.509433962264151e-06, + "loss": 17.1783447265625, + "step": 5 + }, + { + "epoch": 0.001564843189672035, + "grad_norm": 91.0, + "learning_rate": 1.8867924528301889e-06, + "loss": 17.03378677368164, + "step": 6 + }, + { + "epoch": 0.0018256503879507074, + "grad_norm": 89.5, + "learning_rate": 2.2641509433962266e-06, + "loss": 17.054584503173828, + "step": 7 + }, + { + "epoch": 0.00208645758622938, + "grad_norm": 78.0, + "learning_rate": 2.6415094339622644e-06, + "loss": 16.92469596862793, + "step": 8 + }, + { + "epoch": 0.0023472647845080522, + "grad_norm": 78.0, + "learning_rate": 3.018867924528302e-06, + "loss": 16.677852630615234, + "step": 9 + }, + { + "epoch": 0.002608071982786725, + "grad_norm": 82.0, + "learning_rate": 3.3962264150943395e-06, + "loss": 16.411882400512695, + "step": 10 + }, + { + "epoch": 0.0028688791810653974, + "grad_norm": 86.5, + "learning_rate": 3.7735849056603777e-06, + "loss": 16.228927612304688, + "step": 11 + }, + { + "epoch": 0.00312968637934407, + "grad_norm": 89.5, + "learning_rate": 4.150943396226416e-06, + "loss": 15.959747314453125, + "step": 12 + }, + { + "epoch": 0.0033904935776227422, + "grad_norm": 96.5, + "learning_rate": 4.528301886792453e-06, + "loss": 15.387005805969238, + "step": 13 + }, + { + "epoch": 0.003651300775901415, + "grad_norm": 95.0, + "learning_rate": 4.905660377358491e-06, + "loss": 14.818863868713379, + "step": 14 + }, + { + "epoch": 0.003912107974180087, + "grad_norm": 97.0, + "learning_rate": 5.283018867924529e-06, + "loss": 14.189617156982422, + "step": 15 + }, + { + "epoch": 0.00417291517245876, + "grad_norm": 100.5, + "learning_rate": 5.660377358490566e-06, + "loss": 13.341421127319336, + "step": 16 + }, + { + "epoch": 0.004433722370737432, + "grad_norm": 55.25, + "learning_rate": 6.037735849056604e-06, + "loss": 12.826044082641602, + "step": 17 + }, + { + "epoch": 0.0046945295690161044, + "grad_norm": 70.0, + "learning_rate": 6.415094339622642e-06, + "loss": 12.598797798156738, + "step": 18 + }, + { + "epoch": 0.0049553367672947775, + "grad_norm": 41.75, + "learning_rate": 6.792452830188679e-06, + "loss": 12.168102264404297, + "step": 19 + }, + { + "epoch": 0.00521614396557345, + "grad_norm": 80.0, + "learning_rate": 7.169811320754717e-06, + "loss": 11.898005485534668, + "step": 20 + }, + { + "epoch": 0.005476951163852123, + "grad_norm": 49.25, + "learning_rate": 7.5471698113207555e-06, + "loss": 11.813962936401367, + "step": 21 + }, + { + "epoch": 0.005737758362130795, + "grad_norm": 40.0, + "learning_rate": 7.924528301886793e-06, + "loss": 11.574141502380371, + "step": 22 + }, + { + "epoch": 0.005998565560409467, + "grad_norm": 137.0, + "learning_rate": 8.301886792452832e-06, + "loss": 11.403704643249512, + "step": 23 + }, + { + "epoch": 0.00625937275868814, + "grad_norm": 40.75, + "learning_rate": 8.67924528301887e-06, + "loss": 11.084342956542969, + "step": 24 + }, + { + "epoch": 0.006520179956966812, + "grad_norm": 34.25, + "learning_rate": 9.056603773584907e-06, + "loss": 11.013508796691895, + "step": 25 + }, + { + "epoch": 0.0067809871552454845, + "grad_norm": 84.5, + "learning_rate": 9.433962264150944e-06, + "loss": 10.844001770019531, + "step": 26 + }, + { + "epoch": 0.0070417943535241575, + "grad_norm": 27.0, + "learning_rate": 9.811320754716981e-06, + "loss": 10.781389236450195, + "step": 27 + }, + { + "epoch": 0.00730260155180283, + "grad_norm": 25.75, + "learning_rate": 1.018867924528302e-05, + "loss": 10.518528938293457, + "step": 28 + }, + { + "epoch": 0.007563408750081502, + "grad_norm": 97.0, + "learning_rate": 1.0566037735849058e-05, + "loss": 10.529638290405273, + "step": 29 + }, + { + "epoch": 0.007824215948360174, + "grad_norm": 44.5, + "learning_rate": 1.0943396226415095e-05, + "loss": 10.512063980102539, + "step": 30 + }, + { + "epoch": 0.008085023146638847, + "grad_norm": 27.5, + "learning_rate": 1.1320754716981132e-05, + "loss": 10.42243766784668, + "step": 31 + }, + { + "epoch": 0.00834583034491752, + "grad_norm": 23.875, + "learning_rate": 1.169811320754717e-05, + "loss": 10.236409187316895, + "step": 32 + }, + { + "epoch": 0.008606637543196191, + "grad_norm": 54.75, + "learning_rate": 1.2075471698113209e-05, + "loss": 10.11730670928955, + "step": 33 + }, + { + "epoch": 0.008867444741474865, + "grad_norm": 26.875, + "learning_rate": 1.2452830188679246e-05, + "loss": 9.971153259277344, + "step": 34 + }, + { + "epoch": 0.009128251939753538, + "grad_norm": 24.125, + "learning_rate": 1.2830188679245283e-05, + "loss": 9.97641658782959, + "step": 35 + }, + { + "epoch": 0.009389059138032209, + "grad_norm": 31.75, + "learning_rate": 1.320754716981132e-05, + "loss": 9.677864074707031, + "step": 36 + }, + { + "epoch": 0.009649866336310882, + "grad_norm": 24.875, + "learning_rate": 1.3584905660377358e-05, + "loss": 9.680337905883789, + "step": 37 + }, + { + "epoch": 0.009910673534589555, + "grad_norm": 24.875, + "learning_rate": 1.3962264150943397e-05, + "loss": 9.397380828857422, + "step": 38 + }, + { + "epoch": 0.010171480732868228, + "grad_norm": 26.375, + "learning_rate": 1.4339622641509435e-05, + "loss": 9.12952709197998, + "step": 39 + }, + { + "epoch": 0.0104322879311469, + "grad_norm": 27.25, + "learning_rate": 1.4716981132075472e-05, + "loss": 9.214681625366211, + "step": 40 + }, + { + "epoch": 0.010693095129425572, + "grad_norm": 34.5, + "learning_rate": 1.5094339622641511e-05, + "loss": 8.994095802307129, + "step": 41 + }, + { + "epoch": 0.010953902327704245, + "grad_norm": 18.5, + "learning_rate": 1.547169811320755e-05, + "loss": 8.78971004486084, + "step": 42 + }, + { + "epoch": 0.011214709525982917, + "grad_norm": 23.625, + "learning_rate": 1.5849056603773586e-05, + "loss": 8.471336364746094, + "step": 43 + }, + { + "epoch": 0.01147551672426159, + "grad_norm": 30.5, + "learning_rate": 1.6226415094339625e-05, + "loss": 8.377541542053223, + "step": 44 + }, + { + "epoch": 0.011736323922540263, + "grad_norm": 28.5, + "learning_rate": 1.6603773584905664e-05, + "loss": 8.198575973510742, + "step": 45 + }, + { + "epoch": 0.011997131120818934, + "grad_norm": 26.75, + "learning_rate": 1.69811320754717e-05, + "loss": 7.972673416137695, + "step": 46 + }, + { + "epoch": 0.012257938319097607, + "grad_norm": 24.75, + "learning_rate": 1.735849056603774e-05, + "loss": 8.09756851196289, + "step": 47 + }, + { + "epoch": 0.01251874551737628, + "grad_norm": 20.0, + "learning_rate": 1.7735849056603774e-05, + "loss": 7.87360954284668, + "step": 48 + }, + { + "epoch": 0.012779552715654952, + "grad_norm": 32.75, + "learning_rate": 1.8113207547169813e-05, + "loss": 7.462000370025635, + "step": 49 + }, + { + "epoch": 0.013040359913933625, + "grad_norm": 29.0, + "learning_rate": 1.8490566037735852e-05, + "loss": 7.528225421905518, + "step": 50 + }, + { + "epoch": 0.013301167112212298, + "grad_norm": 22.25, + "learning_rate": 1.8867924528301888e-05, + "loss": 7.312280654907227, + "step": 51 + }, + { + "epoch": 0.013561974310490969, + "grad_norm": 26.375, + "learning_rate": 1.9245283018867927e-05, + "loss": 6.996704578399658, + "step": 52 + }, + { + "epoch": 0.013822781508769642, + "grad_norm": 22.5, + "learning_rate": 1.9622641509433963e-05, + "loss": 7.07768440246582, + "step": 53 + }, + { + "epoch": 0.014083588707048315, + "grad_norm": 23.125, + "learning_rate": 2e-05, + "loss": 6.913934230804443, + "step": 54 + }, + { + "epoch": 0.014344395905326986, + "grad_norm": 17.25, + "learning_rate": 2.037735849056604e-05, + "loss": 7.123829364776611, + "step": 55 + }, + { + "epoch": 0.01460520310360566, + "grad_norm": 23.5, + "learning_rate": 2.0754716981132076e-05, + "loss": 6.840793132781982, + "step": 56 + }, + { + "epoch": 0.014866010301884332, + "grad_norm": 23.875, + "learning_rate": 2.1132075471698115e-05, + "loss": 6.265410423278809, + "step": 57 + }, + { + "epoch": 0.015126817500163004, + "grad_norm": 19.875, + "learning_rate": 2.150943396226415e-05, + "loss": 6.487156391143799, + "step": 58 + }, + { + "epoch": 0.015387624698441677, + "grad_norm": 18.75, + "learning_rate": 2.188679245283019e-05, + "loss": 6.40866756439209, + "step": 59 + }, + { + "epoch": 0.015648431896720348, + "grad_norm": 26.25, + "learning_rate": 2.226415094339623e-05, + "loss": 6.230491638183594, + "step": 60 + }, + { + "epoch": 0.01590923909499902, + "grad_norm": 16.75, + "learning_rate": 2.2641509433962265e-05, + "loss": 5.978422164916992, + "step": 61 + }, + { + "epoch": 0.016170046293277694, + "grad_norm": 20.875, + "learning_rate": 2.3018867924528304e-05, + "loss": 5.85161828994751, + "step": 62 + }, + { + "epoch": 0.016430853491556367, + "grad_norm": 20.25, + "learning_rate": 2.339622641509434e-05, + "loss": 5.712477207183838, + "step": 63 + }, + { + "epoch": 0.01669166068983504, + "grad_norm": 17.625, + "learning_rate": 2.377358490566038e-05, + "loss": 5.904017448425293, + "step": 64 + }, + { + "epoch": 0.016952467888113713, + "grad_norm": 18.75, + "learning_rate": 2.4150943396226418e-05, + "loss": 5.851974010467529, + "step": 65 + }, + { + "epoch": 0.017213275086392383, + "grad_norm": 19.375, + "learning_rate": 2.4528301886792453e-05, + "loss": 5.791886806488037, + "step": 66 + }, + { + "epoch": 0.017474082284671056, + "grad_norm": 18.5, + "learning_rate": 2.4905660377358492e-05, + "loss": 5.528830528259277, + "step": 67 + }, + { + "epoch": 0.01773488948294973, + "grad_norm": 27.125, + "learning_rate": 2.5283018867924528e-05, + "loss": 5.4120564460754395, + "step": 68 + }, + { + "epoch": 0.017995696681228402, + "grad_norm": 15.1875, + "learning_rate": 2.5660377358490567e-05, + "loss": 5.542486190795898, + "step": 69 + }, + { + "epoch": 0.018256503879507075, + "grad_norm": 29.625, + "learning_rate": 2.6037735849056606e-05, + "loss": 5.326672554016113, + "step": 70 + }, + { + "epoch": 0.018517311077785748, + "grad_norm": 21.625, + "learning_rate": 2.641509433962264e-05, + "loss": 5.187875270843506, + "step": 71 + }, + { + "epoch": 0.018778118276064418, + "grad_norm": 17.875, + "learning_rate": 2.679245283018868e-05, + "loss": 5.226883888244629, + "step": 72 + }, + { + "epoch": 0.01903892547434309, + "grad_norm": 14.8125, + "learning_rate": 2.7169811320754716e-05, + "loss": 5.023570537567139, + "step": 73 + }, + { + "epoch": 0.019299732672621764, + "grad_norm": 26.375, + "learning_rate": 2.7547169811320755e-05, + "loss": 4.935462951660156, + "step": 74 + }, + { + "epoch": 0.019560539870900437, + "grad_norm": 14.0625, + "learning_rate": 2.7924528301886794e-05, + "loss": 5.175811290740967, + "step": 75 + }, + { + "epoch": 0.01982134706917911, + "grad_norm": 20.0, + "learning_rate": 2.830188679245283e-05, + "loss": 5.010772228240967, + "step": 76 + }, + { + "epoch": 0.020082154267457783, + "grad_norm": 16.375, + "learning_rate": 2.867924528301887e-05, + "loss": 4.9048967361450195, + "step": 77 + }, + { + "epoch": 0.020342961465736456, + "grad_norm": 16.0, + "learning_rate": 2.9056603773584905e-05, + "loss": 4.898214340209961, + "step": 78 + }, + { + "epoch": 0.020603768664015126, + "grad_norm": 20.0, + "learning_rate": 2.9433962264150944e-05, + "loss": 4.572073936462402, + "step": 79 + }, + { + "epoch": 0.0208645758622938, + "grad_norm": 15.0, + "learning_rate": 2.9811320754716983e-05, + "loss": 4.445930480957031, + "step": 80 + }, + { + "epoch": 0.02112538306057247, + "grad_norm": 14.9375, + "learning_rate": 3.0188679245283022e-05, + "loss": 4.540976524353027, + "step": 81 + }, + { + "epoch": 0.021386190258851145, + "grad_norm": 18.125, + "learning_rate": 3.0566037735849064e-05, + "loss": 4.4916791915893555, + "step": 82 + }, + { + "epoch": 0.021646997457129818, + "grad_norm": 12.9375, + "learning_rate": 3.09433962264151e-05, + "loss": 4.635715484619141, + "step": 83 + }, + { + "epoch": 0.02190780465540849, + "grad_norm": 20.25, + "learning_rate": 3.1320754716981136e-05, + "loss": 4.457133769989014, + "step": 84 + }, + { + "epoch": 0.02216861185368716, + "grad_norm": 15.875, + "learning_rate": 3.169811320754717e-05, + "loss": 4.446690082550049, + "step": 85 + }, + { + "epoch": 0.022429419051965833, + "grad_norm": 12.6875, + "learning_rate": 3.2075471698113214e-05, + "loss": 4.444411277770996, + "step": 86 + }, + { + "epoch": 0.022690226250244507, + "grad_norm": 16.5, + "learning_rate": 3.245283018867925e-05, + "loss": 4.506210803985596, + "step": 87 + }, + { + "epoch": 0.02295103344852318, + "grad_norm": 18.625, + "learning_rate": 3.2830188679245285e-05, + "loss": 4.7216081619262695, + "step": 88 + }, + { + "epoch": 0.023211840646801853, + "grad_norm": 17.125, + "learning_rate": 3.320754716981133e-05, + "loss": 4.35127067565918, + "step": 89 + }, + { + "epoch": 0.023472647845080526, + "grad_norm": 17.875, + "learning_rate": 3.358490566037736e-05, + "loss": 4.327524185180664, + "step": 90 + }, + { + "epoch": 0.023733455043359195, + "grad_norm": 15.6875, + "learning_rate": 3.39622641509434e-05, + "loss": 4.007119178771973, + "step": 91 + }, + { + "epoch": 0.02399426224163787, + "grad_norm": 14.0, + "learning_rate": 3.433962264150944e-05, + "loss": 4.097439765930176, + "step": 92 + }, + { + "epoch": 0.02425506943991654, + "grad_norm": 16.75, + "learning_rate": 3.471698113207548e-05, + "loss": 4.138132095336914, + "step": 93 + }, + { + "epoch": 0.024515876638195214, + "grad_norm": 13.5625, + "learning_rate": 3.509433962264151e-05, + "loss": 3.882037401199341, + "step": 94 + }, + { + "epoch": 0.024776683836473887, + "grad_norm": 14.875, + "learning_rate": 3.547169811320755e-05, + "loss": 4.11362886428833, + "step": 95 + }, + { + "epoch": 0.02503749103475256, + "grad_norm": 14.3125, + "learning_rate": 3.584905660377359e-05, + "loss": 4.373976230621338, + "step": 96 + }, + { + "epoch": 0.02529829823303123, + "grad_norm": 14.5, + "learning_rate": 3.6226415094339626e-05, + "loss": 3.847653865814209, + "step": 97 + }, + { + "epoch": 0.025559105431309903, + "grad_norm": 9.875, + "learning_rate": 3.660377358490566e-05, + "loss": 4.18071174621582, + "step": 98 + }, + { + "epoch": 0.025819912629588576, + "grad_norm": 12.8125, + "learning_rate": 3.6981132075471704e-05, + "loss": 3.893112897872925, + "step": 99 + }, + { + "epoch": 0.02608071982786725, + "grad_norm": 12.25, + "learning_rate": 3.735849056603774e-05, + "loss": 3.7350988388061523, + "step": 100 + }, + { + "epoch": 0.026341527026145922, + "grad_norm": 11.1875, + "learning_rate": 3.7735849056603776e-05, + "loss": 3.977013111114502, + "step": 101 + }, + { + "epoch": 0.026602334224424595, + "grad_norm": 15.0, + "learning_rate": 3.811320754716982e-05, + "loss": 3.959244728088379, + "step": 102 + }, + { + "epoch": 0.02686314142270327, + "grad_norm": 13.375, + "learning_rate": 3.8490566037735854e-05, + "loss": 3.6513144969940186, + "step": 103 + }, + { + "epoch": 0.027123948620981938, + "grad_norm": 14.6875, + "learning_rate": 3.886792452830189e-05, + "loss": 3.9482178688049316, + "step": 104 + }, + { + "epoch": 0.02738475581926061, + "grad_norm": 16.25, + "learning_rate": 3.9245283018867925e-05, + "loss": 3.997860908508301, + "step": 105 + }, + { + "epoch": 0.027645563017539284, + "grad_norm": 15.625, + "learning_rate": 3.962264150943397e-05, + "loss": 3.7535505294799805, + "step": 106 + }, + { + "epoch": 0.027906370215817957, + "grad_norm": 9.875, + "learning_rate": 4e-05, + "loss": 3.1479573249816895, + "step": 107 + }, + { + "epoch": 0.02816717741409663, + "grad_norm": 22.625, + "learning_rate": 3.999999467458553e-05, + "loss": 3.762944221496582, + "step": 108 + }, + { + "epoch": 0.028427984612375303, + "grad_norm": 12.5625, + "learning_rate": 3.999997869834493e-05, + "loss": 3.4817514419555664, + "step": 109 + }, + { + "epoch": 0.028688791810653973, + "grad_norm": 16.125, + "learning_rate": 3.999995207128673e-05, + "loss": 3.6522598266601562, + "step": 110 + }, + { + "epoch": 0.028949599008932646, + "grad_norm": 11.875, + "learning_rate": 3.9999914793425094e-05, + "loss": 3.936000347137451, + "step": 111 + }, + { + "epoch": 0.02921040620721132, + "grad_norm": 17.0, + "learning_rate": 3.999986686477989e-05, + "loss": 3.454770088195801, + "step": 112 + }, + { + "epoch": 0.029471213405489992, + "grad_norm": 10.75, + "learning_rate": 3.9999808285376626e-05, + "loss": 3.5592823028564453, + "step": 113 + }, + { + "epoch": 0.029732020603768665, + "grad_norm": 21.125, + "learning_rate": 3.999973905524651e-05, + "loss": 3.7373290061950684, + "step": 114 + }, + { + "epoch": 0.029992827802047338, + "grad_norm": 12.0, + "learning_rate": 3.9999659174426395e-05, + "loss": 3.6499617099761963, + "step": 115 + }, + { + "epoch": 0.030253635000326008, + "grad_norm": 14.6875, + "learning_rate": 3.999956864295883e-05, + "loss": 3.2987723350524902, + "step": 116 + }, + { + "epoch": 0.03051444219860468, + "grad_norm": 15.0, + "learning_rate": 3.999946746089204e-05, + "loss": 3.405784845352173, + "step": 117 + }, + { + "epoch": 0.030775249396883354, + "grad_norm": 12.75, + "learning_rate": 3.999935562827989e-05, + "loss": 3.4816479682922363, + "step": 118 + }, + { + "epoch": 0.031036056595162027, + "grad_norm": 14.1875, + "learning_rate": 3.999923314518194e-05, + "loss": 3.635232448577881, + "step": 119 + }, + { + "epoch": 0.031296863793440696, + "grad_norm": 13.625, + "learning_rate": 3.999910001166342e-05, + "loss": 3.6046676635742188, + "step": 120 + }, + { + "epoch": 0.03155767099171937, + "grad_norm": 12.4375, + "learning_rate": 3.999895622779523e-05, + "loss": 3.4561996459960938, + "step": 121 + }, + { + "epoch": 0.03181847818999804, + "grad_norm": 12.0, + "learning_rate": 3.999880179365393e-05, + "loss": 3.2301549911499023, + "step": 122 + }, + { + "epoch": 0.032079285388276715, + "grad_norm": 9.9375, + "learning_rate": 3.9998636709321774e-05, + "loss": 3.497526168823242, + "step": 123 + }, + { + "epoch": 0.03234009258655539, + "grad_norm": 9.8125, + "learning_rate": 3.999846097488668e-05, + "loss": 3.549671173095703, + "step": 124 + }, + { + "epoch": 0.03260089978483406, + "grad_norm": 10.125, + "learning_rate": 3.999827459044222e-05, + "loss": 3.203861951828003, + "step": 125 + }, + { + "epoch": 0.032861706983112735, + "grad_norm": 10.0, + "learning_rate": 3.999807755608767e-05, + "loss": 3.2915453910827637, + "step": 126 + }, + { + "epoch": 0.03312251418139141, + "grad_norm": 10.8125, + "learning_rate": 3.999786987192794e-05, + "loss": 2.9838616847991943, + "step": 127 + }, + { + "epoch": 0.03338332137967008, + "grad_norm": 9.0, + "learning_rate": 3.999765153807364e-05, + "loss": 3.060359001159668, + "step": 128 + }, + { + "epoch": 0.033644128577948754, + "grad_norm": 9.5, + "learning_rate": 3.999742255464103e-05, + "loss": 2.9777638912200928, + "step": 129 + }, + { + "epoch": 0.03390493577622743, + "grad_norm": 8.5625, + "learning_rate": 3.9997182921752076e-05, + "loss": 3.1705143451690674, + "step": 130 + }, + { + "epoch": 0.0341657429745061, + "grad_norm": 8.5625, + "learning_rate": 3.9996932639534376e-05, + "loss": 3.0397515296936035, + "step": 131 + }, + { + "epoch": 0.034426550172784766, + "grad_norm": 11.625, + "learning_rate": 3.9996671708121214e-05, + "loss": 3.138066530227661, + "step": 132 + }, + { + "epoch": 0.03468735737106344, + "grad_norm": 8.8125, + "learning_rate": 3.999640012765156e-05, + "loss": 3.1769144535064697, + "step": 133 + }, + { + "epoch": 0.03494816456934211, + "grad_norm": 9.3125, + "learning_rate": 3.999611789827003e-05, + "loss": 3.212942600250244, + "step": 134 + }, + { + "epoch": 0.035208971767620785, + "grad_norm": 10.125, + "learning_rate": 3.999582502012692e-05, + "loss": 3.2294867038726807, + "step": 135 + }, + { + "epoch": 0.03546977896589946, + "grad_norm": 9.0, + "learning_rate": 3.999552149337822e-05, + "loss": 3.4734344482421875, + "step": 136 + }, + { + "epoch": 0.03573058616417813, + "grad_norm": 9.25, + "learning_rate": 3.999520731818555e-05, + "loss": 3.1776041984558105, + "step": 137 + }, + { + "epoch": 0.035991393362456804, + "grad_norm": 9.875, + "learning_rate": 3.999488249471623e-05, + "loss": 3.1053857803344727, + "step": 138 + }, + { + "epoch": 0.03625220056073548, + "grad_norm": 9.1875, + "learning_rate": 3.9994547023143244e-05, + "loss": 3.068758726119995, + "step": 139 + }, + { + "epoch": 0.03651300775901415, + "grad_norm": 12.0625, + "learning_rate": 3.999420090364523e-05, + "loss": 2.9159774780273438, + "step": 140 + }, + { + "epoch": 0.03677381495729282, + "grad_norm": 10.375, + "learning_rate": 3.9993844136406535e-05, + "loss": 3.1062023639678955, + "step": 141 + }, + { + "epoch": 0.037034622155571496, + "grad_norm": 8.875, + "learning_rate": 3.999347672161713e-05, + "loss": 3.0980939865112305, + "step": 142 + }, + { + "epoch": 0.03729542935385017, + "grad_norm": 9.5625, + "learning_rate": 3.999309865947269e-05, + "loss": 3.071870803833008, + "step": 143 + }, + { + "epoch": 0.037556236552128835, + "grad_norm": 9.3125, + "learning_rate": 3.999270995017455e-05, + "loss": 3.085413932800293, + "step": 144 + }, + { + "epoch": 0.03781704375040751, + "grad_norm": 12.0, + "learning_rate": 3.999231059392971e-05, + "loss": 3.1918039321899414, + "step": 145 + }, + { + "epoch": 0.03807785094868618, + "grad_norm": 9.875, + "learning_rate": 3.9991900590950844e-05, + "loss": 3.2698490619659424, + "step": 146 + }, + { + "epoch": 0.038338658146964855, + "grad_norm": 13.5625, + "learning_rate": 3.999147994145629e-05, + "loss": 2.81750750541687, + "step": 147 + }, + { + "epoch": 0.03859946534524353, + "grad_norm": 9.25, + "learning_rate": 3.999104864567007e-05, + "loss": 2.9677963256835938, + "step": 148 + }, + { + "epoch": 0.0388602725435222, + "grad_norm": 10.9375, + "learning_rate": 3.999060670382187e-05, + "loss": 2.9043052196502686, + "step": 149 + }, + { + "epoch": 0.039121079741800874, + "grad_norm": 8.9375, + "learning_rate": 3.9990154116147024e-05, + "loss": 3.038243055343628, + "step": 150 + }, + { + "epoch": 0.03938188694007955, + "grad_norm": 13.5625, + "learning_rate": 3.998969088288657e-05, + "loss": 3.111212730407715, + "step": 151 + }, + { + "epoch": 0.03964269413835822, + "grad_norm": 8.5, + "learning_rate": 3.9989217004287206e-05, + "loss": 2.9221205711364746, + "step": 152 + }, + { + "epoch": 0.03990350133663689, + "grad_norm": 9.6875, + "learning_rate": 3.998873248060127e-05, + "loss": 3.0534908771514893, + "step": 153 + }, + { + "epoch": 0.040164308534915566, + "grad_norm": 8.75, + "learning_rate": 3.99882373120868e-05, + "loss": 3.266178607940674, + "step": 154 + }, + { + "epoch": 0.04042511573319424, + "grad_norm": 11.4375, + "learning_rate": 3.998773149900751e-05, + "loss": 2.951430082321167, + "step": 155 + }, + { + "epoch": 0.04068592293147291, + "grad_norm": 10.5, + "learning_rate": 3.9987215041632737e-05, + "loss": 3.2741990089416504, + "step": 156 + }, + { + "epoch": 0.04094673012975158, + "grad_norm": 8.0, + "learning_rate": 3.998668794023754e-05, + "loss": 2.9319162368774414, + "step": 157 + }, + { + "epoch": 0.04120753732803025, + "grad_norm": 9.5625, + "learning_rate": 3.9986150195102604e-05, + "loss": 2.9177396297454834, + "step": 158 + }, + { + "epoch": 0.041468344526308924, + "grad_norm": 9.25, + "learning_rate": 3.9985601806514315e-05, + "loss": 3.190408229827881, + "step": 159 + }, + { + "epoch": 0.0417291517245876, + "grad_norm": 9.25, + "learning_rate": 3.998504277476471e-05, + "loss": 3.000624179840088, + "step": 160 + }, + { + "epoch": 0.04198995892286627, + "grad_norm": 8.5, + "learning_rate": 3.998447310015149e-05, + "loss": 2.9762725830078125, + "step": 161 + }, + { + "epoch": 0.04225076612114494, + "grad_norm": 8.625, + "learning_rate": 3.998389278297804e-05, + "loss": 2.898078680038452, + "step": 162 + }, + { + "epoch": 0.042511573319423616, + "grad_norm": 7.53125, + "learning_rate": 3.9983301823553394e-05, + "loss": 2.8846335411071777, + "step": 163 + }, + { + "epoch": 0.04277238051770229, + "grad_norm": 9.125, + "learning_rate": 3.9982700222192266e-05, + "loss": 2.804777145385742, + "step": 164 + }, + { + "epoch": 0.04303318771598096, + "grad_norm": 7.375, + "learning_rate": 3.998208797921503e-05, + "loss": 2.8471713066101074, + "step": 165 + }, + { + "epoch": 0.043293994914259636, + "grad_norm": 9.3125, + "learning_rate": 3.998146509494774e-05, + "loss": 2.616511821746826, + "step": 166 + }, + { + "epoch": 0.04355480211253831, + "grad_norm": 7.84375, + "learning_rate": 3.99808315697221e-05, + "loss": 2.9040181636810303, + "step": 167 + }, + { + "epoch": 0.04381560931081698, + "grad_norm": 8.75, + "learning_rate": 3.9980187403875485e-05, + "loss": 2.8242077827453613, + "step": 168 + }, + { + "epoch": 0.04407641650909565, + "grad_norm": 8.3125, + "learning_rate": 3.997953259775095e-05, + "loss": 2.8351857662200928, + "step": 169 + }, + { + "epoch": 0.04433722370737432, + "grad_norm": 10.1875, + "learning_rate": 3.99788671516972e-05, + "loss": 3.20156192779541, + "step": 170 + }, + { + "epoch": 0.044598030905652994, + "grad_norm": 9.125, + "learning_rate": 3.9978191066068616e-05, + "loss": 3.055712938308716, + "step": 171 + }, + { + "epoch": 0.04485883810393167, + "grad_norm": 7.96875, + "learning_rate": 3.9977504341225236e-05, + "loss": 2.602215051651001, + "step": 172 + }, + { + "epoch": 0.04511964530221034, + "grad_norm": 8.875, + "learning_rate": 3.997680697753278e-05, + "loss": 2.794217348098755, + "step": 173 + }, + { + "epoch": 0.04538045250048901, + "grad_norm": 9.5, + "learning_rate": 3.997609897536261e-05, + "loss": 2.5247011184692383, + "step": 174 + }, + { + "epoch": 0.045641259698767686, + "grad_norm": 12.25, + "learning_rate": 3.9975380335091786e-05, + "loss": 2.9152863025665283, + "step": 175 + }, + { + "epoch": 0.04590206689704636, + "grad_norm": 8.25, + "learning_rate": 3.9974651057102985e-05, + "loss": 2.656716823577881, + "step": 176 + }, + { + "epoch": 0.04616287409532503, + "grad_norm": 7.25, + "learning_rate": 3.9973911141784605e-05, + "loss": 2.746725082397461, + "step": 177 + }, + { + "epoch": 0.046423681293603705, + "grad_norm": 7.21875, + "learning_rate": 3.9973160589530665e-05, + "loss": 2.4794938564300537, + "step": 178 + }, + { + "epoch": 0.04668448849188238, + "grad_norm": 7.625, + "learning_rate": 3.997239940074087e-05, + "loss": 2.6509430408477783, + "step": 179 + }, + { + "epoch": 0.04694529569016105, + "grad_norm": 7.40625, + "learning_rate": 3.997162757582058e-05, + "loss": 2.940674304962158, + "step": 180 + }, + { + "epoch": 0.047206102888439724, + "grad_norm": 7.59375, + "learning_rate": 3.997084511518083e-05, + "loss": 3.0152294635772705, + "step": 181 + }, + { + "epoch": 0.04746691008671839, + "grad_norm": 7.84375, + "learning_rate": 3.997005201923832e-05, + "loss": 2.9850046634674072, + "step": 182 + }, + { + "epoch": 0.047727717284997064, + "grad_norm": 6.96875, + "learning_rate": 3.996924828841539e-05, + "loss": 2.8081016540527344, + "step": 183 + }, + { + "epoch": 0.04798852448327574, + "grad_norm": 6.6875, + "learning_rate": 3.9968433923140076e-05, + "loss": 2.793598175048828, + "step": 184 + }, + { + "epoch": 0.04824933168155441, + "grad_norm": 7.65625, + "learning_rate": 3.9967608923846044e-05, + "loss": 2.6727943420410156, + "step": 185 + }, + { + "epoch": 0.04851013887983308, + "grad_norm": 7.03125, + "learning_rate": 3.9966773290972654e-05, + "loss": 2.6837079524993896, + "step": 186 + }, + { + "epoch": 0.048770946078111756, + "grad_norm": 8.1875, + "learning_rate": 3.996592702496491e-05, + "loss": 2.765864133834839, + "step": 187 + }, + { + "epoch": 0.04903175327639043, + "grad_norm": 6.59375, + "learning_rate": 3.996507012627348e-05, + "loss": 2.4099583625793457, + "step": 188 + }, + { + "epoch": 0.0492925604746691, + "grad_norm": 8.0, + "learning_rate": 3.99642025953547e-05, + "loss": 2.8211944103240967, + "step": 189 + }, + { + "epoch": 0.049553367672947775, + "grad_norm": 7.09375, + "learning_rate": 3.996332443267058e-05, + "loss": 2.359706163406372, + "step": 190 + }, + { + "epoch": 0.04981417487122645, + "grad_norm": 7.75, + "learning_rate": 3.996243563868876e-05, + "loss": 3.0262551307678223, + "step": 191 + }, + { + "epoch": 0.05007498206950512, + "grad_norm": 8.6875, + "learning_rate": 3.996153621388256e-05, + "loss": 2.709545373916626, + "step": 192 + }, + { + "epoch": 0.050335789267783794, + "grad_norm": 6.9375, + "learning_rate": 3.996062615873098e-05, + "loss": 2.4291229248046875, + "step": 193 + }, + { + "epoch": 0.05059659646606246, + "grad_norm": 6.9375, + "learning_rate": 3.995970547371864e-05, + "loss": 2.47729229927063, + "step": 194 + }, + { + "epoch": 0.05085740366434113, + "grad_norm": 7.5, + "learning_rate": 3.995877415933586e-05, + "loss": 2.773533582687378, + "step": 195 + }, + { + "epoch": 0.051118210862619806, + "grad_norm": 7.28125, + "learning_rate": 3.995783221607859e-05, + "loss": 2.86281418800354, + "step": 196 + }, + { + "epoch": 0.05137901806089848, + "grad_norm": 7.28125, + "learning_rate": 3.9956879644448456e-05, + "loss": 2.6186606884002686, + "step": 197 + }, + { + "epoch": 0.05163982525917715, + "grad_norm": 6.90625, + "learning_rate": 3.995591644495275e-05, + "loss": 2.787097215652466, + "step": 198 + }, + { + "epoch": 0.051900632457455825, + "grad_norm": 6.5, + "learning_rate": 3.995494261810441e-05, + "loss": 2.50883150100708, + "step": 199 + }, + { + "epoch": 0.0521614396557345, + "grad_norm": 8.1875, + "learning_rate": 3.995395816442204e-05, + "loss": 2.8412694931030273, + "step": 200 + }, + { + "epoch": 0.05242224685401317, + "grad_norm": 6.9375, + "learning_rate": 3.99529630844299e-05, + "loss": 2.518864870071411, + "step": 201 + }, + { + "epoch": 0.052683054052291844, + "grad_norm": 7.0625, + "learning_rate": 3.9951957378657916e-05, + "loss": 2.899658203125, + "step": 202 + }, + { + "epoch": 0.05294386125057052, + "grad_norm": 6.5, + "learning_rate": 3.995094104764167e-05, + "loss": 2.6643025875091553, + "step": 203 + }, + { + "epoch": 0.05320466844884919, + "grad_norm": 6.625, + "learning_rate": 3.9949914091922394e-05, + "loss": 2.6935393810272217, + "step": 204 + }, + { + "epoch": 0.053465475647127864, + "grad_norm": 6.6875, + "learning_rate": 3.994887651204698e-05, + "loss": 2.3257358074188232, + "step": 205 + }, + { + "epoch": 0.05372628284540654, + "grad_norm": 6.71875, + "learning_rate": 3.9947828308568e-05, + "loss": 2.544113874435425, + "step": 206 + }, + { + "epoch": 0.0539870900436852, + "grad_norm": 7.78125, + "learning_rate": 3.994676948204364e-05, + "loss": 2.5802993774414062, + "step": 207 + }, + { + "epoch": 0.054247897241963876, + "grad_norm": 7.1875, + "learning_rate": 3.9945700033037794e-05, + "loss": 2.469195604324341, + "step": 208 + }, + { + "epoch": 0.05450870444024255, + "grad_norm": 7.5, + "learning_rate": 3.994461996211998e-05, + "loss": 2.5232973098754883, + "step": 209 + }, + { + "epoch": 0.05476951163852122, + "grad_norm": 7.0, + "learning_rate": 3.9943529269865375e-05, + "loss": 2.701573371887207, + "step": 210 + }, + { + "epoch": 0.055030318836799895, + "grad_norm": 7.5, + "learning_rate": 3.994242795685482e-05, + "loss": 2.9642443656921387, + "step": 211 + }, + { + "epoch": 0.05529112603507857, + "grad_norm": 6.9375, + "learning_rate": 3.994131602367481e-05, + "loss": 2.6440212726593018, + "step": 212 + }, + { + "epoch": 0.05555193323335724, + "grad_norm": 7.3125, + "learning_rate": 3.99401934709175e-05, + "loss": 2.4167118072509766, + "step": 213 + }, + { + "epoch": 0.055812740431635914, + "grad_norm": 6.65625, + "learning_rate": 3.993906029918069e-05, + "loss": 2.6095056533813477, + "step": 214 + }, + { + "epoch": 0.05607354762991459, + "grad_norm": 6.9375, + "learning_rate": 3.9937916509067845e-05, + "loss": 2.439197063446045, + "step": 215 + }, + { + "epoch": 0.05633435482819326, + "grad_norm": 6.90625, + "learning_rate": 3.993676210118808e-05, + "loss": 2.5025954246520996, + "step": 216 + }, + { + "epoch": 0.05659516202647193, + "grad_norm": 7.40625, + "learning_rate": 3.993559707615616e-05, + "loss": 2.774198532104492, + "step": 217 + }, + { + "epoch": 0.056855969224750606, + "grad_norm": 6.5625, + "learning_rate": 3.993442143459251e-05, + "loss": 2.8386070728302, + "step": 218 + }, + { + "epoch": 0.05711677642302927, + "grad_norm": 7.15625, + "learning_rate": 3.993323517712322e-05, + "loss": 2.4885847568511963, + "step": 219 + }, + { + "epoch": 0.057377583621307945, + "grad_norm": 7.0625, + "learning_rate": 3.993203830438001e-05, + "loss": 2.4035756587982178, + "step": 220 + }, + { + "epoch": 0.05763839081958662, + "grad_norm": 7.59375, + "learning_rate": 3.993083081700026e-05, + "loss": 2.9204483032226562, + "step": 221 + }, + { + "epoch": 0.05789919801786529, + "grad_norm": 6.75, + "learning_rate": 3.992961271562702e-05, + "loss": 2.324254274368286, + "step": 222 + }, + { + "epoch": 0.058160005216143965, + "grad_norm": 7.46875, + "learning_rate": 3.9928384000908966e-05, + "loss": 2.451913833618164, + "step": 223 + }, + { + "epoch": 0.05842081241442264, + "grad_norm": 6.53125, + "learning_rate": 3.992714467350045e-05, + "loss": 2.5799005031585693, + "step": 224 + }, + { + "epoch": 0.05868161961270131, + "grad_norm": 7.0625, + "learning_rate": 3.9925894734061466e-05, + "loss": 2.316067934036255, + "step": 225 + }, + { + "epoch": 0.058942426810979984, + "grad_norm": 6.9375, + "learning_rate": 3.992463418325765e-05, + "loss": 2.6440839767456055, + "step": 226 + }, + { + "epoch": 0.05920323400925866, + "grad_norm": 6.21875, + "learning_rate": 3.99233630217603e-05, + "loss": 2.3150699138641357, + "step": 227 + }, + { + "epoch": 0.05946404120753733, + "grad_norm": 7.03125, + "learning_rate": 3.992208125024637e-05, + "loss": 2.774292469024658, + "step": 228 + }, + { + "epoch": 0.059724848405816, + "grad_norm": 6.0, + "learning_rate": 3.9920788869398445e-05, + "loss": 2.1743381023406982, + "step": 229 + }, + { + "epoch": 0.059985655604094676, + "grad_norm": 6.65625, + "learning_rate": 3.991948587990479e-05, + "loss": 2.6707546710968018, + "step": 230 + }, + { + "epoch": 0.06024646280237335, + "grad_norm": 6.90625, + "learning_rate": 3.9918172282459274e-05, + "loss": 2.5074210166931152, + "step": 231 + }, + { + "epoch": 0.060507270000652015, + "grad_norm": 6.53125, + "learning_rate": 3.9916848077761455e-05, + "loss": 2.515544891357422, + "step": 232 + }, + { + "epoch": 0.06076807719893069, + "grad_norm": 7.0, + "learning_rate": 3.991551326651653e-05, + "loss": 2.456247568130493, + "step": 233 + }, + { + "epoch": 0.06102888439720936, + "grad_norm": 6.0625, + "learning_rate": 3.9914167849435344e-05, + "loss": 2.4338889122009277, + "step": 234 + }, + { + "epoch": 0.061289691595488034, + "grad_norm": 6.90625, + "learning_rate": 3.991281182723438e-05, + "loss": 2.734100580215454, + "step": 235 + }, + { + "epoch": 0.06155049879376671, + "grad_norm": 6.4375, + "learning_rate": 3.9911445200635775e-05, + "loss": 2.555612325668335, + "step": 236 + }, + { + "epoch": 0.06181130599204538, + "grad_norm": 6.25, + "learning_rate": 3.9910067970367327e-05, + "loss": 2.738067150115967, + "step": 237 + }, + { + "epoch": 0.06207211319032405, + "grad_norm": 6.1875, + "learning_rate": 3.990868013716245e-05, + "loss": 2.395388603210449, + "step": 238 + }, + { + "epoch": 0.062332920388602726, + "grad_norm": 6.3125, + "learning_rate": 3.9907281701760235e-05, + "loss": 2.574742317199707, + "step": 239 + }, + { + "epoch": 0.06259372758688139, + "grad_norm": 6.65625, + "learning_rate": 3.99058726649054e-05, + "loss": 2.463757038116455, + "step": 240 + }, + { + "epoch": 0.06285453478516007, + "grad_norm": 6.5, + "learning_rate": 3.9904453027348324e-05, + "loss": 2.42558217048645, + "step": 241 + }, + { + "epoch": 0.06311534198343874, + "grad_norm": 6.34375, + "learning_rate": 3.990302278984502e-05, + "loss": 2.339848041534424, + "step": 242 + }, + { + "epoch": 0.06337614918171741, + "grad_norm": 7.25, + "learning_rate": 3.9901581953157135e-05, + "loss": 2.7260823249816895, + "step": 243 + }, + { + "epoch": 0.06363695637999608, + "grad_norm": 6.34375, + "learning_rate": 3.9900130518052e-05, + "loss": 2.346684694290161, + "step": 244 + }, + { + "epoch": 0.06389776357827476, + "grad_norm": 12.0625, + "learning_rate": 3.989866848530254e-05, + "loss": 2.3759613037109375, + "step": 245 + }, + { + "epoch": 0.06415857077655343, + "grad_norm": 7.0, + "learning_rate": 3.989719585568736e-05, + "loss": 2.450105905532837, + "step": 246 + }, + { + "epoch": 0.0644193779748321, + "grad_norm": 18.5, + "learning_rate": 3.98957126299907e-05, + "loss": 2.4428210258483887, + "step": 247 + }, + { + "epoch": 0.06468018517311078, + "grad_norm": 7.5625, + "learning_rate": 3.989421880900243e-05, + "loss": 2.590214252471924, + "step": 248 + }, + { + "epoch": 0.06494099237138945, + "grad_norm": 6.46875, + "learning_rate": 3.9892714393518073e-05, + "loss": 2.6804447174072266, + "step": 249 + }, + { + "epoch": 0.06520179956966812, + "grad_norm": 6.53125, + "learning_rate": 3.98911993843388e-05, + "loss": 2.209712266921997, + "step": 250 + }, + { + "epoch": 0.0654626067679468, + "grad_norm": 7.125, + "learning_rate": 3.98896737822714e-05, + "loss": 2.7148003578186035, + "step": 251 + }, + { + "epoch": 0.06572341396622547, + "grad_norm": 6.09375, + "learning_rate": 3.9888137588128345e-05, + "loss": 2.572610855102539, + "step": 252 + }, + { + "epoch": 0.06598422116450414, + "grad_norm": 6.5625, + "learning_rate": 3.98865908027277e-05, + "loss": 2.296384572982788, + "step": 253 + }, + { + "epoch": 0.06624502836278282, + "grad_norm": 6.59375, + "learning_rate": 3.98850334268932e-05, + "loss": 2.6526260375976562, + "step": 254 + }, + { + "epoch": 0.06650583556106149, + "grad_norm": 6.1875, + "learning_rate": 3.9883465461454215e-05, + "loss": 2.5138492584228516, + "step": 255 + }, + { + "epoch": 0.06676664275934016, + "grad_norm": 6.65625, + "learning_rate": 3.988188690724575e-05, + "loss": 2.2890474796295166, + "step": 256 + }, + { + "epoch": 0.06702744995761883, + "grad_norm": 6.28125, + "learning_rate": 3.9880297765108446e-05, + "loss": 2.328469753265381, + "step": 257 + }, + { + "epoch": 0.06728825715589751, + "grad_norm": 6.90625, + "learning_rate": 3.9878698035888585e-05, + "loss": 2.7269949913024902, + "step": 258 + }, + { + "epoch": 0.06754906435417618, + "grad_norm": 6.8125, + "learning_rate": 3.98770877204381e-05, + "loss": 2.258955955505371, + "step": 259 + }, + { + "epoch": 0.06780987155245485, + "grad_norm": 7.25, + "learning_rate": 3.987546681961455e-05, + "loss": 2.605950355529785, + "step": 260 + }, + { + "epoch": 0.06807067875073353, + "grad_norm": 6.78125, + "learning_rate": 3.987383533428111e-05, + "loss": 2.2461400032043457, + "step": 261 + }, + { + "epoch": 0.0683314859490122, + "grad_norm": 6.65625, + "learning_rate": 3.9872193265306645e-05, + "loss": 2.3346076011657715, + "step": 262 + }, + { + "epoch": 0.06859229314729086, + "grad_norm": 6.0, + "learning_rate": 3.987054061356561e-05, + "loss": 2.512078285217285, + "step": 263 + }, + { + "epoch": 0.06885310034556953, + "grad_norm": 6.5625, + "learning_rate": 3.986887737993811e-05, + "loss": 2.525033950805664, + "step": 264 + }, + { + "epoch": 0.0691139075438482, + "grad_norm": 6.5625, + "learning_rate": 3.986720356530988e-05, + "loss": 2.4972288608551025, + "step": 265 + }, + { + "epoch": 0.06937471474212688, + "grad_norm": 6.53125, + "learning_rate": 3.986551917057231e-05, + "loss": 2.5117955207824707, + "step": 266 + }, + { + "epoch": 0.06963552194040555, + "grad_norm": 6.46875, + "learning_rate": 3.98638241966224e-05, + "loss": 2.182558298110962, + "step": 267 + }, + { + "epoch": 0.06989632913868422, + "grad_norm": 6.5625, + "learning_rate": 3.986211864436279e-05, + "loss": 2.61698055267334, + "step": 268 + }, + { + "epoch": 0.0701571363369629, + "grad_norm": 6.4375, + "learning_rate": 3.986040251470177e-05, + "loss": 2.098867893218994, + "step": 269 + }, + { + "epoch": 0.07041794353524157, + "grad_norm": 6.28125, + "learning_rate": 3.985867580855324e-05, + "loss": 2.3573265075683594, + "step": 270 + }, + { + "epoch": 0.07067875073352024, + "grad_norm": 6.65625, + "learning_rate": 3.985693852683675e-05, + "loss": 2.5895185470581055, + "step": 271 + }, + { + "epoch": 0.07093955793179892, + "grad_norm": 6.46875, + "learning_rate": 3.985519067047747e-05, + "loss": 2.3642361164093018, + "step": 272 + }, + { + "epoch": 0.07120036513007759, + "grad_norm": 6.0625, + "learning_rate": 3.985343224040621e-05, + "loss": 2.739882707595825, + "step": 273 + }, + { + "epoch": 0.07146117232835626, + "grad_norm": 5.875, + "learning_rate": 3.985166323755939e-05, + "loss": 2.458867073059082, + "step": 274 + }, + { + "epoch": 0.07172197952663494, + "grad_norm": 5.96875, + "learning_rate": 3.98498836628791e-05, + "loss": 2.274116039276123, + "step": 275 + }, + { + "epoch": 0.07198278672491361, + "grad_norm": 5.9375, + "learning_rate": 3.9848093517313036e-05, + "loss": 2.380291700363159, + "step": 276 + }, + { + "epoch": 0.07224359392319228, + "grad_norm": 6.65625, + "learning_rate": 3.984629280181451e-05, + "loss": 2.3993239402770996, + "step": 277 + }, + { + "epoch": 0.07250440112147095, + "grad_norm": 6.625, + "learning_rate": 3.984448151734248e-05, + "loss": 2.4813575744628906, + "step": 278 + }, + { + "epoch": 0.07276520831974963, + "grad_norm": 7.0, + "learning_rate": 3.9842659664861536e-05, + "loss": 2.6131296157836914, + "step": 279 + }, + { + "epoch": 0.0730260155180283, + "grad_norm": 6.4375, + "learning_rate": 3.9840827245341894e-05, + "loss": 2.5049290657043457, + "step": 280 + }, + { + "epoch": 0.07328682271630697, + "grad_norm": 6.40625, + "learning_rate": 3.983898425975938e-05, + "loss": 2.5281827449798584, + "step": 281 + }, + { + "epoch": 0.07354762991458565, + "grad_norm": 6.3125, + "learning_rate": 3.9837130709095475e-05, + "loss": 2.6311264038085938, + "step": 282 + }, + { + "epoch": 0.07380843711286432, + "grad_norm": 5.9375, + "learning_rate": 3.9835266594337264e-05, + "loss": 2.3365821838378906, + "step": 283 + }, + { + "epoch": 0.07406924431114299, + "grad_norm": 6.15625, + "learning_rate": 3.983339191647747e-05, + "loss": 2.235295295715332, + "step": 284 + }, + { + "epoch": 0.07433005150942167, + "grad_norm": 6.03125, + "learning_rate": 3.983150667651442e-05, + "loss": 2.445535659790039, + "step": 285 + }, + { + "epoch": 0.07459085870770034, + "grad_norm": 6.40625, + "learning_rate": 3.982961087545211e-05, + "loss": 2.2840161323547363, + "step": 286 + }, + { + "epoch": 0.07485166590597901, + "grad_norm": 5.4375, + "learning_rate": 3.9827704514300105e-05, + "loss": 2.376690149307251, + "step": 287 + }, + { + "epoch": 0.07511247310425767, + "grad_norm": 5.8125, + "learning_rate": 3.9825787594073644e-05, + "loss": 2.3928864002227783, + "step": 288 + }, + { + "epoch": 0.07537328030253634, + "grad_norm": 6.125, + "learning_rate": 3.982386011579355e-05, + "loss": 2.3445351123809814, + "step": 289 + }, + { + "epoch": 0.07563408750081502, + "grad_norm": 5.9375, + "learning_rate": 3.9821922080486296e-05, + "loss": 2.208594560623169, + "step": 290 + }, + { + "epoch": 0.07589489469909369, + "grad_norm": 5.40625, + "learning_rate": 3.981997348918396e-05, + "loss": 2.094682216644287, + "step": 291 + }, + { + "epoch": 0.07615570189737236, + "grad_norm": 6.3125, + "learning_rate": 3.9818014342924245e-05, + "loss": 2.6614317893981934, + "step": 292 + }, + { + "epoch": 0.07641650909565104, + "grad_norm": 6.5625, + "learning_rate": 3.981604464275049e-05, + "loss": 2.593371629714966, + "step": 293 + }, + { + "epoch": 0.07667731629392971, + "grad_norm": 5.75, + "learning_rate": 3.981406438971163e-05, + "loss": 2.1724560260772705, + "step": 294 + }, + { + "epoch": 0.07693812349220838, + "grad_norm": 7.5625, + "learning_rate": 3.9812073584862234e-05, + "loss": 2.3324060440063477, + "step": 295 + }, + { + "epoch": 0.07719893069048706, + "grad_norm": 5.6875, + "learning_rate": 3.9810072229262495e-05, + "loss": 2.4392335414886475, + "step": 296 + }, + { + "epoch": 0.07745973788876573, + "grad_norm": 5.84375, + "learning_rate": 3.98080603239782e-05, + "loss": 2.3669943809509277, + "step": 297 + }, + { + "epoch": 0.0777205450870444, + "grad_norm": 6.40625, + "learning_rate": 3.98060378700808e-05, + "loss": 2.5146942138671875, + "step": 298 + }, + { + "epoch": 0.07798135228532307, + "grad_norm": 5.9375, + "learning_rate": 3.9804004868647315e-05, + "loss": 2.3613228797912598, + "step": 299 + }, + { + "epoch": 0.07824215948360175, + "grad_norm": 5.59375, + "learning_rate": 3.98019613207604e-05, + "loss": 2.367048740386963, + "step": 300 + }, + { + "epoch": 0.07850296668188042, + "grad_norm": 6.15625, + "learning_rate": 3.979990722750835e-05, + "loss": 2.5720415115356445, + "step": 301 + }, + { + "epoch": 0.0787637738801591, + "grad_norm": 5.53125, + "learning_rate": 3.979784258998503e-05, + "loss": 2.27433180809021, + "step": 302 + }, + { + "epoch": 0.07902458107843777, + "grad_norm": 6.125, + "learning_rate": 3.9795767409289965e-05, + "loss": 2.362941265106201, + "step": 303 + }, + { + "epoch": 0.07928538827671644, + "grad_norm": 5.40625, + "learning_rate": 3.979368168652826e-05, + "loss": 2.129410743713379, + "step": 304 + }, + { + "epoch": 0.07954619547499511, + "grad_norm": 5.8125, + "learning_rate": 3.9791585422810664e-05, + "loss": 2.547970771789551, + "step": 305 + }, + { + "epoch": 0.07980700267327379, + "grad_norm": 6.21875, + "learning_rate": 3.9789478619253505e-05, + "loss": 2.542482852935791, + "step": 306 + }, + { + "epoch": 0.08006780987155246, + "grad_norm": 6.46875, + "learning_rate": 3.978736127697876e-05, + "loss": 2.6509501934051514, + "step": 307 + }, + { + "epoch": 0.08032861706983113, + "grad_norm": 6.03125, + "learning_rate": 3.978523339711399e-05, + "loss": 2.4671332836151123, + "step": 308 + }, + { + "epoch": 0.0805894242681098, + "grad_norm": 6.125, + "learning_rate": 3.978309498079239e-05, + "loss": 2.0929112434387207, + "step": 309 + }, + { + "epoch": 0.08085023146638848, + "grad_norm": 7.0, + "learning_rate": 3.978094602915275e-05, + "loss": 2.3321259021759033, + "step": 310 + }, + { + "epoch": 0.08111103866466715, + "grad_norm": 6.03125, + "learning_rate": 3.977878654333947e-05, + "loss": 2.3965871334075928, + "step": 311 + }, + { + "epoch": 0.08137184586294582, + "grad_norm": 6.0, + "learning_rate": 3.977661652450257e-05, + "loss": 2.3253791332244873, + "step": 312 + }, + { + "epoch": 0.08163265306122448, + "grad_norm": 6.90625, + "learning_rate": 3.977443597379768e-05, + "loss": 2.251070976257324, + "step": 313 + }, + { + "epoch": 0.08189346025950316, + "grad_norm": 6.1875, + "learning_rate": 3.977224489238603e-05, + "loss": 2.551664352416992, + "step": 314 + }, + { + "epoch": 0.08215426745778183, + "grad_norm": 5.9375, + "learning_rate": 3.977004328143447e-05, + "loss": 2.367215394973755, + "step": 315 + }, + { + "epoch": 0.0824150746560605, + "grad_norm": 5.75, + "learning_rate": 3.9767831142115426e-05, + "loss": 2.073972702026367, + "step": 316 + }, + { + "epoch": 0.08267588185433918, + "grad_norm": 6.09375, + "learning_rate": 3.976560847560697e-05, + "loss": 2.297461748123169, + "step": 317 + }, + { + "epoch": 0.08293668905261785, + "grad_norm": 6.0, + "learning_rate": 3.9763375283092774e-05, + "loss": 2.428114891052246, + "step": 318 + }, + { + "epoch": 0.08319749625089652, + "grad_norm": 6.09375, + "learning_rate": 3.9761131565762084e-05, + "loss": 2.568549633026123, + "step": 319 + }, + { + "epoch": 0.0834583034491752, + "grad_norm": 5.875, + "learning_rate": 3.9758877324809786e-05, + "loss": 2.3213963508605957, + "step": 320 + }, + { + "epoch": 0.08371911064745387, + "grad_norm": 5.5, + "learning_rate": 3.975661256143635e-05, + "loss": 1.9857629537582397, + "step": 321 + }, + { + "epoch": 0.08397991784573254, + "grad_norm": 5.625, + "learning_rate": 3.975433727684786e-05, + "loss": 2.2222909927368164, + "step": 322 + }, + { + "epoch": 0.08424072504401121, + "grad_norm": 5.84375, + "learning_rate": 3.9752051472256e-05, + "loss": 2.161729335784912, + "step": 323 + }, + { + "epoch": 0.08450153224228989, + "grad_norm": 5.40625, + "learning_rate": 3.9749755148878055e-05, + "loss": 2.5349013805389404, + "step": 324 + }, + { + "epoch": 0.08476233944056856, + "grad_norm": 5.9375, + "learning_rate": 3.974744830793691e-05, + "loss": 2.358365535736084, + "step": 325 + }, + { + "epoch": 0.08502314663884723, + "grad_norm": 5.6875, + "learning_rate": 3.974513095066106e-05, + "loss": 2.1969616413116455, + "step": 326 + }, + { + "epoch": 0.0852839538371259, + "grad_norm": 5.78125, + "learning_rate": 3.974280307828459e-05, + "loss": 2.1518025398254395, + "step": 327 + }, + { + "epoch": 0.08554476103540458, + "grad_norm": 5.8125, + "learning_rate": 3.974046469204719e-05, + "loss": 2.4633238315582275, + "step": 328 + }, + { + "epoch": 0.08580556823368325, + "grad_norm": 5.96875, + "learning_rate": 3.9738115793194136e-05, + "loss": 2.2212085723876953, + "step": 329 + }, + { + "epoch": 0.08606637543196193, + "grad_norm": 5.4375, + "learning_rate": 3.9735756382976324e-05, + "loss": 2.3922016620635986, + "step": 330 + }, + { + "epoch": 0.0863271826302406, + "grad_norm": 6.03125, + "learning_rate": 3.973338646265024e-05, + "loss": 2.308976650238037, + "step": 331 + }, + { + "epoch": 0.08658798982851927, + "grad_norm": 5.90625, + "learning_rate": 3.973100603347797e-05, + "loss": 2.289294958114624, + "step": 332 + }, + { + "epoch": 0.08684879702679794, + "grad_norm": 5.28125, + "learning_rate": 3.972861509672717e-05, + "loss": 2.188870429992676, + "step": 333 + }, + { + "epoch": 0.08710960422507662, + "grad_norm": 5.84375, + "learning_rate": 3.972621365367113e-05, + "loss": 2.280560255050659, + "step": 334 + }, + { + "epoch": 0.08737041142335529, + "grad_norm": 5.53125, + "learning_rate": 3.9723801705588715e-05, + "loss": 2.329860210418701, + "step": 335 + }, + { + "epoch": 0.08763121862163396, + "grad_norm": 6.0, + "learning_rate": 3.972137925376439e-05, + "loss": 2.349754571914673, + "step": 336 + }, + { + "epoch": 0.08789202581991264, + "grad_norm": 6.40625, + "learning_rate": 3.9718946299488207e-05, + "loss": 2.040006637573242, + "step": 337 + }, + { + "epoch": 0.0881528330181913, + "grad_norm": 5.40625, + "learning_rate": 3.9716502844055806e-05, + "loss": 2.155709743499756, + "step": 338 + }, + { + "epoch": 0.08841364021646997, + "grad_norm": 6.21875, + "learning_rate": 3.971404888876844e-05, + "loss": 2.2252378463745117, + "step": 339 + }, + { + "epoch": 0.08867444741474864, + "grad_norm": 5.625, + "learning_rate": 3.971158443493295e-05, + "loss": 2.1346254348754883, + "step": 340 + }, + { + "epoch": 0.08893525461302731, + "grad_norm": 5.96875, + "learning_rate": 3.970910948386174e-05, + "loss": 2.261709213256836, + "step": 341 + }, + { + "epoch": 0.08919606181130599, + "grad_norm": 5.59375, + "learning_rate": 3.970662403687283e-05, + "loss": 2.0507190227508545, + "step": 342 + }, + { + "epoch": 0.08945686900958466, + "grad_norm": 5.875, + "learning_rate": 3.970412809528984e-05, + "loss": 2.4081969261169434, + "step": 343 + }, + { + "epoch": 0.08971767620786333, + "grad_norm": 5.6875, + "learning_rate": 3.970162166044194e-05, + "loss": 2.2700247764587402, + "step": 344 + }, + { + "epoch": 0.089978483406142, + "grad_norm": 5.75, + "learning_rate": 3.969910473366392e-05, + "loss": 2.2401342391967773, + "step": 345 + }, + { + "epoch": 0.09023929060442068, + "grad_norm": 5.65625, + "learning_rate": 3.969657731629615e-05, + "loss": 2.1488614082336426, + "step": 346 + }, + { + "epoch": 0.09050009780269935, + "grad_norm": 5.6875, + "learning_rate": 3.969403940968458e-05, + "loss": 2.3223090171813965, + "step": 347 + }, + { + "epoch": 0.09076090500097803, + "grad_norm": 6.0625, + "learning_rate": 3.969149101518075e-05, + "loss": 2.1582388877868652, + "step": 348 + }, + { + "epoch": 0.0910217121992567, + "grad_norm": 5.4375, + "learning_rate": 3.9688932134141795e-05, + "loss": 2.297391653060913, + "step": 349 + }, + { + "epoch": 0.09128251939753537, + "grad_norm": 5.59375, + "learning_rate": 3.968636276793041e-05, + "loss": 2.2849154472351074, + "step": 350 + }, + { + "epoch": 0.09154332659581405, + "grad_norm": 6.0, + "learning_rate": 3.9683782917914906e-05, + "loss": 2.274324417114258, + "step": 351 + }, + { + "epoch": 0.09180413379409272, + "grad_norm": 5.59375, + "learning_rate": 3.9681192585469146e-05, + "loss": 2.4225234985351562, + "step": 352 + }, + { + "epoch": 0.09206494099237139, + "grad_norm": 5.96875, + "learning_rate": 3.96785917719726e-05, + "loss": 2.265866756439209, + "step": 353 + }, + { + "epoch": 0.09232574819065006, + "grad_norm": 5.4375, + "learning_rate": 3.96759804788103e-05, + "loss": 2.2360856533050537, + "step": 354 + }, + { + "epoch": 0.09258655538892874, + "grad_norm": 5.4375, + "learning_rate": 3.9673358707372864e-05, + "loss": 1.9127092361450195, + "step": 355 + }, + { + "epoch": 0.09284736258720741, + "grad_norm": 5.9375, + "learning_rate": 3.967072645905651e-05, + "loss": 2.1417503356933594, + "step": 356 + }, + { + "epoch": 0.09310816978548608, + "grad_norm": 5.75, + "learning_rate": 3.9668083735263014e-05, + "loss": 2.449788808822632, + "step": 357 + }, + { + "epoch": 0.09336897698376476, + "grad_norm": 6.25, + "learning_rate": 3.9665430537399725e-05, + "loss": 1.9705320596694946, + "step": 358 + }, + { + "epoch": 0.09362978418204343, + "grad_norm": 5.53125, + "learning_rate": 3.9662766866879596e-05, + "loss": 1.9395697116851807, + "step": 359 + }, + { + "epoch": 0.0938905913803221, + "grad_norm": 5.625, + "learning_rate": 3.966009272512113e-05, + "loss": 2.2744157314300537, + "step": 360 + }, + { + "epoch": 0.09415139857860078, + "grad_norm": 5.3125, + "learning_rate": 3.9657408113548425e-05, + "loss": 2.2560882568359375, + "step": 361 + }, + { + "epoch": 0.09441220577687945, + "grad_norm": 5.625, + "learning_rate": 3.965471303359114e-05, + "loss": 2.3874456882476807, + "step": 362 + }, + { + "epoch": 0.09467301297515811, + "grad_norm": 5.28125, + "learning_rate": 3.965200748668453e-05, + "loss": 2.2133641242980957, + "step": 363 + }, + { + "epoch": 0.09493382017343678, + "grad_norm": 5.46875, + "learning_rate": 3.96492914742694e-05, + "loss": 2.106085777282715, + "step": 364 + }, + { + "epoch": 0.09519462737171545, + "grad_norm": 5.5625, + "learning_rate": 3.964656499779214e-05, + "loss": 2.3144335746765137, + "step": 365 + }, + { + "epoch": 0.09545543456999413, + "grad_norm": 5.21875, + "learning_rate": 3.964382805870473e-05, + "loss": 1.9709185361862183, + "step": 366 + }, + { + "epoch": 0.0957162417682728, + "grad_norm": 5.65625, + "learning_rate": 3.964108065846467e-05, + "loss": 2.133201837539673, + "step": 367 + }, + { + "epoch": 0.09597704896655147, + "grad_norm": 5.59375, + "learning_rate": 3.963832279853509e-05, + "loss": 2.362656354904175, + "step": 368 + }, + { + "epoch": 0.09623785616483015, + "grad_norm": 5.09375, + "learning_rate": 3.963555448038466e-05, + "loss": 2.0962352752685547, + "step": 369 + }, + { + "epoch": 0.09649866336310882, + "grad_norm": 5.46875, + "learning_rate": 3.963277570548761e-05, + "loss": 2.27280330657959, + "step": 370 + }, + { + "epoch": 0.09675947056138749, + "grad_norm": 5.875, + "learning_rate": 3.9629986475323773e-05, + "loss": 2.6300599575042725, + "step": 371 + }, + { + "epoch": 0.09702027775966617, + "grad_norm": 5.75, + "learning_rate": 3.962718679137852e-05, + "loss": 2.254918098449707, + "step": 372 + }, + { + "epoch": 0.09728108495794484, + "grad_norm": 5.34375, + "learning_rate": 3.96243766551428e-05, + "loss": 2.279177665710449, + "step": 373 + }, + { + "epoch": 0.09754189215622351, + "grad_norm": 5.09375, + "learning_rate": 3.9621556068113124e-05, + "loss": 2.049755096435547, + "step": 374 + }, + { + "epoch": 0.09780269935450218, + "grad_norm": 5.4375, + "learning_rate": 3.961872503179158e-05, + "loss": 2.3396801948547363, + "step": 375 + }, + { + "epoch": 0.09806350655278086, + "grad_norm": 6.21875, + "learning_rate": 3.961588354768579e-05, + "loss": 2.226260185241699, + "step": 376 + }, + { + "epoch": 0.09832431375105953, + "grad_norm": 5.5, + "learning_rate": 3.9613031617309e-05, + "loss": 2.1153340339660645, + "step": 377 + }, + { + "epoch": 0.0985851209493382, + "grad_norm": 5.21875, + "learning_rate": 3.9610169242179944e-05, + "loss": 1.9020158052444458, + "step": 378 + }, + { + "epoch": 0.09884592814761688, + "grad_norm": 6.03125, + "learning_rate": 3.9607296423822976e-05, + "loss": 2.5020313262939453, + "step": 379 + }, + { + "epoch": 0.09910673534589555, + "grad_norm": 5.15625, + "learning_rate": 3.9604413163767985e-05, + "loss": 1.8208396434783936, + "step": 380 + }, + { + "epoch": 0.09936754254417422, + "grad_norm": 6.8125, + "learning_rate": 3.960151946355043e-05, + "loss": 2.2956275939941406, + "step": 381 + }, + { + "epoch": 0.0996283497424529, + "grad_norm": 5.53125, + "learning_rate": 3.9598615324711325e-05, + "loss": 2.058102607727051, + "step": 382 + }, + { + "epoch": 0.09988915694073157, + "grad_norm": 6.15625, + "learning_rate": 3.9595700748797235e-05, + "loss": 2.239431142807007, + "step": 383 + }, + { + "epoch": 0.10014996413901024, + "grad_norm": 5.84375, + "learning_rate": 3.959277573736031e-05, + "loss": 2.1374924182891846, + "step": 384 + }, + { + "epoch": 0.10041077133728891, + "grad_norm": 6.0, + "learning_rate": 3.9589840291958224e-05, + "loss": 2.121920108795166, + "step": 385 + }, + { + "epoch": 0.10067157853556759, + "grad_norm": 5.71875, + "learning_rate": 3.958689441415423e-05, + "loss": 2.0743794441223145, + "step": 386 + }, + { + "epoch": 0.10093238573384626, + "grad_norm": 5.59375, + "learning_rate": 3.9583938105517127e-05, + "loss": 2.166443347930908, + "step": 387 + }, + { + "epoch": 0.10119319293212492, + "grad_norm": 5.40625, + "learning_rate": 3.958097136762128e-05, + "loss": 2.2176623344421387, + "step": 388 + }, + { + "epoch": 0.1014540001304036, + "grad_norm": 6.0, + "learning_rate": 3.957799420204659e-05, + "loss": 2.3164663314819336, + "step": 389 + }, + { + "epoch": 0.10171480732868227, + "grad_norm": 5.21875, + "learning_rate": 3.9575006610378524e-05, + "loss": 2.0331268310546875, + "step": 390 + }, + { + "epoch": 0.10197561452696094, + "grad_norm": 5.71875, + "learning_rate": 3.95720085942081e-05, + "loss": 2.378547191619873, + "step": 391 + }, + { + "epoch": 0.10223642172523961, + "grad_norm": 5.78125, + "learning_rate": 3.956900015513189e-05, + "loss": 1.9658644199371338, + "step": 392 + }, + { + "epoch": 0.10249722892351829, + "grad_norm": 5.03125, + "learning_rate": 3.9565981294752004e-05, + "loss": 2.014651298522949, + "step": 393 + }, + { + "epoch": 0.10275803612179696, + "grad_norm": 5.09375, + "learning_rate": 3.9562952014676116e-05, + "loss": 2.28570818901062, + "step": 394 + }, + { + "epoch": 0.10301884332007563, + "grad_norm": 5.5625, + "learning_rate": 3.955991231651744e-05, + "loss": 2.0314409732818604, + "step": 395 + }, + { + "epoch": 0.1032796505183543, + "grad_norm": 5.125, + "learning_rate": 3.9556862201894745e-05, + "loss": 2.262718915939331, + "step": 396 + }, + { + "epoch": 0.10354045771663298, + "grad_norm": 4.96875, + "learning_rate": 3.955380167243234e-05, + "loss": 2.2126426696777344, + "step": 397 + }, + { + "epoch": 0.10380126491491165, + "grad_norm": 5.625, + "learning_rate": 3.9550730729760086e-05, + "loss": 1.9544909000396729, + "step": 398 + }, + { + "epoch": 0.10406207211319032, + "grad_norm": 5.0, + "learning_rate": 3.954764937551338e-05, + "loss": 2.2505781650543213, + "step": 399 + }, + { + "epoch": 0.104322879311469, + "grad_norm": 5.40625, + "learning_rate": 3.9544557611333185e-05, + "loss": 2.292494297027588, + "step": 400 + }, + { + "epoch": 0.10458368650974767, + "grad_norm": 5.21875, + "learning_rate": 3.954145543886599e-05, + "loss": 2.254851818084717, + "step": 401 + }, + { + "epoch": 0.10484449370802634, + "grad_norm": 5.21875, + "learning_rate": 3.9538342859763814e-05, + "loss": 2.1972813606262207, + "step": 402 + }, + { + "epoch": 0.10510530090630502, + "grad_norm": 5.0625, + "learning_rate": 3.9535219875684256e-05, + "loss": 2.0275416374206543, + "step": 403 + }, + { + "epoch": 0.10536610810458369, + "grad_norm": 4.90625, + "learning_rate": 3.953208648829042e-05, + "loss": 2.085601806640625, + "step": 404 + }, + { + "epoch": 0.10562691530286236, + "grad_norm": 5.4375, + "learning_rate": 3.9528942699250975e-05, + "loss": 2.362072229385376, + "step": 405 + }, + { + "epoch": 0.10588772250114104, + "grad_norm": 4.90625, + "learning_rate": 3.9525788510240105e-05, + "loss": 2.1449997425079346, + "step": 406 + }, + { + "epoch": 0.10614852969941971, + "grad_norm": 5.25, + "learning_rate": 3.9522623922937565e-05, + "loss": 2.0494844913482666, + "step": 407 + }, + { + "epoch": 0.10640933689769838, + "grad_norm": 5.40625, + "learning_rate": 3.9519448939028604e-05, + "loss": 2.4976119995117188, + "step": 408 + }, + { + "epoch": 0.10667014409597705, + "grad_norm": 5.90625, + "learning_rate": 3.951626356020406e-05, + "loss": 2.2443673610687256, + "step": 409 + }, + { + "epoch": 0.10693095129425573, + "grad_norm": 5.625, + "learning_rate": 3.9513067788160264e-05, + "loss": 1.9980010986328125, + "step": 410 + }, + { + "epoch": 0.1071917584925344, + "grad_norm": 5.4375, + "learning_rate": 3.95098616245991e-05, + "loss": 1.8560242652893066, + "step": 411 + }, + { + "epoch": 0.10745256569081307, + "grad_norm": 5.25, + "learning_rate": 3.950664507122798e-05, + "loss": 2.078575611114502, + "step": 412 + }, + { + "epoch": 0.10771337288909173, + "grad_norm": 5.84375, + "learning_rate": 3.950341812975986e-05, + "loss": 2.282409429550171, + "step": 413 + }, + { + "epoch": 0.1079741800873704, + "grad_norm": 5.53125, + "learning_rate": 3.950018080191321e-05, + "loss": 2.147897243499756, + "step": 414 + }, + { + "epoch": 0.10823498728564908, + "grad_norm": 5.28125, + "learning_rate": 3.949693308941205e-05, + "loss": 1.7462961673736572, + "step": 415 + }, + { + "epoch": 0.10849579448392775, + "grad_norm": 5.53125, + "learning_rate": 3.9493674993985906e-05, + "loss": 2.07855224609375, + "step": 416 + }, + { + "epoch": 0.10875660168220642, + "grad_norm": 5.25, + "learning_rate": 3.949040651736987e-05, + "loss": 2.297809600830078, + "step": 417 + }, + { + "epoch": 0.1090174088804851, + "grad_norm": 5.03125, + "learning_rate": 3.948712766130454e-05, + "loss": 1.9195914268493652, + "step": 418 + }, + { + "epoch": 0.10927821607876377, + "grad_norm": 5.8125, + "learning_rate": 3.948383842753602e-05, + "loss": 2.3484573364257812, + "step": 419 + }, + { + "epoch": 0.10953902327704244, + "grad_norm": 5.5625, + "learning_rate": 3.948053881781598e-05, + "loss": 1.9339882135391235, + "step": 420 + }, + { + "epoch": 0.10979983047532112, + "grad_norm": 5.5, + "learning_rate": 3.9477228833901604e-05, + "loss": 2.1224734783172607, + "step": 421 + }, + { + "epoch": 0.11006063767359979, + "grad_norm": 5.03125, + "learning_rate": 3.947390847755559e-05, + "loss": 1.802683711051941, + "step": 422 + }, + { + "epoch": 0.11032144487187846, + "grad_norm": 5.3125, + "learning_rate": 3.9470577750546155e-05, + "loss": 2.37097430229187, + "step": 423 + }, + { + "epoch": 0.11058225207015714, + "grad_norm": 5.75, + "learning_rate": 3.946723665464706e-05, + "loss": 2.007756471633911, + "step": 424 + }, + { + "epoch": 0.11084305926843581, + "grad_norm": 5.34375, + "learning_rate": 3.946388519163757e-05, + "loss": 1.7305908203125, + "step": 425 + }, + { + "epoch": 0.11110386646671448, + "grad_norm": 5.78125, + "learning_rate": 3.946052336330249e-05, + "loss": 2.0597658157348633, + "step": 426 + }, + { + "epoch": 0.11136467366499316, + "grad_norm": 5.90625, + "learning_rate": 3.945715117143213e-05, + "loss": 2.43072772026062, + "step": 427 + }, + { + "epoch": 0.11162548086327183, + "grad_norm": 6.4375, + "learning_rate": 3.9453768617822305e-05, + "loss": 2.028156042098999, + "step": 428 + }, + { + "epoch": 0.1118862880615505, + "grad_norm": 5.15625, + "learning_rate": 3.945037570427439e-05, + "loss": 1.962646722793579, + "step": 429 + }, + { + "epoch": 0.11214709525982917, + "grad_norm": 6.15625, + "learning_rate": 3.944697243259523e-05, + "loss": 2.209850788116455, + "step": 430 + }, + { + "epoch": 0.11240790245810785, + "grad_norm": 5.09375, + "learning_rate": 3.944355880459723e-05, + "loss": 2.0976197719573975, + "step": 431 + }, + { + "epoch": 0.11266870965638652, + "grad_norm": 6.96875, + "learning_rate": 3.9440134822098264e-05, + "loss": 2.148529052734375, + "step": 432 + }, + { + "epoch": 0.1129295168546652, + "grad_norm": 5.09375, + "learning_rate": 3.9436700486921756e-05, + "loss": 1.90488862991333, + "step": 433 + }, + { + "epoch": 0.11319032405294387, + "grad_norm": 5.46875, + "learning_rate": 3.9433255800896646e-05, + "loss": 2.175771713256836, + "step": 434 + }, + { + "epoch": 0.11345113125122254, + "grad_norm": 5.0, + "learning_rate": 3.942980076585735e-05, + "loss": 1.9042470455169678, + "step": 435 + }, + { + "epoch": 0.11371193844950121, + "grad_norm": 6.34375, + "learning_rate": 3.942633538364383e-05, + "loss": 1.9849103689193726, + "step": 436 + }, + { + "epoch": 0.11397274564777989, + "grad_norm": 5.34375, + "learning_rate": 3.942285965610154e-05, + "loss": 2.1257541179656982, + "step": 437 + }, + { + "epoch": 0.11423355284605854, + "grad_norm": 5.40625, + "learning_rate": 3.941937358508145e-05, + "loss": 1.892786979675293, + "step": 438 + }, + { + "epoch": 0.11449436004433722, + "grad_norm": 5.0625, + "learning_rate": 3.9415877172440045e-05, + "loss": 2.0031538009643555, + "step": 439 + }, + { + "epoch": 0.11475516724261589, + "grad_norm": 5.375, + "learning_rate": 3.9412370420039295e-05, + "loss": 1.965477705001831, + "step": 440 + }, + { + "epoch": 0.11501597444089456, + "grad_norm": 4.65625, + "learning_rate": 3.94088533297467e-05, + "loss": 2.049996852874756, + "step": 441 + }, + { + "epoch": 0.11527678163917324, + "grad_norm": 5.21875, + "learning_rate": 3.9405325903435254e-05, + "loss": 2.175043821334839, + "step": 442 + }, + { + "epoch": 0.11553758883745191, + "grad_norm": 5.34375, + "learning_rate": 3.940178814298347e-05, + "loss": 2.2416770458221436, + "step": 443 + }, + { + "epoch": 0.11579839603573058, + "grad_norm": 6.125, + "learning_rate": 3.939824005027533e-05, + "loss": 1.885480523109436, + "step": 444 + }, + { + "epoch": 0.11605920323400926, + "grad_norm": 5.15625, + "learning_rate": 3.939468162720035e-05, + "loss": 2.01684308052063, + "step": 445 + }, + { + "epoch": 0.11632001043228793, + "grad_norm": 5.875, + "learning_rate": 3.939111287565354e-05, + "loss": 2.12481689453125, + "step": 446 + }, + { + "epoch": 0.1165808176305666, + "grad_norm": 5.25, + "learning_rate": 3.938753379753542e-05, + "loss": 1.9678071737289429, + "step": 447 + }, + { + "epoch": 0.11684162482884528, + "grad_norm": 5.9375, + "learning_rate": 3.9383944394751975e-05, + "loss": 1.8941469192504883, + "step": 448 + }, + { + "epoch": 0.11710243202712395, + "grad_norm": 5.625, + "learning_rate": 3.938034466921472e-05, + "loss": 2.2009029388427734, + "step": 449 + }, + { + "epoch": 0.11736323922540262, + "grad_norm": 4.75, + "learning_rate": 3.937673462284066e-05, + "loss": 1.7105543613433838, + "step": 450 + }, + { + "epoch": 0.1176240464236813, + "grad_norm": 5.3125, + "learning_rate": 3.937311425755229e-05, + "loss": 1.9967753887176514, + "step": 451 + }, + { + "epoch": 0.11788485362195997, + "grad_norm": 5.0625, + "learning_rate": 3.9369483575277615e-05, + "loss": 2.0934154987335205, + "step": 452 + }, + { + "epoch": 0.11814566082023864, + "grad_norm": 5.21875, + "learning_rate": 3.936584257795011e-05, + "loss": 2.0326435565948486, + "step": 453 + }, + { + "epoch": 0.11840646801851731, + "grad_norm": 5.71875, + "learning_rate": 3.936219126750876e-05, + "loss": 2.027519464492798, + "step": 454 + }, + { + "epoch": 0.11866727521679599, + "grad_norm": 5.25, + "learning_rate": 3.9358529645898054e-05, + "loss": 2.176551580429077, + "step": 455 + }, + { + "epoch": 0.11892808241507466, + "grad_norm": 5.40625, + "learning_rate": 3.935485771506794e-05, + "loss": 2.0712437629699707, + "step": 456 + }, + { + "epoch": 0.11918888961335333, + "grad_norm": 5.1875, + "learning_rate": 3.935117547697387e-05, + "loss": 1.9934316873550415, + "step": 457 + }, + { + "epoch": 0.119449696811632, + "grad_norm": 5.0625, + "learning_rate": 3.934748293357682e-05, + "loss": 2.2428603172302246, + "step": 458 + }, + { + "epoch": 0.11971050400991068, + "grad_norm": 5.125, + "learning_rate": 3.934378008684318e-05, + "loss": 2.0849642753601074, + "step": 459 + }, + { + "epoch": 0.11997131120818935, + "grad_norm": 4.90625, + "learning_rate": 3.934006693874489e-05, + "loss": 1.9194163084030151, + "step": 460 + }, + { + "epoch": 0.12023211840646802, + "grad_norm": 5.28125, + "learning_rate": 3.933634349125936e-05, + "loss": 2.1880245208740234, + "step": 461 + }, + { + "epoch": 0.1204929256047467, + "grad_norm": 5.46875, + "learning_rate": 3.933260974636948e-05, + "loss": 1.957980990409851, + "step": 462 + }, + { + "epoch": 0.12075373280302536, + "grad_norm": 5.09375, + "learning_rate": 3.932886570606361e-05, + "loss": 1.950570821762085, + "step": 463 + }, + { + "epoch": 0.12101454000130403, + "grad_norm": 5.71875, + "learning_rate": 3.9325111372335616e-05, + "loss": 2.274207353591919, + "step": 464 + }, + { + "epoch": 0.1212753471995827, + "grad_norm": 4.96875, + "learning_rate": 3.932134674718484e-05, + "loss": 2.1359615325927734, + "step": 465 + }, + { + "epoch": 0.12153615439786138, + "grad_norm": 5.375, + "learning_rate": 3.931757183261609e-05, + "loss": 2.3798720836639404, + "step": 466 + }, + { + "epoch": 0.12179696159614005, + "grad_norm": 4.9375, + "learning_rate": 3.9313786630639676e-05, + "loss": 2.214578628540039, + "step": 467 + }, + { + "epoch": 0.12205776879441872, + "grad_norm": 5.3125, + "learning_rate": 3.930999114327137e-05, + "loss": 2.2453343868255615, + "step": 468 + }, + { + "epoch": 0.1223185759926974, + "grad_norm": 4.6875, + "learning_rate": 3.930618537253242e-05, + "loss": 1.9698379039764404, + "step": 469 + }, + { + "epoch": 0.12257938319097607, + "grad_norm": 4.84375, + "learning_rate": 3.930236932044957e-05, + "loss": 1.7252676486968994, + "step": 470 + }, + { + "epoch": 0.12284019038925474, + "grad_norm": 4.5625, + "learning_rate": 3.929854298905502e-05, + "loss": 2.0300962924957275, + "step": 471 + }, + { + "epoch": 0.12310099758753341, + "grad_norm": 5.28125, + "learning_rate": 3.929470638038645e-05, + "loss": 2.0499699115753174, + "step": 472 + }, + { + "epoch": 0.12336180478581209, + "grad_norm": 5.09375, + "learning_rate": 3.9290859496487e-05, + "loss": 1.9883675575256348, + "step": 473 + }, + { + "epoch": 0.12362261198409076, + "grad_norm": 5.40625, + "learning_rate": 3.928700233940531e-05, + "loss": 2.0695719718933105, + "step": 474 + }, + { + "epoch": 0.12388341918236943, + "grad_norm": 5.0625, + "learning_rate": 3.928313491119548e-05, + "loss": 2.2247023582458496, + "step": 475 + }, + { + "epoch": 0.1241442263806481, + "grad_norm": 5.09375, + "learning_rate": 3.927925721391707e-05, + "loss": 2.141120195388794, + "step": 476 + }, + { + "epoch": 0.12440503357892678, + "grad_norm": 4.4375, + "learning_rate": 3.9275369249635106e-05, + "loss": 1.8078746795654297, + "step": 477 + }, + { + "epoch": 0.12466584077720545, + "grad_norm": 5.53125, + "learning_rate": 3.927147102042011e-05, + "loss": 2.255552053451538, + "step": 478 + }, + { + "epoch": 0.12492664797548413, + "grad_norm": 5.0625, + "learning_rate": 3.926756252834802e-05, + "loss": 2.1611335277557373, + "step": 479 + }, + { + "epoch": 0.12518745517376278, + "grad_norm": 4.90625, + "learning_rate": 3.92636437755003e-05, + "loss": 2.0030574798583984, + "step": 480 + }, + { + "epoch": 0.12544826237204146, + "grad_norm": 4.875, + "learning_rate": 3.9259714763963834e-05, + "loss": 1.8799573183059692, + "step": 481 + }, + { + "epoch": 0.12570906957032013, + "grad_norm": 5.0, + "learning_rate": 3.925577549583099e-05, + "loss": 1.946972131729126, + "step": 482 + }, + { + "epoch": 0.1259698767685988, + "grad_norm": 4.59375, + "learning_rate": 3.925182597319958e-05, + "loss": 2.016092538833618, + "step": 483 + }, + { + "epoch": 0.12623068396687748, + "grad_norm": 4.8125, + "learning_rate": 3.92478661981729e-05, + "loss": 2.0311694145202637, + "step": 484 + }, + { + "epoch": 0.12649149116515615, + "grad_norm": 5.09375, + "learning_rate": 3.924389617285969e-05, + "loss": 1.8325190544128418, + "step": 485 + }, + { + "epoch": 0.12675229836343482, + "grad_norm": 5.3125, + "learning_rate": 3.9239915899374153e-05, + "loss": 2.627725124359131, + "step": 486 + }, + { + "epoch": 0.1270131055617135, + "grad_norm": 7.65625, + "learning_rate": 3.923592537983595e-05, + "loss": 2.2398009300231934, + "step": 487 + }, + { + "epoch": 0.12727391275999217, + "grad_norm": 5.3125, + "learning_rate": 3.92319246163702e-05, + "loss": 1.9798343181610107, + "step": 488 + }, + { + "epoch": 0.12753471995827084, + "grad_norm": 4.84375, + "learning_rate": 3.922791361110747e-05, + "loss": 1.962763786315918, + "step": 489 + }, + { + "epoch": 0.12779552715654952, + "grad_norm": 6.8125, + "learning_rate": 3.9223892366183795e-05, + "loss": 2.0241403579711914, + "step": 490 + }, + { + "epoch": 0.1280563343548282, + "grad_norm": 5.0625, + "learning_rate": 3.921986088374064e-05, + "loss": 2.0447614192962646, + "step": 491 + }, + { + "epoch": 0.12831714155310686, + "grad_norm": 4.78125, + "learning_rate": 3.9215819165924956e-05, + "loss": 1.7903733253479004, + "step": 492 + }, + { + "epoch": 0.12857794875138553, + "grad_norm": 5.65625, + "learning_rate": 3.9211767214889114e-05, + "loss": 2.12595796585083, + "step": 493 + }, + { + "epoch": 0.1288387559496642, + "grad_norm": 5.6875, + "learning_rate": 3.9207705032790944e-05, + "loss": 1.962897539138794, + "step": 494 + }, + { + "epoch": 0.12909956314794288, + "grad_norm": 4.90625, + "learning_rate": 3.9203632621793726e-05, + "loss": 1.8787273168563843, + "step": 495 + }, + { + "epoch": 0.12936037034622155, + "grad_norm": 4.90625, + "learning_rate": 3.91995499840662e-05, + "loss": 1.9988367557525635, + "step": 496 + }, + { + "epoch": 0.12962117754450023, + "grad_norm": 5.125, + "learning_rate": 3.919545712178253e-05, + "loss": 2.086315155029297, + "step": 497 + }, + { + "epoch": 0.1298819847427789, + "grad_norm": 5.0625, + "learning_rate": 3.919135403712233e-05, + "loss": 1.9002385139465332, + "step": 498 + }, + { + "epoch": 0.13014279194105757, + "grad_norm": 5.5, + "learning_rate": 3.9187240732270675e-05, + "loss": 2.037102460861206, + "step": 499 + }, + { + "epoch": 0.13040359913933625, + "grad_norm": 4.96875, + "learning_rate": 3.9183117209418055e-05, + "loss": 1.9531993865966797, + "step": 500 + }, + { + "epoch": 0.13066440633761492, + "grad_norm": 5.15625, + "learning_rate": 3.917898347076043e-05, + "loss": 2.143584728240967, + "step": 501 + }, + { + "epoch": 0.1309252135358936, + "grad_norm": 4.78125, + "learning_rate": 3.917483951849919e-05, + "loss": 2.0562262535095215, + "step": 502 + }, + { + "epoch": 0.13118602073417227, + "grad_norm": 5.46875, + "learning_rate": 3.917068535484114e-05, + "loss": 2.0279130935668945, + "step": 503 + }, + { + "epoch": 0.13144682793245094, + "grad_norm": 5.0625, + "learning_rate": 3.916652098199857e-05, + "loss": 1.8175674676895142, + "step": 504 + }, + { + "epoch": 0.1317076351307296, + "grad_norm": 5.28125, + "learning_rate": 3.916234640218917e-05, + "loss": 2.081401824951172, + "step": 505 + }, + { + "epoch": 0.13196844232900828, + "grad_norm": 5.1875, + "learning_rate": 3.915816161763607e-05, + "loss": 1.9667434692382812, + "step": 506 + }, + { + "epoch": 0.13222924952728696, + "grad_norm": 4.65625, + "learning_rate": 3.915396663056784e-05, + "loss": 1.7676701545715332, + "step": 507 + }, + { + "epoch": 0.13249005672556563, + "grad_norm": 5.34375, + "learning_rate": 3.91497614432185e-05, + "loss": 2.0130467414855957, + "step": 508 + }, + { + "epoch": 0.1327508639238443, + "grad_norm": 5.78125, + "learning_rate": 3.914554605782749e-05, + "loss": 2.157017707824707, + "step": 509 + }, + { + "epoch": 0.13301167112212298, + "grad_norm": 5.3125, + "learning_rate": 3.914132047663965e-05, + "loss": 1.9875788688659668, + "step": 510 + }, + { + "epoch": 0.13327247832040165, + "grad_norm": 4.875, + "learning_rate": 3.91370847019053e-05, + "loss": 1.7901101112365723, + "step": 511 + }, + { + "epoch": 0.13353328551868032, + "grad_norm": 5.0625, + "learning_rate": 3.913283873588016e-05, + "loss": 2.2028799057006836, + "step": 512 + }, + { + "epoch": 0.133794092716959, + "grad_norm": 5.0625, + "learning_rate": 3.912858258082538e-05, + "loss": 2.07771635055542, + "step": 513 + }, + { + "epoch": 0.13405489991523767, + "grad_norm": 4.8125, + "learning_rate": 3.9124316239007535e-05, + "loss": 1.8496947288513184, + "step": 514 + }, + { + "epoch": 0.13431570711351634, + "grad_norm": 4.90625, + "learning_rate": 3.912003971269864e-05, + "loss": 2.029522180557251, + "step": 515 + }, + { + "epoch": 0.13457651431179501, + "grad_norm": 4.9375, + "learning_rate": 3.911575300417612e-05, + "loss": 2.2308075428009033, + "step": 516 + }, + { + "epoch": 0.1348373215100737, + "grad_norm": 5.0625, + "learning_rate": 3.911145611572282e-05, + "loss": 2.0410444736480713, + "step": 517 + }, + { + "epoch": 0.13509812870835236, + "grad_norm": 5.125, + "learning_rate": 3.9107149049627014e-05, + "loss": 2.071100950241089, + "step": 518 + }, + { + "epoch": 0.13535893590663103, + "grad_norm": 4.8125, + "learning_rate": 3.9102831808182384e-05, + "loss": 1.8825916051864624, + "step": 519 + }, + { + "epoch": 0.1356197431049097, + "grad_norm": 5.28125, + "learning_rate": 3.9098504393688055e-05, + "loss": 2.1796164512634277, + "step": 520 + }, + { + "epoch": 0.13588055030318838, + "grad_norm": 5.40625, + "learning_rate": 3.9094166808448546e-05, + "loss": 2.2033798694610596, + "step": 521 + }, + { + "epoch": 0.13614135750146705, + "grad_norm": 5.375, + "learning_rate": 3.908981905477381e-05, + "loss": 1.9869440793991089, + "step": 522 + }, + { + "epoch": 0.13640216469974573, + "grad_norm": 5.34375, + "learning_rate": 3.908546113497919e-05, + "loss": 1.9655147790908813, + "step": 523 + }, + { + "epoch": 0.1366629718980244, + "grad_norm": 10.5, + "learning_rate": 3.908109305138547e-05, + "loss": 2.0668673515319824, + "step": 524 + }, + { + "epoch": 0.13692377909630304, + "grad_norm": 5.15625, + "learning_rate": 3.9076714806318835e-05, + "loss": 2.0768802165985107, + "step": 525 + }, + { + "epoch": 0.13718458629458172, + "grad_norm": 4.96875, + "learning_rate": 3.907232640211089e-05, + "loss": 2.2760894298553467, + "step": 526 + }, + { + "epoch": 0.1374453934928604, + "grad_norm": 4.78125, + "learning_rate": 3.9067927841098614e-05, + "loss": 2.092144727706909, + "step": 527 + }, + { + "epoch": 0.13770620069113906, + "grad_norm": 5.09375, + "learning_rate": 3.906351912562445e-05, + "loss": 2.464493989944458, + "step": 528 + }, + { + "epoch": 0.13796700788941774, + "grad_norm": 5.09375, + "learning_rate": 3.9059100258036214e-05, + "loss": 2.144874095916748, + "step": 529 + }, + { + "epoch": 0.1382278150876964, + "grad_norm": 4.9375, + "learning_rate": 3.9054671240687134e-05, + "loss": 1.8832736015319824, + "step": 530 + }, + { + "epoch": 0.13848862228597508, + "grad_norm": 5.03125, + "learning_rate": 3.905023207593585e-05, + "loss": 2.051281690597534, + "step": 531 + }, + { + "epoch": 0.13874942948425376, + "grad_norm": 4.65625, + "learning_rate": 3.904578276614639e-05, + "loss": 1.959316372871399, + "step": 532 + }, + { + "epoch": 0.13901023668253243, + "grad_norm": 5.09375, + "learning_rate": 3.9041323313688215e-05, + "loss": 2.2046756744384766, + "step": 533 + }, + { + "epoch": 0.1392710438808111, + "grad_norm": 4.5625, + "learning_rate": 3.903685372093615e-05, + "loss": 2.0775370597839355, + "step": 534 + }, + { + "epoch": 0.13953185107908977, + "grad_norm": 5.03125, + "learning_rate": 3.903237399027044e-05, + "loss": 2.0336546897888184, + "step": 535 + }, + { + "epoch": 0.13979265827736845, + "grad_norm": 5.0625, + "learning_rate": 3.902788412407675e-05, + "loss": 1.987802505493164, + "step": 536 + }, + { + "epoch": 0.14005346547564712, + "grad_norm": 5.03125, + "learning_rate": 3.9023384124746085e-05, + "loss": 1.9189298152923584, + "step": 537 + }, + { + "epoch": 0.1403142726739258, + "grad_norm": 4.9375, + "learning_rate": 3.901887399467491e-05, + "loss": 1.8632619380950928, + "step": 538 + }, + { + "epoch": 0.14057507987220447, + "grad_norm": 5.25, + "learning_rate": 3.9014353736265034e-05, + "loss": 2.0026168823242188, + "step": 539 + }, + { + "epoch": 0.14083588707048314, + "grad_norm": 5.03125, + "learning_rate": 3.90098233519237e-05, + "loss": 1.866854190826416, + "step": 540 + }, + { + "epoch": 0.1410966942687618, + "grad_norm": 5.1875, + "learning_rate": 3.900528284406352e-05, + "loss": 2.2660746574401855, + "step": 541 + }, + { + "epoch": 0.1413575014670405, + "grad_norm": 5.375, + "learning_rate": 3.90007322151025e-05, + "loss": 2.1613218784332275, + "step": 542 + }, + { + "epoch": 0.14161830866531916, + "grad_norm": 5.21875, + "learning_rate": 3.899617146746404e-05, + "loss": 2.2124698162078857, + "step": 543 + }, + { + "epoch": 0.14187911586359783, + "grad_norm": 4.875, + "learning_rate": 3.899160060357693e-05, + "loss": 2.103794574737549, + "step": 544 + }, + { + "epoch": 0.1421399230618765, + "grad_norm": 4.75, + "learning_rate": 3.898701962587533e-05, + "loss": 1.8552050590515137, + "step": 545 + }, + { + "epoch": 0.14240073026015518, + "grad_norm": 5.5625, + "learning_rate": 3.898242853679882e-05, + "loss": 2.0167746543884277, + "step": 546 + }, + { + "epoch": 0.14266153745843385, + "grad_norm": 5.53125, + "learning_rate": 3.8977827338792334e-05, + "loss": 2.2357492446899414, + "step": 547 + }, + { + "epoch": 0.14292234465671252, + "grad_norm": 4.78125, + "learning_rate": 3.89732160343062e-05, + "loss": 1.7399003505706787, + "step": 548 + }, + { + "epoch": 0.1431831518549912, + "grad_norm": 4.96875, + "learning_rate": 3.896859462579614e-05, + "loss": 2.075052499771118, + "step": 549 + }, + { + "epoch": 0.14344395905326987, + "grad_norm": 4.375, + "learning_rate": 3.8963963115723234e-05, + "loss": 1.7772223949432373, + "step": 550 + }, + { + "epoch": 0.14370476625154854, + "grad_norm": 4.53125, + "learning_rate": 3.8959321506553955e-05, + "loss": 1.6828080415725708, + "step": 551 + }, + { + "epoch": 0.14396557344982722, + "grad_norm": 5.46875, + "learning_rate": 3.8954669800760164e-05, + "loss": 2.1450443267822266, + "step": 552 + }, + { + "epoch": 0.1442263806481059, + "grad_norm": 5.28125, + "learning_rate": 3.895000800081907e-05, + "loss": 2.1769192218780518, + "step": 553 + }, + { + "epoch": 0.14448718784638456, + "grad_norm": 4.8125, + "learning_rate": 3.894533610921328e-05, + "loss": 2.011871814727783, + "step": 554 + }, + { + "epoch": 0.14474799504466324, + "grad_norm": 4.8125, + "learning_rate": 3.8940654128430766e-05, + "loss": 1.8749043941497803, + "step": 555 + }, + { + "epoch": 0.1450088022429419, + "grad_norm": 4.9375, + "learning_rate": 3.893596206096489e-05, + "loss": 2.031252145767212, + "step": 556 + }, + { + "epoch": 0.14526960944122058, + "grad_norm": 5.0, + "learning_rate": 3.893125990931437e-05, + "loss": 2.2448625564575195, + "step": 557 + }, + { + "epoch": 0.14553041663949925, + "grad_norm": 5.125, + "learning_rate": 3.8926547675983286e-05, + "loss": 2.0833122730255127, + "step": 558 + }, + { + "epoch": 0.14579122383777793, + "grad_norm": 4.5, + "learning_rate": 3.89218253634811e-05, + "loss": 2.0020689964294434, + "step": 559 + }, + { + "epoch": 0.1460520310360566, + "grad_norm": 4.8125, + "learning_rate": 3.891709297432265e-05, + "loss": 1.9792006015777588, + "step": 560 + }, + { + "epoch": 0.14631283823433527, + "grad_norm": 4.78125, + "learning_rate": 3.891235051102812e-05, + "loss": 2.0448176860809326, + "step": 561 + }, + { + "epoch": 0.14657364543261395, + "grad_norm": 4.90625, + "learning_rate": 3.890759797612307e-05, + "loss": 2.2394766807556152, + "step": 562 + }, + { + "epoch": 0.14683445263089262, + "grad_norm": 4.84375, + "learning_rate": 3.890283537213842e-05, + "loss": 2.0798072814941406, + "step": 563 + }, + { + "epoch": 0.1470952598291713, + "grad_norm": 5.53125, + "learning_rate": 3.889806270161046e-05, + "loss": 2.048489809036255, + "step": 564 + }, + { + "epoch": 0.14735606702744997, + "grad_norm": 5.0625, + "learning_rate": 3.889327996708083e-05, + "loss": 2.113455295562744, + "step": 565 + }, + { + "epoch": 0.14761687422572864, + "grad_norm": 4.75, + "learning_rate": 3.888848717109653e-05, + "loss": 1.6562857627868652, + "step": 566 + }, + { + "epoch": 0.1478776814240073, + "grad_norm": 4.84375, + "learning_rate": 3.888368431620993e-05, + "loss": 2.054020404815674, + "step": 567 + }, + { + "epoch": 0.14813848862228599, + "grad_norm": 4.75, + "learning_rate": 3.887887140497875e-05, + "loss": 1.9090944528579712, + "step": 568 + }, + { + "epoch": 0.14839929582056466, + "grad_norm": 5.03125, + "learning_rate": 3.887404843996606e-05, + "loss": 1.945820927619934, + "step": 569 + }, + { + "epoch": 0.14866010301884333, + "grad_norm": 5.1875, + "learning_rate": 3.8869215423740285e-05, + "loss": 2.0362110137939453, + "step": 570 + }, + { + "epoch": 0.148920910217122, + "grad_norm": 4.71875, + "learning_rate": 3.886437235887522e-05, + "loss": 1.8869293928146362, + "step": 571 + }, + { + "epoch": 0.14918171741540068, + "grad_norm": 4.9375, + "learning_rate": 3.8859519247949984e-05, + "loss": 2.261014223098755, + "step": 572 + }, + { + "epoch": 0.14944252461367935, + "grad_norm": 4.90625, + "learning_rate": 3.8854656093549075e-05, + "loss": 1.9760932922363281, + "step": 573 + }, + { + "epoch": 0.14970333181195802, + "grad_norm": 4.5625, + "learning_rate": 3.8849782898262306e-05, + "loss": 1.7362785339355469, + "step": 574 + }, + { + "epoch": 0.14996413901023667, + "grad_norm": 12.8125, + "learning_rate": 3.884489966468486e-05, + "loss": 2.5838465690612793, + "step": 575 + }, + { + "epoch": 0.15022494620851534, + "grad_norm": 5.40625, + "learning_rate": 3.884000639541728e-05, + "loss": 1.9989123344421387, + "step": 576 + }, + { + "epoch": 0.15048575340679402, + "grad_norm": 5.6875, + "learning_rate": 3.883510309306541e-05, + "loss": 1.9393930435180664, + "step": 577 + }, + { + "epoch": 0.1507465606050727, + "grad_norm": 5.03125, + "learning_rate": 3.883018976024047e-05, + "loss": 1.9157646894454956, + "step": 578 + }, + { + "epoch": 0.15100736780335136, + "grad_norm": 5.40625, + "learning_rate": 3.8825266399559024e-05, + "loss": 1.9912060499191284, + "step": 579 + }, + { + "epoch": 0.15126817500163003, + "grad_norm": 5.21875, + "learning_rate": 3.8820333013642945e-05, + "loss": 1.8786393404006958, + "step": 580 + }, + { + "epoch": 0.1515289821999087, + "grad_norm": 5.3125, + "learning_rate": 3.881538960511948e-05, + "loss": 1.921290397644043, + "step": 581 + }, + { + "epoch": 0.15178978939818738, + "grad_norm": 5.09375, + "learning_rate": 3.881043617662121e-05, + "loss": 1.8739674091339111, + "step": 582 + }, + { + "epoch": 0.15205059659646605, + "grad_norm": 5.0, + "learning_rate": 3.880547273078602e-05, + "loss": 2.0182878971099854, + "step": 583 + }, + { + "epoch": 0.15231140379474473, + "grad_norm": 4.875, + "learning_rate": 3.880049927025715e-05, + "loss": 1.7300055027008057, + "step": 584 + }, + { + "epoch": 0.1525722109930234, + "grad_norm": 5.1875, + "learning_rate": 3.8795515797683194e-05, + "loss": 2.154013156890869, + "step": 585 + }, + { + "epoch": 0.15283301819130207, + "grad_norm": 4.8125, + "learning_rate": 3.8790522315718034e-05, + "loss": 2.199392318725586, + "step": 586 + }, + { + "epoch": 0.15309382538958075, + "grad_norm": 4.6875, + "learning_rate": 3.878551882702092e-05, + "loss": 1.8654108047485352, + "step": 587 + }, + { + "epoch": 0.15335463258785942, + "grad_norm": 4.5, + "learning_rate": 3.878050533425642e-05, + "loss": 2.0651044845581055, + "step": 588 + }, + { + "epoch": 0.1536154397861381, + "grad_norm": 4.65625, + "learning_rate": 3.8775481840094416e-05, + "loss": 1.9368878602981567, + "step": 589 + }, + { + "epoch": 0.15387624698441676, + "grad_norm": 5.40625, + "learning_rate": 3.8770448347210144e-05, + "loss": 1.880233645439148, + "step": 590 + }, + { + "epoch": 0.15413705418269544, + "grad_norm": 5.125, + "learning_rate": 3.8765404858284124e-05, + "loss": 2.0680623054504395, + "step": 591 + }, + { + "epoch": 0.1543978613809741, + "grad_norm": 5.78125, + "learning_rate": 3.876035137600224e-05, + "loss": 2.1295831203460693, + "step": 592 + }, + { + "epoch": 0.15465866857925278, + "grad_norm": 11.0625, + "learning_rate": 3.875528790305567e-05, + "loss": 2.007826328277588, + "step": 593 + }, + { + "epoch": 0.15491947577753146, + "grad_norm": 5.125, + "learning_rate": 3.875021444214093e-05, + "loss": 2.1482367515563965, + "step": 594 + }, + { + "epoch": 0.15518028297581013, + "grad_norm": 5.21875, + "learning_rate": 3.874513099595986e-05, + "loss": 2.0759899616241455, + "step": 595 + }, + { + "epoch": 0.1554410901740888, + "grad_norm": 5.5, + "learning_rate": 3.874003756721958e-05, + "loss": 2.073017120361328, + "step": 596 + }, + { + "epoch": 0.15570189737236748, + "grad_norm": 5.8125, + "learning_rate": 3.873493415863256e-05, + "loss": 2.0745091438293457, + "step": 597 + }, + { + "epoch": 0.15596270457064615, + "grad_norm": 4.6875, + "learning_rate": 3.872982077291659e-05, + "loss": 1.9696614742279053, + "step": 598 + }, + { + "epoch": 0.15622351176892482, + "grad_norm": 5.40625, + "learning_rate": 3.872469741279475e-05, + "loss": 2.2111566066741943, + "step": 599 + }, + { + "epoch": 0.1564843189672035, + "grad_norm": 4.625, + "learning_rate": 3.8719564080995434e-05, + "loss": 1.8812564611434937, + "step": 600 + }, + { + "epoch": 0.15674512616548217, + "grad_norm": 4.625, + "learning_rate": 3.871442078025237e-05, + "loss": 2.0892837047576904, + "step": 601 + }, + { + "epoch": 0.15700593336376084, + "grad_norm": 4.5625, + "learning_rate": 3.870926751330458e-05, + "loss": 1.8276432752609253, + "step": 602 + }, + { + "epoch": 0.15726674056203951, + "grad_norm": 5.03125, + "learning_rate": 3.870410428289637e-05, + "loss": 1.8986891508102417, + "step": 603 + }, + { + "epoch": 0.1575275477603182, + "grad_norm": 4.90625, + "learning_rate": 3.86989310917774e-05, + "loss": 2.030865430831909, + "step": 604 + }, + { + "epoch": 0.15778835495859686, + "grad_norm": 4.84375, + "learning_rate": 3.869374794270258e-05, + "loss": 1.9464844465255737, + "step": 605 + }, + { + "epoch": 0.15804916215687553, + "grad_norm": 5.09375, + "learning_rate": 3.868855483843218e-05, + "loss": 2.18831205368042, + "step": 606 + }, + { + "epoch": 0.1583099693551542, + "grad_norm": 4.46875, + "learning_rate": 3.868335178173174e-05, + "loss": 1.8682782649993896, + "step": 607 + }, + { + "epoch": 0.15857077655343288, + "grad_norm": 4.71875, + "learning_rate": 3.867813877537208e-05, + "loss": 1.9430899620056152, + "step": 608 + }, + { + "epoch": 0.15883158375171155, + "grad_norm": 4.875, + "learning_rate": 3.867291582212936e-05, + "loss": 2.0944416522979736, + "step": 609 + }, + { + "epoch": 0.15909239094999023, + "grad_norm": 4.78125, + "learning_rate": 3.866768292478502e-05, + "loss": 1.973773717880249, + "step": 610 + }, + { + "epoch": 0.1593531981482689, + "grad_norm": 5.0, + "learning_rate": 3.866244008612579e-05, + "loss": 2.190091609954834, + "step": 611 + }, + { + "epoch": 0.15961400534654757, + "grad_norm": 4.71875, + "learning_rate": 3.86571873089437e-05, + "loss": 1.9963393211364746, + "step": 612 + }, + { + "epoch": 0.15987481254482624, + "grad_norm": 5.0, + "learning_rate": 3.8651924596036066e-05, + "loss": 2.2495474815368652, + "step": 613 + }, + { + "epoch": 0.16013561974310492, + "grad_norm": 4.78125, + "learning_rate": 3.8646651950205514e-05, + "loss": 1.9934877157211304, + "step": 614 + }, + { + "epoch": 0.1603964269413836, + "grad_norm": 4.6875, + "learning_rate": 3.864136937425993e-05, + "loss": 2.1129329204559326, + "step": 615 + }, + { + "epoch": 0.16065723413966226, + "grad_norm": 4.875, + "learning_rate": 3.863607687101252e-05, + "loss": 1.91444993019104, + "step": 616 + }, + { + "epoch": 0.16091804133794094, + "grad_norm": 4.875, + "learning_rate": 3.863077444328175e-05, + "loss": 1.9214363098144531, + "step": 617 + }, + { + "epoch": 0.1611788485362196, + "grad_norm": 4.84375, + "learning_rate": 3.862546209389139e-05, + "loss": 1.8061554431915283, + "step": 618 + }, + { + "epoch": 0.16143965573449828, + "grad_norm": 4.71875, + "learning_rate": 3.862013982567048e-05, + "loss": 1.873937726020813, + "step": 619 + }, + { + "epoch": 0.16170046293277696, + "grad_norm": 5.21875, + "learning_rate": 3.861480764145335e-05, + "loss": 2.1080081462860107, + "step": 620 + }, + { + "epoch": 0.16196127013105563, + "grad_norm": 4.78125, + "learning_rate": 3.860946554407961e-05, + "loss": 2.0576019287109375, + "step": 621 + }, + { + "epoch": 0.1622220773293343, + "grad_norm": 5.1875, + "learning_rate": 3.860411353639415e-05, + "loss": 2.2772209644317627, + "step": 622 + }, + { + "epoch": 0.16248288452761298, + "grad_norm": 4.6875, + "learning_rate": 3.859875162124714e-05, + "loss": 2.103712558746338, + "step": 623 + }, + { + "epoch": 0.16274369172589165, + "grad_norm": 4.90625, + "learning_rate": 3.8593379801494015e-05, + "loss": 2.074312925338745, + "step": 624 + }, + { + "epoch": 0.1630044989241703, + "grad_norm": 5.78125, + "learning_rate": 3.858799807999549e-05, + "loss": 1.9695968627929688, + "step": 625 + }, + { + "epoch": 0.16326530612244897, + "grad_norm": 4.78125, + "learning_rate": 3.858260645961756e-05, + "loss": 1.8722575902938843, + "step": 626 + }, + { + "epoch": 0.16352611332072764, + "grad_norm": 4.65625, + "learning_rate": 3.857720494323149e-05, + "loss": 1.9032584428787231, + "step": 627 + }, + { + "epoch": 0.1637869205190063, + "grad_norm": 4.75, + "learning_rate": 3.8571793533713796e-05, + "loss": 1.9035115242004395, + "step": 628 + }, + { + "epoch": 0.16404772771728499, + "grad_norm": 5.09375, + "learning_rate": 3.856637223394629e-05, + "loss": 1.9448721408843994, + "step": 629 + }, + { + "epoch": 0.16430853491556366, + "grad_norm": 4.59375, + "learning_rate": 3.856094104681605e-05, + "loss": 1.8737752437591553, + "step": 630 + }, + { + "epoch": 0.16456934211384233, + "grad_norm": 5.375, + "learning_rate": 3.855549997521538e-05, + "loss": 1.9294793605804443, + "step": 631 + }, + { + "epoch": 0.164830149312121, + "grad_norm": 4.9375, + "learning_rate": 3.85500490220419e-05, + "loss": 1.7865889072418213, + "step": 632 + }, + { + "epoch": 0.16509095651039968, + "grad_norm": 4.71875, + "learning_rate": 3.8544588190198454e-05, + "loss": 1.6698765754699707, + "step": 633 + }, + { + "epoch": 0.16535176370867835, + "grad_norm": 5.0625, + "learning_rate": 3.8539117482593164e-05, + "loss": 2.2318742275238037, + "step": 634 + }, + { + "epoch": 0.16561257090695702, + "grad_norm": 4.75, + "learning_rate": 3.853363690213942e-05, + "loss": 1.8279545307159424, + "step": 635 + }, + { + "epoch": 0.1658733781052357, + "grad_norm": 4.84375, + "learning_rate": 3.852814645175584e-05, + "loss": 1.9168078899383545, + "step": 636 + }, + { + "epoch": 0.16613418530351437, + "grad_norm": 4.90625, + "learning_rate": 3.8522646134366336e-05, + "loss": 1.9145238399505615, + "step": 637 + }, + { + "epoch": 0.16639499250179304, + "grad_norm": 4.8125, + "learning_rate": 3.851713595290004e-05, + "loss": 1.8849806785583496, + "step": 638 + }, + { + "epoch": 0.16665579970007172, + "grad_norm": 4.53125, + "learning_rate": 3.851161591029135e-05, + "loss": 1.6500052213668823, + "step": 639 + }, + { + "epoch": 0.1669166068983504, + "grad_norm": 5.0, + "learning_rate": 3.8506086009479934e-05, + "loss": 1.8743053674697876, + "step": 640 + }, + { + "epoch": 0.16717741409662906, + "grad_norm": 4.84375, + "learning_rate": 3.850054625341068e-05, + "loss": 2.2005090713500977, + "step": 641 + }, + { + "epoch": 0.16743822129490774, + "grad_norm": 4.5625, + "learning_rate": 3.849499664503375e-05, + "loss": 1.7907366752624512, + "step": 642 + }, + { + "epoch": 0.1676990284931864, + "grad_norm": 4.4375, + "learning_rate": 3.848943718730452e-05, + "loss": 1.8465526103973389, + "step": 643 + }, + { + "epoch": 0.16795983569146508, + "grad_norm": 4.75, + "learning_rate": 3.848386788318365e-05, + "loss": 2.003734827041626, + "step": 644 + }, + { + "epoch": 0.16822064288974375, + "grad_norm": 4.78125, + "learning_rate": 3.847828873563702e-05, + "loss": 1.7648651599884033, + "step": 645 + }, + { + "epoch": 0.16848145008802243, + "grad_norm": 4.4375, + "learning_rate": 3.847269974763576e-05, + "loss": 2.020761728286743, + "step": 646 + }, + { + "epoch": 0.1687422572863011, + "grad_norm": 4.6875, + "learning_rate": 3.846710092215623e-05, + "loss": 1.9700047969818115, + "step": 647 + }, + { + "epoch": 0.16900306448457977, + "grad_norm": 4.4375, + "learning_rate": 3.846149226218003e-05, + "loss": 1.9700831174850464, + "step": 648 + }, + { + "epoch": 0.16926387168285845, + "grad_norm": 4.5, + "learning_rate": 3.845587377069403e-05, + "loss": 2.1578357219696045, + "step": 649 + }, + { + "epoch": 0.16952467888113712, + "grad_norm": 4.5625, + "learning_rate": 3.845024545069029e-05, + "loss": 2.0335230827331543, + "step": 650 + }, + { + "epoch": 0.1697854860794158, + "grad_norm": 4.34375, + "learning_rate": 3.8444607305166124e-05, + "loss": 1.9350706338882446, + "step": 651 + }, + { + "epoch": 0.17004629327769447, + "grad_norm": 4.375, + "learning_rate": 3.843895933712409e-05, + "loss": 2.063561201095581, + "step": 652 + }, + { + "epoch": 0.17030710047597314, + "grad_norm": 4.1875, + "learning_rate": 3.843330154957195e-05, + "loss": 1.7344597578048706, + "step": 653 + }, + { + "epoch": 0.1705679076742518, + "grad_norm": 4.78125, + "learning_rate": 3.8427633945522714e-05, + "loss": 2.1208086013793945, + "step": 654 + }, + { + "epoch": 0.17082871487253048, + "grad_norm": 4.5625, + "learning_rate": 3.842195652799463e-05, + "loss": 1.8748433589935303, + "step": 655 + }, + { + "epoch": 0.17108952207080916, + "grad_norm": 4.40625, + "learning_rate": 3.841626930001114e-05, + "loss": 1.809694766998291, + "step": 656 + }, + { + "epoch": 0.17135032926908783, + "grad_norm": 4.28125, + "learning_rate": 3.841057226460094e-05, + "loss": 1.9290765523910522, + "step": 657 + }, + { + "epoch": 0.1716111364673665, + "grad_norm": 4.6875, + "learning_rate": 3.840486542479793e-05, + "loss": 2.177703857421875, + "step": 658 + }, + { + "epoch": 0.17187194366564518, + "grad_norm": 4.375, + "learning_rate": 3.839914878364125e-05, + "loss": 1.9175394773483276, + "step": 659 + }, + { + "epoch": 0.17213275086392385, + "grad_norm": 6.28125, + "learning_rate": 3.8393422344175234e-05, + "loss": 2.1164231300354004, + "step": 660 + }, + { + "epoch": 0.17239355806220252, + "grad_norm": 4.6875, + "learning_rate": 3.838768610944946e-05, + "loss": 2.0701417922973633, + "step": 661 + }, + { + "epoch": 0.1726543652604812, + "grad_norm": 4.625, + "learning_rate": 3.8381940082518704e-05, + "loss": 2.0714147090911865, + "step": 662 + }, + { + "epoch": 0.17291517245875987, + "grad_norm": 4.5625, + "learning_rate": 3.8376184266442965e-05, + "loss": 1.839314341545105, + "step": 663 + }, + { + "epoch": 0.17317597965703854, + "grad_norm": 4.8125, + "learning_rate": 3.837041866428745e-05, + "loss": 2.001265048980713, + "step": 664 + }, + { + "epoch": 0.17343678685531722, + "grad_norm": 4.125, + "learning_rate": 3.83646432791226e-05, + "loss": 1.6682621240615845, + "step": 665 + }, + { + "epoch": 0.1736975940535959, + "grad_norm": 4.875, + "learning_rate": 3.835885811402402e-05, + "loss": 1.9657459259033203, + "step": 666 + }, + { + "epoch": 0.17395840125187456, + "grad_norm": 4.3125, + "learning_rate": 3.8353063172072564e-05, + "loss": 1.709079384803772, + "step": 667 + }, + { + "epoch": 0.17421920845015323, + "grad_norm": 4.46875, + "learning_rate": 3.834725845635428e-05, + "loss": 1.8939547538757324, + "step": 668 + }, + { + "epoch": 0.1744800156484319, + "grad_norm": 4.65625, + "learning_rate": 3.834144396996041e-05, + "loss": 2.047983169555664, + "step": 669 + }, + { + "epoch": 0.17474082284671058, + "grad_norm": 4.875, + "learning_rate": 3.833561971598743e-05, + "loss": 2.058068037033081, + "step": 670 + }, + { + "epoch": 0.17500163004498925, + "grad_norm": 4.53125, + "learning_rate": 3.832978569753697e-05, + "loss": 1.89093017578125, + "step": 671 + }, + { + "epoch": 0.17526243724326793, + "grad_norm": 4.875, + "learning_rate": 3.83239419177159e-05, + "loss": 1.9490283727645874, + "step": 672 + }, + { + "epoch": 0.1755232444415466, + "grad_norm": 4.6875, + "learning_rate": 3.831808837963628e-05, + "loss": 2.1818556785583496, + "step": 673 + }, + { + "epoch": 0.17578405163982527, + "grad_norm": 4.5625, + "learning_rate": 3.831222508641535e-05, + "loss": 2.1393942832946777, + "step": 674 + }, + { + "epoch": 0.17604485883810392, + "grad_norm": 4.3125, + "learning_rate": 3.830635204117557e-05, + "loss": 1.8520056009292603, + "step": 675 + }, + { + "epoch": 0.1763056660363826, + "grad_norm": 4.8125, + "learning_rate": 3.8300469247044564e-05, + "loss": 1.6711615324020386, + "step": 676 + }, + { + "epoch": 0.17656647323466126, + "grad_norm": 4.625, + "learning_rate": 3.829457670715518e-05, + "loss": 1.928394079208374, + "step": 677 + }, + { + "epoch": 0.17682728043293994, + "grad_norm": 4.5, + "learning_rate": 3.828867442464543e-05, + "loss": 1.7552798986434937, + "step": 678 + }, + { + "epoch": 0.1770880876312186, + "grad_norm": 4.875, + "learning_rate": 3.828276240265852e-05, + "loss": 2.078004837036133, + "step": 679 + }, + { + "epoch": 0.17734889482949728, + "grad_norm": 4.8125, + "learning_rate": 3.827684064434286e-05, + "loss": 2.1814630031585693, + "step": 680 + }, + { + "epoch": 0.17760970202777596, + "grad_norm": 4.78125, + "learning_rate": 3.827090915285202e-05, + "loss": 2.024282932281494, + "step": 681 + }, + { + "epoch": 0.17787050922605463, + "grad_norm": 4.46875, + "learning_rate": 3.8264967931344774e-05, + "loss": 1.7166085243225098, + "step": 682 + }, + { + "epoch": 0.1781313164243333, + "grad_norm": 4.625, + "learning_rate": 3.825901698298506e-05, + "loss": 1.7741438150405884, + "step": 683 + }, + { + "epoch": 0.17839212362261198, + "grad_norm": 4.3125, + "learning_rate": 3.8253056310942015e-05, + "loss": 1.6977009773254395, + "step": 684 + }, + { + "epoch": 0.17865293082089065, + "grad_norm": 4.40625, + "learning_rate": 3.824708591838993e-05, + "loss": 1.880244493484497, + "step": 685 + }, + { + "epoch": 0.17891373801916932, + "grad_norm": 4.625, + "learning_rate": 3.82411058085083e-05, + "loss": 2.0184810161590576, + "step": 686 + }, + { + "epoch": 0.179174545217448, + "grad_norm": 4.6875, + "learning_rate": 3.823511598448177e-05, + "loss": 1.8925732374191284, + "step": 687 + }, + { + "epoch": 0.17943535241572667, + "grad_norm": 4.625, + "learning_rate": 3.822911644950018e-05, + "loss": 1.8870201110839844, + "step": 688 + }, + { + "epoch": 0.17969615961400534, + "grad_norm": 4.78125, + "learning_rate": 3.822310720675852e-05, + "loss": 2.037837028503418, + "step": 689 + }, + { + "epoch": 0.179956966812284, + "grad_norm": 4.46875, + "learning_rate": 3.821708825945698e-05, + "loss": 1.8832042217254639, + "step": 690 + }, + { + "epoch": 0.1802177740105627, + "grad_norm": 4.8125, + "learning_rate": 3.821105961080088e-05, + "loss": 2.1521339416503906, + "step": 691 + }, + { + "epoch": 0.18047858120884136, + "grad_norm": 4.125, + "learning_rate": 3.820502126400073e-05, + "loss": 1.81393301486969, + "step": 692 + }, + { + "epoch": 0.18073938840712003, + "grad_norm": 4.28125, + "learning_rate": 3.81989732222722e-05, + "loss": 1.8241595029830933, + "step": 693 + }, + { + "epoch": 0.1810001956053987, + "grad_norm": 4.40625, + "learning_rate": 3.819291548883612e-05, + "loss": 1.8708091974258423, + "step": 694 + }, + { + "epoch": 0.18126100280367738, + "grad_norm": 4.75, + "learning_rate": 3.81868480669185e-05, + "loss": 1.8775160312652588, + "step": 695 + }, + { + "epoch": 0.18152181000195605, + "grad_norm": 4.75, + "learning_rate": 3.818077095975048e-05, + "loss": 1.9862464666366577, + "step": 696 + }, + { + "epoch": 0.18178261720023473, + "grad_norm": 4.59375, + "learning_rate": 3.817468417056836e-05, + "loss": 1.988217830657959, + "step": 697 + }, + { + "epoch": 0.1820434243985134, + "grad_norm": 4.9375, + "learning_rate": 3.816858770261363e-05, + "loss": 2.1213014125823975, + "step": 698 + }, + { + "epoch": 0.18230423159679207, + "grad_norm": 4.59375, + "learning_rate": 3.816248155913291e-05, + "loss": 1.8472071886062622, + "step": 699 + }, + { + "epoch": 0.18256503879507074, + "grad_norm": 4.15625, + "learning_rate": 3.815636574337796e-05, + "loss": 1.5301586389541626, + "step": 700 + }, + { + "epoch": 0.18282584599334942, + "grad_norm": 4.34375, + "learning_rate": 3.8150240258605714e-05, + "loss": 1.8507400751113892, + "step": 701 + }, + { + "epoch": 0.1830866531916281, + "grad_norm": 4.75, + "learning_rate": 3.8144105108078246e-05, + "loss": 1.957880973815918, + "step": 702 + }, + { + "epoch": 0.18334746038990676, + "grad_norm": 4.4375, + "learning_rate": 3.813796029506277e-05, + "loss": 2.0707597732543945, + "step": 703 + }, + { + "epoch": 0.18360826758818544, + "grad_norm": 4.3125, + "learning_rate": 3.813180582283167e-05, + "loss": 2.112607955932617, + "step": 704 + }, + { + "epoch": 0.1838690747864641, + "grad_norm": 4.375, + "learning_rate": 3.8125641694662445e-05, + "loss": 1.702808141708374, + "step": 705 + }, + { + "epoch": 0.18412988198474278, + "grad_norm": 4.1875, + "learning_rate": 3.8119467913837754e-05, + "loss": 1.906697154045105, + "step": 706 + }, + { + "epoch": 0.18439068918302146, + "grad_norm": 4.5, + "learning_rate": 3.811328448364538e-05, + "loss": 1.8337846994400024, + "step": 707 + }, + { + "epoch": 0.18465149638130013, + "grad_norm": 5.09375, + "learning_rate": 3.8107091407378275e-05, + "loss": 1.86478590965271, + "step": 708 + }, + { + "epoch": 0.1849123035795788, + "grad_norm": 4.0625, + "learning_rate": 3.81008886883345e-05, + "loss": 1.6813318729400635, + "step": 709 + }, + { + "epoch": 0.18517311077785747, + "grad_norm": 4.8125, + "learning_rate": 3.8094676329817256e-05, + "loss": 1.7286593914031982, + "step": 710 + }, + { + "epoch": 0.18543391797613615, + "grad_norm": 5.03125, + "learning_rate": 3.808845433513488e-05, + "loss": 2.1475114822387695, + "step": 711 + }, + { + "epoch": 0.18569472517441482, + "grad_norm": 4.28125, + "learning_rate": 3.8082222707600854e-05, + "loss": 1.9896444082260132, + "step": 712 + }, + { + "epoch": 0.1859555323726935, + "grad_norm": 4.46875, + "learning_rate": 3.807598145053376e-05, + "loss": 2.2578413486480713, + "step": 713 + }, + { + "epoch": 0.18621633957097217, + "grad_norm": 4.1875, + "learning_rate": 3.806973056725735e-05, + "loss": 1.6955958604812622, + "step": 714 + }, + { + "epoch": 0.18647714676925084, + "grad_norm": 4.46875, + "learning_rate": 3.8063470061100454e-05, + "loss": 1.8761972188949585, + "step": 715 + }, + { + "epoch": 0.1867379539675295, + "grad_norm": 4.375, + "learning_rate": 3.805719993539707e-05, + "loss": 1.8478339910507202, + "step": 716 + }, + { + "epoch": 0.18699876116580819, + "grad_norm": 4.34375, + "learning_rate": 3.805092019348628e-05, + "loss": 1.4803378582000732, + "step": 717 + }, + { + "epoch": 0.18725956836408686, + "grad_norm": 4.65625, + "learning_rate": 3.8044630838712326e-05, + "loss": 1.7085039615631104, + "step": 718 + }, + { + "epoch": 0.18752037556236553, + "grad_norm": 4.0625, + "learning_rate": 3.8038331874424546e-05, + "loss": 1.6832685470581055, + "step": 719 + }, + { + "epoch": 0.1877811827606442, + "grad_norm": 4.34375, + "learning_rate": 3.8032023303977384e-05, + "loss": 2.000880479812622, + "step": 720 + }, + { + "epoch": 0.18804198995892288, + "grad_norm": 4.625, + "learning_rate": 3.802570513073044e-05, + "loss": 1.9390875101089478, + "step": 721 + }, + { + "epoch": 0.18830279715720155, + "grad_norm": 4.90625, + "learning_rate": 3.801937735804838e-05, + "loss": 2.086874008178711, + "step": 722 + }, + { + "epoch": 0.18856360435548022, + "grad_norm": 4.96875, + "learning_rate": 3.801303998930103e-05, + "loss": 2.000013828277588, + "step": 723 + }, + { + "epoch": 0.1888244115537589, + "grad_norm": 4.3125, + "learning_rate": 3.800669302786328e-05, + "loss": 1.8913724422454834, + "step": 724 + }, + { + "epoch": 0.18908521875203754, + "grad_norm": 4.375, + "learning_rate": 3.800033647711515e-05, + "loss": 1.7368848323822021, + "step": 725 + }, + { + "epoch": 0.18934602595031622, + "grad_norm": 4.375, + "learning_rate": 3.7993970340441786e-05, + "loss": 1.822178840637207, + "step": 726 + }, + { + "epoch": 0.1896068331485949, + "grad_norm": 4.1875, + "learning_rate": 3.79875946212334e-05, + "loss": 1.549401044845581, + "step": 727 + }, + { + "epoch": 0.18986764034687356, + "grad_norm": 4.75, + "learning_rate": 3.798120932288534e-05, + "loss": 1.803915023803711, + "step": 728 + }, + { + "epoch": 0.19012844754515223, + "grad_norm": 4.96875, + "learning_rate": 3.797481444879803e-05, + "loss": 1.8997050523757935, + "step": 729 + }, + { + "epoch": 0.1903892547434309, + "grad_norm": 4.78125, + "learning_rate": 3.796841000237701e-05, + "loss": 1.73325777053833, + "step": 730 + }, + { + "epoch": 0.19065006194170958, + "grad_norm": 4.71875, + "learning_rate": 3.7961995987032924e-05, + "loss": 1.9174513816833496, + "step": 731 + }, + { + "epoch": 0.19091086913998825, + "grad_norm": 5.1875, + "learning_rate": 3.795557240618149e-05, + "loss": 1.9190399646759033, + "step": 732 + }, + { + "epoch": 0.19117167633826693, + "grad_norm": 4.78125, + "learning_rate": 3.794913926324353e-05, + "loss": 1.8463743925094604, + "step": 733 + }, + { + "epoch": 0.1914324835365456, + "grad_norm": 4.46875, + "learning_rate": 3.794269656164496e-05, + "loss": 1.7895328998565674, + "step": 734 + }, + { + "epoch": 0.19169329073482427, + "grad_norm": 4.21875, + "learning_rate": 3.793624430481679e-05, + "loss": 1.724631428718567, + "step": 735 + }, + { + "epoch": 0.19195409793310295, + "grad_norm": 4.59375, + "learning_rate": 3.792978249619512e-05, + "loss": 1.934809684753418, + "step": 736 + }, + { + "epoch": 0.19221490513138162, + "grad_norm": 4.25, + "learning_rate": 3.7923311139221114e-05, + "loss": 1.7765518426895142, + "step": 737 + }, + { + "epoch": 0.1924757123296603, + "grad_norm": 4.3125, + "learning_rate": 3.791683023734105e-05, + "loss": 1.8107788562774658, + "step": 738 + }, + { + "epoch": 0.19273651952793897, + "grad_norm": 4.46875, + "learning_rate": 3.7910339794006274e-05, + "loss": 1.9428932666778564, + "step": 739 + }, + { + "epoch": 0.19299732672621764, + "grad_norm": 4.46875, + "learning_rate": 3.790383981267322e-05, + "loss": 1.7820011377334595, + "step": 740 + }, + { + "epoch": 0.1932581339244963, + "grad_norm": 4.40625, + "learning_rate": 3.789733029680338e-05, + "loss": 1.9347243309020996, + "step": 741 + }, + { + "epoch": 0.19351894112277498, + "grad_norm": 4.40625, + "learning_rate": 3.789081124986337e-05, + "loss": 1.6909987926483154, + "step": 742 + }, + { + "epoch": 0.19377974832105366, + "grad_norm": 5.03125, + "learning_rate": 3.788428267532483e-05, + "loss": 1.7528774738311768, + "step": 743 + }, + { + "epoch": 0.19404055551933233, + "grad_norm": 4.59375, + "learning_rate": 3.787774457666451e-05, + "loss": 1.726656198501587, + "step": 744 + }, + { + "epoch": 0.194301362717611, + "grad_norm": 4.71875, + "learning_rate": 3.7871196957364206e-05, + "loss": 2.0491385459899902, + "step": 745 + }, + { + "epoch": 0.19456216991588968, + "grad_norm": 5.03125, + "learning_rate": 3.78646398209108e-05, + "loss": 1.9195506572723389, + "step": 746 + }, + { + "epoch": 0.19482297711416835, + "grad_norm": 4.375, + "learning_rate": 3.785807317079624e-05, + "loss": 1.758399486541748, + "step": 747 + }, + { + "epoch": 0.19508378431244702, + "grad_norm": 4.4375, + "learning_rate": 3.7851497010517554e-05, + "loss": 1.838735580444336, + "step": 748 + }, + { + "epoch": 0.1953445915107257, + "grad_norm": 4.34375, + "learning_rate": 3.78449113435768e-05, + "loss": 1.9476432800292969, + "step": 749 + }, + { + "epoch": 0.19560539870900437, + "grad_norm": 4.65625, + "learning_rate": 3.7838316173481127e-05, + "loss": 1.732071876525879, + "step": 750 + }, + { + "epoch": 0.19586620590728304, + "grad_norm": 4.21875, + "learning_rate": 3.783171150374273e-05, + "loss": 1.670494794845581, + "step": 751 + }, + { + "epoch": 0.19612701310556171, + "grad_norm": 4.53125, + "learning_rate": 3.782509733787888e-05, + "loss": 1.9444698095321655, + "step": 752 + }, + { + "epoch": 0.1963878203038404, + "grad_norm": 4.34375, + "learning_rate": 3.7818473679411886e-05, + "loss": 1.8395514488220215, + "step": 753 + }, + { + "epoch": 0.19664862750211906, + "grad_norm": 4.4375, + "learning_rate": 3.7811840531869124e-05, + "loss": 1.9476215839385986, + "step": 754 + }, + { + "epoch": 0.19690943470039773, + "grad_norm": 4.9375, + "learning_rate": 3.7805197898783015e-05, + "loss": 2.008249282836914, + "step": 755 + }, + { + "epoch": 0.1971702418986764, + "grad_norm": 4.75, + "learning_rate": 3.7798545783691055e-05, + "loss": 1.943685531616211, + "step": 756 + }, + { + "epoch": 0.19743104909695508, + "grad_norm": 4.5625, + "learning_rate": 3.7791884190135745e-05, + "loss": 1.8641271591186523, + "step": 757 + }, + { + "epoch": 0.19769185629523375, + "grad_norm": 4.59375, + "learning_rate": 3.778521312166467e-05, + "loss": 1.9630130529403687, + "step": 758 + }, + { + "epoch": 0.19795266349351243, + "grad_norm": 4.5625, + "learning_rate": 3.777853258183046e-05, + "loss": 2.0424537658691406, + "step": 759 + }, + { + "epoch": 0.1982134706917911, + "grad_norm": 4.53125, + "learning_rate": 3.7771842574190765e-05, + "loss": 2.040316104888916, + "step": 760 + }, + { + "epoch": 0.19847427789006977, + "grad_norm": 4.75, + "learning_rate": 3.7765143102308305e-05, + "loss": 2.004734754562378, + "step": 761 + }, + { + "epoch": 0.19873508508834845, + "grad_norm": 4.53125, + "learning_rate": 3.775843416975082e-05, + "loss": 1.8797520399093628, + "step": 762 + }, + { + "epoch": 0.19899589228662712, + "grad_norm": 4.40625, + "learning_rate": 3.775171578009109e-05, + "loss": 1.61118483543396, + "step": 763 + }, + { + "epoch": 0.1992566994849058, + "grad_norm": 4.375, + "learning_rate": 3.7744987936906934e-05, + "loss": 1.8276541233062744, + "step": 764 + }, + { + "epoch": 0.19951750668318446, + "grad_norm": 4.4375, + "learning_rate": 3.773825064378122e-05, + "loss": 1.9090209007263184, + "step": 765 + }, + { + "epoch": 0.19977831388146314, + "grad_norm": 4.59375, + "learning_rate": 3.773150390430183e-05, + "loss": 1.7013720273971558, + "step": 766 + }, + { + "epoch": 0.2000391210797418, + "grad_norm": 5.125, + "learning_rate": 3.7724747722061676e-05, + "loss": 1.9808199405670166, + "step": 767 + }, + { + "epoch": 0.20029992827802048, + "grad_norm": 4.34375, + "learning_rate": 3.771798210065871e-05, + "loss": 2.031911611557007, + "step": 768 + }, + { + "epoch": 0.20056073547629916, + "grad_norm": 4.53125, + "learning_rate": 3.7711207043695914e-05, + "loss": 2.065260410308838, + "step": 769 + }, + { + "epoch": 0.20082154267457783, + "grad_norm": 4.5, + "learning_rate": 3.770442255478128e-05, + "loss": 1.8915151357650757, + "step": 770 + }, + { + "epoch": 0.2010823498728565, + "grad_norm": 4.78125, + "learning_rate": 3.769762863752782e-05, + "loss": 1.715123176574707, + "step": 771 + }, + { + "epoch": 0.20134315707113518, + "grad_norm": 4.59375, + "learning_rate": 3.769082529555359e-05, + "loss": 1.732142686843872, + "step": 772 + }, + { + "epoch": 0.20160396426941385, + "grad_norm": 5.53125, + "learning_rate": 3.768401253248165e-05, + "loss": 1.9372129440307617, + "step": 773 + }, + { + "epoch": 0.20186477146769252, + "grad_norm": 4.71875, + "learning_rate": 3.767719035194007e-05, + "loss": 1.8738274574279785, + "step": 774 + }, + { + "epoch": 0.20212557866597117, + "grad_norm": 4.625, + "learning_rate": 3.767035875756195e-05, + "loss": 1.9400805234909058, + "step": 775 + }, + { + "epoch": 0.20238638586424984, + "grad_norm": 4.96875, + "learning_rate": 3.76635177529854e-05, + "loss": 1.754683256149292, + "step": 776 + }, + { + "epoch": 0.2026471930625285, + "grad_norm": 4.3125, + "learning_rate": 3.765666734185353e-05, + "loss": 2.086247682571411, + "step": 777 + }, + { + "epoch": 0.2029080002608072, + "grad_norm": 4.84375, + "learning_rate": 3.764980752781448e-05, + "loss": 1.793013572692871, + "step": 778 + }, + { + "epoch": 0.20316880745908586, + "grad_norm": 4.90625, + "learning_rate": 3.7642938314521374e-05, + "loss": 1.8967558145523071, + "step": 779 + }, + { + "epoch": 0.20342961465736453, + "grad_norm": 4.46875, + "learning_rate": 3.7636059705632355e-05, + "loss": 2.0488367080688477, + "step": 780 + }, + { + "epoch": 0.2036904218556432, + "grad_norm": 5.28125, + "learning_rate": 3.762917170481057e-05, + "loss": 1.8737905025482178, + "step": 781 + }, + { + "epoch": 0.20395122905392188, + "grad_norm": 4.375, + "learning_rate": 3.762227431572417e-05, + "loss": 1.8983911275863647, + "step": 782 + }, + { + "epoch": 0.20421203625220055, + "grad_norm": 4.75, + "learning_rate": 3.761536754204628e-05, + "loss": 1.9074043035507202, + "step": 783 + }, + { + "epoch": 0.20447284345047922, + "grad_norm": 4.34375, + "learning_rate": 3.7608451387455066e-05, + "loss": 1.863377332687378, + "step": 784 + }, + { + "epoch": 0.2047336506487579, + "grad_norm": 4.75, + "learning_rate": 3.760152585563367e-05, + "loss": 1.7177624702453613, + "step": 785 + }, + { + "epoch": 0.20499445784703657, + "grad_norm": 4.28125, + "learning_rate": 3.75945909502702e-05, + "loss": 1.6665986776351929, + "step": 786 + }, + { + "epoch": 0.20525526504531524, + "grad_norm": 4.59375, + "learning_rate": 3.75876466750578e-05, + "loss": 1.8127803802490234, + "step": 787 + }, + { + "epoch": 0.20551607224359392, + "grad_norm": 4.84375, + "learning_rate": 3.7580693033694576e-05, + "loss": 2.0583481788635254, + "step": 788 + }, + { + "epoch": 0.2057768794418726, + "grad_norm": 4.65625, + "learning_rate": 3.757373002988363e-05, + "loss": 2.0759923458099365, + "step": 789 + }, + { + "epoch": 0.20603768664015126, + "grad_norm": 4.53125, + "learning_rate": 3.756675766733306e-05, + "loss": 1.8449110984802246, + "step": 790 + }, + { + "epoch": 0.20629849383842994, + "grad_norm": 4.4375, + "learning_rate": 3.755977594975593e-05, + "loss": 1.890228271484375, + "step": 791 + }, + { + "epoch": 0.2065593010367086, + "grad_norm": 4.40625, + "learning_rate": 3.7552784880870294e-05, + "loss": 1.9156498908996582, + "step": 792 + }, + { + "epoch": 0.20682010823498728, + "grad_norm": 4.40625, + "learning_rate": 3.754578446439919e-05, + "loss": 1.7611085176467896, + "step": 793 + }, + { + "epoch": 0.20708091543326596, + "grad_norm": 4.3125, + "learning_rate": 3.753877470407062e-05, + "loss": 1.793421983718872, + "step": 794 + }, + { + "epoch": 0.20734172263154463, + "grad_norm": 4.40625, + "learning_rate": 3.753175560361758e-05, + "loss": 2.0085411071777344, + "step": 795 + }, + { + "epoch": 0.2076025298298233, + "grad_norm": 4.5625, + "learning_rate": 3.752472716677803e-05, + "loss": 1.8485441207885742, + "step": 796 + }, + { + "epoch": 0.20786333702810197, + "grad_norm": 4.53125, + "learning_rate": 3.7517689397294914e-05, + "loss": 1.9082846641540527, + "step": 797 + }, + { + "epoch": 0.20812414422638065, + "grad_norm": 4.15625, + "learning_rate": 3.751064229891612e-05, + "loss": 1.8202950954437256, + "step": 798 + }, + { + "epoch": 0.20838495142465932, + "grad_norm": 4.71875, + "learning_rate": 3.750358587539452e-05, + "loss": 1.9479665756225586, + "step": 799 + }, + { + "epoch": 0.208645758622938, + "grad_norm": 4.6875, + "learning_rate": 3.749652013048797e-05, + "loss": 2.0446767807006836, + "step": 800 + }, + { + "epoch": 0.20890656582121667, + "grad_norm": 4.5625, + "learning_rate": 3.748944506795926e-05, + "loss": 1.5322918891906738, + "step": 801 + }, + { + "epoch": 0.20916737301949534, + "grad_norm": 4.8125, + "learning_rate": 3.7482360691576146e-05, + "loss": 1.6456043720245361, + "step": 802 + }, + { + "epoch": 0.209428180217774, + "grad_norm": 4.09375, + "learning_rate": 3.747526700511137e-05, + "loss": 1.7690880298614502, + "step": 803 + }, + { + "epoch": 0.20968898741605269, + "grad_norm": 4.15625, + "learning_rate": 3.74681640123426e-05, + "loss": 1.7727999687194824, + "step": 804 + }, + { + "epoch": 0.20994979461433136, + "grad_norm": 5.75, + "learning_rate": 3.7461051717052474e-05, + "loss": 2.2625675201416016, + "step": 805 + }, + { + "epoch": 0.21021060181261003, + "grad_norm": 4.65625, + "learning_rate": 3.7453930123028594e-05, + "loss": 1.7446084022521973, + "step": 806 + }, + { + "epoch": 0.2104714090108887, + "grad_norm": 4.625, + "learning_rate": 3.744679923406351e-05, + "loss": 1.9689126014709473, + "step": 807 + }, + { + "epoch": 0.21073221620916738, + "grad_norm": 4.1875, + "learning_rate": 3.7439659053954685e-05, + "loss": 1.6769580841064453, + "step": 808 + }, + { + "epoch": 0.21099302340744605, + "grad_norm": 4.0625, + "learning_rate": 3.743250958650459e-05, + "loss": 1.8509106636047363, + "step": 809 + }, + { + "epoch": 0.21125383060572472, + "grad_norm": 4.28125, + "learning_rate": 3.74253508355206e-05, + "loss": 1.8638005256652832, + "step": 810 + }, + { + "epoch": 0.2115146378040034, + "grad_norm": 4.375, + "learning_rate": 3.7418182804815054e-05, + "loss": 1.8554314374923706, + "step": 811 + }, + { + "epoch": 0.21177544500228207, + "grad_norm": 4.5, + "learning_rate": 3.741100549820522e-05, + "loss": 1.6938271522521973, + "step": 812 + }, + { + "epoch": 0.21203625220056074, + "grad_norm": 4.5625, + "learning_rate": 3.740381891951332e-05, + "loss": 2.0607852935791016, + "step": 813 + }, + { + "epoch": 0.21229705939883942, + "grad_norm": 4.5625, + "learning_rate": 3.739662307256649e-05, + "loss": 2.0525259971618652, + "step": 814 + }, + { + "epoch": 0.2125578665971181, + "grad_norm": 4.40625, + "learning_rate": 3.738941796119682e-05, + "loss": 1.8823179006576538, + "step": 815 + }, + { + "epoch": 0.21281867379539676, + "grad_norm": 4.65625, + "learning_rate": 3.738220358924134e-05, + "loss": 1.9729381799697876, + "step": 816 + }, + { + "epoch": 0.21307948099367544, + "grad_norm": 4.40625, + "learning_rate": 3.7374979960542e-05, + "loss": 1.991550087928772, + "step": 817 + }, + { + "epoch": 0.2133402881919541, + "grad_norm": 4.09375, + "learning_rate": 3.736774707894568e-05, + "loss": 1.6832308769226074, + "step": 818 + }, + { + "epoch": 0.21360109539023278, + "grad_norm": 4.40625, + "learning_rate": 3.736050494830417e-05, + "loss": 1.9713823795318604, + "step": 819 + }, + { + "epoch": 0.21386190258851145, + "grad_norm": 4.5625, + "learning_rate": 3.735325357247424e-05, + "loss": 1.791365623474121, + "step": 820 + }, + { + "epoch": 0.21412270978679013, + "grad_norm": 4.46875, + "learning_rate": 3.7345992955317534e-05, + "loss": 1.7550973892211914, + "step": 821 + }, + { + "epoch": 0.2143835169850688, + "grad_norm": 4.625, + "learning_rate": 3.7338723100700615e-05, + "loss": 1.830931305885315, + "step": 822 + }, + { + "epoch": 0.21464432418334747, + "grad_norm": 4.625, + "learning_rate": 3.733144401249501e-05, + "loss": 2.0567502975463867, + "step": 823 + }, + { + "epoch": 0.21490513138162615, + "grad_norm": 4.28125, + "learning_rate": 3.732415569457711e-05, + "loss": 1.8143775463104248, + "step": 824 + }, + { + "epoch": 0.2151659385799048, + "grad_norm": 4.40625, + "learning_rate": 3.731685815082826e-05, + "loss": 1.614667534828186, + "step": 825 + }, + { + "epoch": 0.21542674577818346, + "grad_norm": 4.96875, + "learning_rate": 3.73095513851347e-05, + "loss": 1.9580087661743164, + "step": 826 + }, + { + "epoch": 0.21568755297646214, + "grad_norm": 4.25, + "learning_rate": 3.730223540138759e-05, + "loss": 1.9181026220321655, + "step": 827 + }, + { + "epoch": 0.2159483601747408, + "grad_norm": 4.3125, + "learning_rate": 3.7294910203482984e-05, + "loss": 1.6878881454467773, + "step": 828 + }, + { + "epoch": 0.21620916737301948, + "grad_norm": 4.5625, + "learning_rate": 3.728757579532187e-05, + "loss": 2.0224173069000244, + "step": 829 + }, + { + "epoch": 0.21646997457129816, + "grad_norm": 4.15625, + "learning_rate": 3.728023218081011e-05, + "loss": 1.8751661777496338, + "step": 830 + }, + { + "epoch": 0.21673078176957683, + "grad_norm": 4.125, + "learning_rate": 3.727287936385849e-05, + "loss": 1.7184031009674072, + "step": 831 + }, + { + "epoch": 0.2169915889678555, + "grad_norm": 4.5625, + "learning_rate": 3.7265517348382683e-05, + "loss": 1.8709547519683838, + "step": 832 + }, + { + "epoch": 0.21725239616613418, + "grad_norm": 4.59375, + "learning_rate": 3.7258146138303276e-05, + "loss": 2.057793617248535, + "step": 833 + }, + { + "epoch": 0.21751320336441285, + "grad_norm": 4.375, + "learning_rate": 3.725076573754574e-05, + "loss": 1.785442590713501, + "step": 834 + }, + { + "epoch": 0.21777401056269152, + "grad_norm": 4.03125, + "learning_rate": 3.724337615004045e-05, + "loss": 1.485368013381958, + "step": 835 + }, + { + "epoch": 0.2180348177609702, + "grad_norm": 4.4375, + "learning_rate": 3.7235977379722666e-05, + "loss": 1.9761755466461182, + "step": 836 + }, + { + "epoch": 0.21829562495924887, + "grad_norm": 4.4375, + "learning_rate": 3.722856943053253e-05, + "loss": 1.9895318746566772, + "step": 837 + }, + { + "epoch": 0.21855643215752754, + "grad_norm": 4.53125, + "learning_rate": 3.722115230641509e-05, + "loss": 1.6570758819580078, + "step": 838 + }, + { + "epoch": 0.21881723935580621, + "grad_norm": 4.6875, + "learning_rate": 3.721372601132027e-05, + "loss": 2.0249204635620117, + "step": 839 + }, + { + "epoch": 0.2190780465540849, + "grad_norm": 4.4375, + "learning_rate": 3.720629054920287e-05, + "loss": 1.8241016864776611, + "step": 840 + }, + { + "epoch": 0.21933885375236356, + "grad_norm": 4.125, + "learning_rate": 3.71988459240226e-05, + "loss": 1.8797310590744019, + "step": 841 + }, + { + "epoch": 0.21959966095064223, + "grad_norm": 4.90625, + "learning_rate": 3.719139213974403e-05, + "loss": 1.8294291496276855, + "step": 842 + }, + { + "epoch": 0.2198604681489209, + "grad_norm": 4.1875, + "learning_rate": 3.718392920033659e-05, + "loss": 1.7440972328186035, + "step": 843 + }, + { + "epoch": 0.22012127534719958, + "grad_norm": 4.21875, + "learning_rate": 3.7176457109774624e-05, + "loss": 1.7216081619262695, + "step": 844 + }, + { + "epoch": 0.22038208254547825, + "grad_norm": 4.5625, + "learning_rate": 3.716897587203733e-05, + "loss": 1.734525442123413, + "step": 845 + }, + { + "epoch": 0.22064288974375693, + "grad_norm": 4.25, + "learning_rate": 3.716148549110876e-05, + "loss": 1.861636757850647, + "step": 846 + }, + { + "epoch": 0.2209036969420356, + "grad_norm": 4.375, + "learning_rate": 3.7153985970977865e-05, + "loss": 1.9185471534729004, + "step": 847 + }, + { + "epoch": 0.22116450414031427, + "grad_norm": 4.25, + "learning_rate": 3.714647731563845e-05, + "loss": 1.6556049585342407, + "step": 848 + }, + { + "epoch": 0.22142531133859295, + "grad_norm": 5.0, + "learning_rate": 3.7138959529089175e-05, + "loss": 1.8159377574920654, + "step": 849 + }, + { + "epoch": 0.22168611853687162, + "grad_norm": 4.375, + "learning_rate": 3.713143261533359e-05, + "loss": 2.021803140640259, + "step": 850 + }, + { + "epoch": 0.2219469257351503, + "grad_norm": 4.46875, + "learning_rate": 3.712389657838007e-05, + "loss": 1.7354223728179932, + "step": 851 + }, + { + "epoch": 0.22220773293342896, + "grad_norm": 5.0, + "learning_rate": 3.7116351422241894e-05, + "loss": 2.040472984313965, + "step": 852 + }, + { + "epoch": 0.22246854013170764, + "grad_norm": 4.15625, + "learning_rate": 3.7108797150937136e-05, + "loss": 1.7748935222625732, + "step": 853 + }, + { + "epoch": 0.2227293473299863, + "grad_norm": 4.3125, + "learning_rate": 3.710123376848878e-05, + "loss": 1.8850452899932861, + "step": 854 + }, + { + "epoch": 0.22299015452826498, + "grad_norm": 4.1875, + "learning_rate": 3.709366127892464e-05, + "loss": 1.8661198616027832, + "step": 855 + }, + { + "epoch": 0.22325096172654366, + "grad_norm": 4.09375, + "learning_rate": 3.7086079686277376e-05, + "loss": 1.8567036390304565, + "step": 856 + }, + { + "epoch": 0.22351176892482233, + "grad_norm": 4.71875, + "learning_rate": 3.7078488994584496e-05, + "loss": 1.8610680103302002, + "step": 857 + }, + { + "epoch": 0.223772576123101, + "grad_norm": 4.6875, + "learning_rate": 3.7070889207888375e-05, + "loss": 1.8955552577972412, + "step": 858 + }, + { + "epoch": 0.22403338332137968, + "grad_norm": 4.65625, + "learning_rate": 3.706328033023619e-05, + "loss": 1.9822683334350586, + "step": 859 + }, + { + "epoch": 0.22429419051965835, + "grad_norm": 4.34375, + "learning_rate": 3.7055662365679994e-05, + "loss": 1.6399036645889282, + "step": 860 + }, + { + "epoch": 0.22455499771793702, + "grad_norm": 4.34375, + "learning_rate": 3.704803531827668e-05, + "loss": 1.6619651317596436, + "step": 861 + }, + { + "epoch": 0.2248158049162157, + "grad_norm": 4.53125, + "learning_rate": 3.704039919208795e-05, + "loss": 1.5787312984466553, + "step": 862 + }, + { + "epoch": 0.22507661211449437, + "grad_norm": 4.09375, + "learning_rate": 3.703275399118037e-05, + "loss": 1.7187163829803467, + "step": 863 + }, + { + "epoch": 0.22533741931277304, + "grad_norm": 4.75, + "learning_rate": 3.7025099719625324e-05, + "loss": 2.1511130332946777, + "step": 864 + }, + { + "epoch": 0.2255982265110517, + "grad_norm": 4.40625, + "learning_rate": 3.7017436381499026e-05, + "loss": 1.8955907821655273, + "step": 865 + }, + { + "epoch": 0.2258590337093304, + "grad_norm": 4.9375, + "learning_rate": 3.700976398088252e-05, + "loss": 1.884652853012085, + "step": 866 + }, + { + "epoch": 0.22611984090760906, + "grad_norm": 4.625, + "learning_rate": 3.7002082521861675e-05, + "loss": 1.8265893459320068, + "step": 867 + }, + { + "epoch": 0.22638064810588773, + "grad_norm": 4.1875, + "learning_rate": 3.699439200852719e-05, + "loss": 1.5954231023788452, + "step": 868 + }, + { + "epoch": 0.2266414553041664, + "grad_norm": 4.625, + "learning_rate": 3.69866924449746e-05, + "loss": 1.8270741701126099, + "step": 869 + }, + { + "epoch": 0.22690226250244508, + "grad_norm": 4.28125, + "learning_rate": 3.6978983835304204e-05, + "loss": 1.938004732131958, + "step": 870 + }, + { + "epoch": 0.22716306970072375, + "grad_norm": 4.1875, + "learning_rate": 3.697126618362119e-05, + "loss": 1.695405125617981, + "step": 871 + }, + { + "epoch": 0.22742387689900243, + "grad_norm": 4.96875, + "learning_rate": 3.69635394940355e-05, + "loss": 1.9706010818481445, + "step": 872 + }, + { + "epoch": 0.2276846840972811, + "grad_norm": 4.4375, + "learning_rate": 3.695580377066194e-05, + "loss": 1.896984577178955, + "step": 873 + }, + { + "epoch": 0.22794549129555977, + "grad_norm": 4.78125, + "learning_rate": 3.6948059017620095e-05, + "loss": 1.9504402875900269, + "step": 874 + }, + { + "epoch": 0.22820629849383842, + "grad_norm": 4.4375, + "learning_rate": 3.694030523903436e-05, + "loss": 1.9126397371292114, + "step": 875 + }, + { + "epoch": 0.2284671056921171, + "grad_norm": 4.4375, + "learning_rate": 3.6932542439033955e-05, + "loss": 1.790719747543335, + "step": 876 + }, + { + "epoch": 0.22872791289039576, + "grad_norm": 4.4375, + "learning_rate": 3.692477062175289e-05, + "loss": 1.9425745010375977, + "step": 877 + }, + { + "epoch": 0.22898872008867444, + "grad_norm": 4.5, + "learning_rate": 3.691698979132996e-05, + "loss": 1.9001245498657227, + "step": 878 + }, + { + "epoch": 0.2292495272869531, + "grad_norm": 4.34375, + "learning_rate": 3.690919995190881e-05, + "loss": 1.8583731651306152, + "step": 879 + }, + { + "epoch": 0.22951033448523178, + "grad_norm": 4.15625, + "learning_rate": 3.690140110763784e-05, + "loss": 1.8205029964447021, + "step": 880 + }, + { + "epoch": 0.22977114168351045, + "grad_norm": 4.5625, + "learning_rate": 3.6893593262670246e-05, + "loss": 1.9404447078704834, + "step": 881 + }, + { + "epoch": 0.23003194888178913, + "grad_norm": 3.96875, + "learning_rate": 3.688577642116405e-05, + "loss": 1.7789771556854248, + "step": 882 + }, + { + "epoch": 0.2302927560800678, + "grad_norm": 4.40625, + "learning_rate": 3.6877950587282025e-05, + "loss": 2.017627716064453, + "step": 883 + }, + { + "epoch": 0.23055356327834647, + "grad_norm": 4.34375, + "learning_rate": 3.687011576519177e-05, + "loss": 1.93501615524292, + "step": 884 + }, + { + "epoch": 0.23081437047662515, + "grad_norm": 4.6875, + "learning_rate": 3.686227195906564e-05, + "loss": 1.960984706878662, + "step": 885 + }, + { + "epoch": 0.23107517767490382, + "grad_norm": 4.03125, + "learning_rate": 3.6854419173080784e-05, + "loss": 1.81727933883667, + "step": 886 + }, + { + "epoch": 0.2313359848731825, + "grad_norm": 4.65625, + "learning_rate": 3.6846557411419145e-05, + "loss": 1.7302303314208984, + "step": 887 + }, + { + "epoch": 0.23159679207146117, + "grad_norm": 4.28125, + "learning_rate": 3.683868667826744e-05, + "loss": 1.8151025772094727, + "step": 888 + }, + { + "epoch": 0.23185759926973984, + "grad_norm": 4.625, + "learning_rate": 3.683080697781715e-05, + "loss": 1.9974563121795654, + "step": 889 + }, + { + "epoch": 0.2321184064680185, + "grad_norm": 4.0625, + "learning_rate": 3.682291831426454e-05, + "loss": 1.8311131000518799, + "step": 890 + }, + { + "epoch": 0.23237921366629719, + "grad_norm": 4.40625, + "learning_rate": 3.6815020691810664e-05, + "loss": 1.8545184135437012, + "step": 891 + }, + { + "epoch": 0.23264002086457586, + "grad_norm": 5.3125, + "learning_rate": 3.680711411466133e-05, + "loss": 1.6972341537475586, + "step": 892 + }, + { + "epoch": 0.23290082806285453, + "grad_norm": 4.90625, + "learning_rate": 3.679919858702711e-05, + "loss": 2.1778340339660645, + "step": 893 + }, + { + "epoch": 0.2331616352611332, + "grad_norm": 4.625, + "learning_rate": 3.679127411312336e-05, + "loss": 2.0662150382995605, + "step": 894 + }, + { + "epoch": 0.23342244245941188, + "grad_norm": 4.0625, + "learning_rate": 3.678334069717018e-05, + "loss": 1.7575409412384033, + "step": 895 + }, + { + "epoch": 0.23368324965769055, + "grad_norm": 4.96875, + "learning_rate": 3.6775398343392444e-05, + "loss": 2.143287420272827, + "step": 896 + }, + { + "epoch": 0.23394405685596922, + "grad_norm": 4.09375, + "learning_rate": 3.67674470560198e-05, + "loss": 1.69294011592865, + "step": 897 + }, + { + "epoch": 0.2342048640542479, + "grad_norm": 4.25, + "learning_rate": 3.675948683928662e-05, + "loss": 1.6591373682022095, + "step": 898 + }, + { + "epoch": 0.23446567125252657, + "grad_norm": 6.625, + "learning_rate": 3.675151769743206e-05, + "loss": 2.171626329421997, + "step": 899 + }, + { + "epoch": 0.23472647845080524, + "grad_norm": 4.34375, + "learning_rate": 3.674353963470001e-05, + "loss": 1.7921541929244995, + "step": 900 + }, + { + "epoch": 0.23498728564908392, + "grad_norm": 4.78125, + "learning_rate": 3.673555265533913e-05, + "loss": 1.960420846939087, + "step": 901 + }, + { + "epoch": 0.2352480928473626, + "grad_norm": 4.75, + "learning_rate": 3.672755676360281e-05, + "loss": 2.0104479789733887, + "step": 902 + }, + { + "epoch": 0.23550890004564126, + "grad_norm": 4.6875, + "learning_rate": 3.671955196374919e-05, + "loss": 2.0661163330078125, + "step": 903 + }, + { + "epoch": 0.23576970724391993, + "grad_norm": 4.375, + "learning_rate": 3.671153826004116e-05, + "loss": 2.0342602729797363, + "step": 904 + }, + { + "epoch": 0.2360305144421986, + "grad_norm": 4.3125, + "learning_rate": 3.6703515656746365e-05, + "loss": 1.9294260740280151, + "step": 905 + }, + { + "epoch": 0.23629132164047728, + "grad_norm": 3.953125, + "learning_rate": 3.669548415813715e-05, + "loss": 1.389187216758728, + "step": 906 + }, + { + "epoch": 0.23655212883875595, + "grad_norm": 4.125, + "learning_rate": 3.668744376849064e-05, + "loss": 1.5896613597869873, + "step": 907 + }, + { + "epoch": 0.23681293603703463, + "grad_norm": 4.09375, + "learning_rate": 3.6679394492088666e-05, + "loss": 1.739761233329773, + "step": 908 + }, + { + "epoch": 0.2370737432353133, + "grad_norm": 4.125, + "learning_rate": 3.66713363332178e-05, + "loss": 1.8526043891906738, + "step": 909 + }, + { + "epoch": 0.23733455043359197, + "grad_norm": 4.5625, + "learning_rate": 3.666326929616935e-05, + "loss": 1.7840735912322998, + "step": 910 + }, + { + "epoch": 0.23759535763187065, + "grad_norm": 4.53125, + "learning_rate": 3.665519338523935e-05, + "loss": 1.9899275302886963, + "step": 911 + }, + { + "epoch": 0.23785616483014932, + "grad_norm": 4.5, + "learning_rate": 3.6647108604728546e-05, + "loss": 1.8728816509246826, + "step": 912 + }, + { + "epoch": 0.238116972028428, + "grad_norm": 4.09375, + "learning_rate": 3.6639014958942436e-05, + "loss": 1.7578339576721191, + "step": 913 + }, + { + "epoch": 0.23837777922670667, + "grad_norm": 4.15625, + "learning_rate": 3.66309124521912e-05, + "loss": 1.9648104906082153, + "step": 914 + }, + { + "epoch": 0.23863858642498534, + "grad_norm": 4.25, + "learning_rate": 3.662280108878978e-05, + "loss": 1.7992619276046753, + "step": 915 + }, + { + "epoch": 0.238899393623264, + "grad_norm": 10.1875, + "learning_rate": 3.6614680873057796e-05, + "loss": 2.0010461807250977, + "step": 916 + }, + { + "epoch": 0.23916020082154268, + "grad_norm": 4.625, + "learning_rate": 3.6606551809319614e-05, + "loss": 1.8452863693237305, + "step": 917 + }, + { + "epoch": 0.23942100801982136, + "grad_norm": 4.375, + "learning_rate": 3.659841390190429e-05, + "loss": 1.7582532167434692, + "step": 918 + }, + { + "epoch": 0.23968181521810003, + "grad_norm": 4.53125, + "learning_rate": 3.65902671551456e-05, + "loss": 1.7853057384490967, + "step": 919 + }, + { + "epoch": 0.2399426224163787, + "grad_norm": 4.5625, + "learning_rate": 3.658211157338202e-05, + "loss": 1.660017490386963, + "step": 920 + }, + { + "epoch": 0.24020342961465738, + "grad_norm": 4.625, + "learning_rate": 3.657394716095673e-05, + "loss": 1.7863976955413818, + "step": 921 + }, + { + "epoch": 0.24046423681293605, + "grad_norm": 4.21875, + "learning_rate": 3.656577392221763e-05, + "loss": 1.577195405960083, + "step": 922 + }, + { + "epoch": 0.24072504401121472, + "grad_norm": 4.34375, + "learning_rate": 3.655759186151731e-05, + "loss": 1.7188184261322021, + "step": 923 + }, + { + "epoch": 0.2409858512094934, + "grad_norm": 4.0625, + "learning_rate": 3.654940098321305e-05, + "loss": 1.5993618965148926, + "step": 924 + }, + { + "epoch": 0.24124665840777204, + "grad_norm": 4.34375, + "learning_rate": 3.654120129166682e-05, + "loss": 1.9271817207336426, + "step": 925 + }, + { + "epoch": 0.24150746560605071, + "grad_norm": 4.09375, + "learning_rate": 3.653299279124532e-05, + "loss": 1.5634753704071045, + "step": 926 + }, + { + "epoch": 0.2417682728043294, + "grad_norm": 4.15625, + "learning_rate": 3.65247754863199e-05, + "loss": 1.7926197052001953, + "step": 927 + }, + { + "epoch": 0.24202908000260806, + "grad_norm": 4.1875, + "learning_rate": 3.651654938126662e-05, + "loss": 2.1989662647247314, + "step": 928 + }, + { + "epoch": 0.24228988720088673, + "grad_norm": 4.21875, + "learning_rate": 3.650831448046623e-05, + "loss": 1.8017569780349731, + "step": 929 + }, + { + "epoch": 0.2425506943991654, + "grad_norm": 4.1875, + "learning_rate": 3.650007078830414e-05, + "loss": 1.9014790058135986, + "step": 930 + }, + { + "epoch": 0.24281150159744408, + "grad_norm": 4.25, + "learning_rate": 3.649181830917046e-05, + "loss": 1.6705433130264282, + "step": 931 + }, + { + "epoch": 0.24307230879572275, + "grad_norm": 4.65625, + "learning_rate": 3.6483557047459994e-05, + "loss": 1.9792215824127197, + "step": 932 + }, + { + "epoch": 0.24333311599400143, + "grad_norm": 3.921875, + "learning_rate": 3.6475287007572194e-05, + "loss": 1.7080910205841064, + "step": 933 + }, + { + "epoch": 0.2435939231922801, + "grad_norm": 4.46875, + "learning_rate": 3.6467008193911195e-05, + "loss": 2.0196614265441895, + "step": 934 + }, + { + "epoch": 0.24385473039055877, + "grad_norm": 4.0625, + "learning_rate": 3.645872061088581e-05, + "loss": 1.619052529335022, + "step": 935 + }, + { + "epoch": 0.24411553758883744, + "grad_norm": 4.28125, + "learning_rate": 3.645042426290954e-05, + "loss": 1.7661082744598389, + "step": 936 + }, + { + "epoch": 0.24437634478711612, + "grad_norm": 4.0625, + "learning_rate": 3.6442119154400506e-05, + "loss": 1.5300219058990479, + "step": 937 + }, + { + "epoch": 0.2446371519853948, + "grad_norm": 4.25, + "learning_rate": 3.6433805289781535e-05, + "loss": 1.6125133037567139, + "step": 938 + }, + { + "epoch": 0.24489795918367346, + "grad_norm": 4.75, + "learning_rate": 3.6425482673480114e-05, + "loss": 1.6887743473052979, + "step": 939 + }, + { + "epoch": 0.24515876638195214, + "grad_norm": 4.40625, + "learning_rate": 3.641715130992836e-05, + "loss": 1.908347249031067, + "step": 940 + }, + { + "epoch": 0.2454195735802308, + "grad_norm": 4.25, + "learning_rate": 3.6408811203563084e-05, + "loss": 2.0740489959716797, + "step": 941 + }, + { + "epoch": 0.24568038077850948, + "grad_norm": 4.28125, + "learning_rate": 3.640046235882574e-05, + "loss": 1.7063522338867188, + "step": 942 + }, + { + "epoch": 0.24594118797678816, + "grad_norm": 4.3125, + "learning_rate": 3.6392104780162425e-05, + "loss": 1.7976257801055908, + "step": 943 + }, + { + "epoch": 0.24620199517506683, + "grad_norm": 4.28125, + "learning_rate": 3.63837384720239e-05, + "loss": 1.89145827293396, + "step": 944 + }, + { + "epoch": 0.2464628023733455, + "grad_norm": 4.125, + "learning_rate": 3.6375363438865574e-05, + "loss": 1.5576190948486328, + "step": 945 + }, + { + "epoch": 0.24672360957162418, + "grad_norm": 4.46875, + "learning_rate": 3.63669796851475e-05, + "loss": 1.8441598415374756, + "step": 946 + }, + { + "epoch": 0.24698441676990285, + "grad_norm": 4.125, + "learning_rate": 3.6358587215334355e-05, + "loss": 1.5619696378707886, + "step": 947 + }, + { + "epoch": 0.24724522396818152, + "grad_norm": 4.40625, + "learning_rate": 3.6350186033895505e-05, + "loss": 1.5821995735168457, + "step": 948 + }, + { + "epoch": 0.2475060311664602, + "grad_norm": 4.0, + "learning_rate": 3.634177614530491e-05, + "loss": 1.66362464427948, + "step": 949 + }, + { + "epoch": 0.24776683836473887, + "grad_norm": 4.15625, + "learning_rate": 3.633335755404119e-05, + "loss": 1.8080108165740967, + "step": 950 + }, + { + "epoch": 0.24802764556301754, + "grad_norm": 4.53125, + "learning_rate": 3.63249302645876e-05, + "loss": 2.0615901947021484, + "step": 951 + }, + { + "epoch": 0.2482884527612962, + "grad_norm": 4.25, + "learning_rate": 3.6316494281432e-05, + "loss": 1.8065787553787231, + "step": 952 + }, + { + "epoch": 0.2485492599595749, + "grad_norm": 4.4375, + "learning_rate": 3.630804960906693e-05, + "loss": 1.9311704635620117, + "step": 953 + }, + { + "epoch": 0.24881006715785356, + "grad_norm": 4.125, + "learning_rate": 3.62995962519895e-05, + "loss": 1.6823601722717285, + "step": 954 + }, + { + "epoch": 0.24907087435613223, + "grad_norm": 4.15625, + "learning_rate": 3.629113421470149e-05, + "loss": 1.729414463043213, + "step": 955 + }, + { + "epoch": 0.2493316815544109, + "grad_norm": 3.921875, + "learning_rate": 3.628266350170929e-05, + "loss": 1.53072190284729, + "step": 956 + }, + { + "epoch": 0.24959248875268958, + "grad_norm": 4.96875, + "learning_rate": 3.6274184117523885e-05, + "loss": 2.1300301551818848, + "step": 957 + }, + { + "epoch": 0.24985329595096825, + "grad_norm": 4.3125, + "learning_rate": 3.626569606666092e-05, + "loss": 1.8616907596588135, + "step": 958 + }, + { + "epoch": 0.2501141031492469, + "grad_norm": 4.4375, + "learning_rate": 3.625719935364061e-05, + "loss": 1.9490491151809692, + "step": 959 + }, + { + "epoch": 0.25037491034752557, + "grad_norm": 4.3125, + "learning_rate": 3.624869398298783e-05, + "loss": 1.6784336566925049, + "step": 960 + }, + { + "epoch": 0.25063571754580427, + "grad_norm": 3.96875, + "learning_rate": 3.624017995923204e-05, + "loss": 1.5734858512878418, + "step": 961 + }, + { + "epoch": 0.2508965247440829, + "grad_norm": 4.0625, + "learning_rate": 3.6231657286907294e-05, + "loss": 1.7256945371627808, + "step": 962 + }, + { + "epoch": 0.2511573319423616, + "grad_norm": 4.3125, + "learning_rate": 3.622312597055229e-05, + "loss": 1.8039542436599731, + "step": 963 + }, + { + "epoch": 0.25141813914064026, + "grad_norm": 4.125, + "learning_rate": 3.6214586014710285e-05, + "loss": 1.8100578784942627, + "step": 964 + }, + { + "epoch": 0.25167894633891896, + "grad_norm": 4.53125, + "learning_rate": 3.6206037423929175e-05, + "loss": 2.144407033920288, + "step": 965 + }, + { + "epoch": 0.2519397535371976, + "grad_norm": 4.375, + "learning_rate": 3.619748020276143e-05, + "loss": 1.6897966861724854, + "step": 966 + }, + { + "epoch": 0.2522005607354763, + "grad_norm": 4.25, + "learning_rate": 3.618891435576414e-05, + "loss": 1.4720361232757568, + "step": 967 + }, + { + "epoch": 0.25246136793375495, + "grad_norm": 4.4375, + "learning_rate": 3.6180339887498953e-05, + "loss": 1.9113588333129883, + "step": 968 + }, + { + "epoch": 0.25272217513203366, + "grad_norm": 4.1875, + "learning_rate": 3.617175680253214e-05, + "loss": 1.7395099401474, + "step": 969 + }, + { + "epoch": 0.2529829823303123, + "grad_norm": 4.40625, + "learning_rate": 3.6163165105434545e-05, + "loss": 1.8829009532928467, + "step": 970 + }, + { + "epoch": 0.253243789528591, + "grad_norm": 4.40625, + "learning_rate": 3.615456480078162e-05, + "loss": 1.7133891582489014, + "step": 971 + }, + { + "epoch": 0.25350459672686965, + "grad_norm": 4.46875, + "learning_rate": 3.6145955893153355e-05, + "loss": 1.9484844207763672, + "step": 972 + }, + { + "epoch": 0.25376540392514835, + "grad_norm": 4.28125, + "learning_rate": 3.613733838713437e-05, + "loss": 1.6234208345413208, + "step": 973 + }, + { + "epoch": 0.254026211123427, + "grad_norm": 4.8125, + "learning_rate": 3.612871228731384e-05, + "loss": 1.8469617366790771, + "step": 974 + }, + { + "epoch": 0.2542870183217057, + "grad_norm": 4.3125, + "learning_rate": 3.612007759828552e-05, + "loss": 1.8527976274490356, + "step": 975 + }, + { + "epoch": 0.25454782551998434, + "grad_norm": 4.15625, + "learning_rate": 3.611143432464773e-05, + "loss": 1.6421080827713013, + "step": 976 + }, + { + "epoch": 0.25480863271826304, + "grad_norm": 4.0, + "learning_rate": 3.610278247100339e-05, + "loss": 1.7878233194351196, + "step": 977 + }, + { + "epoch": 0.2550694399165417, + "grad_norm": 4.78125, + "learning_rate": 3.609412204195996e-05, + "loss": 1.8771930932998657, + "step": 978 + }, + { + "epoch": 0.2553302471148204, + "grad_norm": 4.625, + "learning_rate": 3.608545304212948e-05, + "loss": 2.0811333656311035, + "step": 979 + }, + { + "epoch": 0.25559105431309903, + "grad_norm": 4.34375, + "learning_rate": 3.607677547612855e-05, + "loss": 1.7313621044158936, + "step": 980 + }, + { + "epoch": 0.25585186151137773, + "grad_norm": 4.46875, + "learning_rate": 3.6068089348578335e-05, + "loss": 1.8115135431289673, + "step": 981 + }, + { + "epoch": 0.2561126687096564, + "grad_norm": 4.40625, + "learning_rate": 3.6059394664104554e-05, + "loss": 2.017341136932373, + "step": 982 + }, + { + "epoch": 0.2563734759079351, + "grad_norm": 4.28125, + "learning_rate": 3.605069142733749e-05, + "loss": 1.7018239498138428, + "step": 983 + }, + { + "epoch": 0.2566342831062137, + "grad_norm": 4.03125, + "learning_rate": 3.604197964291199e-05, + "loss": 1.8607988357543945, + "step": 984 + }, + { + "epoch": 0.2568950903044924, + "grad_norm": 4.0, + "learning_rate": 3.6033259315467406e-05, + "loss": 1.794417142868042, + "step": 985 + }, + { + "epoch": 0.25715589750277107, + "grad_norm": 4.21875, + "learning_rate": 3.602453044964771e-05, + "loss": 1.9194737672805786, + "step": 986 + }, + { + "epoch": 0.25741670470104977, + "grad_norm": 4.3125, + "learning_rate": 3.6015793050101364e-05, + "loss": 1.6993803977966309, + "step": 987 + }, + { + "epoch": 0.2576775118993284, + "grad_norm": 4.46875, + "learning_rate": 3.60070471214814e-05, + "loss": 1.7307885885238647, + "step": 988 + }, + { + "epoch": 0.2579383190976071, + "grad_norm": 4.40625, + "learning_rate": 3.59982926684454e-05, + "loss": 1.7992185354232788, + "step": 989 + }, + { + "epoch": 0.25819912629588576, + "grad_norm": 4.0625, + "learning_rate": 3.598952969565545e-05, + "loss": 1.8427271842956543, + "step": 990 + }, + { + "epoch": 0.25845993349416446, + "grad_norm": 4.40625, + "learning_rate": 3.598075820777822e-05, + "loss": 1.8364369869232178, + "step": 991 + }, + { + "epoch": 0.2587207406924431, + "grad_norm": 3.96875, + "learning_rate": 3.597197820948487e-05, + "loss": 1.682704210281372, + "step": 992 + }, + { + "epoch": 0.2589815478907218, + "grad_norm": 4.75, + "learning_rate": 3.5963189705451124e-05, + "loss": 1.9118919372558594, + "step": 993 + }, + { + "epoch": 0.25924235508900045, + "grad_norm": 3.96875, + "learning_rate": 3.595439270035722e-05, + "loss": 1.6127727031707764, + "step": 994 + }, + { + "epoch": 0.25950316228727915, + "grad_norm": 4.03125, + "learning_rate": 3.594558719888793e-05, + "loss": 1.6650826930999756, + "step": 995 + }, + { + "epoch": 0.2597639694855578, + "grad_norm": 4.15625, + "learning_rate": 3.593677320573256e-05, + "loss": 1.7605465650558472, + "step": 996 + }, + { + "epoch": 0.2600247766838365, + "grad_norm": 4.25, + "learning_rate": 3.5927950725584905e-05, + "loss": 1.442620038986206, + "step": 997 + }, + { + "epoch": 0.26028558388211515, + "grad_norm": 4.125, + "learning_rate": 3.591911976314332e-05, + "loss": 2.185831069946289, + "step": 998 + }, + { + "epoch": 0.26054639108039385, + "grad_norm": 4.03125, + "learning_rate": 3.591028032311065e-05, + "loss": 1.6497381925582886, + "step": 999 + }, + { + "epoch": 0.2608071982786725, + "grad_norm": 4.59375, + "learning_rate": 3.590143241019426e-05, + "loss": 1.868830680847168, + "step": 1000 + }, + { + "epoch": 0.26106800547695114, + "grad_norm": 4.125, + "learning_rate": 3.5892576029106034e-05, + "loss": 1.7232383489608765, + "step": 1001 + }, + { + "epoch": 0.26132881267522984, + "grad_norm": 4.8125, + "learning_rate": 3.588371118456237e-05, + "loss": 1.8657559156417847, + "step": 1002 + }, + { + "epoch": 0.2615896198735085, + "grad_norm": 4.15625, + "learning_rate": 3.587483788128415e-05, + "loss": 1.80161452293396, + "step": 1003 + }, + { + "epoch": 0.2618504270717872, + "grad_norm": 4.25, + "learning_rate": 3.5865956123996785e-05, + "loss": 1.4268081188201904, + "step": 1004 + }, + { + "epoch": 0.26211123427006583, + "grad_norm": 4.5, + "learning_rate": 3.585706591743018e-05, + "loss": 1.8081510066986084, + "step": 1005 + }, + { + "epoch": 0.26237204146834453, + "grad_norm": 4.3125, + "learning_rate": 3.584816726631873e-05, + "loss": 1.6240954399108887, + "step": 1006 + }, + { + "epoch": 0.2626328486666232, + "grad_norm": 4.625, + "learning_rate": 3.5839260175401345e-05, + "loss": 1.733473300933838, + "step": 1007 + }, + { + "epoch": 0.2628936558649019, + "grad_norm": 4.15625, + "learning_rate": 3.5830344649421416e-05, + "loss": 1.8932445049285889, + "step": 1008 + }, + { + "epoch": 0.2631544630631805, + "grad_norm": 4.59375, + "learning_rate": 3.5821420693126834e-05, + "loss": 1.9630615711212158, + "step": 1009 + }, + { + "epoch": 0.2634152702614592, + "grad_norm": 4.375, + "learning_rate": 3.581248831126996e-05, + "loss": 1.6601322889328003, + "step": 1010 + }, + { + "epoch": 0.26367607745973787, + "grad_norm": 4.1875, + "learning_rate": 3.580354750860768e-05, + "loss": 1.8083221912384033, + "step": 1011 + }, + { + "epoch": 0.26393688465801657, + "grad_norm": 4.5625, + "learning_rate": 3.579459828990133e-05, + "loss": 1.8334442377090454, + "step": 1012 + }, + { + "epoch": 0.2641976918562952, + "grad_norm": 4.96875, + "learning_rate": 3.5785640659916736e-05, + "loss": 2.0063252449035645, + "step": 1013 + }, + { + "epoch": 0.2644584990545739, + "grad_norm": 4.40625, + "learning_rate": 3.5776674623424226e-05, + "loss": 2.0151400566101074, + "step": 1014 + }, + { + "epoch": 0.26471930625285256, + "grad_norm": 3.84375, + "learning_rate": 3.5767700185198556e-05, + "loss": 1.4993696212768555, + "step": 1015 + }, + { + "epoch": 0.26498011345113126, + "grad_norm": 4.25, + "learning_rate": 3.575871735001901e-05, + "loss": 1.8012977838516235, + "step": 1016 + }, + { + "epoch": 0.2652409206494099, + "grad_norm": 4.28125, + "learning_rate": 3.5749726122669316e-05, + "loss": 1.8131016492843628, + "step": 1017 + }, + { + "epoch": 0.2655017278476886, + "grad_norm": 4.40625, + "learning_rate": 3.574072650793767e-05, + "loss": 1.7579952478408813, + "step": 1018 + }, + { + "epoch": 0.26576253504596725, + "grad_norm": 4.78125, + "learning_rate": 3.573171851061674e-05, + "loss": 1.9900325536727905, + "step": 1019 + }, + { + "epoch": 0.26602334224424595, + "grad_norm": 3.984375, + "learning_rate": 3.5722702135503664e-05, + "loss": 1.738937258720398, + "step": 1020 + }, + { + "epoch": 0.2662841494425246, + "grad_norm": 4.78125, + "learning_rate": 3.571367738740003e-05, + "loss": 1.9216761589050293, + "step": 1021 + }, + { + "epoch": 0.2665449566408033, + "grad_norm": 3.984375, + "learning_rate": 3.570464427111189e-05, + "loss": 1.5162055492401123, + "step": 1022 + }, + { + "epoch": 0.26680576383908194, + "grad_norm": 4.5625, + "learning_rate": 3.569560279144976e-05, + "loss": 1.8345317840576172, + "step": 1023 + }, + { + "epoch": 0.26706657103736064, + "grad_norm": 4.4375, + "learning_rate": 3.568655295322859e-05, + "loss": 1.8400464057922363, + "step": 1024 + }, + { + "epoch": 0.2673273782356393, + "grad_norm": 4.15625, + "learning_rate": 3.567749476126781e-05, + "loss": 1.690098762512207, + "step": 1025 + }, + { + "epoch": 0.267588185433918, + "grad_norm": 4.34375, + "learning_rate": 3.566842822039127e-05, + "loss": 1.9592373371124268, + "step": 1026 + }, + { + "epoch": 0.26784899263219664, + "grad_norm": 4.5, + "learning_rate": 3.565935333542729e-05, + "loss": 1.7009608745574951, + "step": 1027 + }, + { + "epoch": 0.26810979983047534, + "grad_norm": 3.9375, + "learning_rate": 3.56502701112086e-05, + "loss": 1.7458827495574951, + "step": 1028 + }, + { + "epoch": 0.268370607028754, + "grad_norm": 4.34375, + "learning_rate": 3.564117855257242e-05, + "loss": 1.8003566265106201, + "step": 1029 + }, + { + "epoch": 0.2686314142270327, + "grad_norm": 4.71875, + "learning_rate": 3.5632078664360365e-05, + "loss": 1.5618364810943604, + "step": 1030 + }, + { + "epoch": 0.26889222142531133, + "grad_norm": 4.1875, + "learning_rate": 3.562297045141851e-05, + "loss": 1.9563771486282349, + "step": 1031 + }, + { + "epoch": 0.26915302862359003, + "grad_norm": 4.75, + "learning_rate": 3.561385391859736e-05, + "loss": 1.5928397178649902, + "step": 1032 + }, + { + "epoch": 0.2694138358218687, + "grad_norm": 4.5625, + "learning_rate": 3.560472907075183e-05, + "loss": 1.7969861030578613, + "step": 1033 + }, + { + "epoch": 0.2696746430201474, + "grad_norm": 3.890625, + "learning_rate": 3.559559591274129e-05, + "loss": 1.464586615562439, + "step": 1034 + }, + { + "epoch": 0.269935450218426, + "grad_norm": 4.40625, + "learning_rate": 3.558645444942953e-05, + "loss": 1.7187366485595703, + "step": 1035 + }, + { + "epoch": 0.2701962574167047, + "grad_norm": 4.4375, + "learning_rate": 3.557730468568476e-05, + "loss": 1.5232993364334106, + "step": 1036 + }, + { + "epoch": 0.27045706461498337, + "grad_norm": 4.3125, + "learning_rate": 3.556814662637959e-05, + "loss": 1.7988381385803223, + "step": 1037 + }, + { + "epoch": 0.27071787181326207, + "grad_norm": 4.25, + "learning_rate": 3.555898027639109e-05, + "loss": 1.8296515941619873, + "step": 1038 + }, + { + "epoch": 0.2709786790115407, + "grad_norm": 4.125, + "learning_rate": 3.55498056406007e-05, + "loss": 1.3963137865066528, + "step": 1039 + }, + { + "epoch": 0.2712394862098194, + "grad_norm": 4.75, + "learning_rate": 3.554062272389431e-05, + "loss": 1.8541409969329834, + "step": 1040 + }, + { + "epoch": 0.27150029340809806, + "grad_norm": 4.15625, + "learning_rate": 3.553143153116219e-05, + "loss": 1.891303300857544, + "step": 1041 + }, + { + "epoch": 0.27176110060637676, + "grad_norm": 4.875, + "learning_rate": 3.552223206729904e-05, + "loss": 2.0532147884368896, + "step": 1042 + }, + { + "epoch": 0.2720219078046554, + "grad_norm": 3.859375, + "learning_rate": 3.551302433720396e-05, + "loss": 1.566463828086853, + "step": 1043 + }, + { + "epoch": 0.2722827150029341, + "grad_norm": 4.375, + "learning_rate": 3.550380834578044e-05, + "loss": 1.5827128887176514, + "step": 1044 + }, + { + "epoch": 0.27254352220121275, + "grad_norm": 4.4375, + "learning_rate": 3.5494584097936375e-05, + "loss": 1.8478995561599731, + "step": 1045 + }, + { + "epoch": 0.27280432939949145, + "grad_norm": 6.125, + "learning_rate": 3.5485351598584066e-05, + "loss": 1.8030246496200562, + "step": 1046 + }, + { + "epoch": 0.2730651365977701, + "grad_norm": 4.3125, + "learning_rate": 3.54761108526402e-05, + "loss": 1.7071260213851929, + "step": 1047 + }, + { + "epoch": 0.2733259437960488, + "grad_norm": 4.09375, + "learning_rate": 3.5466861865025856e-05, + "loss": 1.7506829500198364, + "step": 1048 + }, + { + "epoch": 0.27358675099432744, + "grad_norm": 4.0625, + "learning_rate": 3.54576046406665e-05, + "loss": 2.0367753505706787, + "step": 1049 + }, + { + "epoch": 0.2738475581926061, + "grad_norm": 3.890625, + "learning_rate": 3.544833918449199e-05, + "loss": 1.6471517086029053, + "step": 1050 + }, + { + "epoch": 0.2741083653908848, + "grad_norm": 4.3125, + "learning_rate": 3.5439065501436575e-05, + "loss": 2.0317742824554443, + "step": 1051 + }, + { + "epoch": 0.27436917258916343, + "grad_norm": 3.9375, + "learning_rate": 3.5429783596438864e-05, + "loss": 1.732642650604248, + "step": 1052 + }, + { + "epoch": 0.27462997978744214, + "grad_norm": 4.03125, + "learning_rate": 3.5420493474441855e-05, + "loss": 1.9185147285461426, + "step": 1053 + }, + { + "epoch": 0.2748907869857208, + "grad_norm": 4.0625, + "learning_rate": 3.5411195140392936e-05, + "loss": 1.7298734188079834, + "step": 1054 + }, + { + "epoch": 0.2751515941839995, + "grad_norm": 4.28125, + "learning_rate": 3.540188859924384e-05, + "loss": 1.9768015146255493, + "step": 1055 + }, + { + "epoch": 0.2754124013822781, + "grad_norm": 5.40625, + "learning_rate": 3.539257385595069e-05, + "loss": 1.898254632949829, + "step": 1056 + }, + { + "epoch": 0.2756732085805568, + "grad_norm": 4.15625, + "learning_rate": 3.538325091547398e-05, + "loss": 1.6079350709915161, + "step": 1057 + }, + { + "epoch": 0.2759340157788355, + "grad_norm": 4.21875, + "learning_rate": 3.537391978277856e-05, + "loss": 1.719800591468811, + "step": 1058 + }, + { + "epoch": 0.2761948229771142, + "grad_norm": 26.0, + "learning_rate": 3.536458046283364e-05, + "loss": 2.0866620540618896, + "step": 1059 + }, + { + "epoch": 0.2764556301753928, + "grad_norm": 4.53125, + "learning_rate": 3.535523296061279e-05, + "loss": 1.798237919807434, + "step": 1060 + }, + { + "epoch": 0.2767164373736715, + "grad_norm": 3.921875, + "learning_rate": 3.534587728109396e-05, + "loss": 1.8382494449615479, + "step": 1061 + }, + { + "epoch": 0.27697724457195017, + "grad_norm": 4.6875, + "learning_rate": 3.533651342925942e-05, + "loss": 1.8185405731201172, + "step": 1062 + }, + { + "epoch": 0.27723805177022887, + "grad_norm": 4.46875, + "learning_rate": 3.532714141009583e-05, + "loss": 1.7275090217590332, + "step": 1063 + }, + { + "epoch": 0.2774988589685075, + "grad_norm": 4.46875, + "learning_rate": 3.531776122859415e-05, + "loss": 2.008340358734131, + "step": 1064 + }, + { + "epoch": 0.2777596661667862, + "grad_norm": 4.4375, + "learning_rate": 3.530837288974974e-05, + "loss": 1.8108479976654053, + "step": 1065 + }, + { + "epoch": 0.27802047336506486, + "grad_norm": 4.125, + "learning_rate": 3.529897639856226e-05, + "loss": 1.894986867904663, + "step": 1066 + }, + { + "epoch": 0.27828128056334356, + "grad_norm": 3.984375, + "learning_rate": 3.528957176003575e-05, + "loss": 1.6555955410003662, + "step": 1067 + }, + { + "epoch": 0.2785420877616222, + "grad_norm": 4.53125, + "learning_rate": 3.528015897917856e-05, + "loss": 1.951297402381897, + "step": 1068 + }, + { + "epoch": 0.2788028949599009, + "grad_norm": 4.21875, + "learning_rate": 3.527073806100338e-05, + "loss": 1.9099540710449219, + "step": 1069 + }, + { + "epoch": 0.27906370215817955, + "grad_norm": 3.71875, + "learning_rate": 3.5261309010527256e-05, + "loss": 1.778804063796997, + "step": 1070 + }, + { + "epoch": 0.27932450935645825, + "grad_norm": 4.21875, + "learning_rate": 3.525187183277153e-05, + "loss": 1.7820208072662354, + "step": 1071 + }, + { + "epoch": 0.2795853165547369, + "grad_norm": 3.953125, + "learning_rate": 3.52424265327619e-05, + "loss": 1.6652235984802246, + "step": 1072 + }, + { + "epoch": 0.2798461237530156, + "grad_norm": 4.03125, + "learning_rate": 3.5232973115528375e-05, + "loss": 1.863331913948059, + "step": 1073 + }, + { + "epoch": 0.28010693095129424, + "grad_norm": 3.859375, + "learning_rate": 3.52235115861053e-05, + "loss": 1.4964725971221924, + "step": 1074 + }, + { + "epoch": 0.28036773814957294, + "grad_norm": 4.03125, + "learning_rate": 3.5214041949531324e-05, + "loss": 1.6539411544799805, + "step": 1075 + }, + { + "epoch": 0.2806285453478516, + "grad_norm": 4.625, + "learning_rate": 3.5204564210849425e-05, + "loss": 1.8654205799102783, + "step": 1076 + }, + { + "epoch": 0.2808893525461303, + "grad_norm": 4.21875, + "learning_rate": 3.519507837510689e-05, + "loss": 1.7179417610168457, + "step": 1077 + }, + { + "epoch": 0.28115015974440893, + "grad_norm": 3.953125, + "learning_rate": 3.518558444735532e-05, + "loss": 1.6654682159423828, + "step": 1078 + }, + { + "epoch": 0.28141096694268763, + "grad_norm": 4.125, + "learning_rate": 3.517608243265063e-05, + "loss": 1.6149661540985107, + "step": 1079 + }, + { + "epoch": 0.2816717741409663, + "grad_norm": 4.25, + "learning_rate": 3.516657233605302e-05, + "loss": 1.7557415962219238, + "step": 1080 + }, + { + "epoch": 0.281932581339245, + "grad_norm": 4.03125, + "learning_rate": 3.5157054162627036e-05, + "loss": 1.8852646350860596, + "step": 1081 + }, + { + "epoch": 0.2821933885375236, + "grad_norm": 4.3125, + "learning_rate": 3.514752791744147e-05, + "loss": 2.0640451908111572, + "step": 1082 + }, + { + "epoch": 0.2824541957358023, + "grad_norm": 4.59375, + "learning_rate": 3.513799360556947e-05, + "loss": 1.7899377346038818, + "step": 1083 + }, + { + "epoch": 0.282715002934081, + "grad_norm": 4.0625, + "learning_rate": 3.512845123208844e-05, + "loss": 1.7269141674041748, + "step": 1084 + }, + { + "epoch": 0.2829758101323597, + "grad_norm": 4.1875, + "learning_rate": 3.511890080208008e-05, + "loss": 1.6378408670425415, + "step": 1085 + }, + { + "epoch": 0.2832366173306383, + "grad_norm": 4.28125, + "learning_rate": 3.510934232063041e-05, + "loss": 1.911826491355896, + "step": 1086 + }, + { + "epoch": 0.283497424528917, + "grad_norm": 4.71875, + "learning_rate": 3.509977579282971e-05, + "loss": 1.7262004613876343, + "step": 1087 + }, + { + "epoch": 0.28375823172719566, + "grad_norm": 4.40625, + "learning_rate": 3.509020122377254e-05, + "loss": 1.9643630981445312, + "step": 1088 + }, + { + "epoch": 0.28401903892547437, + "grad_norm": 4.125, + "learning_rate": 3.508061861855777e-05, + "loss": 1.7308140993118286, + "step": 1089 + }, + { + "epoch": 0.284279846123753, + "grad_norm": 4.21875, + "learning_rate": 3.507102798228852e-05, + "loss": 1.5944818258285522, + "step": 1090 + }, + { + "epoch": 0.2845406533220317, + "grad_norm": 4.15625, + "learning_rate": 3.5061429320072225e-05, + "loss": 1.4689910411834717, + "step": 1091 + }, + { + "epoch": 0.28480146052031036, + "grad_norm": 3.8125, + "learning_rate": 3.505182263702055e-05, + "loss": 1.7175511121749878, + "step": 1092 + }, + { + "epoch": 0.28506226771858906, + "grad_norm": 4.625, + "learning_rate": 3.504220793824945e-05, + "loss": 1.98661470413208, + "step": 1093 + }, + { + "epoch": 0.2853230749168677, + "grad_norm": 4.375, + "learning_rate": 3.503258522887917e-05, + "loss": 1.6025145053863525, + "step": 1094 + }, + { + "epoch": 0.2855838821151464, + "grad_norm": 3.703125, + "learning_rate": 3.502295451403419e-05, + "loss": 1.6541790962219238, + "step": 1095 + }, + { + "epoch": 0.28584468931342505, + "grad_norm": 3.828125, + "learning_rate": 3.501331579884325e-05, + "loss": 1.834327220916748, + "step": 1096 + }, + { + "epoch": 0.28610549651170375, + "grad_norm": 3.921875, + "learning_rate": 3.5003669088439395e-05, + "loss": 1.7379425764083862, + "step": 1097 + }, + { + "epoch": 0.2863663037099824, + "grad_norm": 4.1875, + "learning_rate": 3.499401438795988e-05, + "loss": 1.7236168384552002, + "step": 1098 + }, + { + "epoch": 0.2866271109082611, + "grad_norm": 4.1875, + "learning_rate": 3.498435170254624e-05, + "loss": 1.6990818977355957, + "step": 1099 + }, + { + "epoch": 0.28688791810653974, + "grad_norm": 4.03125, + "learning_rate": 3.497468103734424e-05, + "loss": 1.7150160074234009, + "step": 1100 + }, + { + "epoch": 0.2871487253048184, + "grad_norm": 4.28125, + "learning_rate": 3.4965002397503936e-05, + "loss": 1.8012375831604004, + "step": 1101 + }, + { + "epoch": 0.2874095325030971, + "grad_norm": 3.96875, + "learning_rate": 3.495531578817958e-05, + "loss": 1.4892457723617554, + "step": 1102 + }, + { + "epoch": 0.28767033970137573, + "grad_norm": 4.3125, + "learning_rate": 3.4945621214529705e-05, + "loss": 1.8427650928497314, + "step": 1103 + }, + { + "epoch": 0.28793114689965443, + "grad_norm": 3.75, + "learning_rate": 3.493591868171709e-05, + "loss": 1.5933095216751099, + "step": 1104 + }, + { + "epoch": 0.2881919540979331, + "grad_norm": 4.375, + "learning_rate": 3.49262081949087e-05, + "loss": 1.6940237283706665, + "step": 1105 + }, + { + "epoch": 0.2884527612962118, + "grad_norm": 3.953125, + "learning_rate": 3.4916489759275794e-05, + "loss": 1.495390772819519, + "step": 1106 + }, + { + "epoch": 0.2887135684944904, + "grad_norm": 3.859375, + "learning_rate": 3.490676337999383e-05, + "loss": 1.590932846069336, + "step": 1107 + }, + { + "epoch": 0.2889743756927691, + "grad_norm": 3.96875, + "learning_rate": 3.489702906224253e-05, + "loss": 1.7562646865844727, + "step": 1108 + }, + { + "epoch": 0.28923518289104777, + "grad_norm": 3.9375, + "learning_rate": 3.48872868112058e-05, + "loss": 1.5710026025772095, + "step": 1109 + }, + { + "epoch": 0.28949599008932647, + "grad_norm": 4.125, + "learning_rate": 3.48775366320718e-05, + "loss": 1.8648991584777832, + "step": 1110 + }, + { + "epoch": 0.2897567972876051, + "grad_norm": 4.21875, + "learning_rate": 3.48677785300329e-05, + "loss": 1.6395639181137085, + "step": 1111 + }, + { + "epoch": 0.2900176044858838, + "grad_norm": 4.5, + "learning_rate": 3.48580125102857e-05, + "loss": 1.6861207485198975, + "step": 1112 + }, + { + "epoch": 0.29027841168416246, + "grad_norm": 4.21875, + "learning_rate": 3.484823857803101e-05, + "loss": 1.7843270301818848, + "step": 1113 + }, + { + "epoch": 0.29053921888244116, + "grad_norm": 4.15625, + "learning_rate": 3.483845673847386e-05, + "loss": 1.8121414184570312, + "step": 1114 + }, + { + "epoch": 0.2908000260807198, + "grad_norm": 4.09375, + "learning_rate": 3.482866699682347e-05, + "loss": 1.9957536458969116, + "step": 1115 + }, + { + "epoch": 0.2910608332789985, + "grad_norm": 4.46875, + "learning_rate": 3.4818869358293285e-05, + "loss": 1.9593991041183472, + "step": 1116 + }, + { + "epoch": 0.29132164047727716, + "grad_norm": 4.09375, + "learning_rate": 3.4809063828100965e-05, + "loss": 1.7332122325897217, + "step": 1117 + }, + { + "epoch": 0.29158244767555586, + "grad_norm": 4.375, + "learning_rate": 3.479925041146836e-05, + "loss": 1.657759189605713, + "step": 1118 + }, + { + "epoch": 0.2918432548738345, + "grad_norm": 4.625, + "learning_rate": 3.4789429113621517e-05, + "loss": 1.997182011604309, + "step": 1119 + }, + { + "epoch": 0.2921040620721132, + "grad_norm": 4.0625, + "learning_rate": 3.477959993979068e-05, + "loss": 1.7544701099395752, + "step": 1120 + }, + { + "epoch": 0.29236486927039185, + "grad_norm": 4.3125, + "learning_rate": 3.476976289521029e-05, + "loss": 1.771339774131775, + "step": 1121 + }, + { + "epoch": 0.29262567646867055, + "grad_norm": 4.125, + "learning_rate": 3.475991798511899e-05, + "loss": 1.9331625699996948, + "step": 1122 + }, + { + "epoch": 0.2928864836669492, + "grad_norm": 4.0625, + "learning_rate": 3.47500652147596e-05, + "loss": 2.144134521484375, + "step": 1123 + }, + { + "epoch": 0.2931472908652279, + "grad_norm": 4.53125, + "learning_rate": 3.4740204589379125e-05, + "loss": 1.9860496520996094, + "step": 1124 + }, + { + "epoch": 0.29340809806350654, + "grad_norm": 4.34375, + "learning_rate": 3.4730336114228756e-05, + "loss": 1.868838906288147, + "step": 1125 + }, + { + "epoch": 0.29366890526178524, + "grad_norm": 4.09375, + "learning_rate": 3.472045979456387e-05, + "loss": 1.7470306158065796, + "step": 1126 + }, + { + "epoch": 0.2939297124600639, + "grad_norm": 4.1875, + "learning_rate": 3.471057563564403e-05, + "loss": 1.7575474977493286, + "step": 1127 + }, + { + "epoch": 0.2941905196583426, + "grad_norm": 4.21875, + "learning_rate": 3.470068364273292e-05, + "loss": 1.9841556549072266, + "step": 1128 + }, + { + "epoch": 0.29445132685662123, + "grad_norm": 4.53125, + "learning_rate": 3.4690783821098476e-05, + "loss": 1.9896951913833618, + "step": 1129 + }, + { + "epoch": 0.29471213405489993, + "grad_norm": 4.125, + "learning_rate": 3.468087617601275e-05, + "loss": 1.8877949714660645, + "step": 1130 + }, + { + "epoch": 0.2949729412531786, + "grad_norm": 3.9375, + "learning_rate": 3.4670960712751965e-05, + "loss": 1.5972007513046265, + "step": 1131 + }, + { + "epoch": 0.2952337484514573, + "grad_norm": 4.25, + "learning_rate": 3.4661037436596526e-05, + "loss": 1.761352777481079, + "step": 1132 + }, + { + "epoch": 0.2954945556497359, + "grad_norm": 4.75, + "learning_rate": 3.465110635283099e-05, + "loss": 1.855859637260437, + "step": 1133 + }, + { + "epoch": 0.2957553628480146, + "grad_norm": 4.09375, + "learning_rate": 3.464116746674407e-05, + "loss": 1.6813459396362305, + "step": 1134 + }, + { + "epoch": 0.29601617004629327, + "grad_norm": 3.8125, + "learning_rate": 3.4631220783628624e-05, + "loss": 1.7292029857635498, + "step": 1135 + }, + { + "epoch": 0.29627697724457197, + "grad_norm": 4.53125, + "learning_rate": 3.462126630878168e-05, + "loss": 1.9538642168045044, + "step": 1136 + }, + { + "epoch": 0.2965377844428506, + "grad_norm": 4.09375, + "learning_rate": 3.4611304047504416e-05, + "loss": 1.7921640872955322, + "step": 1137 + }, + { + "epoch": 0.2967985916411293, + "grad_norm": 4.4375, + "learning_rate": 3.460133400510214e-05, + "loss": 1.724027156829834, + "step": 1138 + }, + { + "epoch": 0.29705939883940796, + "grad_norm": 4.375, + "learning_rate": 3.459135618688431e-05, + "loss": 1.9083806276321411, + "step": 1139 + }, + { + "epoch": 0.29732020603768666, + "grad_norm": 4.0, + "learning_rate": 3.4581370598164535e-05, + "loss": 1.5803298950195312, + "step": 1140 + }, + { + "epoch": 0.2975810132359653, + "grad_norm": 3.875, + "learning_rate": 3.4571377244260554e-05, + "loss": 1.4782859086990356, + "step": 1141 + }, + { + "epoch": 0.297841820434244, + "grad_norm": 4.375, + "learning_rate": 3.456137613049424e-05, + "loss": 1.9525219202041626, + "step": 1142 + }, + { + "epoch": 0.29810262763252265, + "grad_norm": 3.921875, + "learning_rate": 3.45513672621916e-05, + "loss": 1.9345943927764893, + "step": 1143 + }, + { + "epoch": 0.29836343483080136, + "grad_norm": 4.125, + "learning_rate": 3.454135064468278e-05, + "loss": 1.9073323011398315, + "step": 1144 + }, + { + "epoch": 0.29862424202908, + "grad_norm": 4.21875, + "learning_rate": 3.453132628330203e-05, + "loss": 1.7295219898223877, + "step": 1145 + }, + { + "epoch": 0.2988850492273587, + "grad_norm": 4.28125, + "learning_rate": 3.452129418338775e-05, + "loss": 1.6505075693130493, + "step": 1146 + }, + { + "epoch": 0.29914585642563735, + "grad_norm": 4.25, + "learning_rate": 3.4511254350282436e-05, + "loss": 1.7804162502288818, + "step": 1147 + }, + { + "epoch": 0.29940666362391605, + "grad_norm": 4.125, + "learning_rate": 3.450120678933273e-05, + "loss": 1.8169751167297363, + "step": 1148 + }, + { + "epoch": 0.2996674708221947, + "grad_norm": 4.28125, + "learning_rate": 3.449115150588936e-05, + "loss": 1.6251006126403809, + "step": 1149 + }, + { + "epoch": 0.29992827802047334, + "grad_norm": 4.25, + "learning_rate": 3.448108850530719e-05, + "loss": 1.8406720161437988, + "step": 1150 + }, + { + "epoch": 0.30018908521875204, + "grad_norm": 4.25, + "learning_rate": 3.4471017792945186e-05, + "loss": 1.9019505977630615, + "step": 1151 + }, + { + "epoch": 0.3004498924170307, + "grad_norm": 4.125, + "learning_rate": 3.4460939374166424e-05, + "loss": 1.896089792251587, + "step": 1152 + }, + { + "epoch": 0.3007106996153094, + "grad_norm": 4.03125, + "learning_rate": 3.4450853254338065e-05, + "loss": 1.679295301437378, + "step": 1153 + }, + { + "epoch": 0.30097150681358803, + "grad_norm": 4.25, + "learning_rate": 3.44407594388314e-05, + "loss": 1.6873747110366821, + "step": 1154 + }, + { + "epoch": 0.30123231401186673, + "grad_norm": 4.0, + "learning_rate": 3.443065793302179e-05, + "loss": 1.688307523727417, + "step": 1155 + }, + { + "epoch": 0.3014931212101454, + "grad_norm": 4.5625, + "learning_rate": 3.4420548742288716e-05, + "loss": 1.8257405757904053, + "step": 1156 + }, + { + "epoch": 0.3017539284084241, + "grad_norm": 4.53125, + "learning_rate": 3.441043187201574e-05, + "loss": 1.7855994701385498, + "step": 1157 + }, + { + "epoch": 0.3020147356067027, + "grad_norm": 3.859375, + "learning_rate": 3.4400307327590517e-05, + "loss": 1.6374878883361816, + "step": 1158 + }, + { + "epoch": 0.3022755428049814, + "grad_norm": 4.25, + "learning_rate": 3.439017511440478e-05, + "loss": 1.905398964881897, + "step": 1159 + }, + { + "epoch": 0.30253635000326007, + "grad_norm": 4.21875, + "learning_rate": 3.4380035237854355e-05, + "loss": 1.79018235206604, + "step": 1160 + }, + { + "epoch": 0.30279715720153877, + "grad_norm": 4.4375, + "learning_rate": 3.436988770333915e-05, + "loss": 1.934191107749939, + "step": 1161 + }, + { + "epoch": 0.3030579643998174, + "grad_norm": 4.34375, + "learning_rate": 3.435973251626314e-05, + "loss": 1.959614634513855, + "step": 1162 + }, + { + "epoch": 0.3033187715980961, + "grad_norm": 4.96875, + "learning_rate": 3.4349569682034394e-05, + "loss": 1.8245642185211182, + "step": 1163 + }, + { + "epoch": 0.30357957879637476, + "grad_norm": 4.03125, + "learning_rate": 3.433939920606503e-05, + "loss": 1.9029959440231323, + "step": 1164 + }, + { + "epoch": 0.30384038599465346, + "grad_norm": 4.3125, + "learning_rate": 3.432922109377125e-05, + "loss": 1.563682198524475, + "step": 1165 + }, + { + "epoch": 0.3041011931929321, + "grad_norm": 4.1875, + "learning_rate": 3.4319035350573334e-05, + "loss": 2.009978771209717, + "step": 1166 + }, + { + "epoch": 0.3043620003912108, + "grad_norm": 3.703125, + "learning_rate": 3.43088419818956e-05, + "loss": 1.6859945058822632, + "step": 1167 + }, + { + "epoch": 0.30462280758948945, + "grad_norm": 3.71875, + "learning_rate": 3.4298640993166446e-05, + "loss": 1.7476444244384766, + "step": 1168 + }, + { + "epoch": 0.30488361478776815, + "grad_norm": 3.875, + "learning_rate": 3.4288432389818314e-05, + "loss": 1.4783931970596313, + "step": 1169 + }, + { + "epoch": 0.3051444219860468, + "grad_norm": 3.9375, + "learning_rate": 3.427821617728771e-05, + "loss": 1.5559719800949097, + "step": 1170 + }, + { + "epoch": 0.3054052291843255, + "grad_norm": 4.03125, + "learning_rate": 3.4267992361015196e-05, + "loss": 1.6822757720947266, + "step": 1171 + }, + { + "epoch": 0.30566603638260414, + "grad_norm": 3.859375, + "learning_rate": 3.4257760946445375e-05, + "loss": 1.708196759223938, + "step": 1172 + }, + { + "epoch": 0.30592684358088285, + "grad_norm": 4.40625, + "learning_rate": 3.42475219390269e-05, + "loss": 1.585463523864746, + "step": 1173 + }, + { + "epoch": 0.3061876507791615, + "grad_norm": 4.15625, + "learning_rate": 3.423727534421247e-05, + "loss": 1.7142834663391113, + "step": 1174 + }, + { + "epoch": 0.3064484579774402, + "grad_norm": 4.0625, + "learning_rate": 3.422702116745881e-05, + "loss": 1.822965383529663, + "step": 1175 + }, + { + "epoch": 0.30670926517571884, + "grad_norm": 3.734375, + "learning_rate": 3.42167594142267e-05, + "loss": 1.6955723762512207, + "step": 1176 + }, + { + "epoch": 0.30697007237399754, + "grad_norm": 3.796875, + "learning_rate": 3.420649008998095e-05, + "loss": 1.9219430685043335, + "step": 1177 + }, + { + "epoch": 0.3072308795722762, + "grad_norm": 4.03125, + "learning_rate": 3.419621320019041e-05, + "loss": 1.8803141117095947, + "step": 1178 + }, + { + "epoch": 0.3074916867705549, + "grad_norm": 3.859375, + "learning_rate": 3.418592875032793e-05, + "loss": 1.6397581100463867, + "step": 1179 + }, + { + "epoch": 0.30775249396883353, + "grad_norm": 3.9375, + "learning_rate": 3.417563674587043e-05, + "loss": 1.4602605104446411, + "step": 1180 + }, + { + "epoch": 0.30801330116711223, + "grad_norm": 3.890625, + "learning_rate": 3.4165337192298804e-05, + "loss": 1.7662440538406372, + "step": 1181 + }, + { + "epoch": 0.3082741083653909, + "grad_norm": 4.125, + "learning_rate": 3.415503009509801e-05, + "loss": 1.7334061861038208, + "step": 1182 + }, + { + "epoch": 0.3085349155636696, + "grad_norm": 4.90625, + "learning_rate": 3.4144715459756995e-05, + "loss": 1.7847474813461304, + "step": 1183 + }, + { + "epoch": 0.3087957227619482, + "grad_norm": 3.953125, + "learning_rate": 3.413439329176874e-05, + "loss": 1.8660763502120972, + "step": 1184 + }, + { + "epoch": 0.3090565299602269, + "grad_norm": 3.875, + "learning_rate": 3.41240635966302e-05, + "loss": 1.4467748403549194, + "step": 1185 + }, + { + "epoch": 0.30931733715850557, + "grad_norm": 3.765625, + "learning_rate": 3.411372637984241e-05, + "loss": 1.6627625226974487, + "step": 1186 + }, + { + "epoch": 0.30957814435678427, + "grad_norm": 4.21875, + "learning_rate": 3.4103381646910325e-05, + "loss": 1.893566370010376, + "step": 1187 + }, + { + "epoch": 0.3098389515550629, + "grad_norm": 4.71875, + "learning_rate": 3.409302940334296e-05, + "loss": 1.5209918022155762, + "step": 1188 + }, + { + "epoch": 0.3100997587533416, + "grad_norm": 4.125, + "learning_rate": 3.408266965465332e-05, + "loss": 1.8906986713409424, + "step": 1189 + }, + { + "epoch": 0.31036056595162026, + "grad_norm": 4.25, + "learning_rate": 3.40723024063584e-05, + "loss": 1.5039809942245483, + "step": 1190 + }, + { + "epoch": 0.31062137314989896, + "grad_norm": 4.15625, + "learning_rate": 3.406192766397918e-05, + "loss": 1.6729252338409424, + "step": 1191 + }, + { + "epoch": 0.3108821803481776, + "grad_norm": 4.125, + "learning_rate": 3.405154543304065e-05, + "loss": 1.5411571264266968, + "step": 1192 + }, + { + "epoch": 0.3111429875464563, + "grad_norm": 4.03125, + "learning_rate": 3.404115571907176e-05, + "loss": 1.9417058229446411, + "step": 1193 + }, + { + "epoch": 0.31140379474473495, + "grad_norm": 3.734375, + "learning_rate": 3.4030758527605484e-05, + "loss": 1.615490198135376, + "step": 1194 + }, + { + "epoch": 0.31166460194301365, + "grad_norm": 4.375, + "learning_rate": 3.402035386417875e-05, + "loss": 2.0656657218933105, + "step": 1195 + }, + { + "epoch": 0.3119254091412923, + "grad_norm": 4.375, + "learning_rate": 3.4009941734332476e-05, + "loss": 1.805072546005249, + "step": 1196 + }, + { + "epoch": 0.312186216339571, + "grad_norm": 4.1875, + "learning_rate": 3.399952214361154e-05, + "loss": 1.7341935634613037, + "step": 1197 + }, + { + "epoch": 0.31244702353784964, + "grad_norm": 3.984375, + "learning_rate": 3.398909509756482e-05, + "loss": 1.8699414730072021, + "step": 1198 + }, + { + "epoch": 0.3127078307361283, + "grad_norm": 3.75, + "learning_rate": 3.397866060174515e-05, + "loss": 1.4938279390335083, + "step": 1199 + }, + { + "epoch": 0.312968637934407, + "grad_norm": 4.15625, + "learning_rate": 3.3968218661709315e-05, + "loss": 1.7651143074035645, + "step": 1200 + }, + { + "epoch": 0.31322944513268564, + "grad_norm": 3.953125, + "learning_rate": 3.3957769283018106e-05, + "loss": 1.6848008632659912, + "step": 1201 + }, + { + "epoch": 0.31349025233096434, + "grad_norm": 4.0625, + "learning_rate": 3.394731247123623e-05, + "loss": 1.8530536890029907, + "step": 1202 + }, + { + "epoch": 0.313751059529243, + "grad_norm": 4.3125, + "learning_rate": 3.393684823193238e-05, + "loss": 2.0332212448120117, + "step": 1203 + }, + { + "epoch": 0.3140118667275217, + "grad_norm": 4.0, + "learning_rate": 3.39263765706792e-05, + "loss": 1.8089815378189087, + "step": 1204 + }, + { + "epoch": 0.3142726739258003, + "grad_norm": 3.9375, + "learning_rate": 3.3915897493053274e-05, + "loss": 1.7100820541381836, + "step": 1205 + }, + { + "epoch": 0.31453348112407903, + "grad_norm": 3.890625, + "learning_rate": 3.390541100463515e-05, + "loss": 1.4569581747055054, + "step": 1206 + }, + { + "epoch": 0.3147942883223577, + "grad_norm": 3.953125, + "learning_rate": 3.3894917111009325e-05, + "loss": 1.7145471572875977, + "step": 1207 + }, + { + "epoch": 0.3150550955206364, + "grad_norm": 4.03125, + "learning_rate": 3.388441581776423e-05, + "loss": 1.8081591129302979, + "step": 1208 + }, + { + "epoch": 0.315315902718915, + "grad_norm": 4.6875, + "learning_rate": 3.3873907130492236e-05, + "loss": 1.6673164367675781, + "step": 1209 + }, + { + "epoch": 0.3155767099171937, + "grad_norm": 4.125, + "learning_rate": 3.386339105478964e-05, + "loss": 1.787574052810669, + "step": 1210 + }, + { + "epoch": 0.31583751711547237, + "grad_norm": 3.6875, + "learning_rate": 3.385286759625672e-05, + "loss": 1.460659146308899, + "step": 1211 + }, + { + "epoch": 0.31609832431375107, + "grad_norm": 4.0, + "learning_rate": 3.384233676049762e-05, + "loss": 1.524317979812622, + "step": 1212 + }, + { + "epoch": 0.3163591315120297, + "grad_norm": 4.0625, + "learning_rate": 3.3831798553120475e-05, + "loss": 1.6283299922943115, + "step": 1213 + }, + { + "epoch": 0.3166199387103084, + "grad_norm": 3.609375, + "learning_rate": 3.38212529797373e-05, + "loss": 1.3031306266784668, + "step": 1214 + }, + { + "epoch": 0.31688074590858706, + "grad_norm": 4.625, + "learning_rate": 3.381070004596405e-05, + "loss": 1.764394998550415, + "step": 1215 + }, + { + "epoch": 0.31714155310686576, + "grad_norm": 4.0, + "learning_rate": 3.3800139757420604e-05, + "loss": 1.557260513305664, + "step": 1216 + }, + { + "epoch": 0.3174023603051444, + "grad_norm": 3.875, + "learning_rate": 3.3789572119730766e-05, + "loss": 1.5873910188674927, + "step": 1217 + }, + { + "epoch": 0.3176631675034231, + "grad_norm": 4.09375, + "learning_rate": 3.377899713852222e-05, + "loss": 1.6127475500106812, + "step": 1218 + }, + { + "epoch": 0.31792397470170175, + "grad_norm": 4.03125, + "learning_rate": 3.376841481942659e-05, + "loss": 1.7642548084259033, + "step": 1219 + }, + { + "epoch": 0.31818478189998045, + "grad_norm": 4.15625, + "learning_rate": 3.3757825168079396e-05, + "loss": 1.804701328277588, + "step": 1220 + }, + { + "epoch": 0.3184455890982591, + "grad_norm": 3.9375, + "learning_rate": 3.374722819012008e-05, + "loss": 1.7389459609985352, + "step": 1221 + }, + { + "epoch": 0.3187063962965378, + "grad_norm": 4.0625, + "learning_rate": 3.373662389119196e-05, + "loss": 1.4407830238342285, + "step": 1222 + }, + { + "epoch": 0.31896720349481644, + "grad_norm": 4.09375, + "learning_rate": 3.372601227694226e-05, + "loss": 1.6595262289047241, + "step": 1223 + }, + { + "epoch": 0.31922801069309514, + "grad_norm": 4.15625, + "learning_rate": 3.3715393353022116e-05, + "loss": 1.8524608612060547, + "step": 1224 + }, + { + "epoch": 0.3194888178913738, + "grad_norm": 4.125, + "learning_rate": 3.370476712508654e-05, + "loss": 1.6108800172805786, + "step": 1225 + }, + { + "epoch": 0.3197496250896525, + "grad_norm": 4.0625, + "learning_rate": 3.369413359879445e-05, + "loss": 1.7554543018341064, + "step": 1226 + }, + { + "epoch": 0.32001043228793113, + "grad_norm": 3.6875, + "learning_rate": 3.368349277980861e-05, + "loss": 1.501410961151123, + "step": 1227 + }, + { + "epoch": 0.32027123948620984, + "grad_norm": 4.21875, + "learning_rate": 3.367284467379572e-05, + "loss": 2.095388412475586, + "step": 1228 + }, + { + "epoch": 0.3205320466844885, + "grad_norm": 4.1875, + "learning_rate": 3.366218928642634e-05, + "loss": 1.7752673625946045, + "step": 1229 + }, + { + "epoch": 0.3207928538827672, + "grad_norm": 3.9375, + "learning_rate": 3.36515266233749e-05, + "loss": 1.790653944015503, + "step": 1230 + }, + { + "epoch": 0.3210536610810458, + "grad_norm": 3.875, + "learning_rate": 3.364085669031971e-05, + "loss": 1.5785245895385742, + "step": 1231 + }, + { + "epoch": 0.3213144682793245, + "grad_norm": 4.09375, + "learning_rate": 3.3630179492942944e-05, + "loss": 1.9329664707183838, + "step": 1232 + }, + { + "epoch": 0.3215752754776032, + "grad_norm": 3.90625, + "learning_rate": 3.361949503693066e-05, + "loss": 1.6032710075378418, + "step": 1233 + }, + { + "epoch": 0.3218360826758819, + "grad_norm": 4.0, + "learning_rate": 3.360880332797278e-05, + "loss": 1.7220112085342407, + "step": 1234 + }, + { + "epoch": 0.3220968898741605, + "grad_norm": 3.953125, + "learning_rate": 3.359810437176307e-05, + "loss": 1.717103362083435, + "step": 1235 + }, + { + "epoch": 0.3223576970724392, + "grad_norm": 3.890625, + "learning_rate": 3.3587398173999166e-05, + "loss": 1.6662729978561401, + "step": 1236 + }, + { + "epoch": 0.32261850427071787, + "grad_norm": 3.796875, + "learning_rate": 3.3576684740382584e-05, + "loss": 1.6473667621612549, + "step": 1237 + }, + { + "epoch": 0.32287931146899657, + "grad_norm": 3.828125, + "learning_rate": 3.356596407661864e-05, + "loss": 1.7563276290893555, + "step": 1238 + }, + { + "epoch": 0.3231401186672752, + "grad_norm": 3.75, + "learning_rate": 3.3555236188416555e-05, + "loss": 1.6142089366912842, + "step": 1239 + }, + { + "epoch": 0.3234009258655539, + "grad_norm": 4.125, + "learning_rate": 3.354450108148937e-05, + "loss": 1.732367753982544, + "step": 1240 + }, + { + "epoch": 0.32366173306383256, + "grad_norm": 4.0, + "learning_rate": 3.3533758761553966e-05, + "loss": 1.6796579360961914, + "step": 1241 + }, + { + "epoch": 0.32392254026211126, + "grad_norm": 3.890625, + "learning_rate": 3.352300923433108e-05, + "loss": 1.8754359483718872, + "step": 1242 + }, + { + "epoch": 0.3241833474603899, + "grad_norm": 4.1875, + "learning_rate": 3.351225250554528e-05, + "loss": 1.778498649597168, + "step": 1243 + }, + { + "epoch": 0.3244441546586686, + "grad_norm": 4.21875, + "learning_rate": 3.350148858092497e-05, + "loss": 2.1445441246032715, + "step": 1244 + }, + { + "epoch": 0.32470496185694725, + "grad_norm": 4.46875, + "learning_rate": 3.349071746620238e-05, + "loss": 1.8901653289794922, + "step": 1245 + }, + { + "epoch": 0.32496576905522595, + "grad_norm": 4.34375, + "learning_rate": 3.347993916711358e-05, + "loss": 1.9751983880996704, + "step": 1246 + }, + { + "epoch": 0.3252265762535046, + "grad_norm": 3.5625, + "learning_rate": 3.346915368939847e-05, + "loss": 1.3669779300689697, + "step": 1247 + }, + { + "epoch": 0.3254873834517833, + "grad_norm": 4.15625, + "learning_rate": 3.3458361038800754e-05, + "loss": 1.8745354413986206, + "step": 1248 + }, + { + "epoch": 0.32574819065006194, + "grad_norm": 4.46875, + "learning_rate": 3.3447561221067964e-05, + "loss": 1.5999069213867188, + "step": 1249 + }, + { + "epoch": 0.3260089978483406, + "grad_norm": 4.34375, + "learning_rate": 3.343675424195146e-05, + "loss": 2.0362579822540283, + "step": 1250 + }, + { + "epoch": 0.3262698050466193, + "grad_norm": 4.0625, + "learning_rate": 3.342594010720639e-05, + "loss": 1.6629884243011475, + "step": 1251 + }, + { + "epoch": 0.32653061224489793, + "grad_norm": 4.3125, + "learning_rate": 3.3415118822591744e-05, + "loss": 2.0487184524536133, + "step": 1252 + }, + { + "epoch": 0.32679141944317663, + "grad_norm": 4.0, + "learning_rate": 3.34042903938703e-05, + "loss": 1.6013741493225098, + "step": 1253 + }, + { + "epoch": 0.3270522266414553, + "grad_norm": 4.0, + "learning_rate": 3.3393454826808645e-05, + "loss": 1.8632688522338867, + "step": 1254 + }, + { + "epoch": 0.327313033839734, + "grad_norm": 4.65625, + "learning_rate": 3.3382612127177166e-05, + "loss": 2.017110586166382, + "step": 1255 + }, + { + "epoch": 0.3275738410380126, + "grad_norm": 4.34375, + "learning_rate": 3.337176230075005e-05, + "loss": 1.9384896755218506, + "step": 1256 + }, + { + "epoch": 0.3278346482362913, + "grad_norm": 4.46875, + "learning_rate": 3.3360905353305284e-05, + "loss": 1.7275506258010864, + "step": 1257 + }, + { + "epoch": 0.32809545543456997, + "grad_norm": 4.15625, + "learning_rate": 3.335004129062464e-05, + "loss": 1.731511116027832, + "step": 1258 + }, + { + "epoch": 0.32835626263284867, + "grad_norm": 4.15625, + "learning_rate": 3.3339170118493674e-05, + "loss": 1.887991189956665, + "step": 1259 + }, + { + "epoch": 0.3286170698311273, + "grad_norm": 3.96875, + "learning_rate": 3.332829184270175e-05, + "loss": 1.6650071144104004, + "step": 1260 + }, + { + "epoch": 0.328877877029406, + "grad_norm": 4.09375, + "learning_rate": 3.331740646904199e-05, + "loss": 1.8969206809997559, + "step": 1261 + }, + { + "epoch": 0.32913868422768466, + "grad_norm": 4.0625, + "learning_rate": 3.3306514003311305e-05, + "loss": 1.8191999197006226, + "step": 1262 + }, + { + "epoch": 0.32939949142596336, + "grad_norm": 3.6875, + "learning_rate": 3.32956144513104e-05, + "loss": 1.7069573402404785, + "step": 1263 + }, + { + "epoch": 0.329660298624242, + "grad_norm": 3.796875, + "learning_rate": 3.328470781884372e-05, + "loss": 1.6841368675231934, + "step": 1264 + }, + { + "epoch": 0.3299211058225207, + "grad_norm": 4.5, + "learning_rate": 3.327379411171951e-05, + "loss": 1.7754472494125366, + "step": 1265 + }, + { + "epoch": 0.33018191302079936, + "grad_norm": 4.125, + "learning_rate": 3.326287333574977e-05, + "loss": 1.608618140220642, + "step": 1266 + }, + { + "epoch": 0.33044272021907806, + "grad_norm": 4.0625, + "learning_rate": 3.3251945496750253e-05, + "loss": 1.667032241821289, + "step": 1267 + }, + { + "epoch": 0.3307035274173567, + "grad_norm": 4.0625, + "learning_rate": 3.324101060054051e-05, + "loss": 1.7562623023986816, + "step": 1268 + }, + { + "epoch": 0.3309643346156354, + "grad_norm": 4.03125, + "learning_rate": 3.32300686529438e-05, + "loss": 1.6734570264816284, + "step": 1269 + }, + { + "epoch": 0.33122514181391405, + "grad_norm": 4.21875, + "learning_rate": 3.321911965978718e-05, + "loss": 1.7349474430084229, + "step": 1270 + }, + { + "epoch": 0.33148594901219275, + "grad_norm": 4.25, + "learning_rate": 3.320816362690145e-05, + "loss": 1.8015398979187012, + "step": 1271 + }, + { + "epoch": 0.3317467562104714, + "grad_norm": 4.03125, + "learning_rate": 3.319720056012113e-05, + "loss": 1.8009923696517944, + "step": 1272 + }, + { + "epoch": 0.3320075634087501, + "grad_norm": 4.21875, + "learning_rate": 3.318623046528453e-05, + "loss": 1.8440279960632324, + "step": 1273 + }, + { + "epoch": 0.33226837060702874, + "grad_norm": 3.953125, + "learning_rate": 3.3175253348233654e-05, + "loss": 1.6640136241912842, + "step": 1274 + }, + { + "epoch": 0.33252917780530744, + "grad_norm": 4.40625, + "learning_rate": 3.316426921481429e-05, + "loss": 1.7970125675201416, + "step": 1275 + }, + { + "epoch": 0.3327899850035861, + "grad_norm": 4.125, + "learning_rate": 3.315327807087595e-05, + "loss": 1.9269849061965942, + "step": 1276 + }, + { + "epoch": 0.3330507922018648, + "grad_norm": 3.984375, + "learning_rate": 3.314227992227186e-05, + "loss": 1.5076572895050049, + "step": 1277 + }, + { + "epoch": 0.33331159940014343, + "grad_norm": 4.6875, + "learning_rate": 3.3131274774859e-05, + "loss": 1.6746129989624023, + "step": 1278 + }, + { + "epoch": 0.33357240659842213, + "grad_norm": 3.890625, + "learning_rate": 3.312026263449805e-05, + "loss": 1.6816201210021973, + "step": 1279 + }, + { + "epoch": 0.3338332137967008, + "grad_norm": 4.125, + "learning_rate": 3.310924350705345e-05, + "loss": 1.7868664264678955, + "step": 1280 + }, + { + "epoch": 0.3340940209949795, + "grad_norm": 3.875, + "learning_rate": 3.309821739839333e-05, + "loss": 1.4542313814163208, + "step": 1281 + }, + { + "epoch": 0.3343548281932581, + "grad_norm": 3.875, + "learning_rate": 3.308718431438956e-05, + "loss": 1.7355276346206665, + "step": 1282 + }, + { + "epoch": 0.3346156353915368, + "grad_norm": 3.859375, + "learning_rate": 3.3076144260917705e-05, + "loss": 1.4578869342803955, + "step": 1283 + }, + { + "epoch": 0.33487644258981547, + "grad_norm": 3.875, + "learning_rate": 3.306509724385706e-05, + "loss": 1.5557750463485718, + "step": 1284 + }, + { + "epoch": 0.33513724978809417, + "grad_norm": 3.84375, + "learning_rate": 3.3054043269090614e-05, + "loss": 1.6612582206726074, + "step": 1285 + }, + { + "epoch": 0.3353980569863728, + "grad_norm": 4.0625, + "learning_rate": 3.304298234250506e-05, + "loss": 1.7065796852111816, + "step": 1286 + }, + { + "epoch": 0.3356588641846515, + "grad_norm": 4.25, + "learning_rate": 3.303191446999082e-05, + "loss": 1.929694652557373, + "step": 1287 + }, + { + "epoch": 0.33591967138293016, + "grad_norm": 3.875, + "learning_rate": 3.302083965744198e-05, + "loss": 1.6119188070297241, + "step": 1288 + }, + { + "epoch": 0.33618047858120886, + "grad_norm": 4.21875, + "learning_rate": 3.300975791075633e-05, + "loss": 1.7780215740203857, + "step": 1289 + }, + { + "epoch": 0.3364412857794875, + "grad_norm": 3.90625, + "learning_rate": 3.2998669235835374e-05, + "loss": 1.674952745437622, + "step": 1290 + }, + { + "epoch": 0.3367020929777662, + "grad_norm": 4.03125, + "learning_rate": 3.298757363858429e-05, + "loss": 1.6719112396240234, + "step": 1291 + }, + { + "epoch": 0.33696290017604486, + "grad_norm": 3.890625, + "learning_rate": 3.297647112491193e-05, + "loss": 1.6144449710845947, + "step": 1292 + }, + { + "epoch": 0.33722370737432356, + "grad_norm": 4.28125, + "learning_rate": 3.2965361700730856e-05, + "loss": 2.0725719928741455, + "step": 1293 + }, + { + "epoch": 0.3374845145726022, + "grad_norm": 4.1875, + "learning_rate": 3.2954245371957294e-05, + "loss": 1.604860782623291, + "step": 1294 + }, + { + "epoch": 0.3377453217708809, + "grad_norm": 4.3125, + "learning_rate": 3.294312214451115e-05, + "loss": 1.7692651748657227, + "step": 1295 + }, + { + "epoch": 0.33800612896915955, + "grad_norm": 3.875, + "learning_rate": 3.293199202431599e-05, + "loss": 1.7182050943374634, + "step": 1296 + }, + { + "epoch": 0.33826693616743825, + "grad_norm": 3.8125, + "learning_rate": 3.292085501729909e-05, + "loss": 1.5841974020004272, + "step": 1297 + }, + { + "epoch": 0.3385277433657169, + "grad_norm": 3.859375, + "learning_rate": 3.290971112939135e-05, + "loss": 1.6726911067962646, + "step": 1298 + }, + { + "epoch": 0.33878855056399554, + "grad_norm": 3.546875, + "learning_rate": 3.289856036652736e-05, + "loss": 1.6453895568847656, + "step": 1299 + }, + { + "epoch": 0.33904935776227424, + "grad_norm": 4.03125, + "learning_rate": 3.288740273464535e-05, + "loss": 1.8880107402801514, + "step": 1300 + }, + { + "epoch": 0.3393101649605529, + "grad_norm": 3.859375, + "learning_rate": 3.287623823968724e-05, + "loss": 1.6029824018478394, + "step": 1301 + }, + { + "epoch": 0.3395709721588316, + "grad_norm": 4.0, + "learning_rate": 3.2865066887598566e-05, + "loss": 1.5460083484649658, + "step": 1302 + }, + { + "epoch": 0.33983177935711023, + "grad_norm": 3.75, + "learning_rate": 3.285388868432856e-05, + "loss": 1.419440746307373, + "step": 1303 + }, + { + "epoch": 0.34009258655538893, + "grad_norm": 4.15625, + "learning_rate": 3.284270363583005e-05, + "loss": 1.952426791191101, + "step": 1304 + }, + { + "epoch": 0.3403533937536676, + "grad_norm": 3.9375, + "learning_rate": 3.283151174805957e-05, + "loss": 1.7862493991851807, + "step": 1305 + }, + { + "epoch": 0.3406142009519463, + "grad_norm": 4.1875, + "learning_rate": 3.282031302697724e-05, + "loss": 1.757668137550354, + "step": 1306 + }, + { + "epoch": 0.3408750081502249, + "grad_norm": 4.0, + "learning_rate": 3.280910747854685e-05, + "loss": 1.7689999341964722, + "step": 1307 + }, + { + "epoch": 0.3411358153485036, + "grad_norm": 4.0625, + "learning_rate": 3.279789510873583e-05, + "loss": 1.647916555404663, + "step": 1308 + }, + { + "epoch": 0.34139662254678227, + "grad_norm": 4.125, + "learning_rate": 3.278667592351521e-05, + "loss": 1.7402260303497314, + "step": 1309 + }, + { + "epoch": 0.34165742974506097, + "grad_norm": 4.40625, + "learning_rate": 3.277544992885969e-05, + "loss": 1.8295246362686157, + "step": 1310 + }, + { + "epoch": 0.3419182369433396, + "grad_norm": 4.21875, + "learning_rate": 3.2764217130747566e-05, + "loss": 1.785295009613037, + "step": 1311 + }, + { + "epoch": 0.3421790441416183, + "grad_norm": 3.71875, + "learning_rate": 3.275297753516078e-05, + "loss": 1.67706298828125, + "step": 1312 + }, + { + "epoch": 0.34243985133989696, + "grad_norm": 3.75, + "learning_rate": 3.274173114808487e-05, + "loss": 1.673936128616333, + "step": 1313 + }, + { + "epoch": 0.34270065853817566, + "grad_norm": 4.03125, + "learning_rate": 3.273047797550901e-05, + "loss": 1.7098627090454102, + "step": 1314 + }, + { + "epoch": 0.3429614657364543, + "grad_norm": 4.125, + "learning_rate": 3.2719218023425976e-05, + "loss": 1.8933629989624023, + "step": 1315 + }, + { + "epoch": 0.343222272934733, + "grad_norm": 4.25, + "learning_rate": 3.270795129783217e-05, + "loss": 1.7215807437896729, + "step": 1316 + }, + { + "epoch": 0.34348308013301165, + "grad_norm": 4.3125, + "learning_rate": 3.2696677804727574e-05, + "loss": 1.659914493560791, + "step": 1317 + }, + { + "epoch": 0.34374388733129035, + "grad_norm": 3.875, + "learning_rate": 3.26853975501158e-05, + "loss": 1.5705589056015015, + "step": 1318 + }, + { + "epoch": 0.344004694529569, + "grad_norm": 4.125, + "learning_rate": 3.267411054000406e-05, + "loss": 1.8723785877227783, + "step": 1319 + }, + { + "epoch": 0.3442655017278477, + "grad_norm": 4.0, + "learning_rate": 3.266281678040314e-05, + "loss": 1.6139472723007202, + "step": 1320 + }, + { + "epoch": 0.34452630892612635, + "grad_norm": 3.875, + "learning_rate": 3.265151627732744e-05, + "loss": 1.7100131511688232, + "step": 1321 + }, + { + "epoch": 0.34478711612440505, + "grad_norm": 3.890625, + "learning_rate": 3.2640209036794946e-05, + "loss": 1.6928737163543701, + "step": 1322 + }, + { + "epoch": 0.3450479233226837, + "grad_norm": 3.84375, + "learning_rate": 3.262889506482723e-05, + "loss": 1.6608216762542725, + "step": 1323 + }, + { + "epoch": 0.3453087305209624, + "grad_norm": 3.84375, + "learning_rate": 3.261757436744946e-05, + "loss": 1.524505376815796, + "step": 1324 + }, + { + "epoch": 0.34556953771924104, + "grad_norm": 3.75, + "learning_rate": 3.2606246950690365e-05, + "loss": 1.627318024635315, + "step": 1325 + }, + { + "epoch": 0.34583034491751974, + "grad_norm": 4.03125, + "learning_rate": 3.259491282058227e-05, + "loss": 1.7727982997894287, + "step": 1326 + }, + { + "epoch": 0.3460911521157984, + "grad_norm": 4.3125, + "learning_rate": 3.2583571983161074e-05, + "loss": 1.734523892402649, + "step": 1327 + }, + { + "epoch": 0.3463519593140771, + "grad_norm": 4.03125, + "learning_rate": 3.2572224444466226e-05, + "loss": 1.7290492057800293, + "step": 1328 + }, + { + "epoch": 0.34661276651235573, + "grad_norm": 4.375, + "learning_rate": 3.2560870210540786e-05, + "loss": 1.9091473817825317, + "step": 1329 + }, + { + "epoch": 0.34687357371063443, + "grad_norm": 4.125, + "learning_rate": 3.254950928743133e-05, + "loss": 1.7047741413116455, + "step": 1330 + }, + { + "epoch": 0.3471343809089131, + "grad_norm": 3.6875, + "learning_rate": 3.2538141681188035e-05, + "loss": 1.5911457538604736, + "step": 1331 + }, + { + "epoch": 0.3473951881071918, + "grad_norm": 3.90625, + "learning_rate": 3.2526767397864614e-05, + "loss": 1.6464033126831055, + "step": 1332 + }, + { + "epoch": 0.3476559953054704, + "grad_norm": 3.703125, + "learning_rate": 3.2515386443518356e-05, + "loss": 1.6222318410873413, + "step": 1333 + }, + { + "epoch": 0.3479168025037491, + "grad_norm": 3.8125, + "learning_rate": 3.250399882421007e-05, + "loss": 1.6447246074676514, + "step": 1334 + }, + { + "epoch": 0.34817760970202777, + "grad_norm": 3.6875, + "learning_rate": 3.249260454600416e-05, + "loss": 1.5842132568359375, + "step": 1335 + }, + { + "epoch": 0.34843841690030647, + "grad_norm": 4.0, + "learning_rate": 3.2481203614968544e-05, + "loss": 1.8774179220199585, + "step": 1336 + }, + { + "epoch": 0.3486992240985851, + "grad_norm": 3.875, + "learning_rate": 3.246979603717467e-05, + "loss": 1.8143694400787354, + "step": 1337 + }, + { + "epoch": 0.3489600312968638, + "grad_norm": 3.84375, + "learning_rate": 3.2458381818697576e-05, + "loss": 1.4577972888946533, + "step": 1338 + }, + { + "epoch": 0.34922083849514246, + "grad_norm": 4.125, + "learning_rate": 3.244696096561579e-05, + "loss": 1.8097434043884277, + "step": 1339 + }, + { + "epoch": 0.34948164569342116, + "grad_norm": 3.796875, + "learning_rate": 3.2435533484011385e-05, + "loss": 1.5269027948379517, + "step": 1340 + }, + { + "epoch": 0.3497424528916998, + "grad_norm": 4.0, + "learning_rate": 3.242409937996999e-05, + "loss": 1.495283603668213, + "step": 1341 + }, + { + "epoch": 0.3500032600899785, + "grad_norm": 4.09375, + "learning_rate": 3.2412658659580715e-05, + "loss": 1.5215660333633423, + "step": 1342 + }, + { + "epoch": 0.35026406728825715, + "grad_norm": 3.828125, + "learning_rate": 3.240121132893623e-05, + "loss": 1.4833729267120361, + "step": 1343 + }, + { + "epoch": 0.35052487448653585, + "grad_norm": 3.734375, + "learning_rate": 3.2389757394132706e-05, + "loss": 1.6995420455932617, + "step": 1344 + }, + { + "epoch": 0.3507856816848145, + "grad_norm": 3.765625, + "learning_rate": 3.2378296861269854e-05, + "loss": 1.4850215911865234, + "step": 1345 + }, + { + "epoch": 0.3510464888830932, + "grad_norm": 4.28125, + "learning_rate": 3.236682973645087e-05, + "loss": 1.8568706512451172, + "step": 1346 + }, + { + "epoch": 0.35130729608137184, + "grad_norm": 4.09375, + "learning_rate": 3.235535602578246e-05, + "loss": 1.7425060272216797, + "step": 1347 + }, + { + "epoch": 0.35156810327965055, + "grad_norm": 4.0625, + "learning_rate": 3.234387573537488e-05, + "loss": 1.425740361213684, + "step": 1348 + }, + { + "epoch": 0.3518289104779292, + "grad_norm": 3.609375, + "learning_rate": 3.233238887134184e-05, + "loss": 1.6624091863632202, + "step": 1349 + }, + { + "epoch": 0.35208971767620784, + "grad_norm": 4.25, + "learning_rate": 3.2320895439800584e-05, + "loss": 1.684777021408081, + "step": 1350 + }, + { + "epoch": 0.35235052487448654, + "grad_norm": 4.15625, + "learning_rate": 3.230939544687183e-05, + "loss": 1.9346016645431519, + "step": 1351 + }, + { + "epoch": 0.3526113320727652, + "grad_norm": 4.0, + "learning_rate": 3.229788889867981e-05, + "loss": 1.682251214981079, + "step": 1352 + }, + { + "epoch": 0.3528721392710439, + "grad_norm": 4.0, + "learning_rate": 3.2286375801352225e-05, + "loss": 1.6682612895965576, + "step": 1353 + }, + { + "epoch": 0.35313294646932253, + "grad_norm": 4.1875, + "learning_rate": 3.2274856161020284e-05, + "loss": 1.6654565334320068, + "step": 1354 + }, + { + "epoch": 0.35339375366760123, + "grad_norm": 3.953125, + "learning_rate": 3.226332998381867e-05, + "loss": 1.6384330987930298, + "step": 1355 + }, + { + "epoch": 0.3536545608658799, + "grad_norm": 4.15625, + "learning_rate": 3.225179727588556e-05, + "loss": 1.5750579833984375, + "step": 1356 + }, + { + "epoch": 0.3539153680641586, + "grad_norm": 3.828125, + "learning_rate": 3.2240258043362593e-05, + "loss": 1.6457281112670898, + "step": 1357 + }, + { + "epoch": 0.3541761752624372, + "grad_norm": 3.765625, + "learning_rate": 3.222871229239489e-05, + "loss": 1.5718562602996826, + "step": 1358 + }, + { + "epoch": 0.3544369824607159, + "grad_norm": 4.1875, + "learning_rate": 3.221716002913103e-05, + "loss": 1.8065204620361328, + "step": 1359 + }, + { + "epoch": 0.35469778965899457, + "grad_norm": 4.03125, + "learning_rate": 3.220560125972309e-05, + "loss": 1.7895827293395996, + "step": 1360 + }, + { + "epoch": 0.35495859685727327, + "grad_norm": 3.859375, + "learning_rate": 3.219403599032659e-05, + "loss": 1.6034718751907349, + "step": 1361 + }, + { + "epoch": 0.3552194040555519, + "grad_norm": 4.21875, + "learning_rate": 3.21824642271005e-05, + "loss": 1.5476500988006592, + "step": 1362 + }, + { + "epoch": 0.3554802112538306, + "grad_norm": 4.34375, + "learning_rate": 3.217088597620728e-05, + "loss": 1.6288042068481445, + "step": 1363 + }, + { + "epoch": 0.35574101845210926, + "grad_norm": 3.796875, + "learning_rate": 3.215930124381282e-05, + "loss": 1.642390489578247, + "step": 1364 + }, + { + "epoch": 0.35600182565038796, + "grad_norm": 3.859375, + "learning_rate": 3.2147710036086475e-05, + "loss": 1.6608468294143677, + "step": 1365 + }, + { + "epoch": 0.3562626328486666, + "grad_norm": 3.703125, + "learning_rate": 3.2136112359201043e-05, + "loss": 1.7599210739135742, + "step": 1366 + }, + { + "epoch": 0.3565234400469453, + "grad_norm": 4.09375, + "learning_rate": 3.212450821933277e-05, + "loss": 1.5129823684692383, + "step": 1367 + }, + { + "epoch": 0.35678424724522395, + "grad_norm": 4.0625, + "learning_rate": 3.211289762266132e-05, + "loss": 1.5158467292785645, + "step": 1368 + }, + { + "epoch": 0.35704505444350265, + "grad_norm": 3.609375, + "learning_rate": 3.210128057536985e-05, + "loss": 1.4805352687835693, + "step": 1369 + }, + { + "epoch": 0.3573058616417813, + "grad_norm": 3.9375, + "learning_rate": 3.20896570836449e-05, + "loss": 1.7511478662490845, + "step": 1370 + }, + { + "epoch": 0.35756666884006, + "grad_norm": 4.15625, + "learning_rate": 3.207802715367647e-05, + "loss": 1.7829283475875854, + "step": 1371 + }, + { + "epoch": 0.35782747603833864, + "grad_norm": 3.96875, + "learning_rate": 3.2066390791657966e-05, + "loss": 1.6291167736053467, + "step": 1372 + }, + { + "epoch": 0.35808828323661734, + "grad_norm": 3.984375, + "learning_rate": 3.2054748003786245e-05, + "loss": 1.9838212728500366, + "step": 1373 + }, + { + "epoch": 0.358349090434896, + "grad_norm": 4.03125, + "learning_rate": 3.2043098796261575e-05, + "loss": 1.783674955368042, + "step": 1374 + }, + { + "epoch": 0.3586098976331747, + "grad_norm": 3.734375, + "learning_rate": 3.203144317528764e-05, + "loss": 1.5914146900177002, + "step": 1375 + }, + { + "epoch": 0.35887070483145334, + "grad_norm": 4.09375, + "learning_rate": 3.2019781147071526e-05, + "loss": 1.6053392887115479, + "step": 1376 + }, + { + "epoch": 0.35913151202973204, + "grad_norm": 3.671875, + "learning_rate": 3.2008112717823765e-05, + "loss": 1.4937987327575684, + "step": 1377 + }, + { + "epoch": 0.3593923192280107, + "grad_norm": 3.890625, + "learning_rate": 3.199643789375828e-05, + "loss": 1.5063151121139526, + "step": 1378 + }, + { + "epoch": 0.3596531264262894, + "grad_norm": 4.03125, + "learning_rate": 3.198475668109239e-05, + "loss": 1.5862890481948853, + "step": 1379 + }, + { + "epoch": 0.359913933624568, + "grad_norm": 3.3125, + "learning_rate": 3.197306908604682e-05, + "loss": 1.3541672229766846, + "step": 1380 + }, + { + "epoch": 0.36017474082284673, + "grad_norm": 3.90625, + "learning_rate": 3.196137511484571e-05, + "loss": 1.6211347579956055, + "step": 1381 + }, + { + "epoch": 0.3604355480211254, + "grad_norm": 3.9375, + "learning_rate": 3.194967477371658e-05, + "loss": 1.8768589496612549, + "step": 1382 + }, + { + "epoch": 0.3606963552194041, + "grad_norm": 3.640625, + "learning_rate": 3.1937968068890346e-05, + "loss": 1.3836852312088013, + "step": 1383 + }, + { + "epoch": 0.3609571624176827, + "grad_norm": 3.734375, + "learning_rate": 3.192625500660132e-05, + "loss": 1.5915032625198364, + "step": 1384 + }, + { + "epoch": 0.3612179696159614, + "grad_norm": 3.84375, + "learning_rate": 3.191453559308718e-05, + "loss": 1.5451784133911133, + "step": 1385 + }, + { + "epoch": 0.36147877681424007, + "grad_norm": 3.921875, + "learning_rate": 3.190280983458901e-05, + "loss": 1.7580339908599854, + "step": 1386 + }, + { + "epoch": 0.36173958401251877, + "grad_norm": 4.15625, + "learning_rate": 3.189107773735126e-05, + "loss": 1.6647831201553345, + "step": 1387 + }, + { + "epoch": 0.3620003912107974, + "grad_norm": 3.921875, + "learning_rate": 3.1879339307621765e-05, + "loss": 1.7435990571975708, + "step": 1388 + }, + { + "epoch": 0.3622611984090761, + "grad_norm": 3.875, + "learning_rate": 3.1867594551651704e-05, + "loss": 1.602888822555542, + "step": 1389 + }, + { + "epoch": 0.36252200560735476, + "grad_norm": 3.53125, + "learning_rate": 3.185584347569567e-05, + "loss": 1.54853093624115, + "step": 1390 + }, + { + "epoch": 0.36278281280563346, + "grad_norm": 3.890625, + "learning_rate": 3.184408608601158e-05, + "loss": 1.764925241470337, + "step": 1391 + }, + { + "epoch": 0.3630436200039121, + "grad_norm": 4.03125, + "learning_rate": 3.183232238886075e-05, + "loss": 1.7794710397720337, + "step": 1392 + }, + { + "epoch": 0.3633044272021908, + "grad_norm": 6.5625, + "learning_rate": 3.182055239050782e-05, + "loss": 1.8926446437835693, + "step": 1393 + }, + { + "epoch": 0.36356523440046945, + "grad_norm": 3.953125, + "learning_rate": 3.18087760972208e-05, + "loss": 1.7303396463394165, + "step": 1394 + }, + { + "epoch": 0.36382604159874815, + "grad_norm": 4.09375, + "learning_rate": 3.1796993515271075e-05, + "loss": 1.884756088256836, + "step": 1395 + }, + { + "epoch": 0.3640868487970268, + "grad_norm": 4.0, + "learning_rate": 3.1785204650933334e-05, + "loss": 1.5551866292953491, + "step": 1396 + }, + { + "epoch": 0.3643476559953055, + "grad_norm": 4.03125, + "learning_rate": 3.177340951048566e-05, + "loss": 1.7906616926193237, + "step": 1397 + }, + { + "epoch": 0.36460846319358414, + "grad_norm": 3.96875, + "learning_rate": 3.176160810020943e-05, + "loss": 1.7069594860076904, + "step": 1398 + }, + { + "epoch": 0.3648692703918628, + "grad_norm": 3.78125, + "learning_rate": 3.1749800426389405e-05, + "loss": 1.9896750450134277, + "step": 1399 + }, + { + "epoch": 0.3651300775901415, + "grad_norm": 4.09375, + "learning_rate": 3.1737986495313644e-05, + "loss": 1.3037837743759155, + "step": 1400 + }, + { + "epoch": 0.36539088478842013, + "grad_norm": 3.96875, + "learning_rate": 3.1726166313273565e-05, + "loss": 1.5864887237548828, + "step": 1401 + }, + { + "epoch": 0.36565169198669883, + "grad_norm": 4.03125, + "learning_rate": 3.1714339886563896e-05, + "loss": 1.5846668481826782, + "step": 1402 + }, + { + "epoch": 0.3659124991849775, + "grad_norm": 3.609375, + "learning_rate": 3.170250722148271e-05, + "loss": 1.5023304224014282, + "step": 1403 + }, + { + "epoch": 0.3661733063832562, + "grad_norm": 4.0, + "learning_rate": 3.169066832433139e-05, + "loss": 1.797839879989624, + "step": 1404 + }, + { + "epoch": 0.3664341135815348, + "grad_norm": 4.0625, + "learning_rate": 3.167882320141463e-05, + "loss": 1.8674392700195312, + "step": 1405 + }, + { + "epoch": 0.3666949207798135, + "grad_norm": 3.6875, + "learning_rate": 3.166697185904046e-05, + "loss": 1.5936272144317627, + "step": 1406 + }, + { + "epoch": 0.36695572797809217, + "grad_norm": 4.21875, + "learning_rate": 3.1655114303520216e-05, + "loss": 1.7863364219665527, + "step": 1407 + }, + { + "epoch": 0.3672165351763709, + "grad_norm": 3.8125, + "learning_rate": 3.1643250541168515e-05, + "loss": 1.576570987701416, + "step": 1408 + }, + { + "epoch": 0.3674773423746495, + "grad_norm": 4.125, + "learning_rate": 3.163138057830332e-05, + "loss": 1.69913649559021, + "step": 1409 + }, + { + "epoch": 0.3677381495729282, + "grad_norm": 3.703125, + "learning_rate": 3.161950442124587e-05, + "loss": 1.4786731004714966, + "step": 1410 + }, + { + "epoch": 0.36799895677120686, + "grad_norm": 3.828125, + "learning_rate": 3.160762207632071e-05, + "loss": 1.6378079652786255, + "step": 1411 + }, + { + "epoch": 0.36825976396948557, + "grad_norm": 3.5, + "learning_rate": 3.1595733549855697e-05, + "loss": 1.553241491317749, + "step": 1412 + }, + { + "epoch": 0.3685205711677642, + "grad_norm": 3.75, + "learning_rate": 3.158383884818195e-05, + "loss": 1.5734972953796387, + "step": 1413 + }, + { + "epoch": 0.3687813783660429, + "grad_norm": 4.0625, + "learning_rate": 3.1571937977633884e-05, + "loss": 1.666394591331482, + "step": 1414 + }, + { + "epoch": 0.36904218556432156, + "grad_norm": 3.84375, + "learning_rate": 3.1560030944549226e-05, + "loss": 1.7138550281524658, + "step": 1415 + }, + { + "epoch": 0.36930299276260026, + "grad_norm": 3.8125, + "learning_rate": 3.1548117755268945e-05, + "loss": 1.4449464082717896, + "step": 1416 + }, + { + "epoch": 0.3695637999608789, + "grad_norm": 4.0625, + "learning_rate": 3.1536198416137325e-05, + "loss": 1.5007790327072144, + "step": 1417 + }, + { + "epoch": 0.3698246071591576, + "grad_norm": 4.03125, + "learning_rate": 3.1524272933501895e-05, + "loss": 1.4527007341384888, + "step": 1418 + }, + { + "epoch": 0.37008541435743625, + "grad_norm": 3.5, + "learning_rate": 3.151234131371348e-05, + "loss": 1.612570881843567, + "step": 1419 + }, + { + "epoch": 0.37034622155571495, + "grad_norm": 4.09375, + "learning_rate": 3.150040356312614e-05, + "loss": 1.774835467338562, + "step": 1420 + }, + { + "epoch": 0.3706070287539936, + "grad_norm": 3.9375, + "learning_rate": 3.148845968809725e-05, + "loss": 1.7513749599456787, + "step": 1421 + }, + { + "epoch": 0.3708678359522723, + "grad_norm": 3.9375, + "learning_rate": 3.147650969498741e-05, + "loss": 1.6648533344268799, + "step": 1422 + }, + { + "epoch": 0.37112864315055094, + "grad_norm": 3.890625, + "learning_rate": 3.146455359016048e-05, + "loss": 1.6554616689682007, + "step": 1423 + }, + { + "epoch": 0.37138945034882964, + "grad_norm": 4.15625, + "learning_rate": 3.1452591379983574e-05, + "loss": 1.7418549060821533, + "step": 1424 + }, + { + "epoch": 0.3716502575471083, + "grad_norm": 4.28125, + "learning_rate": 3.144062307082709e-05, + "loss": 1.7103866338729858, + "step": 1425 + }, + { + "epoch": 0.371911064745387, + "grad_norm": 3.515625, + "learning_rate": 3.142864866906462e-05, + "loss": 1.495460867881775, + "step": 1426 + }, + { + "epoch": 0.37217187194366563, + "grad_norm": 4.53125, + "learning_rate": 3.141666818107306e-05, + "loss": 1.5710725784301758, + "step": 1427 + }, + { + "epoch": 0.37243267914194433, + "grad_norm": 4.03125, + "learning_rate": 3.1404681613232476e-05, + "loss": 1.6216447353363037, + "step": 1428 + }, + { + "epoch": 0.372693486340223, + "grad_norm": 4.0, + "learning_rate": 3.139268897192625e-05, + "loss": 1.8143309354782104, + "step": 1429 + }, + { + "epoch": 0.3729542935385017, + "grad_norm": 3.84375, + "learning_rate": 3.138069026354095e-05, + "loss": 1.5283796787261963, + "step": 1430 + }, + { + "epoch": 0.3732151007367803, + "grad_norm": 4.28125, + "learning_rate": 3.1368685494466375e-05, + "loss": 1.5353344678878784, + "step": 1431 + }, + { + "epoch": 0.373475907935059, + "grad_norm": 4.1875, + "learning_rate": 3.1356674671095564e-05, + "loss": 1.8828372955322266, + "step": 1432 + }, + { + "epoch": 0.37373671513333767, + "grad_norm": 4.0, + "learning_rate": 3.134465779982479e-05, + "loss": 1.6504466533660889, + "step": 1433 + }, + { + "epoch": 0.37399752233161637, + "grad_norm": 4.0625, + "learning_rate": 3.133263488705353e-05, + "loss": 1.8509012460708618, + "step": 1434 + }, + { + "epoch": 0.374258329529895, + "grad_norm": 3.75, + "learning_rate": 3.132060593918448e-05, + "loss": 1.6439563035964966, + "step": 1435 + }, + { + "epoch": 0.3745191367281737, + "grad_norm": 4.125, + "learning_rate": 3.1308570962623554e-05, + "loss": 1.6253024339675903, + "step": 1436 + }, + { + "epoch": 0.37477994392645236, + "grad_norm": 4.28125, + "learning_rate": 3.129652996377987e-05, + "loss": 1.900834321975708, + "step": 1437 + }, + { + "epoch": 0.37504075112473106, + "grad_norm": 3.75, + "learning_rate": 3.1284482949065776e-05, + "loss": 1.5885049104690552, + "step": 1438 + }, + { + "epoch": 0.3753015583230097, + "grad_norm": 4.375, + "learning_rate": 3.127242992489679e-05, + "loss": 1.9497604370117188, + "step": 1439 + }, + { + "epoch": 0.3755623655212884, + "grad_norm": 3.734375, + "learning_rate": 3.126037089769165e-05, + "loss": 1.7633403539657593, + "step": 1440 + }, + { + "epoch": 0.37582317271956706, + "grad_norm": 3.625, + "learning_rate": 3.12483058738723e-05, + "loss": 1.619724988937378, + "step": 1441 + }, + { + "epoch": 0.37608397991784576, + "grad_norm": 3.609375, + "learning_rate": 3.123623485986385e-05, + "loss": 1.611403465270996, + "step": 1442 + }, + { + "epoch": 0.3763447871161244, + "grad_norm": 3.921875, + "learning_rate": 3.1224157862094624e-05, + "loss": 1.6440799236297607, + "step": 1443 + }, + { + "epoch": 0.3766055943144031, + "grad_norm": 3.734375, + "learning_rate": 3.121207488699612e-05, + "loss": 1.7324094772338867, + "step": 1444 + }, + { + "epoch": 0.37686640151268175, + "grad_norm": 3.8125, + "learning_rate": 3.1199985941003025e-05, + "loss": 1.7377610206604004, + "step": 1445 + }, + { + "epoch": 0.37712720871096045, + "grad_norm": 3.734375, + "learning_rate": 3.11878910305532e-05, + "loss": 1.4954262971878052, + "step": 1446 + }, + { + "epoch": 0.3773880159092391, + "grad_norm": 3.71875, + "learning_rate": 3.117579016208769e-05, + "loss": 1.6021431684494019, + "step": 1447 + }, + { + "epoch": 0.3776488231075178, + "grad_norm": 3.828125, + "learning_rate": 3.1163683342050716e-05, + "loss": 1.5622243881225586, + "step": 1448 + }, + { + "epoch": 0.37790963030579644, + "grad_norm": 3.84375, + "learning_rate": 3.115157057688964e-05, + "loss": 1.568746566772461, + "step": 1449 + }, + { + "epoch": 0.3781704375040751, + "grad_norm": 3.78125, + "learning_rate": 3.113945187305504e-05, + "loss": 1.6814937591552734, + "step": 1450 + }, + { + "epoch": 0.3784312447023538, + "grad_norm": 3.984375, + "learning_rate": 3.11273272370006e-05, + "loss": 1.6775844097137451, + "step": 1451 + }, + { + "epoch": 0.37869205190063243, + "grad_norm": 3.953125, + "learning_rate": 3.1115196675183216e-05, + "loss": 1.4709616899490356, + "step": 1452 + }, + { + "epoch": 0.37895285909891113, + "grad_norm": 3.84375, + "learning_rate": 3.11030601940629e-05, + "loss": 1.746448040008545, + "step": 1453 + }, + { + "epoch": 0.3792136662971898, + "grad_norm": 3.75, + "learning_rate": 3.109091780010283e-05, + "loss": 1.7106285095214844, + "step": 1454 + }, + { + "epoch": 0.3794744734954685, + "grad_norm": 3.859375, + "learning_rate": 3.107876949976934e-05, + "loss": 1.6017910242080688, + "step": 1455 + }, + { + "epoch": 0.3797352806937471, + "grad_norm": 3.953125, + "learning_rate": 3.106661529953191e-05, + "loss": 1.951802134513855, + "step": 1456 + }, + { + "epoch": 0.3799960878920258, + "grad_norm": 3.65625, + "learning_rate": 3.105445520586314e-05, + "loss": 1.2404531240463257, + "step": 1457 + }, + { + "epoch": 0.38025689509030447, + "grad_norm": 3.5625, + "learning_rate": 3.1042289225238796e-05, + "loss": 1.6789586544036865, + "step": 1458 + }, + { + "epoch": 0.38051770228858317, + "grad_norm": 3.765625, + "learning_rate": 3.103011736413776e-05, + "loss": 1.8510938882827759, + "step": 1459 + }, + { + "epoch": 0.3807785094868618, + "grad_norm": 4.03125, + "learning_rate": 3.101793962904205e-05, + "loss": 1.9462577104568481, + "step": 1460 + }, + { + "epoch": 0.3810393166851405, + "grad_norm": 3.8125, + "learning_rate": 3.100575602643683e-05, + "loss": 1.7952697277069092, + "step": 1461 + }, + { + "epoch": 0.38130012388341916, + "grad_norm": 3.609375, + "learning_rate": 3.099356656281035e-05, + "loss": 1.5525479316711426, + "step": 1462 + }, + { + "epoch": 0.38156093108169786, + "grad_norm": 3.734375, + "learning_rate": 3.098137124465403e-05, + "loss": 1.5512548685073853, + "step": 1463 + }, + { + "epoch": 0.3818217382799765, + "grad_norm": 3.84375, + "learning_rate": 3.096917007846237e-05, + "loss": 1.47243070602417, + "step": 1464 + }, + { + "epoch": 0.3820825454782552, + "grad_norm": 7.34375, + "learning_rate": 3.095696307073299e-05, + "loss": 2.14477801322937, + "step": 1465 + }, + { + "epoch": 0.38234335267653385, + "grad_norm": 4.0625, + "learning_rate": 3.094475022796664e-05, + "loss": 1.8921345472335815, + "step": 1466 + }, + { + "epoch": 0.38260415987481255, + "grad_norm": 3.953125, + "learning_rate": 3.093253155666715e-05, + "loss": 1.5091718435287476, + "step": 1467 + }, + { + "epoch": 0.3828649670730912, + "grad_norm": 3.71875, + "learning_rate": 3.0920307063341485e-05, + "loss": 1.737138032913208, + "step": 1468 + }, + { + "epoch": 0.3831257742713699, + "grad_norm": 3.390625, + "learning_rate": 3.090807675449969e-05, + "loss": 1.533186435699463, + "step": 1469 + }, + { + "epoch": 0.38338658146964855, + "grad_norm": 4.53125, + "learning_rate": 3.0895840636654906e-05, + "loss": 1.9540660381317139, + "step": 1470 + }, + { + "epoch": 0.38364738866792725, + "grad_norm": 3.921875, + "learning_rate": 3.088359871632337e-05, + "loss": 1.6391453742980957, + "step": 1471 + }, + { + "epoch": 0.3839081958662059, + "grad_norm": 3.65625, + "learning_rate": 3.0871351000024425e-05, + "loss": 1.4946112632751465, + "step": 1472 + }, + { + "epoch": 0.3841690030644846, + "grad_norm": 3.53125, + "learning_rate": 3.085909749428048e-05, + "loss": 1.6111618280410767, + "step": 1473 + }, + { + "epoch": 0.38442981026276324, + "grad_norm": 4.15625, + "learning_rate": 3.084683820561703e-05, + "loss": 1.9099327325820923, + "step": 1474 + }, + { + "epoch": 0.38469061746104194, + "grad_norm": 4.125, + "learning_rate": 3.083457314056267e-05, + "loss": 1.5812824964523315, + "step": 1475 + }, + { + "epoch": 0.3849514246593206, + "grad_norm": 4.0625, + "learning_rate": 3.082230230564904e-05, + "loss": 1.604433536529541, + "step": 1476 + }, + { + "epoch": 0.3852122318575993, + "grad_norm": 4.03125, + "learning_rate": 3.081002570741086e-05, + "loss": 1.717907428741455, + "step": 1477 + }, + { + "epoch": 0.38547303905587793, + "grad_norm": 4.125, + "learning_rate": 3.0797743352385956e-05, + "loss": 1.7131648063659668, + "step": 1478 + }, + { + "epoch": 0.38573384625415663, + "grad_norm": 3.8125, + "learning_rate": 3.078545524711517e-05, + "loss": 1.4925047159194946, + "step": 1479 + }, + { + "epoch": 0.3859946534524353, + "grad_norm": 3.9375, + "learning_rate": 3.0773161398142435e-05, + "loss": 1.5858057737350464, + "step": 1480 + }, + { + "epoch": 0.386255460650714, + "grad_norm": 3.875, + "learning_rate": 3.076086181201474e-05, + "loss": 1.7264738082885742, + "step": 1481 + }, + { + "epoch": 0.3865162678489926, + "grad_norm": 3.84375, + "learning_rate": 3.0748556495282104e-05, + "loss": 1.7423782348632812, + "step": 1482 + }, + { + "epoch": 0.3867770750472713, + "grad_norm": 3.78125, + "learning_rate": 3.0736245454497634e-05, + "loss": 1.4618861675262451, + "step": 1483 + }, + { + "epoch": 0.38703788224554997, + "grad_norm": 3.734375, + "learning_rate": 3.072392869621747e-05, + "loss": 1.5003267526626587, + "step": 1484 + }, + { + "epoch": 0.38729868944382867, + "grad_norm": 3.953125, + "learning_rate": 3.0711606227000794e-05, + "loss": 1.821825623512268, + "step": 1485 + }, + { + "epoch": 0.3875594966421073, + "grad_norm": 4.0, + "learning_rate": 3.069927805340983e-05, + "loss": 1.6688201427459717, + "step": 1486 + }, + { + "epoch": 0.387820303840386, + "grad_norm": 4.09375, + "learning_rate": 3.068694418200985e-05, + "loss": 1.8684214353561401, + "step": 1487 + }, + { + "epoch": 0.38808111103866466, + "grad_norm": 3.921875, + "learning_rate": 3.0674604619369136e-05, + "loss": 1.8903346061706543, + "step": 1488 + }, + { + "epoch": 0.38834191823694336, + "grad_norm": 3.875, + "learning_rate": 3.0662259372059026e-05, + "loss": 1.7969683408737183, + "step": 1489 + }, + { + "epoch": 0.388602725435222, + "grad_norm": 3.90625, + "learning_rate": 3.064990844665388e-05, + "loss": 1.8461135625839233, + "step": 1490 + }, + { + "epoch": 0.3888635326335007, + "grad_norm": 3.78125, + "learning_rate": 3.063755184973107e-05, + "loss": 1.934802532196045, + "step": 1491 + }, + { + "epoch": 0.38912433983177935, + "grad_norm": 3.859375, + "learning_rate": 3.062518958787099e-05, + "loss": 1.5508052110671997, + "step": 1492 + }, + { + "epoch": 0.38938514703005805, + "grad_norm": 3.8125, + "learning_rate": 3.061282166765707e-05, + "loss": 1.5577564239501953, + "step": 1493 + }, + { + "epoch": 0.3896459542283367, + "grad_norm": 3.890625, + "learning_rate": 3.0600448095675736e-05, + "loss": 1.7710663080215454, + "step": 1494 + }, + { + "epoch": 0.3899067614266154, + "grad_norm": 3.71875, + "learning_rate": 3.0588068878516435e-05, + "loss": 1.5946844816207886, + "step": 1495 + }, + { + "epoch": 0.39016756862489405, + "grad_norm": 3.890625, + "learning_rate": 3.0575684022771595e-05, + "loss": 1.5420547723770142, + "step": 1496 + }, + { + "epoch": 0.39042837582317275, + "grad_norm": 3.890625, + "learning_rate": 3.0563293535036676e-05, + "loss": 1.6369975805282593, + "step": 1497 + }, + { + "epoch": 0.3906891830214514, + "grad_norm": 3.71875, + "learning_rate": 3.055089742191013e-05, + "loss": 1.6601423025131226, + "step": 1498 + }, + { + "epoch": 0.39094999021973004, + "grad_norm": 4.21875, + "learning_rate": 3.05384956899934e-05, + "loss": 1.8204602003097534, + "step": 1499 + }, + { + "epoch": 0.39121079741800874, + "grad_norm": 4.21875, + "learning_rate": 3.052608834589091e-05, + "loss": 1.9024076461791992, + "step": 1500 + }, + { + "epoch": 0.3914716046162874, + "grad_norm": 4.03125, + "learning_rate": 3.0513675396210094e-05, + "loss": 1.7785924673080444, + "step": 1501 + }, + { + "epoch": 0.3917324118145661, + "grad_norm": 4.4375, + "learning_rate": 3.050125684756137e-05, + "loss": 1.9502075910568237, + "step": 1502 + }, + { + "epoch": 0.39199321901284473, + "grad_norm": 3.90625, + "learning_rate": 3.048883270655812e-05, + "loss": 1.6539990901947021, + "step": 1503 + }, + { + "epoch": 0.39225402621112343, + "grad_norm": 3.921875, + "learning_rate": 3.047640297981671e-05, + "loss": 1.8033804893493652, + "step": 1504 + }, + { + "epoch": 0.3925148334094021, + "grad_norm": 4.09375, + "learning_rate": 3.04639676739565e-05, + "loss": 1.6304492950439453, + "step": 1505 + }, + { + "epoch": 0.3927756406076808, + "grad_norm": 3.734375, + "learning_rate": 3.045152679559979e-05, + "loss": 1.5990183353424072, + "step": 1506 + }, + { + "epoch": 0.3930364478059594, + "grad_norm": 3.640625, + "learning_rate": 3.0439080351371875e-05, + "loss": 1.6147968769073486, + "step": 1507 + }, + { + "epoch": 0.3932972550042381, + "grad_norm": 3.421875, + "learning_rate": 3.0426628347900996e-05, + "loss": 1.553178071975708, + "step": 1508 + }, + { + "epoch": 0.39355806220251677, + "grad_norm": 3.734375, + "learning_rate": 3.0414170791818355e-05, + "loss": 1.5697667598724365, + "step": 1509 + }, + { + "epoch": 0.39381886940079547, + "grad_norm": 3.265625, + "learning_rate": 3.0401707689758133e-05, + "loss": 1.266568899154663, + "step": 1510 + }, + { + "epoch": 0.3940796765990741, + "grad_norm": 3.578125, + "learning_rate": 3.0389239048357437e-05, + "loss": 1.5824153423309326, + "step": 1511 + }, + { + "epoch": 0.3943404837973528, + "grad_norm": 4.1875, + "learning_rate": 3.0376764874256337e-05, + "loss": 1.812801718711853, + "step": 1512 + }, + { + "epoch": 0.39460129099563146, + "grad_norm": 3.609375, + "learning_rate": 3.036428517409785e-05, + "loss": 1.650933027267456, + "step": 1513 + }, + { + "epoch": 0.39486209819391016, + "grad_norm": 3.515625, + "learning_rate": 3.0351799954527927e-05, + "loss": 1.4789142608642578, + "step": 1514 + }, + { + "epoch": 0.3951229053921888, + "grad_norm": 3.765625, + "learning_rate": 3.033930922219548e-05, + "loss": 1.656179428100586, + "step": 1515 + }, + { + "epoch": 0.3953837125904675, + "grad_norm": 4.40625, + "learning_rate": 3.032681298375233e-05, + "loss": 1.6824867725372314, + "step": 1516 + }, + { + "epoch": 0.39564451978874615, + "grad_norm": 4.0625, + "learning_rate": 3.031431124585324e-05, + "loss": 1.7417092323303223, + "step": 1517 + }, + { + "epoch": 0.39590532698702485, + "grad_norm": 3.796875, + "learning_rate": 3.0301804015155906e-05, + "loss": 1.629252314567566, + "step": 1518 + }, + { + "epoch": 0.3961661341853035, + "grad_norm": 3.5625, + "learning_rate": 3.0289291298320952e-05, + "loss": 1.5575857162475586, + "step": 1519 + }, + { + "epoch": 0.3964269413835822, + "grad_norm": 3.953125, + "learning_rate": 3.027677310201192e-05, + "loss": 1.5468329191207886, + "step": 1520 + }, + { + "epoch": 0.39668774858186084, + "grad_norm": 3.6875, + "learning_rate": 3.0264249432895254e-05, + "loss": 1.6860032081604004, + "step": 1521 + }, + { + "epoch": 0.39694855578013954, + "grad_norm": 3.78125, + "learning_rate": 3.0251720297640336e-05, + "loss": 1.6084399223327637, + "step": 1522 + }, + { + "epoch": 0.3972093629784182, + "grad_norm": 4.46875, + "learning_rate": 3.0239185702919452e-05, + "loss": 1.4937280416488647, + "step": 1523 + }, + { + "epoch": 0.3974701701766969, + "grad_norm": 3.625, + "learning_rate": 3.0226645655407795e-05, + "loss": 1.6108170747756958, + "step": 1524 + }, + { + "epoch": 0.39773097737497554, + "grad_norm": 4.34375, + "learning_rate": 3.0214100161783445e-05, + "loss": 1.752903699874878, + "step": 1525 + }, + { + "epoch": 0.39799178457325424, + "grad_norm": 3.78125, + "learning_rate": 3.0201549228727417e-05, + "loss": 1.7260315418243408, + "step": 1526 + }, + { + "epoch": 0.3982525917715329, + "grad_norm": 3.6875, + "learning_rate": 3.018899286292359e-05, + "loss": 1.5754700899124146, + "step": 1527 + }, + { + "epoch": 0.3985133989698116, + "grad_norm": 4.3125, + "learning_rate": 3.017643107105876e-05, + "loss": 1.630751609802246, + "step": 1528 + }, + { + "epoch": 0.39877420616809023, + "grad_norm": 3.890625, + "learning_rate": 3.0163863859822596e-05, + "loss": 1.6684372425079346, + "step": 1529 + }, + { + "epoch": 0.39903501336636893, + "grad_norm": 3.984375, + "learning_rate": 3.0151291235907643e-05, + "loss": 1.8939087390899658, + "step": 1530 + }, + { + "epoch": 0.3992958205646476, + "grad_norm": 3.828125, + "learning_rate": 3.0138713206009376e-05, + "loss": 1.4833927154541016, + "step": 1531 + }, + { + "epoch": 0.3995566277629263, + "grad_norm": 3.96875, + "learning_rate": 3.0126129776826095e-05, + "loss": 1.6794382333755493, + "step": 1532 + }, + { + "epoch": 0.3998174349612049, + "grad_norm": 3.6875, + "learning_rate": 3.0113540955059e-05, + "loss": 1.7359819412231445, + "step": 1533 + }, + { + "epoch": 0.4000782421594836, + "grad_norm": 3.78125, + "learning_rate": 3.0100946747412173e-05, + "loss": 1.6286976337432861, + "step": 1534 + }, + { + "epoch": 0.40033904935776227, + "grad_norm": 3.796875, + "learning_rate": 3.0088347160592534e-05, + "loss": 1.8103272914886475, + "step": 1535 + }, + { + "epoch": 0.40059985655604097, + "grad_norm": 4.09375, + "learning_rate": 3.0075742201309898e-05, + "loss": 1.7070512771606445, + "step": 1536 + }, + { + "epoch": 0.4008606637543196, + "grad_norm": 3.3125, + "learning_rate": 3.0063131876276917e-05, + "loss": 1.4519729614257812, + "step": 1537 + }, + { + "epoch": 0.4011214709525983, + "grad_norm": 3.828125, + "learning_rate": 3.0050516192209126e-05, + "loss": 1.6558887958526611, + "step": 1538 + }, + { + "epoch": 0.40138227815087696, + "grad_norm": 3.8125, + "learning_rate": 3.003789515582489e-05, + "loss": 1.4730424880981445, + "step": 1539 + }, + { + "epoch": 0.40164308534915566, + "grad_norm": 3.875, + "learning_rate": 3.0025268773845436e-05, + "loss": 1.8217370510101318, + "step": 1540 + }, + { + "epoch": 0.4019038925474343, + "grad_norm": 3.9375, + "learning_rate": 3.001263705299484e-05, + "loss": 1.6720647811889648, + "step": 1541 + }, + { + "epoch": 0.402164699745713, + "grad_norm": 3.796875, + "learning_rate": 3.0000000000000004e-05, + "loss": 1.663779377937317, + "step": 1542 + }, + { + "epoch": 0.40242550694399165, + "grad_norm": 3.6875, + "learning_rate": 2.9987357621590693e-05, + "loss": 1.4999834299087524, + "step": 1543 + }, + { + "epoch": 0.40268631414227035, + "grad_norm": 3.71875, + "learning_rate": 2.9974709924499498e-05, + "loss": 1.628490924835205, + "step": 1544 + }, + { + "epoch": 0.402947121340549, + "grad_norm": 4.125, + "learning_rate": 2.9962056915461844e-05, + "loss": 1.6849451065063477, + "step": 1545 + }, + { + "epoch": 0.4032079285388277, + "grad_norm": 4.0625, + "learning_rate": 2.994939860121598e-05, + "loss": 1.4971082210540771, + "step": 1546 + }, + { + "epoch": 0.40346873573710634, + "grad_norm": 4.0625, + "learning_rate": 2.993673498850297e-05, + "loss": 1.6485989093780518, + "step": 1547 + }, + { + "epoch": 0.40372954293538504, + "grad_norm": 3.875, + "learning_rate": 2.9924066084066737e-05, + "loss": 1.4791631698608398, + "step": 1548 + }, + { + "epoch": 0.4039903501336637, + "grad_norm": 3.921875, + "learning_rate": 2.9911391894653975e-05, + "loss": 1.6571983098983765, + "step": 1549 + }, + { + "epoch": 0.40425115733194233, + "grad_norm": 3.703125, + "learning_rate": 2.9898712427014227e-05, + "loss": 1.6108894348144531, + "step": 1550 + }, + { + "epoch": 0.40451196453022104, + "grad_norm": 3.546875, + "learning_rate": 2.9886027687899843e-05, + "loss": 1.719677209854126, + "step": 1551 + }, + { + "epoch": 0.4047727717284997, + "grad_norm": 3.890625, + "learning_rate": 2.9873337684065945e-05, + "loss": 1.770775318145752, + "step": 1552 + }, + { + "epoch": 0.4050335789267784, + "grad_norm": 3.796875, + "learning_rate": 2.9860642422270517e-05, + "loss": 1.6228448152542114, + "step": 1553 + }, + { + "epoch": 0.405294386125057, + "grad_norm": 3.65625, + "learning_rate": 2.9847941909274295e-05, + "loss": 1.7396163940429688, + "step": 1554 + }, + { + "epoch": 0.4055551933233357, + "grad_norm": 3.375, + "learning_rate": 2.983523615184083e-05, + "loss": 1.175154685974121, + "step": 1555 + }, + { + "epoch": 0.4058160005216144, + "grad_norm": 4.09375, + "learning_rate": 2.9822525156736467e-05, + "loss": 1.6514532566070557, + "step": 1556 + }, + { + "epoch": 0.4060768077198931, + "grad_norm": 3.71875, + "learning_rate": 2.980980893073034e-05, + "loss": 1.2293200492858887, + "step": 1557 + }, + { + "epoch": 0.4063376149181717, + "grad_norm": 3.6875, + "learning_rate": 2.9797087480594366e-05, + "loss": 1.437143087387085, + "step": 1558 + }, + { + "epoch": 0.4065984221164504, + "grad_norm": 3.515625, + "learning_rate": 2.9784360813103236e-05, + "loss": 1.4445056915283203, + "step": 1559 + }, + { + "epoch": 0.40685922931472907, + "grad_norm": 3.734375, + "learning_rate": 2.9771628935034434e-05, + "loss": 1.692763328552246, + "step": 1560 + }, + { + "epoch": 0.40712003651300777, + "grad_norm": 3.828125, + "learning_rate": 2.9758891853168213e-05, + "loss": 1.7990310192108154, + "step": 1561 + }, + { + "epoch": 0.4073808437112864, + "grad_norm": 3.71875, + "learning_rate": 2.97461495742876e-05, + "loss": 1.3989152908325195, + "step": 1562 + }, + { + "epoch": 0.4076416509095651, + "grad_norm": 3.796875, + "learning_rate": 2.973340210517839e-05, + "loss": 1.4927902221679688, + "step": 1563 + }, + { + "epoch": 0.40790245810784376, + "grad_norm": 3.5625, + "learning_rate": 2.9720649452629123e-05, + "loss": 1.634340524673462, + "step": 1564 + }, + { + "epoch": 0.40816326530612246, + "grad_norm": 3.90625, + "learning_rate": 2.9707891623431126e-05, + "loss": 1.6583830118179321, + "step": 1565 + }, + { + "epoch": 0.4084240725044011, + "grad_norm": 3.71875, + "learning_rate": 2.9695128624378468e-05, + "loss": 1.4272788763046265, + "step": 1566 + }, + { + "epoch": 0.4086848797026798, + "grad_norm": 3.765625, + "learning_rate": 2.968236046226798e-05, + "loss": 1.62437105178833, + "step": 1567 + }, + { + "epoch": 0.40894568690095845, + "grad_norm": 3.484375, + "learning_rate": 2.9669587143899235e-05, + "loss": 1.4198951721191406, + "step": 1568 + }, + { + "epoch": 0.40920649409923715, + "grad_norm": 3.6875, + "learning_rate": 2.965680867607455e-05, + "loss": 1.5796867609024048, + "step": 1569 + }, + { + "epoch": 0.4094673012975158, + "grad_norm": 3.3125, + "learning_rate": 2.9644025065599e-05, + "loss": 1.423644781112671, + "step": 1570 + }, + { + "epoch": 0.4097281084957945, + "grad_norm": 3.90625, + "learning_rate": 2.9631236319280376e-05, + "loss": 1.70688796043396, + "step": 1571 + }, + { + "epoch": 0.40998891569407314, + "grad_norm": 3.875, + "learning_rate": 2.9618442443929218e-05, + "loss": 1.974204182624817, + "step": 1572 + }, + { + "epoch": 0.41024972289235184, + "grad_norm": 3.609375, + "learning_rate": 2.9605643446358798e-05, + "loss": 1.5921353101730347, + "step": 1573 + }, + { + "epoch": 0.4105105300906305, + "grad_norm": 4.03125, + "learning_rate": 2.959283933338511e-05, + "loss": 1.6732794046401978, + "step": 1574 + }, + { + "epoch": 0.4107713372889092, + "grad_norm": 3.640625, + "learning_rate": 2.9580030111826882e-05, + "loss": 1.6262917518615723, + "step": 1575 + }, + { + "epoch": 0.41103214448718783, + "grad_norm": 3.828125, + "learning_rate": 2.9567215788505544e-05, + "loss": 1.9774980545043945, + "step": 1576 + }, + { + "epoch": 0.41129295168546653, + "grad_norm": 3.671875, + "learning_rate": 2.955439637024526e-05, + "loss": 1.4540205001831055, + "step": 1577 + }, + { + "epoch": 0.4115537588837452, + "grad_norm": 3.65625, + "learning_rate": 2.9541571863872903e-05, + "loss": 1.752079725265503, + "step": 1578 + }, + { + "epoch": 0.4118145660820239, + "grad_norm": 3.75, + "learning_rate": 2.9528742276218053e-05, + "loss": 1.6492635011672974, + "step": 1579 + }, + { + "epoch": 0.4120753732803025, + "grad_norm": 3.625, + "learning_rate": 2.9515907614113e-05, + "loss": 1.4227832555770874, + "step": 1580 + }, + { + "epoch": 0.4123361804785812, + "grad_norm": 3.59375, + "learning_rate": 2.9503067884392726e-05, + "loss": 1.2912888526916504, + "step": 1581 + }, + { + "epoch": 0.41259698767685987, + "grad_norm": 4.21875, + "learning_rate": 2.9490223093894924e-05, + "loss": 1.5593420267105103, + "step": 1582 + }, + { + "epoch": 0.4128577948751386, + "grad_norm": 3.9375, + "learning_rate": 2.9477373249459974e-05, + "loss": 1.7273547649383545, + "step": 1583 + }, + { + "epoch": 0.4131186020734172, + "grad_norm": 3.484375, + "learning_rate": 2.946451835793096e-05, + "loss": 1.5384719371795654, + "step": 1584 + }, + { + "epoch": 0.4133794092716959, + "grad_norm": 3.875, + "learning_rate": 2.945165842615364e-05, + "loss": 1.8384695053100586, + "step": 1585 + }, + { + "epoch": 0.41364021646997456, + "grad_norm": 3.84375, + "learning_rate": 2.943879346097645e-05, + "loss": 1.7200520038604736, + "step": 1586 + }, + { + "epoch": 0.41390102366825327, + "grad_norm": 3.984375, + "learning_rate": 2.942592346925053e-05, + "loss": 1.5968172550201416, + "step": 1587 + }, + { + "epoch": 0.4141618308665319, + "grad_norm": 3.65625, + "learning_rate": 2.941304845782968e-05, + "loss": 1.5619614124298096, + "step": 1588 + }, + { + "epoch": 0.4144226380648106, + "grad_norm": 3.828125, + "learning_rate": 2.9400168433570378e-05, + "loss": 1.551422119140625, + "step": 1589 + }, + { + "epoch": 0.41468344526308926, + "grad_norm": 3.8125, + "learning_rate": 2.938728340333177e-05, + "loss": 1.6820770502090454, + "step": 1590 + }, + { + "epoch": 0.41494425246136796, + "grad_norm": 3.78125, + "learning_rate": 2.9374393373975663e-05, + "loss": 1.576513648033142, + "step": 1591 + }, + { + "epoch": 0.4152050596596466, + "grad_norm": 4.0625, + "learning_rate": 2.936149835236655e-05, + "loss": 1.6498585939407349, + "step": 1592 + }, + { + "epoch": 0.4154658668579253, + "grad_norm": 3.5, + "learning_rate": 2.9348598345371545e-05, + "loss": 1.6826300621032715, + "step": 1593 + }, + { + "epoch": 0.41572667405620395, + "grad_norm": 3.78125, + "learning_rate": 2.933569335986044e-05, + "loss": 1.864450216293335, + "step": 1594 + }, + { + "epoch": 0.41598748125448265, + "grad_norm": 3.9375, + "learning_rate": 2.9322783402705687e-05, + "loss": 1.5203943252563477, + "step": 1595 + }, + { + "epoch": 0.4162482884527613, + "grad_norm": 3.875, + "learning_rate": 2.9309868480782356e-05, + "loss": 1.7502679824829102, + "step": 1596 + }, + { + "epoch": 0.41650909565104, + "grad_norm": 3.8125, + "learning_rate": 2.9296948600968194e-05, + "loss": 1.60343599319458, + "step": 1597 + }, + { + "epoch": 0.41676990284931864, + "grad_norm": 3.59375, + "learning_rate": 2.928402377014356e-05, + "loss": 1.7471981048583984, + "step": 1598 + }, + { + "epoch": 0.4170307100475973, + "grad_norm": 3.84375, + "learning_rate": 2.9271093995191467e-05, + "loss": 1.9182765483856201, + "step": 1599 + }, + { + "epoch": 0.417291517245876, + "grad_norm": 3.859375, + "learning_rate": 2.9258159282997555e-05, + "loss": 1.9174833297729492, + "step": 1600 + }, + { + "epoch": 0.41755232444415463, + "grad_norm": 3.609375, + "learning_rate": 2.9245219640450103e-05, + "loss": 1.5726749897003174, + "step": 1601 + }, + { + "epoch": 0.41781313164243333, + "grad_norm": 3.9375, + "learning_rate": 2.9232275074439996e-05, + "loss": 1.4135874509811401, + "step": 1602 + }, + { + "epoch": 0.418073938840712, + "grad_norm": 3.578125, + "learning_rate": 2.9219325591860753e-05, + "loss": 1.641174554824829, + "step": 1603 + }, + { + "epoch": 0.4183347460389907, + "grad_norm": 3.890625, + "learning_rate": 2.9206371199608518e-05, + "loss": 1.7001125812530518, + "step": 1604 + }, + { + "epoch": 0.4185955532372693, + "grad_norm": 4.28125, + "learning_rate": 2.9193411904582033e-05, + "loss": 1.8250946998596191, + "step": 1605 + }, + { + "epoch": 0.418856360435548, + "grad_norm": 3.8125, + "learning_rate": 2.9180447713682664e-05, + "loss": 1.8040634393692017, + "step": 1606 + }, + { + "epoch": 0.41911716763382667, + "grad_norm": 3.96875, + "learning_rate": 2.9167478633814376e-05, + "loss": 1.6127195358276367, + "step": 1607 + }, + { + "epoch": 0.41937797483210537, + "grad_norm": 3.65625, + "learning_rate": 2.9154504671883747e-05, + "loss": 1.5782577991485596, + "step": 1608 + }, + { + "epoch": 0.419638782030384, + "grad_norm": 3.4375, + "learning_rate": 2.9141525834799952e-05, + "loss": 1.6237680912017822, + "step": 1609 + }, + { + "epoch": 0.4198995892286627, + "grad_norm": 3.8125, + "learning_rate": 2.912854212947475e-05, + "loss": 1.5012826919555664, + "step": 1610 + }, + { + "epoch": 0.42016039642694136, + "grad_norm": 4.15625, + "learning_rate": 2.9115553562822508e-05, + "loss": 1.7397178411483765, + "step": 1611 + }, + { + "epoch": 0.42042120362522006, + "grad_norm": 3.78125, + "learning_rate": 2.9102560141760178e-05, + "loss": 1.4469175338745117, + "step": 1612 + }, + { + "epoch": 0.4206820108234987, + "grad_norm": 3.75, + "learning_rate": 2.908956187320729e-05, + "loss": 1.3980048894882202, + "step": 1613 + }, + { + "epoch": 0.4209428180217774, + "grad_norm": 3.546875, + "learning_rate": 2.9076558764085966e-05, + "loss": 1.52538001537323, + "step": 1614 + }, + { + "epoch": 0.42120362522005605, + "grad_norm": 4.0, + "learning_rate": 2.9063550821320897e-05, + "loss": 1.826903223991394, + "step": 1615 + }, + { + "epoch": 0.42146443241833476, + "grad_norm": 3.4375, + "learning_rate": 2.9050538051839355e-05, + "loss": 1.2689564228057861, + "step": 1616 + }, + { + "epoch": 0.4217252396166134, + "grad_norm": 3.640625, + "learning_rate": 2.903752046257117e-05, + "loss": 1.546547532081604, + "step": 1617 + }, + { + "epoch": 0.4219860468148921, + "grad_norm": 3.625, + "learning_rate": 2.9024498060448758e-05, + "loss": 1.3963298797607422, + "step": 1618 + }, + { + "epoch": 0.42224685401317075, + "grad_norm": 3.859375, + "learning_rate": 2.901147085240709e-05, + "loss": 1.47343909740448, + "step": 1619 + }, + { + "epoch": 0.42250766121144945, + "grad_norm": 3.890625, + "learning_rate": 2.899843884538368e-05, + "loss": 1.4737505912780762, + "step": 1620 + }, + { + "epoch": 0.4227684684097281, + "grad_norm": 3.578125, + "learning_rate": 2.8985402046318625e-05, + "loss": 1.5501151084899902, + "step": 1621 + }, + { + "epoch": 0.4230292756080068, + "grad_norm": 4.0, + "learning_rate": 2.8972360462154557e-05, + "loss": 1.7948087453842163, + "step": 1622 + }, + { + "epoch": 0.42329008280628544, + "grad_norm": 3.90625, + "learning_rate": 2.8959314099836654e-05, + "loss": 1.4594502449035645, + "step": 1623 + }, + { + "epoch": 0.42355089000456414, + "grad_norm": 3.828125, + "learning_rate": 2.8946262966312652e-05, + "loss": 1.6952762603759766, + "step": 1624 + }, + { + "epoch": 0.4238116972028428, + "grad_norm": 3.6875, + "learning_rate": 2.893320706853282e-05, + "loss": 1.5199551582336426, + "step": 1625 + }, + { + "epoch": 0.4240725044011215, + "grad_norm": 3.8125, + "learning_rate": 2.892014641344997e-05, + "loss": 1.7249001264572144, + "step": 1626 + }, + { + "epoch": 0.42433331159940013, + "grad_norm": 3.828125, + "learning_rate": 2.890708100801943e-05, + "loss": 1.660285472869873, + "step": 1627 + }, + { + "epoch": 0.42459411879767883, + "grad_norm": 3.765625, + "learning_rate": 2.8894010859199073e-05, + "loss": 1.7036306858062744, + "step": 1628 + }, + { + "epoch": 0.4248549259959575, + "grad_norm": 3.765625, + "learning_rate": 2.8880935973949304e-05, + "loss": 1.5827271938323975, + "step": 1629 + }, + { + "epoch": 0.4251157331942362, + "grad_norm": 4.125, + "learning_rate": 2.8867856359233027e-05, + "loss": 1.3944931030273438, + "step": 1630 + }, + { + "epoch": 0.4253765403925148, + "grad_norm": 3.9375, + "learning_rate": 2.8854772022015694e-05, + "loss": 1.742563247680664, + "step": 1631 + }, + { + "epoch": 0.4256373475907935, + "grad_norm": 3.578125, + "learning_rate": 2.8841682969265242e-05, + "loss": 1.4844765663146973, + "step": 1632 + }, + { + "epoch": 0.42589815478907217, + "grad_norm": 3.78125, + "learning_rate": 2.8828589207952152e-05, + "loss": 1.5548579692840576, + "step": 1633 + }, + { + "epoch": 0.42615896198735087, + "grad_norm": 3.53125, + "learning_rate": 2.8815490745049372e-05, + "loss": 1.2696446180343628, + "step": 1634 + }, + { + "epoch": 0.4264197691856295, + "grad_norm": 3.890625, + "learning_rate": 2.8802387587532395e-05, + "loss": 1.6950145959854126, + "step": 1635 + }, + { + "epoch": 0.4266805763839082, + "grad_norm": 3.6875, + "learning_rate": 2.8789279742379196e-05, + "loss": 1.7176172733306885, + "step": 1636 + }, + { + "epoch": 0.42694138358218686, + "grad_norm": 4.5, + "learning_rate": 2.8776167216570225e-05, + "loss": 1.8122673034667969, + "step": 1637 + }, + { + "epoch": 0.42720219078046556, + "grad_norm": 3.953125, + "learning_rate": 2.876305001708847e-05, + "loss": 1.6122087240219116, + "step": 1638 + }, + { + "epoch": 0.4274629979787442, + "grad_norm": 3.90625, + "learning_rate": 2.874992815091937e-05, + "loss": 1.713416337966919, + "step": 1639 + }, + { + "epoch": 0.4277238051770229, + "grad_norm": 3.703125, + "learning_rate": 2.873680162505087e-05, + "loss": 1.697394609451294, + "step": 1640 + }, + { + "epoch": 0.42798461237530155, + "grad_norm": 4.0625, + "learning_rate": 2.8723670446473373e-05, + "loss": 1.6875641345977783, + "step": 1641 + }, + { + "epoch": 0.42824541957358025, + "grad_norm": 3.78125, + "learning_rate": 2.8710534622179797e-05, + "loss": 1.820279598236084, + "step": 1642 + }, + { + "epoch": 0.4285062267718589, + "grad_norm": 3.734375, + "learning_rate": 2.8697394159165505e-05, + "loss": 1.5553900003433228, + "step": 1643 + }, + { + "epoch": 0.4287670339701376, + "grad_norm": 5.375, + "learning_rate": 2.868424906442833e-05, + "loss": 1.6352019309997559, + "step": 1644 + }, + { + "epoch": 0.42902784116841625, + "grad_norm": 3.625, + "learning_rate": 2.867109934496859e-05, + "loss": 1.4842324256896973, + "step": 1645 + }, + { + "epoch": 0.42928864836669495, + "grad_norm": 3.859375, + "learning_rate": 2.865794500778905e-05, + "loss": 1.4266735315322876, + "step": 1646 + }, + { + "epoch": 0.4295494555649736, + "grad_norm": 3.890625, + "learning_rate": 2.864478605989494e-05, + "loss": 1.5456125736236572, + "step": 1647 + }, + { + "epoch": 0.4298102627632523, + "grad_norm": 3.515625, + "learning_rate": 2.8631622508293957e-05, + "loss": 1.5186576843261719, + "step": 1648 + }, + { + "epoch": 0.43007106996153094, + "grad_norm": 3.859375, + "learning_rate": 2.8618454359996217e-05, + "loss": 1.5909136533737183, + "step": 1649 + }, + { + "epoch": 0.4303318771598096, + "grad_norm": 4.03125, + "learning_rate": 2.8605281622014315e-05, + "loss": 1.7734607458114624, + "step": 1650 + }, + { + "epoch": 0.4305926843580883, + "grad_norm": 4.0, + "learning_rate": 2.8592104301363285e-05, + "loss": 1.7636637687683105, + "step": 1651 + }, + { + "epoch": 0.43085349155636693, + "grad_norm": 3.890625, + "learning_rate": 2.8578922405060593e-05, + "loss": 1.7131767272949219, + "step": 1652 + }, + { + "epoch": 0.43111429875464563, + "grad_norm": 3.796875, + "learning_rate": 2.8565735940126146e-05, + "loss": 1.5171430110931396, + "step": 1653 + }, + { + "epoch": 0.4313751059529243, + "grad_norm": 4.28125, + "learning_rate": 2.855254491358227e-05, + "loss": 1.241947889328003, + "step": 1654 + }, + { + "epoch": 0.431635913151203, + "grad_norm": 4.0, + "learning_rate": 2.8539349332453758e-05, + "loss": 1.6244800090789795, + "step": 1655 + }, + { + "epoch": 0.4318967203494816, + "grad_norm": 3.828125, + "learning_rate": 2.852614920376778e-05, + "loss": 1.6008328199386597, + "step": 1656 + }, + { + "epoch": 0.4321575275477603, + "grad_norm": 3.828125, + "learning_rate": 2.8512944534553968e-05, + "loss": 1.7080821990966797, + "step": 1657 + }, + { + "epoch": 0.43241833474603897, + "grad_norm": 4.1875, + "learning_rate": 2.849973533184435e-05, + "loss": 1.818986177444458, + "step": 1658 + }, + { + "epoch": 0.43267914194431767, + "grad_norm": 3.796875, + "learning_rate": 2.8486521602673368e-05, + "loss": 1.7770615816116333, + "step": 1659 + }, + { + "epoch": 0.4329399491425963, + "grad_norm": 3.6875, + "learning_rate": 2.8473303354077894e-05, + "loss": 1.6648075580596924, + "step": 1660 + }, + { + "epoch": 0.433200756340875, + "grad_norm": 3.84375, + "learning_rate": 2.8460080593097177e-05, + "loss": 1.87071692943573, + "step": 1661 + }, + { + "epoch": 0.43346156353915366, + "grad_norm": 3.59375, + "learning_rate": 2.8446853326772902e-05, + "loss": 1.5345838069915771, + "step": 1662 + }, + { + "epoch": 0.43372237073743236, + "grad_norm": 4.0, + "learning_rate": 2.8433621562149122e-05, + "loss": 1.5579233169555664, + "step": 1663 + }, + { + "epoch": 0.433983177935711, + "grad_norm": 3.765625, + "learning_rate": 2.8420385306272303e-05, + "loss": 1.5652178525924683, + "step": 1664 + }, + { + "epoch": 0.4342439851339897, + "grad_norm": 3.890625, + "learning_rate": 2.8407144566191315e-05, + "loss": 1.7516753673553467, + "step": 1665 + }, + { + "epoch": 0.43450479233226835, + "grad_norm": 3.765625, + "learning_rate": 2.839389934895738e-05, + "loss": 1.4254251718521118, + "step": 1666 + }, + { + "epoch": 0.43476559953054705, + "grad_norm": 3.609375, + "learning_rate": 2.8380649661624135e-05, + "loss": 1.5584107637405396, + "step": 1667 + }, + { + "epoch": 0.4350264067288257, + "grad_norm": 3.75, + "learning_rate": 2.836739551124759e-05, + "loss": 1.6238042116165161, + "step": 1668 + }, + { + "epoch": 0.4352872139271044, + "grad_norm": 3.796875, + "learning_rate": 2.8354136904886123e-05, + "loss": 1.6247994899749756, + "step": 1669 + }, + { + "epoch": 0.43554802112538304, + "grad_norm": 3.6875, + "learning_rate": 2.8340873849600502e-05, + "loss": 1.4396026134490967, + "step": 1670 + }, + { + "epoch": 0.43580882832366175, + "grad_norm": 3.828125, + "learning_rate": 2.832760635245383e-05, + "loss": 1.3682128190994263, + "step": 1671 + }, + { + "epoch": 0.4360696355219404, + "grad_norm": 4.0, + "learning_rate": 2.831433442051163e-05, + "loss": 1.592055082321167, + "step": 1672 + }, + { + "epoch": 0.4363304427202191, + "grad_norm": 3.65625, + "learning_rate": 2.830105806084174e-05, + "loss": 1.4300789833068848, + "step": 1673 + }, + { + "epoch": 0.43659124991849774, + "grad_norm": 3.625, + "learning_rate": 2.828777728051437e-05, + "loss": 1.423119068145752, + "step": 1674 + }, + { + "epoch": 0.43685205711677644, + "grad_norm": 3.90625, + "learning_rate": 2.8274492086602085e-05, + "loss": 1.757512092590332, + "step": 1675 + }, + { + "epoch": 0.4371128643150551, + "grad_norm": 3.84375, + "learning_rate": 2.826120248617981e-05, + "loss": 1.5190269947052002, + "step": 1676 + }, + { + "epoch": 0.4373736715133338, + "grad_norm": 4.0625, + "learning_rate": 2.8247908486324807e-05, + "loss": 1.508905291557312, + "step": 1677 + }, + { + "epoch": 0.43763447871161243, + "grad_norm": 3.65625, + "learning_rate": 2.8234610094116676e-05, + "loss": 1.6778496503829956, + "step": 1678 + }, + { + "epoch": 0.43789528590989113, + "grad_norm": 3.671875, + "learning_rate": 2.8221307316637365e-05, + "loss": 1.9285204410552979, + "step": 1679 + }, + { + "epoch": 0.4381560931081698, + "grad_norm": 3.953125, + "learning_rate": 2.8208000160971153e-05, + "loss": 1.6861218214035034, + "step": 1680 + }, + { + "epoch": 0.4384169003064485, + "grad_norm": 3.78125, + "learning_rate": 2.8194688634204647e-05, + "loss": 1.6404638290405273, + "step": 1681 + }, + { + "epoch": 0.4386777075047271, + "grad_norm": 3.84375, + "learning_rate": 2.8181372743426805e-05, + "loss": 1.6675424575805664, + "step": 1682 + }, + { + "epoch": 0.4389385147030058, + "grad_norm": 3.8125, + "learning_rate": 2.8168052495728866e-05, + "loss": 1.6485905647277832, + "step": 1683 + }, + { + "epoch": 0.43919932190128447, + "grad_norm": 4.1875, + "learning_rate": 2.8154727898204434e-05, + "loss": 1.7377148866653442, + "step": 1684 + }, + { + "epoch": 0.43946012909956317, + "grad_norm": 3.703125, + "learning_rate": 2.8141398957949397e-05, + "loss": 1.6704241037368774, + "step": 1685 + }, + { + "epoch": 0.4397209362978418, + "grad_norm": 3.703125, + "learning_rate": 2.8128065682061975e-05, + "loss": 1.7559125423431396, + "step": 1686 + }, + { + "epoch": 0.4399817434961205, + "grad_norm": 3.8125, + "learning_rate": 2.8114728077642693e-05, + "loss": 1.6977081298828125, + "step": 1687 + }, + { + "epoch": 0.44024255069439916, + "grad_norm": 3.53125, + "learning_rate": 2.8101386151794362e-05, + "loss": 1.3879613876342773, + "step": 1688 + }, + { + "epoch": 0.44050335789267786, + "grad_norm": 3.765625, + "learning_rate": 2.8088039911622133e-05, + "loss": 1.4685391187667847, + "step": 1689 + }, + { + "epoch": 0.4407641650909565, + "grad_norm": 3.75, + "learning_rate": 2.8074689364233414e-05, + "loss": 1.8438893556594849, + "step": 1690 + }, + { + "epoch": 0.4410249722892352, + "grad_norm": 3.65625, + "learning_rate": 2.8061334516737936e-05, + "loss": 1.6127426624298096, + "step": 1691 + }, + { + "epoch": 0.44128577948751385, + "grad_norm": 3.453125, + "learning_rate": 2.80479753762477e-05, + "loss": 1.3222421407699585, + "step": 1692 + }, + { + "epoch": 0.44154658668579255, + "grad_norm": 3.71875, + "learning_rate": 2.8034611949877005e-05, + "loss": 1.6235663890838623, + "step": 1693 + }, + { + "epoch": 0.4418073938840712, + "grad_norm": 4.4375, + "learning_rate": 2.8021244244742437e-05, + "loss": 1.9033315181732178, + "step": 1694 + }, + { + "epoch": 0.4420682010823499, + "grad_norm": 3.75, + "learning_rate": 2.8007872267962844e-05, + "loss": 1.569352388381958, + "step": 1695 + }, + { + "epoch": 0.44232900828062854, + "grad_norm": 3.484375, + "learning_rate": 2.7994496026659363e-05, + "loss": 1.3566945791244507, + "step": 1696 + }, + { + "epoch": 0.44258981547890724, + "grad_norm": 4.0, + "learning_rate": 2.798111552795539e-05, + "loss": 1.5274192094802856, + "step": 1697 + }, + { + "epoch": 0.4428506226771859, + "grad_norm": 3.625, + "learning_rate": 2.7967730778976596e-05, + "loss": 1.5132932662963867, + "step": 1698 + }, + { + "epoch": 0.44311142987546454, + "grad_norm": 3.90625, + "learning_rate": 2.795434178685093e-05, + "loss": 1.6494386196136475, + "step": 1699 + }, + { + "epoch": 0.44337223707374324, + "grad_norm": 3.671875, + "learning_rate": 2.7940948558708567e-05, + "loss": 1.6655917167663574, + "step": 1700 + }, + { + "epoch": 0.4436330442720219, + "grad_norm": 3.609375, + "learning_rate": 2.792755110168196e-05, + "loss": 1.5109964609146118, + "step": 1701 + }, + { + "epoch": 0.4438938514703006, + "grad_norm": 3.625, + "learning_rate": 2.791414942290582e-05, + "loss": 1.4205164909362793, + "step": 1702 + }, + { + "epoch": 0.4441546586685792, + "grad_norm": 3.890625, + "learning_rate": 2.7900743529517087e-05, + "loss": 1.5652105808258057, + "step": 1703 + }, + { + "epoch": 0.44441546586685793, + "grad_norm": 3.546875, + "learning_rate": 2.7887333428654955e-05, + "loss": 1.481246829032898, + "step": 1704 + }, + { + "epoch": 0.4446762730651366, + "grad_norm": 3.5625, + "learning_rate": 2.7873919127460857e-05, + "loss": 1.777045726776123, + "step": 1705 + }, + { + "epoch": 0.4449370802634153, + "grad_norm": 3.40625, + "learning_rate": 2.7860500633078475e-05, + "loss": 1.5149214267730713, + "step": 1706 + }, + { + "epoch": 0.4451978874616939, + "grad_norm": 3.65625, + "learning_rate": 2.7847077952653704e-05, + "loss": 1.6715518236160278, + "step": 1707 + }, + { + "epoch": 0.4454586946599726, + "grad_norm": 3.546875, + "learning_rate": 2.7833651093334686e-05, + "loss": 1.6214019060134888, + "step": 1708 + }, + { + "epoch": 0.44571950185825127, + "grad_norm": 3.734375, + "learning_rate": 2.7820220062271768e-05, + "loss": 1.4817593097686768, + "step": 1709 + }, + { + "epoch": 0.44598030905652997, + "grad_norm": 3.703125, + "learning_rate": 2.780678486661753e-05, + "loss": 1.4205501079559326, + "step": 1710 + }, + { + "epoch": 0.4462411162548086, + "grad_norm": 3.5, + "learning_rate": 2.779334551352679e-05, + "loss": 1.573449730873108, + "step": 1711 + }, + { + "epoch": 0.4465019234530873, + "grad_norm": 3.546875, + "learning_rate": 2.7779902010156542e-05, + "loss": 1.5362462997436523, + "step": 1712 + }, + { + "epoch": 0.44676273065136596, + "grad_norm": 3.75, + "learning_rate": 2.776645436366602e-05, + "loss": 1.4577736854553223, + "step": 1713 + }, + { + "epoch": 0.44702353784964466, + "grad_norm": 3.390625, + "learning_rate": 2.7753002581216636e-05, + "loss": 1.5574638843536377, + "step": 1714 + }, + { + "epoch": 0.4472843450479233, + "grad_norm": 3.46875, + "learning_rate": 2.7739546669972046e-05, + "loss": 1.3225568532943726, + "step": 1715 + }, + { + "epoch": 0.447545152246202, + "grad_norm": 3.546875, + "learning_rate": 2.7726086637098064e-05, + "loss": 1.5711350440979004, + "step": 1716 + }, + { + "epoch": 0.44780595944448065, + "grad_norm": 4.125, + "learning_rate": 2.771262248976272e-05, + "loss": 1.6645259857177734, + "step": 1717 + }, + { + "epoch": 0.44806676664275935, + "grad_norm": 3.5, + "learning_rate": 2.769915423513623e-05, + "loss": 1.3624897003173828, + "step": 1718 + }, + { + "epoch": 0.448327573841038, + "grad_norm": 3.796875, + "learning_rate": 2.7685681880390995e-05, + "loss": 1.7277605533599854, + "step": 1719 + }, + { + "epoch": 0.4485883810393167, + "grad_norm": 3.890625, + "learning_rate": 2.7672205432701607e-05, + "loss": 1.7653427124023438, + "step": 1720 + }, + { + "epoch": 0.44884918823759534, + "grad_norm": 3.734375, + "learning_rate": 2.7658724899244833e-05, + "loss": 1.591906189918518, + "step": 1721 + }, + { + "epoch": 0.44910999543587404, + "grad_norm": 3.609375, + "learning_rate": 2.7645240287199606e-05, + "loss": 1.6018164157867432, + "step": 1722 + }, + { + "epoch": 0.4493708026341527, + "grad_norm": 3.53125, + "learning_rate": 2.7631751603747058e-05, + "loss": 1.5946130752563477, + "step": 1723 + }, + { + "epoch": 0.4496316098324314, + "grad_norm": 3.53125, + "learning_rate": 2.7618258856070458e-05, + "loss": 1.4172343015670776, + "step": 1724 + }, + { + "epoch": 0.44989241703071003, + "grad_norm": 3.875, + "learning_rate": 2.7604762051355262e-05, + "loss": 1.9235308170318604, + "step": 1725 + }, + { + "epoch": 0.45015322422898874, + "grad_norm": 3.421875, + "learning_rate": 2.7591261196789072e-05, + "loss": 1.4664264917373657, + "step": 1726 + }, + { + "epoch": 0.4504140314272674, + "grad_norm": 3.640625, + "learning_rate": 2.7577756299561654e-05, + "loss": 1.406175136566162, + "step": 1727 + }, + { + "epoch": 0.4506748386255461, + "grad_norm": 3.546875, + "learning_rate": 2.7564247366864926e-05, + "loss": 1.6239509582519531, + "step": 1728 + }, + { + "epoch": 0.4509356458238247, + "grad_norm": 3.5, + "learning_rate": 2.7550734405892954e-05, + "loss": 1.8283617496490479, + "step": 1729 + }, + { + "epoch": 0.4511964530221034, + "grad_norm": 3.546875, + "learning_rate": 2.753721742384196e-05, + "loss": 1.433314561843872, + "step": 1730 + }, + { + "epoch": 0.4514572602203821, + "grad_norm": 3.640625, + "learning_rate": 2.7523696427910272e-05, + "loss": 1.6373162269592285, + "step": 1731 + }, + { + "epoch": 0.4517180674186608, + "grad_norm": 3.6875, + "learning_rate": 2.7510171425298408e-05, + "loss": 1.5311187505722046, + "step": 1732 + }, + { + "epoch": 0.4519788746169394, + "grad_norm": 3.6875, + "learning_rate": 2.7496642423208975e-05, + "loss": 1.498328685760498, + "step": 1733 + }, + { + "epoch": 0.4522396818152181, + "grad_norm": 3.703125, + "learning_rate": 2.7483109428846736e-05, + "loss": 1.7286779880523682, + "step": 1734 + }, + { + "epoch": 0.45250048901349677, + "grad_norm": 3.9375, + "learning_rate": 2.7469572449418564e-05, + "loss": 1.7031946182250977, + "step": 1735 + }, + { + "epoch": 0.45276129621177547, + "grad_norm": 3.9375, + "learning_rate": 2.7456031492133472e-05, + "loss": 1.6917610168457031, + "step": 1736 + }, + { + "epoch": 0.4530221034100541, + "grad_norm": 3.765625, + "learning_rate": 2.7442486564202577e-05, + "loss": 1.4163310527801514, + "step": 1737 + }, + { + "epoch": 0.4532829106083328, + "grad_norm": 3.875, + "learning_rate": 2.742893767283911e-05, + "loss": 1.7884117364883423, + "step": 1738 + }, + { + "epoch": 0.45354371780661146, + "grad_norm": 3.515625, + "learning_rate": 2.741538482525842e-05, + "loss": 1.5097811222076416, + "step": 1739 + }, + { + "epoch": 0.45380452500489016, + "grad_norm": 3.75, + "learning_rate": 2.740182802867796e-05, + "loss": 1.71995210647583, + "step": 1740 + }, + { + "epoch": 0.4540653322031688, + "grad_norm": 3.703125, + "learning_rate": 2.738826729031728e-05, + "loss": 1.719955563545227, + "step": 1741 + }, + { + "epoch": 0.4543261394014475, + "grad_norm": 3.78125, + "learning_rate": 2.7374702617398052e-05, + "loss": 1.5219910144805908, + "step": 1742 + }, + { + "epoch": 0.45458694659972615, + "grad_norm": 4.25, + "learning_rate": 2.7361134017144012e-05, + "loss": 1.7454659938812256, + "step": 1743 + }, + { + "epoch": 0.45484775379800485, + "grad_norm": 3.546875, + "learning_rate": 2.7347561496781007e-05, + "loss": 1.7231475114822388, + "step": 1744 + }, + { + "epoch": 0.4551085609962835, + "grad_norm": 3.46875, + "learning_rate": 2.7333985063536963e-05, + "loss": 1.6025141477584839, + "step": 1745 + }, + { + "epoch": 0.4553693681945622, + "grad_norm": 4.0, + "learning_rate": 2.73204047246419e-05, + "loss": 1.6790502071380615, + "step": 1746 + }, + { + "epoch": 0.45563017539284084, + "grad_norm": 3.734375, + "learning_rate": 2.7306820487327906e-05, + "loss": 1.6374878883361816, + "step": 1747 + }, + { + "epoch": 0.45589098259111954, + "grad_norm": 3.578125, + "learning_rate": 2.7293232358829146e-05, + "loss": 1.6954729557037354, + "step": 1748 + }, + { + "epoch": 0.4561517897893982, + "grad_norm": 3.734375, + "learning_rate": 2.7279640346381877e-05, + "loss": 1.764048457145691, + "step": 1749 + }, + { + "epoch": 0.45641259698767683, + "grad_norm": 3.578125, + "learning_rate": 2.7266044457224394e-05, + "loss": 1.6400312185287476, + "step": 1750 + }, + { + "epoch": 0.45667340418595553, + "grad_norm": 3.59375, + "learning_rate": 2.7252444698597078e-05, + "loss": 1.5373930931091309, + "step": 1751 + }, + { + "epoch": 0.4569342113842342, + "grad_norm": 3.546875, + "learning_rate": 2.723884107774236e-05, + "loss": 1.3972481489181519, + "step": 1752 + }, + { + "epoch": 0.4571950185825129, + "grad_norm": 3.5625, + "learning_rate": 2.7225233601904738e-05, + "loss": 1.7710322141647339, + "step": 1753 + }, + { + "epoch": 0.4574558257807915, + "grad_norm": 3.4375, + "learning_rate": 2.721162227833076e-05, + "loss": 1.5051907300949097, + "step": 1754 + }, + { + "epoch": 0.4577166329790702, + "grad_norm": 4.0625, + "learning_rate": 2.7198007114269005e-05, + "loss": 1.6756287813186646, + "step": 1755 + }, + { + "epoch": 0.45797744017734887, + "grad_norm": 3.59375, + "learning_rate": 2.7184388116970124e-05, + "loss": 1.4749387502670288, + "step": 1756 + }, + { + "epoch": 0.45823824737562757, + "grad_norm": 3.84375, + "learning_rate": 2.7170765293686792e-05, + "loss": 1.9337652921676636, + "step": 1757 + }, + { + "epoch": 0.4584990545739062, + "grad_norm": 3.53125, + "learning_rate": 2.715713865167373e-05, + "loss": 1.5626142024993896, + "step": 1758 + }, + { + "epoch": 0.4587598617721849, + "grad_norm": 3.453125, + "learning_rate": 2.7143508198187695e-05, + "loss": 1.5024030208587646, + "step": 1759 + }, + { + "epoch": 0.45902066897046356, + "grad_norm": 3.609375, + "learning_rate": 2.712987394048746e-05, + "loss": 1.4093332290649414, + "step": 1760 + }, + { + "epoch": 0.45928147616874226, + "grad_norm": 3.71875, + "learning_rate": 2.7116235885833835e-05, + "loss": 1.5896199941635132, + "step": 1761 + }, + { + "epoch": 0.4595422833670209, + "grad_norm": 3.609375, + "learning_rate": 2.710259404148965e-05, + "loss": 1.6533608436584473, + "step": 1762 + }, + { + "epoch": 0.4598030905652996, + "grad_norm": 3.53125, + "learning_rate": 2.708894841471975e-05, + "loss": 1.6842036247253418, + "step": 1763 + }, + { + "epoch": 0.46006389776357826, + "grad_norm": 3.65625, + "learning_rate": 2.7075299012791003e-05, + "loss": 1.488107442855835, + "step": 1764 + }, + { + "epoch": 0.46032470496185696, + "grad_norm": 3.703125, + "learning_rate": 2.7061645842972266e-05, + "loss": 1.551996111869812, + "step": 1765 + }, + { + "epoch": 0.4605855121601356, + "grad_norm": 3.671875, + "learning_rate": 2.704798891253444e-05, + "loss": 1.6990528106689453, + "step": 1766 + }, + { + "epoch": 0.4608463193584143, + "grad_norm": 3.609375, + "learning_rate": 2.703432822875039e-05, + "loss": 1.503858208656311, + "step": 1767 + }, + { + "epoch": 0.46110712655669295, + "grad_norm": 3.609375, + "learning_rate": 2.7020663798895003e-05, + "loss": 1.5476514101028442, + "step": 1768 + }, + { + "epoch": 0.46136793375497165, + "grad_norm": 3.71875, + "learning_rate": 2.7006995630245155e-05, + "loss": 1.4874258041381836, + "step": 1769 + }, + { + "epoch": 0.4616287409532503, + "grad_norm": 3.6875, + "learning_rate": 2.6993323730079707e-05, + "loss": 1.786684513092041, + "step": 1770 + }, + { + "epoch": 0.461889548151529, + "grad_norm": 3.84375, + "learning_rate": 2.6979648105679523e-05, + "loss": 1.6916794776916504, + "step": 1771 + }, + { + "epoch": 0.46215035534980764, + "grad_norm": 3.859375, + "learning_rate": 2.6965968764327425e-05, + "loss": 1.5331114530563354, + "step": 1772 + }, + { + "epoch": 0.46241116254808634, + "grad_norm": 3.765625, + "learning_rate": 2.695228571330824e-05, + "loss": 1.9602546691894531, + "step": 1773 + }, + { + "epoch": 0.462671969746365, + "grad_norm": 3.421875, + "learning_rate": 2.6938598959908752e-05, + "loss": 1.4702377319335938, + "step": 1774 + }, + { + "epoch": 0.4629327769446437, + "grad_norm": 3.703125, + "learning_rate": 2.6924908511417737e-05, + "loss": 1.6842867136001587, + "step": 1775 + }, + { + "epoch": 0.46319358414292233, + "grad_norm": 3.4375, + "learning_rate": 2.6911214375125917e-05, + "loss": 1.5531668663024902, + "step": 1776 + }, + { + "epoch": 0.46345439134120103, + "grad_norm": 3.21875, + "learning_rate": 2.6897516558325984e-05, + "loss": 1.2932814359664917, + "step": 1777 + }, + { + "epoch": 0.4637151985394797, + "grad_norm": 4.34375, + "learning_rate": 2.6883815068312597e-05, + "loss": 1.5881843566894531, + "step": 1778 + }, + { + "epoch": 0.4639760057377584, + "grad_norm": 3.375, + "learning_rate": 2.687010991238237e-05, + "loss": 1.4304906129837036, + "step": 1779 + }, + { + "epoch": 0.464236812936037, + "grad_norm": 3.625, + "learning_rate": 2.6856401097833863e-05, + "loss": 1.5073388814926147, + "step": 1780 + }, + { + "epoch": 0.4644976201343157, + "grad_norm": 3.796875, + "learning_rate": 2.68426886319676e-05, + "loss": 1.7035486698150635, + "step": 1781 + }, + { + "epoch": 0.46475842733259437, + "grad_norm": 3.890625, + "learning_rate": 2.6828972522086013e-05, + "loss": 1.650022029876709, + "step": 1782 + }, + { + "epoch": 0.46501923453087307, + "grad_norm": 3.921875, + "learning_rate": 2.6815252775493523e-05, + "loss": 1.7883479595184326, + "step": 1783 + }, + { + "epoch": 0.4652800417291517, + "grad_norm": 3.53125, + "learning_rate": 2.6801529399496446e-05, + "loss": 1.5400499105453491, + "step": 1784 + }, + { + "epoch": 0.4655408489274304, + "grad_norm": 3.9375, + "learning_rate": 2.6787802401403062e-05, + "loss": 1.4867982864379883, + "step": 1785 + }, + { + "epoch": 0.46580165612570906, + "grad_norm": 3.328125, + "learning_rate": 2.677407178852356e-05, + "loss": 1.2686187028884888, + "step": 1786 + }, + { + "epoch": 0.46606246332398776, + "grad_norm": 3.671875, + "learning_rate": 2.6760337568170056e-05, + "loss": 1.719443917274475, + "step": 1787 + }, + { + "epoch": 0.4663232705222664, + "grad_norm": 3.46875, + "learning_rate": 2.6746599747656605e-05, + "loss": 1.4701546430587769, + "step": 1788 + }, + { + "epoch": 0.4665840777205451, + "grad_norm": 3.46875, + "learning_rate": 2.6732858334299155e-05, + "loss": 1.4711968898773193, + "step": 1789 + }, + { + "epoch": 0.46684488491882375, + "grad_norm": 3.6875, + "learning_rate": 2.6719113335415572e-05, + "loss": 1.6861298084259033, + "step": 1790 + }, + { + "epoch": 0.46710569211710246, + "grad_norm": 3.8125, + "learning_rate": 2.670536475832566e-05, + "loss": 1.5621323585510254, + "step": 1791 + }, + { + "epoch": 0.4673664993153811, + "grad_norm": 3.703125, + "learning_rate": 2.6691612610351084e-05, + "loss": 1.4175682067871094, + "step": 1792 + }, + { + "epoch": 0.4676273065136598, + "grad_norm": 3.6875, + "learning_rate": 2.6677856898815443e-05, + "loss": 1.5656529664993286, + "step": 1793 + }, + { + "epoch": 0.46788811371193845, + "grad_norm": 3.421875, + "learning_rate": 2.6664097631044224e-05, + "loss": 1.355933427810669, + "step": 1794 + }, + { + "epoch": 0.46814892091021715, + "grad_norm": 3.390625, + "learning_rate": 2.66503348143648e-05, + "loss": 1.2955758571624756, + "step": 1795 + }, + { + "epoch": 0.4684097281084958, + "grad_norm": 4.09375, + "learning_rate": 2.663656845610645e-05, + "loss": 1.8883273601531982, + "step": 1796 + }, + { + "epoch": 0.4686705353067745, + "grad_norm": 3.65625, + "learning_rate": 2.6622798563600324e-05, + "loss": 1.5600733757019043, + "step": 1797 + }, + { + "epoch": 0.46893134250505314, + "grad_norm": 3.5, + "learning_rate": 2.660902514417947e-05, + "loss": 1.6002708673477173, + "step": 1798 + }, + { + "epoch": 0.4691921497033318, + "grad_norm": 3.71875, + "learning_rate": 2.659524820517879e-05, + "loss": 1.5094125270843506, + "step": 1799 + }, + { + "epoch": 0.4694529569016105, + "grad_norm": 3.71875, + "learning_rate": 2.658146775393509e-05, + "loss": 1.6929123401641846, + "step": 1800 + }, + { + "epoch": 0.46971376409988913, + "grad_norm": 3.609375, + "learning_rate": 2.656768379778702e-05, + "loss": 1.4931977987289429, + "step": 1801 + }, + { + "epoch": 0.46997457129816783, + "grad_norm": 3.40625, + "learning_rate": 2.6553896344075113e-05, + "loss": 1.4910286664962769, + "step": 1802 + }, + { + "epoch": 0.4702353784964465, + "grad_norm": 3.546875, + "learning_rate": 2.6540105400141756e-05, + "loss": 1.7399252653121948, + "step": 1803 + }, + { + "epoch": 0.4704961856947252, + "grad_norm": 3.609375, + "learning_rate": 2.652631097333121e-05, + "loss": 1.6574151515960693, + "step": 1804 + }, + { + "epoch": 0.4707569928930038, + "grad_norm": 3.640625, + "learning_rate": 2.6512513070989567e-05, + "loss": 1.7319456338882446, + "step": 1805 + }, + { + "epoch": 0.4710178000912825, + "grad_norm": 3.671875, + "learning_rate": 2.6498711700464786e-05, + "loss": 1.7306711673736572, + "step": 1806 + }, + { + "epoch": 0.47127860728956117, + "grad_norm": 3.484375, + "learning_rate": 2.648490686910666e-05, + "loss": 1.4612176418304443, + "step": 1807 + }, + { + "epoch": 0.47153941448783987, + "grad_norm": 3.5625, + "learning_rate": 2.647109858426685e-05, + "loss": 1.3843921422958374, + "step": 1808 + }, + { + "epoch": 0.4718002216861185, + "grad_norm": 3.375, + "learning_rate": 2.645728685329883e-05, + "loss": 1.324055552482605, + "step": 1809 + }, + { + "epoch": 0.4720610288843972, + "grad_norm": 3.484375, + "learning_rate": 2.6443471683557925e-05, + "loss": 1.6804945468902588, + "step": 1810 + }, + { + "epoch": 0.47232183608267586, + "grad_norm": 3.703125, + "learning_rate": 2.6429653082401276e-05, + "loss": 1.5219495296478271, + "step": 1811 + }, + { + "epoch": 0.47258264328095456, + "grad_norm": 3.65625, + "learning_rate": 2.6415831057187867e-05, + "loss": 1.696804404258728, + "step": 1812 + }, + { + "epoch": 0.4728434504792332, + "grad_norm": 3.765625, + "learning_rate": 2.6402005615278505e-05, + "loss": 1.6895540952682495, + "step": 1813 + }, + { + "epoch": 0.4731042576775119, + "grad_norm": 5.28125, + "learning_rate": 2.6388176764035794e-05, + "loss": 1.4110807180404663, + "step": 1814 + }, + { + "epoch": 0.47336506487579055, + "grad_norm": 3.640625, + "learning_rate": 2.6374344510824194e-05, + "loss": 1.7220664024353027, + "step": 1815 + }, + { + "epoch": 0.47362587207406925, + "grad_norm": 3.609375, + "learning_rate": 2.6360508863009928e-05, + "loss": 1.6977365016937256, + "step": 1816 + }, + { + "epoch": 0.4738866792723479, + "grad_norm": 3.578125, + "learning_rate": 2.634666982796107e-05, + "loss": 1.7042901515960693, + "step": 1817 + }, + { + "epoch": 0.4741474864706266, + "grad_norm": 3.84375, + "learning_rate": 2.6332827413047475e-05, + "loss": 1.6732020378112793, + "step": 1818 + }, + { + "epoch": 0.47440829366890525, + "grad_norm": 4.0625, + "learning_rate": 2.63189816256408e-05, + "loss": 1.7227288484573364, + "step": 1819 + }, + { + "epoch": 0.47466910086718395, + "grad_norm": 3.375, + "learning_rate": 2.6305132473114502e-05, + "loss": 1.3994221687316895, + "step": 1820 + }, + { + "epoch": 0.4749299080654626, + "grad_norm": 3.828125, + "learning_rate": 2.6291279962843828e-05, + "loss": 1.4534467458724976, + "step": 1821 + }, + { + "epoch": 0.4751907152637413, + "grad_norm": 3.875, + "learning_rate": 2.6277424102205817e-05, + "loss": 1.467596173286438, + "step": 1822 + }, + { + "epoch": 0.47545152246201994, + "grad_norm": 3.578125, + "learning_rate": 2.626356489857929e-05, + "loss": 1.4756677150726318, + "step": 1823 + }, + { + "epoch": 0.47571232966029864, + "grad_norm": 3.625, + "learning_rate": 2.624970235934484e-05, + "loss": 1.6284886598587036, + "step": 1824 + }, + { + "epoch": 0.4759731368585773, + "grad_norm": 3.71875, + "learning_rate": 2.6235836491884845e-05, + "loss": 1.5227932929992676, + "step": 1825 + }, + { + "epoch": 0.476233944056856, + "grad_norm": 3.75, + "learning_rate": 2.6221967303583463e-05, + "loss": 1.5853811502456665, + "step": 1826 + }, + { + "epoch": 0.47649475125513463, + "grad_norm": 3.59375, + "learning_rate": 2.6208094801826603e-05, + "loss": 1.5051393508911133, + "step": 1827 + }, + { + "epoch": 0.47675555845341333, + "grad_norm": 3.484375, + "learning_rate": 2.6194218994001956e-05, + "loss": 1.3453019857406616, + "step": 1828 + }, + { + "epoch": 0.477016365651692, + "grad_norm": 3.890625, + "learning_rate": 2.618033988749895e-05, + "loss": 1.648142695426941, + "step": 1829 + }, + { + "epoch": 0.4772771728499707, + "grad_norm": 3.8125, + "learning_rate": 2.61664574897088e-05, + "loss": 1.4317494630813599, + "step": 1830 + }, + { + "epoch": 0.4775379800482493, + "grad_norm": 3.796875, + "learning_rate": 2.6152571808024446e-05, + "loss": 1.5820590257644653, + "step": 1831 + }, + { + "epoch": 0.477798787246528, + "grad_norm": 3.75, + "learning_rate": 2.6138682849840602e-05, + "loss": 1.859946608543396, + "step": 1832 + }, + { + "epoch": 0.47805959444480667, + "grad_norm": 3.671875, + "learning_rate": 2.6124790622553696e-05, + "loss": 1.586352825164795, + "step": 1833 + }, + { + "epoch": 0.47832040164308537, + "grad_norm": 3.4375, + "learning_rate": 2.611089513356193e-05, + "loss": 1.6238490343093872, + "step": 1834 + }, + { + "epoch": 0.478581208841364, + "grad_norm": 3.734375, + "learning_rate": 2.6096996390265226e-05, + "loss": 1.5462846755981445, + "step": 1835 + }, + { + "epoch": 0.4788420160396427, + "grad_norm": 3.6875, + "learning_rate": 2.6083094400065236e-05, + "loss": 1.3191076517105103, + "step": 1836 + }, + { + "epoch": 0.47910282323792136, + "grad_norm": 3.734375, + "learning_rate": 2.6069189170365354e-05, + "loss": 1.6824617385864258, + "step": 1837 + }, + { + "epoch": 0.47936363043620006, + "grad_norm": 3.59375, + "learning_rate": 2.6055280708570667e-05, + "loss": 1.3834890127182007, + "step": 1838 + }, + { + "epoch": 0.4796244376344787, + "grad_norm": 3.515625, + "learning_rate": 2.6041369022088044e-05, + "loss": 1.6102575063705444, + "step": 1839 + }, + { + "epoch": 0.4798852448327574, + "grad_norm": 3.53125, + "learning_rate": 2.602745411832601e-05, + "loss": 1.3793972730636597, + "step": 1840 + }, + { + "epoch": 0.48014605203103605, + "grad_norm": 3.890625, + "learning_rate": 2.601353600469483e-05, + "loss": 1.7498921155929565, + "step": 1841 + }, + { + "epoch": 0.48040685922931475, + "grad_norm": 3.71875, + "learning_rate": 2.5999614688606482e-05, + "loss": 1.7192661762237549, + "step": 1842 + }, + { + "epoch": 0.4806676664275934, + "grad_norm": 3.4375, + "learning_rate": 2.5985690177474646e-05, + "loss": 1.7017711400985718, + "step": 1843 + }, + { + "epoch": 0.4809284736258721, + "grad_norm": 3.515625, + "learning_rate": 2.59717624787147e-05, + "loss": 1.6084730625152588, + "step": 1844 + }, + { + "epoch": 0.48118928082415074, + "grad_norm": 3.59375, + "learning_rate": 2.5957831599743713e-05, + "loss": 1.6439201831817627, + "step": 1845 + }, + { + "epoch": 0.48145008802242945, + "grad_norm": 3.703125, + "learning_rate": 2.594389754798046e-05, + "loss": 1.3723888397216797, + "step": 1846 + }, + { + "epoch": 0.4817108952207081, + "grad_norm": 3.640625, + "learning_rate": 2.5929960330845402e-05, + "loss": 1.4593439102172852, + "step": 1847 + }, + { + "epoch": 0.4819717024189868, + "grad_norm": 3.734375, + "learning_rate": 2.5916019955760687e-05, + "loss": 1.5289146900177002, + "step": 1848 + }, + { + "epoch": 0.48223250961726544, + "grad_norm": 3.78125, + "learning_rate": 2.5902076430150143e-05, + "loss": 1.5675907135009766, + "step": 1849 + }, + { + "epoch": 0.4824933168155441, + "grad_norm": 3.671875, + "learning_rate": 2.5888129761439268e-05, + "loss": 1.5333256721496582, + "step": 1850 + }, + { + "epoch": 0.4827541240138228, + "grad_norm": 4.09375, + "learning_rate": 2.587417995705525e-05, + "loss": 1.9039056301116943, + "step": 1851 + }, + { + "epoch": 0.48301493121210143, + "grad_norm": 3.4375, + "learning_rate": 2.586022702442693e-05, + "loss": 1.4152421951293945, + "step": 1852 + }, + { + "epoch": 0.48327573841038013, + "grad_norm": 3.4375, + "learning_rate": 2.5846270970984826e-05, + "loss": 1.4902153015136719, + "step": 1853 + }, + { + "epoch": 0.4835365456086588, + "grad_norm": 3.6875, + "learning_rate": 2.5832311804161127e-05, + "loss": 1.768594741821289, + "step": 1854 + }, + { + "epoch": 0.4837973528069375, + "grad_norm": 3.671875, + "learning_rate": 2.581834953138964e-05, + "loss": 1.5538244247436523, + "step": 1855 + }, + { + "epoch": 0.4840581600052161, + "grad_norm": 3.84375, + "learning_rate": 2.580438416010588e-05, + "loss": 1.8284088373184204, + "step": 1856 + }, + { + "epoch": 0.4843189672034948, + "grad_norm": 3.5, + "learning_rate": 2.5790415697746976e-05, + "loss": 1.479146122932434, + "step": 1857 + }, + { + "epoch": 0.48457977440177347, + "grad_norm": 3.78125, + "learning_rate": 2.5776444151751712e-05, + "loss": 1.6835163831710815, + "step": 1858 + }, + { + "epoch": 0.48484058160005217, + "grad_norm": 3.671875, + "learning_rate": 2.5762469529560514e-05, + "loss": 1.6849534511566162, + "step": 1859 + }, + { + "epoch": 0.4851013887983308, + "grad_norm": 3.640625, + "learning_rate": 2.5748491838615457e-05, + "loss": 1.5472744703292847, + "step": 1860 + }, + { + "epoch": 0.4853621959966095, + "grad_norm": 3.609375, + "learning_rate": 2.5734511086360236e-05, + "loss": 1.6519335508346558, + "step": 1861 + }, + { + "epoch": 0.48562300319488816, + "grad_norm": 3.53125, + "learning_rate": 2.5720527280240172e-05, + "loss": 1.5347322225570679, + "step": 1862 + }, + { + "epoch": 0.48588381039316686, + "grad_norm": 3.515625, + "learning_rate": 2.5706540427702234e-05, + "loss": 1.4124343395233154, + "step": 1863 + }, + { + "epoch": 0.4861446175914455, + "grad_norm": 3.765625, + "learning_rate": 2.5692550536194988e-05, + "loss": 1.6010658740997314, + "step": 1864 + }, + { + "epoch": 0.4864054247897242, + "grad_norm": 3.703125, + "learning_rate": 2.5678557613168645e-05, + "loss": 1.6980068683624268, + "step": 1865 + }, + { + "epoch": 0.48666623198800285, + "grad_norm": 3.796875, + "learning_rate": 2.566456166607501e-05, + "loss": 1.8062810897827148, + "step": 1866 + }, + { + "epoch": 0.48692703918628155, + "grad_norm": 4.03125, + "learning_rate": 2.56505627023675e-05, + "loss": 1.6686768531799316, + "step": 1867 + }, + { + "epoch": 0.4871878463845602, + "grad_norm": 3.96875, + "learning_rate": 2.5636560729501154e-05, + "loss": 1.660166621208191, + "step": 1868 + }, + { + "epoch": 0.4874486535828389, + "grad_norm": 3.625, + "learning_rate": 2.56225557549326e-05, + "loss": 1.8616141080856323, + "step": 1869 + }, + { + "epoch": 0.48770946078111754, + "grad_norm": 4.0, + "learning_rate": 2.5608547786120056e-05, + "loss": 1.7120239734649658, + "step": 1870 + }, + { + "epoch": 0.48797026797939624, + "grad_norm": 3.578125, + "learning_rate": 2.5594536830523362e-05, + "loss": 1.5653290748596191, + "step": 1871 + }, + { + "epoch": 0.4882310751776749, + "grad_norm": 4.15625, + "learning_rate": 2.5580522895603917e-05, + "loss": 1.5002496242523193, + "step": 1872 + }, + { + "epoch": 0.4884918823759536, + "grad_norm": 3.75, + "learning_rate": 2.5566505988824738e-05, + "loss": 1.466421127319336, + "step": 1873 + }, + { + "epoch": 0.48875268957423224, + "grad_norm": 4.0, + "learning_rate": 2.5552486117650398e-05, + "loss": 1.4143470525741577, + "step": 1874 + }, + { + "epoch": 0.48901349677251094, + "grad_norm": 3.84375, + "learning_rate": 2.5538463289547068e-05, + "loss": 1.8513165712356567, + "step": 1875 + }, + { + "epoch": 0.4892743039707896, + "grad_norm": 3.5625, + "learning_rate": 2.5524437511982472e-05, + "loss": 1.4836419820785522, + "step": 1876 + }, + { + "epoch": 0.4895351111690683, + "grad_norm": 3.5625, + "learning_rate": 2.551040879242593e-05, + "loss": 1.5473915338516235, + "step": 1877 + }, + { + "epoch": 0.4897959183673469, + "grad_norm": 3.5625, + "learning_rate": 2.5496377138348313e-05, + "loss": 1.438124418258667, + "step": 1878 + }, + { + "epoch": 0.49005672556562563, + "grad_norm": 3.625, + "learning_rate": 2.5482342557222057e-05, + "loss": 1.5178227424621582, + "step": 1879 + }, + { + "epoch": 0.4903175327639043, + "grad_norm": 3.6875, + "learning_rate": 2.546830505652116e-05, + "loss": 1.6444778442382812, + "step": 1880 + }, + { + "epoch": 0.490578339962183, + "grad_norm": 3.5625, + "learning_rate": 2.545426464372117e-05, + "loss": 1.6002541780471802, + "step": 1881 + }, + { + "epoch": 0.4908391471604616, + "grad_norm": 3.546875, + "learning_rate": 2.5440221326299187e-05, + "loss": 1.7041120529174805, + "step": 1882 + }, + { + "epoch": 0.4910999543587403, + "grad_norm": 3.578125, + "learning_rate": 2.5426175111733873e-05, + "loss": 1.511730432510376, + "step": 1883 + }, + { + "epoch": 0.49136076155701897, + "grad_norm": 3.75, + "learning_rate": 2.5412126007505404e-05, + "loss": 1.7230110168457031, + "step": 1884 + }, + { + "epoch": 0.49162156875529767, + "grad_norm": 3.703125, + "learning_rate": 2.5398074021095517e-05, + "loss": 1.4343509674072266, + "step": 1885 + }, + { + "epoch": 0.4918823759535763, + "grad_norm": 3.5625, + "learning_rate": 2.5384019159987473e-05, + "loss": 1.3298263549804688, + "step": 1886 + }, + { + "epoch": 0.492143183151855, + "grad_norm": 3.78125, + "learning_rate": 2.536996143166608e-05, + "loss": 1.591562271118164, + "step": 1887 + }, + { + "epoch": 0.49240399035013366, + "grad_norm": 3.9375, + "learning_rate": 2.535590084361764e-05, + "loss": 1.6372700929641724, + "step": 1888 + }, + { + "epoch": 0.49266479754841236, + "grad_norm": 4.28125, + "learning_rate": 2.5341837403330015e-05, + "loss": 1.6347156763076782, + "step": 1889 + }, + { + "epoch": 0.492925604746691, + "grad_norm": 3.5625, + "learning_rate": 2.5327771118292575e-05, + "loss": 1.5804529190063477, + "step": 1890 + }, + { + "epoch": 0.4931864119449697, + "grad_norm": 3.890625, + "learning_rate": 2.5313701995996177e-05, + "loss": 1.7619483470916748, + "step": 1891 + }, + { + "epoch": 0.49344721914324835, + "grad_norm": 3.453125, + "learning_rate": 2.529963004393324e-05, + "loss": 1.387556791305542, + "step": 1892 + }, + { + "epoch": 0.49370802634152705, + "grad_norm": 3.78125, + "learning_rate": 2.5285555269597635e-05, + "loss": 1.744908094406128, + "step": 1893 + }, + { + "epoch": 0.4939688335398057, + "grad_norm": 3.8125, + "learning_rate": 2.5271477680484776e-05, + "loss": 1.6964850425720215, + "step": 1894 + }, + { + "epoch": 0.4942296407380844, + "grad_norm": 3.703125, + "learning_rate": 2.5257397284091572e-05, + "loss": 1.4730907678604126, + "step": 1895 + }, + { + "epoch": 0.49449044793636304, + "grad_norm": 4.375, + "learning_rate": 2.52433140879164e-05, + "loss": 1.6733832359313965, + "step": 1896 + }, + { + "epoch": 0.49475125513464174, + "grad_norm": 3.453125, + "learning_rate": 2.5229228099459153e-05, + "loss": 1.5286256074905396, + "step": 1897 + }, + { + "epoch": 0.4950120623329204, + "grad_norm": 3.765625, + "learning_rate": 2.5215139326221206e-05, + "loss": 1.682433843612671, + "step": 1898 + }, + { + "epoch": 0.49527286953119903, + "grad_norm": 3.765625, + "learning_rate": 2.5201047775705414e-05, + "loss": 1.7269163131713867, + "step": 1899 + }, + { + "epoch": 0.49553367672947773, + "grad_norm": 4.375, + "learning_rate": 2.5186953455416106e-05, + "loss": 1.81180739402771, + "step": 1900 + }, + { + "epoch": 0.4957944839277564, + "grad_norm": 3.875, + "learning_rate": 2.51728563728591e-05, + "loss": 1.527209758758545, + "step": 1901 + }, + { + "epoch": 0.4960552911260351, + "grad_norm": 3.828125, + "learning_rate": 2.515875653554167e-05, + "loss": 1.8661195039749146, + "step": 1902 + }, + { + "epoch": 0.4963160983243137, + "grad_norm": 3.421875, + "learning_rate": 2.5144653950972565e-05, + "loss": 1.3593074083328247, + "step": 1903 + }, + { + "epoch": 0.4965769055225924, + "grad_norm": 3.765625, + "learning_rate": 2.5130548626662002e-05, + "loss": 1.5782065391540527, + "step": 1904 + }, + { + "epoch": 0.49683771272087107, + "grad_norm": 3.484375, + "learning_rate": 2.511644057012164e-05, + "loss": 1.5576541423797607, + "step": 1905 + }, + { + "epoch": 0.4970985199191498, + "grad_norm": 3.4375, + "learning_rate": 2.510232978886461e-05, + "loss": 1.5748167037963867, + "step": 1906 + }, + { + "epoch": 0.4973593271174284, + "grad_norm": 3.453125, + "learning_rate": 2.5088216290405495e-05, + "loss": 1.518371820449829, + "step": 1907 + }, + { + "epoch": 0.4976201343157071, + "grad_norm": 3.5625, + "learning_rate": 2.5074100082260304e-05, + "loss": 1.7400927543640137, + "step": 1908 + }, + { + "epoch": 0.49788094151398576, + "grad_norm": 3.59375, + "learning_rate": 2.5059981171946515e-05, + "loss": 1.4117454290390015, + "step": 1909 + }, + { + "epoch": 0.49814174871226446, + "grad_norm": 3.578125, + "learning_rate": 2.5045859566983016e-05, + "loss": 1.3402498960494995, + "step": 1910 + }, + { + "epoch": 0.4984025559105431, + "grad_norm": 4.0, + "learning_rate": 2.5031735274890176e-05, + "loss": 1.872905969619751, + "step": 1911 + }, + { + "epoch": 0.4986633631088218, + "grad_norm": 3.9375, + "learning_rate": 2.501760830318974e-05, + "loss": 1.6585395336151123, + "step": 1912 + }, + { + "epoch": 0.49892417030710046, + "grad_norm": 3.78125, + "learning_rate": 2.5003478659404906e-05, + "loss": 1.6110572814941406, + "step": 1913 + }, + { + "epoch": 0.49918497750537916, + "grad_norm": 3.609375, + "learning_rate": 2.4989346351060314e-05, + "loss": 1.711033582687378, + "step": 1914 + }, + { + "epoch": 0.4994457847036578, + "grad_norm": 3.640625, + "learning_rate": 2.4975211385681986e-05, + "loss": 1.8739043474197388, + "step": 1915 + }, + { + "epoch": 0.4997065919019365, + "grad_norm": 3.65625, + "learning_rate": 2.4961073770797394e-05, + "loss": 1.5758452415466309, + "step": 1916 + }, + { + "epoch": 0.49996739910021515, + "grad_norm": 3.390625, + "learning_rate": 2.494693351393539e-05, + "loss": 1.5570107698440552, + "step": 1917 + }, + { + "epoch": 0.5002282062984938, + "grad_norm": 3.5625, + "learning_rate": 2.4932790622626247e-05, + "loss": 1.5484333038330078, + "step": 1918 + }, + { + "epoch": 0.5004890134967726, + "grad_norm": 3.59375, + "learning_rate": 2.4918645104401648e-05, + "loss": 1.6143624782562256, + "step": 1919 + }, + { + "epoch": 0.5007498206950511, + "grad_norm": 3.640625, + "learning_rate": 2.4904496966794662e-05, + "loss": 1.6884369850158691, + "step": 1920 + }, + { + "epoch": 0.5010106278933298, + "grad_norm": 3.71875, + "learning_rate": 2.4890346217339768e-05, + "loss": 1.5124157667160034, + "step": 1921 + }, + { + "epoch": 0.5012714350916085, + "grad_norm": 3.828125, + "learning_rate": 2.4876192863572816e-05, + "loss": 1.6060041189193726, + "step": 1922 + }, + { + "epoch": 0.5015322422898872, + "grad_norm": 3.828125, + "learning_rate": 2.4862036913031053e-05, + "loss": 1.4871320724487305, + "step": 1923 + }, + { + "epoch": 0.5017930494881658, + "grad_norm": 3.5625, + "learning_rate": 2.4847878373253118e-05, + "loss": 1.3923563957214355, + "step": 1924 + }, + { + "epoch": 0.5020538566864445, + "grad_norm": 3.578125, + "learning_rate": 2.4833717251779014e-05, + "loss": 1.5572978258132935, + "step": 1925 + }, + { + "epoch": 0.5023146638847232, + "grad_norm": 3.53125, + "learning_rate": 2.4819553556150134e-05, + "loss": 1.6809754371643066, + "step": 1926 + }, + { + "epoch": 0.5025754710830019, + "grad_norm": 4.03125, + "learning_rate": 2.4805387293909214e-05, + "loss": 1.8645470142364502, + "step": 1927 + }, + { + "epoch": 0.5028362782812805, + "grad_norm": 3.859375, + "learning_rate": 2.4791218472600396e-05, + "loss": 1.772527813911438, + "step": 1928 + }, + { + "epoch": 0.5030970854795592, + "grad_norm": 3.828125, + "learning_rate": 2.4777047099769157e-05, + "loss": 1.609299898147583, + "step": 1929 + }, + { + "epoch": 0.5033578926778379, + "grad_norm": 3.625, + "learning_rate": 2.4762873182962338e-05, + "loss": 1.748628854751587, + "step": 1930 + }, + { + "epoch": 0.5036186998761166, + "grad_norm": 3.5625, + "learning_rate": 2.4748696729728135e-05, + "loss": 1.6891801357269287, + "step": 1931 + }, + { + "epoch": 0.5038795070743952, + "grad_norm": 3.984375, + "learning_rate": 2.4734517747616106e-05, + "loss": 1.5500144958496094, + "step": 1932 + }, + { + "epoch": 0.5041403142726739, + "grad_norm": 3.328125, + "learning_rate": 2.472033624417715e-05, + "loss": 1.177302598953247, + "step": 1933 + }, + { + "epoch": 0.5044011214709526, + "grad_norm": 3.765625, + "learning_rate": 2.4706152226963484e-05, + "loss": 1.510635495185852, + "step": 1934 + }, + { + "epoch": 0.5046619286692313, + "grad_norm": 3.875, + "learning_rate": 2.46919657035287e-05, + "loss": 1.5966476202011108, + "step": 1935 + }, + { + "epoch": 0.5049227358675099, + "grad_norm": 3.625, + "learning_rate": 2.467777668142771e-05, + "loss": 1.3704627752304077, + "step": 1936 + }, + { + "epoch": 0.5051835430657886, + "grad_norm": 3.640625, + "learning_rate": 2.466358516821675e-05, + "loss": 1.6136150360107422, + "step": 1937 + }, + { + "epoch": 0.5054443502640673, + "grad_norm": 3.671875, + "learning_rate": 2.46493911714534e-05, + "loss": 1.286433219909668, + "step": 1938 + }, + { + "epoch": 0.505705157462346, + "grad_norm": 3.640625, + "learning_rate": 2.4635194698696544e-05, + "loss": 1.5212979316711426, + "step": 1939 + }, + { + "epoch": 0.5059659646606246, + "grad_norm": 3.484375, + "learning_rate": 2.4620995757506393e-05, + "loss": 1.5038641691207886, + "step": 1940 + }, + { + "epoch": 0.5062267718589033, + "grad_norm": 3.546875, + "learning_rate": 2.4606794355444467e-05, + "loss": 1.5387482643127441, + "step": 1941 + }, + { + "epoch": 0.506487579057182, + "grad_norm": 3.671875, + "learning_rate": 2.4592590500073607e-05, + "loss": 1.5222899913787842, + "step": 1942 + }, + { + "epoch": 0.5067483862554607, + "grad_norm": 3.75, + "learning_rate": 2.4578384198957957e-05, + "loss": 1.7417032718658447, + "step": 1943 + }, + { + "epoch": 0.5070091934537393, + "grad_norm": 3.265625, + "learning_rate": 2.456417545966295e-05, + "loss": 1.7328388690948486, + "step": 1944 + }, + { + "epoch": 0.507270000652018, + "grad_norm": 3.359375, + "learning_rate": 2.4549964289755347e-05, + "loss": 1.432841181755066, + "step": 1945 + }, + { + "epoch": 0.5075308078502967, + "grad_norm": 4.125, + "learning_rate": 2.453575069680317e-05, + "loss": 1.5138942003250122, + "step": 1946 + }, + { + "epoch": 0.5077916150485754, + "grad_norm": 3.703125, + "learning_rate": 2.4521534688375747e-05, + "loss": 1.4251571893692017, + "step": 1947 + }, + { + "epoch": 0.508052422246854, + "grad_norm": 3.828125, + "learning_rate": 2.4507316272043705e-05, + "loss": 1.2827256917953491, + "step": 1948 + }, + { + "epoch": 0.5083132294451327, + "grad_norm": 3.703125, + "learning_rate": 2.449309545537892e-05, + "loss": 1.6564579010009766, + "step": 1949 + }, + { + "epoch": 0.5085740366434114, + "grad_norm": 3.78125, + "learning_rate": 2.447887224595458e-05, + "loss": 1.5971944332122803, + "step": 1950 + }, + { + "epoch": 0.50883484384169, + "grad_norm": 3.640625, + "learning_rate": 2.4464646651345133e-05, + "loss": 1.6550348997116089, + "step": 1951 + }, + { + "epoch": 0.5090956510399687, + "grad_norm": 3.4375, + "learning_rate": 2.445041867912629e-05, + "loss": 1.5908136367797852, + "step": 1952 + }, + { + "epoch": 0.5093564582382474, + "grad_norm": 3.5625, + "learning_rate": 2.4436188336875044e-05, + "loss": 1.4241156578063965, + "step": 1953 + }, + { + "epoch": 0.5096172654365261, + "grad_norm": 3.25, + "learning_rate": 2.4421955632169638e-05, + "loss": 1.4336529970169067, + "step": 1954 + }, + { + "epoch": 0.5098780726348047, + "grad_norm": 3.78125, + "learning_rate": 2.440772057258958e-05, + "loss": 1.6574651002883911, + "step": 1955 + }, + { + "epoch": 0.5101388798330834, + "grad_norm": 3.84375, + "learning_rate": 2.439348316571563e-05, + "loss": 1.8338189125061035, + "step": 1956 + }, + { + "epoch": 0.5103996870313621, + "grad_norm": 3.671875, + "learning_rate": 2.4379243419129788e-05, + "loss": 1.7848278284072876, + "step": 1957 + }, + { + "epoch": 0.5106604942296408, + "grad_norm": 3.5, + "learning_rate": 2.436500134041532e-05, + "loss": 1.265929102897644, + "step": 1958 + }, + { + "epoch": 0.5109213014279194, + "grad_norm": 3.703125, + "learning_rate": 2.4350756937156718e-05, + "loss": 1.5478613376617432, + "step": 1959 + }, + { + "epoch": 0.5111821086261981, + "grad_norm": 3.984375, + "learning_rate": 2.433651021693972e-05, + "loss": 1.741408348083496, + "step": 1960 + }, + { + "epoch": 0.5114429158244768, + "grad_norm": 3.46875, + "learning_rate": 2.4322261187351287e-05, + "loss": 1.689663290977478, + "step": 1961 + }, + { + "epoch": 0.5117037230227555, + "grad_norm": 3.546875, + "learning_rate": 2.430800985597963e-05, + "loss": 1.541811466217041, + "step": 1962 + }, + { + "epoch": 0.511964530221034, + "grad_norm": 3.359375, + "learning_rate": 2.429375623041417e-05, + "loss": 1.5056332349777222, + "step": 1963 + }, + { + "epoch": 0.5122253374193128, + "grad_norm": 3.625, + "learning_rate": 2.4279500318245542e-05, + "loss": 1.6486773490905762, + "step": 1964 + }, + { + "epoch": 0.5124861446175915, + "grad_norm": 3.734375, + "learning_rate": 2.426524212706563e-05, + "loss": 1.5406248569488525, + "step": 1965 + }, + { + "epoch": 0.5127469518158702, + "grad_norm": 3.40625, + "learning_rate": 2.4250981664467488e-05, + "loss": 1.1409528255462646, + "step": 1966 + }, + { + "epoch": 0.5130077590141487, + "grad_norm": 3.890625, + "learning_rate": 2.423671893804543e-05, + "loss": 1.627926230430603, + "step": 1967 + }, + { + "epoch": 0.5132685662124274, + "grad_norm": 3.515625, + "learning_rate": 2.4222453955394928e-05, + "loss": 1.7604104280471802, + "step": 1968 + }, + { + "epoch": 0.5135293734107061, + "grad_norm": 3.5625, + "learning_rate": 2.4208186724112683e-05, + "loss": 1.5717003345489502, + "step": 1969 + }, + { + "epoch": 0.5137901806089848, + "grad_norm": 3.375, + "learning_rate": 2.419391725179659e-05, + "loss": 1.3920912742614746, + "step": 1970 + }, + { + "epoch": 0.5140509878072634, + "grad_norm": 3.5, + "learning_rate": 2.417964554604573e-05, + "loss": 1.8886692523956299, + "step": 1971 + }, + { + "epoch": 0.5143117950055421, + "grad_norm": 3.265625, + "learning_rate": 2.4165371614460388e-05, + "loss": 1.5830036401748657, + "step": 1972 + }, + { + "epoch": 0.5145726022038208, + "grad_norm": 3.84375, + "learning_rate": 2.415109546464201e-05, + "loss": 1.675565481185913, + "step": 1973 + }, + { + "epoch": 0.5148334094020995, + "grad_norm": 3.515625, + "learning_rate": 2.4136817104193244e-05, + "loss": 1.5147334337234497, + "step": 1974 + }, + { + "epoch": 0.5150942166003781, + "grad_norm": 3.3125, + "learning_rate": 2.412253654071791e-05, + "loss": 1.4753445386886597, + "step": 1975 + }, + { + "epoch": 0.5153550237986568, + "grad_norm": 3.640625, + "learning_rate": 2.4108253781820998e-05, + "loss": 1.3679883480072021, + "step": 1976 + }, + { + "epoch": 0.5156158309969355, + "grad_norm": 3.546875, + "learning_rate": 2.4093968835108674e-05, + "loss": 1.4993984699249268, + "step": 1977 + }, + { + "epoch": 0.5158766381952142, + "grad_norm": 3.421875, + "learning_rate": 2.407968170818825e-05, + "loss": 1.2360178232192993, + "step": 1978 + }, + { + "epoch": 0.5161374453934928, + "grad_norm": 3.6875, + "learning_rate": 2.406539240866823e-05, + "loss": 1.4463670253753662, + "step": 1979 + }, + { + "epoch": 0.5163982525917715, + "grad_norm": 3.546875, + "learning_rate": 2.4051100944158252e-05, + "loss": 1.7772748470306396, + "step": 1980 + }, + { + "epoch": 0.5166590597900502, + "grad_norm": 3.46875, + "learning_rate": 2.403680732226911e-05, + "loss": 1.677770733833313, + "step": 1981 + }, + { + "epoch": 0.5169198669883289, + "grad_norm": 3.671875, + "learning_rate": 2.4022511550612757e-05, + "loss": 1.669695258140564, + "step": 1982 + }, + { + "epoch": 0.5171806741866075, + "grad_norm": 3.46875, + "learning_rate": 2.400821363680227e-05, + "loss": 1.4655897617340088, + "step": 1983 + }, + { + "epoch": 0.5174414813848862, + "grad_norm": 3.546875, + "learning_rate": 2.3993913588451898e-05, + "loss": 1.4056119918823242, + "step": 1984 + }, + { + "epoch": 0.5177022885831649, + "grad_norm": 3.40625, + "learning_rate": 2.3979611413177003e-05, + "loss": 1.5410664081573486, + "step": 1985 + }, + { + "epoch": 0.5179630957814436, + "grad_norm": 3.5625, + "learning_rate": 2.3965307118594077e-05, + "loss": 1.8617842197418213, + "step": 1986 + }, + { + "epoch": 0.5182239029797222, + "grad_norm": 3.484375, + "learning_rate": 2.3951000712320768e-05, + "loss": 1.5264947414398193, + "step": 1987 + }, + { + "epoch": 0.5184847101780009, + "grad_norm": 3.625, + "learning_rate": 2.393669220197581e-05, + "loss": 1.6375635862350464, + "step": 1988 + }, + { + "epoch": 0.5187455173762796, + "grad_norm": 3.578125, + "learning_rate": 2.3922381595179096e-05, + "loss": 1.56282377243042, + "step": 1989 + }, + { + "epoch": 0.5190063245745583, + "grad_norm": 3.625, + "learning_rate": 2.3908068899551604e-05, + "loss": 1.568946123123169, + "step": 1990 + }, + { + "epoch": 0.5192671317728369, + "grad_norm": 3.203125, + "learning_rate": 2.3893754122715446e-05, + "loss": 1.2424476146697998, + "step": 1991 + }, + { + "epoch": 0.5195279389711156, + "grad_norm": 3.640625, + "learning_rate": 2.3879437272293827e-05, + "loss": 1.5834656953811646, + "step": 1992 + }, + { + "epoch": 0.5197887461693943, + "grad_norm": 3.28125, + "learning_rate": 2.3865118355911066e-05, + "loss": 1.5506184101104736, + "step": 1993 + }, + { + "epoch": 0.520049553367673, + "grad_norm": 3.703125, + "learning_rate": 2.3850797381192586e-05, + "loss": 1.4371000528335571, + "step": 1994 + }, + { + "epoch": 0.5203103605659516, + "grad_norm": 3.828125, + "learning_rate": 2.3836474355764887e-05, + "loss": 1.6684209108352661, + "step": 1995 + }, + { + "epoch": 0.5205711677642303, + "grad_norm": 3.671875, + "learning_rate": 2.382214928725559e-05, + "loss": 1.3157470226287842, + "step": 1996 + }, + { + "epoch": 0.520831974962509, + "grad_norm": 3.609375, + "learning_rate": 2.380782218329337e-05, + "loss": 1.7018301486968994, + "step": 1997 + }, + { + "epoch": 0.5210927821607877, + "grad_norm": 3.265625, + "learning_rate": 2.3793493051508012e-05, + "loss": 1.5252659320831299, + "step": 1998 + }, + { + "epoch": 0.5213535893590663, + "grad_norm": 3.59375, + "learning_rate": 2.3779161899530383e-05, + "loss": 1.513904094696045, + "step": 1999 + }, + { + "epoch": 0.521614396557345, + "grad_norm": 3.703125, + "learning_rate": 2.3764828734992392e-05, + "loss": 1.476542592048645, + "step": 2000 + }, + { + "epoch": 0.5218752037556237, + "grad_norm": 3.328125, + "learning_rate": 2.3750493565527063e-05, + "loss": 1.3485337495803833, + "step": 2001 + }, + { + "epoch": 0.5221360109539023, + "grad_norm": 3.53125, + "learning_rate": 2.373615639876846e-05, + "loss": 1.6200525760650635, + "step": 2002 + }, + { + "epoch": 0.522396818152181, + "grad_norm": 3.578125, + "learning_rate": 2.372181724235172e-05, + "loss": 1.9040131568908691, + "step": 2003 + }, + { + "epoch": 0.5226576253504597, + "grad_norm": 3.90625, + "learning_rate": 2.3707476103913037e-05, + "loss": 1.5234922170639038, + "step": 2004 + }, + { + "epoch": 0.5229184325487384, + "grad_norm": 3.4375, + "learning_rate": 2.3693132991089663e-05, + "loss": 1.3111425638198853, + "step": 2005 + }, + { + "epoch": 0.523179239747017, + "grad_norm": 3.40625, + "learning_rate": 2.36787879115199e-05, + "loss": 1.6711585521697998, + "step": 2006 + }, + { + "epoch": 0.5234400469452957, + "grad_norm": 3.484375, + "learning_rate": 2.3664440872843098e-05, + "loss": 1.6181418895721436, + "step": 2007 + }, + { + "epoch": 0.5237008541435744, + "grad_norm": 3.84375, + "learning_rate": 2.365009188269965e-05, + "loss": 1.4819923639297485, + "step": 2008 + }, + { + "epoch": 0.5239616613418531, + "grad_norm": 3.453125, + "learning_rate": 2.363574094873098e-05, + "loss": 1.5990458726882935, + "step": 2009 + }, + { + "epoch": 0.5242224685401317, + "grad_norm": 4.53125, + "learning_rate": 2.3621388078579566e-05, + "loss": 1.9280414581298828, + "step": 2010 + }, + { + "epoch": 0.5244832757384104, + "grad_norm": 3.453125, + "learning_rate": 2.3607033279888905e-05, + "loss": 1.5945062637329102, + "step": 2011 + }, + { + "epoch": 0.5247440829366891, + "grad_norm": 3.21875, + "learning_rate": 2.3592676560303512e-05, + "loss": 1.4565190076828003, + "step": 2012 + }, + { + "epoch": 0.5250048901349678, + "grad_norm": 3.359375, + "learning_rate": 2.357831792746895e-05, + "loss": 1.3984203338623047, + "step": 2013 + }, + { + "epoch": 0.5252656973332464, + "grad_norm": 3.421875, + "learning_rate": 2.356395738903177e-05, + "loss": 1.573155403137207, + "step": 2014 + }, + { + "epoch": 0.525526504531525, + "grad_norm": 3.53125, + "learning_rate": 2.354959495263957e-05, + "loss": 1.536868929862976, + "step": 2015 + }, + { + "epoch": 0.5257873117298038, + "grad_norm": 3.453125, + "learning_rate": 2.3535230625940936e-05, + "loss": 1.5260744094848633, + "step": 2016 + }, + { + "epoch": 0.5260481189280825, + "grad_norm": 3.46875, + "learning_rate": 2.352086441658546e-05, + "loss": 1.3998295068740845, + "step": 2017 + }, + { + "epoch": 0.526308926126361, + "grad_norm": 3.65625, + "learning_rate": 2.350649633222376e-05, + "loss": 1.337104320526123, + "step": 2018 + }, + { + "epoch": 0.5265697333246397, + "grad_norm": 3.546875, + "learning_rate": 2.349212638050742e-05, + "loss": 1.6615405082702637, + "step": 2019 + }, + { + "epoch": 0.5268305405229184, + "grad_norm": 3.71875, + "learning_rate": 2.347775456908904e-05, + "loss": 1.4596977233886719, + "step": 2020 + }, + { + "epoch": 0.5270913477211971, + "grad_norm": 3.421875, + "learning_rate": 2.3463380905622214e-05, + "loss": 1.543012022972107, + "step": 2021 + }, + { + "epoch": 0.5273521549194757, + "grad_norm": 3.890625, + "learning_rate": 2.34490053977615e-05, + "loss": 1.8023302555084229, + "step": 2022 + }, + { + "epoch": 0.5276129621177544, + "grad_norm": 3.53125, + "learning_rate": 2.3434628053162465e-05, + "loss": 1.5883944034576416, + "step": 2023 + }, + { + "epoch": 0.5278737693160331, + "grad_norm": 3.6875, + "learning_rate": 2.3420248879481632e-05, + "loss": 1.5604071617126465, + "step": 2024 + }, + { + "epoch": 0.5281345765143118, + "grad_norm": 3.625, + "learning_rate": 2.3405867884376504e-05, + "loss": 1.4639250040054321, + "step": 2025 + }, + { + "epoch": 0.5283953837125904, + "grad_norm": 3.546875, + "learning_rate": 2.3391485075505567e-05, + "loss": 1.5720698833465576, + "step": 2026 + }, + { + "epoch": 0.5286561909108691, + "grad_norm": 3.34375, + "learning_rate": 2.3377100460528256e-05, + "loss": 1.2630044221878052, + "step": 2027 + }, + { + "epoch": 0.5289169981091478, + "grad_norm": 3.546875, + "learning_rate": 2.3362714047104987e-05, + "loss": 1.364597201347351, + "step": 2028 + }, + { + "epoch": 0.5291778053074265, + "grad_norm": 3.6875, + "learning_rate": 2.3348325842897102e-05, + "loss": 1.5645570755004883, + "step": 2029 + }, + { + "epoch": 0.5294386125057051, + "grad_norm": 3.5, + "learning_rate": 2.3333935855566922e-05, + "loss": 1.4444186687469482, + "step": 2030 + }, + { + "epoch": 0.5296994197039838, + "grad_norm": 3.484375, + "learning_rate": 2.331954409277772e-05, + "loss": 1.4813719987869263, + "step": 2031 + }, + { + "epoch": 0.5299602269022625, + "grad_norm": 3.375, + "learning_rate": 2.33051505621937e-05, + "loss": 1.4920804500579834, + "step": 2032 + }, + { + "epoch": 0.5302210341005412, + "grad_norm": 3.59375, + "learning_rate": 2.329075527148002e-05, + "loss": 1.4645588397979736, + "step": 2033 + }, + { + "epoch": 0.5304818412988198, + "grad_norm": 3.484375, + "learning_rate": 2.3276358228302757e-05, + "loss": 1.6457539796829224, + "step": 2034 + }, + { + "epoch": 0.5307426484970985, + "grad_norm": 3.375, + "learning_rate": 2.326195944032894e-05, + "loss": 1.6536197662353516, + "step": 2035 + }, + { + "epoch": 0.5310034556953772, + "grad_norm": 3.59375, + "learning_rate": 2.3247558915226526e-05, + "loss": 1.5841575860977173, + "step": 2036 + }, + { + "epoch": 0.5312642628936559, + "grad_norm": 3.609375, + "learning_rate": 2.3233156660664384e-05, + "loss": 1.562570333480835, + "step": 2037 + }, + { + "epoch": 0.5315250700919345, + "grad_norm": 3.484375, + "learning_rate": 2.3218752684312308e-05, + "loss": 1.4996845722198486, + "step": 2038 + }, + { + "epoch": 0.5317858772902132, + "grad_norm": 3.75, + "learning_rate": 2.320434699384102e-05, + "loss": 1.6510419845581055, + "step": 2039 + }, + { + "epoch": 0.5320466844884919, + "grad_norm": 3.578125, + "learning_rate": 2.3189939596922156e-05, + "loss": 1.5520703792572021, + "step": 2040 + }, + { + "epoch": 0.5323074916867706, + "grad_norm": 3.46875, + "learning_rate": 2.3175530501228227e-05, + "loss": 1.5717568397521973, + "step": 2041 + }, + { + "epoch": 0.5325682988850492, + "grad_norm": 3.984375, + "learning_rate": 2.3161119714432693e-05, + "loss": 1.6424689292907715, + "step": 2042 + }, + { + "epoch": 0.5328291060833279, + "grad_norm": 3.53125, + "learning_rate": 2.314670724420989e-05, + "loss": 1.5468082427978516, + "step": 2043 + }, + { + "epoch": 0.5330899132816066, + "grad_norm": 3.515625, + "learning_rate": 2.3132293098235056e-05, + "loss": 1.6908055543899536, + "step": 2044 + }, + { + "epoch": 0.5333507204798853, + "grad_norm": 3.515625, + "learning_rate": 2.3117877284184322e-05, + "loss": 1.4291343688964844, + "step": 2045 + }, + { + "epoch": 0.5336115276781639, + "grad_norm": 3.46875, + "learning_rate": 2.3103459809734706e-05, + "loss": 1.5319087505340576, + "step": 2046 + }, + { + "epoch": 0.5338723348764426, + "grad_norm": 3.921875, + "learning_rate": 2.3089040682564104e-05, + "loss": 1.5633468627929688, + "step": 2047 + }, + { + "epoch": 0.5341331420747213, + "grad_norm": 3.890625, + "learning_rate": 2.307461991035131e-05, + "loss": 1.7004632949829102, + "step": 2048 + }, + { + "epoch": 0.5343939492729999, + "grad_norm": 3.34375, + "learning_rate": 2.3060197500775977e-05, + "loss": 1.5623658895492554, + "step": 2049 + }, + { + "epoch": 0.5346547564712786, + "grad_norm": 3.65625, + "learning_rate": 2.304577346151864e-05, + "loss": 1.5550426244735718, + "step": 2050 + }, + { + "epoch": 0.5349155636695573, + "grad_norm": 3.5, + "learning_rate": 2.303134780026069e-05, + "loss": 1.608680248260498, + "step": 2051 + }, + { + "epoch": 0.535176370867836, + "grad_norm": 3.984375, + "learning_rate": 2.3016920524684396e-05, + "loss": 1.9391027688980103, + "step": 2052 + }, + { + "epoch": 0.5354371780661146, + "grad_norm": 3.8125, + "learning_rate": 2.300249164247288e-05, + "loss": 1.6023705005645752, + "step": 2053 + }, + { + "epoch": 0.5356979852643933, + "grad_norm": 3.53125, + "learning_rate": 2.298806116131012e-05, + "loss": 1.5014597177505493, + "step": 2054 + }, + { + "epoch": 0.535958792462672, + "grad_norm": 3.828125, + "learning_rate": 2.297362908888093e-05, + "loss": 1.5135198831558228, + "step": 2055 + }, + { + "epoch": 0.5362195996609507, + "grad_norm": 3.515625, + "learning_rate": 2.2959195432871012e-05, + "loss": 1.4736385345458984, + "step": 2056 + }, + { + "epoch": 0.5364804068592293, + "grad_norm": 3.734375, + "learning_rate": 2.2944760200966876e-05, + "loss": 1.570051670074463, + "step": 2057 + }, + { + "epoch": 0.536741214057508, + "grad_norm": 3.5625, + "learning_rate": 2.2930323400855875e-05, + "loss": 1.6457270383834839, + "step": 2058 + }, + { + "epoch": 0.5370020212557867, + "grad_norm": 3.4375, + "learning_rate": 2.2915885040226205e-05, + "loss": 1.40768301486969, + "step": 2059 + }, + { + "epoch": 0.5372628284540654, + "grad_norm": 3.921875, + "learning_rate": 2.29014451267669e-05, + "loss": 1.632968544960022, + "step": 2060 + }, + { + "epoch": 0.537523635652344, + "grad_norm": 3.203125, + "learning_rate": 2.2887003668167803e-05, + "loss": 1.3858742713928223, + "step": 2061 + }, + { + "epoch": 0.5377844428506227, + "grad_norm": 3.40625, + "learning_rate": 2.28725606721196e-05, + "loss": 1.4711406230926514, + "step": 2062 + }, + { + "epoch": 0.5380452500489014, + "grad_norm": 3.53125, + "learning_rate": 2.2858116146313772e-05, + "loss": 1.7600085735321045, + "step": 2063 + }, + { + "epoch": 0.5383060572471801, + "grad_norm": 3.53125, + "learning_rate": 2.2843670098442634e-05, + "loss": 1.6487839221954346, + "step": 2064 + }, + { + "epoch": 0.5385668644454586, + "grad_norm": 3.296875, + "learning_rate": 2.2829222536199308e-05, + "loss": 1.547398567199707, + "step": 2065 + }, + { + "epoch": 0.5388276716437373, + "grad_norm": 3.484375, + "learning_rate": 2.281477346727772e-05, + "loss": 1.616747260093689, + "step": 2066 + }, + { + "epoch": 0.539088478842016, + "grad_norm": 3.828125, + "learning_rate": 2.2800322899372586e-05, + "loss": 1.5411043167114258, + "step": 2067 + }, + { + "epoch": 0.5393492860402948, + "grad_norm": 3.984375, + "learning_rate": 2.2785870840179437e-05, + "loss": 1.7756426334381104, + "step": 2068 + }, + { + "epoch": 0.5396100932385733, + "grad_norm": 3.28125, + "learning_rate": 2.2771417297394613e-05, + "loss": 1.355830192565918, + "step": 2069 + }, + { + "epoch": 0.539870900436852, + "grad_norm": 3.765625, + "learning_rate": 2.27569622787152e-05, + "loss": 1.398935317993164, + "step": 2070 + }, + { + "epoch": 0.5401317076351307, + "grad_norm": 3.8125, + "learning_rate": 2.274250579183911e-05, + "loss": 1.6798679828643799, + "step": 2071 + }, + { + "epoch": 0.5403925148334094, + "grad_norm": 3.78125, + "learning_rate": 2.2728047844465006e-05, + "loss": 1.7959800958633423, + "step": 2072 + }, + { + "epoch": 0.540653322031688, + "grad_norm": 3.390625, + "learning_rate": 2.2713588444292358e-05, + "loss": 1.4074865579605103, + "step": 2073 + }, + { + "epoch": 0.5409141292299667, + "grad_norm": 3.46875, + "learning_rate": 2.2699127599021397e-05, + "loss": 1.409622311592102, + "step": 2074 + }, + { + "epoch": 0.5411749364282454, + "grad_norm": 3.328125, + "learning_rate": 2.2684665316353112e-05, + "loss": 1.2508794069290161, + "step": 2075 + }, + { + "epoch": 0.5414357436265241, + "grad_norm": 3.578125, + "learning_rate": 2.2670201603989275e-05, + "loss": 1.6247743368148804, + "step": 2076 + }, + { + "epoch": 0.5416965508248027, + "grad_norm": 3.203125, + "learning_rate": 2.265573646963241e-05, + "loss": 1.2508021593093872, + "step": 2077 + }, + { + "epoch": 0.5419573580230814, + "grad_norm": 3.453125, + "learning_rate": 2.26412699209858e-05, + "loss": 1.3514494895935059, + "step": 2078 + }, + { + "epoch": 0.5422181652213601, + "grad_norm": 3.5, + "learning_rate": 2.2626801965753483e-05, + "loss": 1.5631085634231567, + "step": 2079 + }, + { + "epoch": 0.5424789724196388, + "grad_norm": 3.578125, + "learning_rate": 2.2612332611640243e-05, + "loss": 1.5388072729110718, + "step": 2080 + }, + { + "epoch": 0.5427397796179174, + "grad_norm": 3.328125, + "learning_rate": 2.259786186635161e-05, + "loss": 1.2644977569580078, + "step": 2081 + }, + { + "epoch": 0.5430005868161961, + "grad_norm": 3.40625, + "learning_rate": 2.258338973759386e-05, + "loss": 1.6229095458984375, + "step": 2082 + }, + { + "epoch": 0.5432613940144748, + "grad_norm": 3.46875, + "learning_rate": 2.2568916233074004e-05, + "loss": 1.4737906455993652, + "step": 2083 + }, + { + "epoch": 0.5435222012127535, + "grad_norm": 3.421875, + "learning_rate": 2.2554441360499775e-05, + "loss": 1.6421562433242798, + "step": 2084 + }, + { + "epoch": 0.5437830084110321, + "grad_norm": 3.59375, + "learning_rate": 2.253996512757964e-05, + "loss": 1.3732681274414062, + "step": 2085 + }, + { + "epoch": 0.5440438156093108, + "grad_norm": 3.5, + "learning_rate": 2.2525487542022808e-05, + "loss": 1.5210660696029663, + "step": 2086 + }, + { + "epoch": 0.5443046228075895, + "grad_norm": 3.25, + "learning_rate": 2.2511008611539177e-05, + "loss": 1.2596254348754883, + "step": 2087 + }, + { + "epoch": 0.5445654300058682, + "grad_norm": 3.265625, + "learning_rate": 2.249652834383939e-05, + "loss": 1.3855814933776855, + "step": 2088 + }, + { + "epoch": 0.5448262372041468, + "grad_norm": 3.53125, + "learning_rate": 2.2482046746634784e-05, + "loss": 1.484673023223877, + "step": 2089 + }, + { + "epoch": 0.5450870444024255, + "grad_norm": 3.78125, + "learning_rate": 2.2467563827637414e-05, + "loss": 1.5831198692321777, + "step": 2090 + }, + { + "epoch": 0.5453478516007042, + "grad_norm": 3.484375, + "learning_rate": 2.2453079594560025e-05, + "loss": 1.5446763038635254, + "step": 2091 + }, + { + "epoch": 0.5456086587989829, + "grad_norm": 3.5625, + "learning_rate": 2.2438594055116077e-05, + "loss": 1.385252594947815, + "step": 2092 + }, + { + "epoch": 0.5458694659972615, + "grad_norm": 3.609375, + "learning_rate": 2.2424107217019724e-05, + "loss": 1.5257917642593384, + "step": 2093 + }, + { + "epoch": 0.5461302731955402, + "grad_norm": 3.625, + "learning_rate": 2.24096190879858e-05, + "loss": 1.4672267436981201, + "step": 2094 + }, + { + "epoch": 0.5463910803938189, + "grad_norm": 3.515625, + "learning_rate": 2.2395129675729845e-05, + "loss": 1.6204917430877686, + "step": 2095 + }, + { + "epoch": 0.5466518875920976, + "grad_norm": 3.734375, + "learning_rate": 2.238063898796806e-05, + "loss": 1.6788408756256104, + "step": 2096 + }, + { + "epoch": 0.5469126947903762, + "grad_norm": 3.53125, + "learning_rate": 2.236614703241734e-05, + "loss": 1.4796342849731445, + "step": 2097 + }, + { + "epoch": 0.5471735019886549, + "grad_norm": 3.625, + "learning_rate": 2.2351653816795263e-05, + "loss": 1.5955634117126465, + "step": 2098 + }, + { + "epoch": 0.5474343091869336, + "grad_norm": 3.40625, + "learning_rate": 2.233715934882005e-05, + "loss": 1.5928817987442017, + "step": 2099 + }, + { + "epoch": 0.5476951163852122, + "grad_norm": 3.40625, + "learning_rate": 2.232266363621062e-05, + "loss": 1.5585155487060547, + "step": 2100 + }, + { + "epoch": 0.5479559235834909, + "grad_norm": 3.859375, + "learning_rate": 2.230816668668653e-05, + "loss": 1.6276161670684814, + "step": 2101 + }, + { + "epoch": 0.5482167307817696, + "grad_norm": 3.421875, + "learning_rate": 2.2293668507968015e-05, + "loss": 1.4985865354537964, + "step": 2102 + }, + { + "epoch": 0.5484775379800483, + "grad_norm": 3.75, + "learning_rate": 2.2279169107775944e-05, + "loss": 1.6169122457504272, + "step": 2103 + }, + { + "epoch": 0.5487383451783269, + "grad_norm": 3.390625, + "learning_rate": 2.2264668493831863e-05, + "loss": 1.5767686367034912, + "step": 2104 + }, + { + "epoch": 0.5489991523766056, + "grad_norm": 3.84375, + "learning_rate": 2.225016667385795e-05, + "loss": 1.4845507144927979, + "step": 2105 + }, + { + "epoch": 0.5492599595748843, + "grad_norm": 3.59375, + "learning_rate": 2.2235663655577006e-05, + "loss": 1.5769743919372559, + "step": 2106 + }, + { + "epoch": 0.549520766773163, + "grad_norm": 3.59375, + "learning_rate": 2.2221159446712513e-05, + "loss": 1.3913387060165405, + "step": 2107 + }, + { + "epoch": 0.5497815739714416, + "grad_norm": 3.59375, + "learning_rate": 2.2206654054988545e-05, + "loss": 1.4873542785644531, + "step": 2108 + }, + { + "epoch": 0.5500423811697203, + "grad_norm": 3.515625, + "learning_rate": 2.2192147488129837e-05, + "loss": 1.367735505104065, + "step": 2109 + }, + { + "epoch": 0.550303188367999, + "grad_norm": 3.421875, + "learning_rate": 2.2177639753861735e-05, + "loss": 1.3265107870101929, + "step": 2110 + }, + { + "epoch": 0.5505639955662777, + "grad_norm": 3.625, + "learning_rate": 2.21631308599102e-05, + "loss": 1.437415361404419, + "step": 2111 + }, + { + "epoch": 0.5508248027645563, + "grad_norm": 3.53125, + "learning_rate": 2.2148620814001828e-05, + "loss": 1.553528070449829, + "step": 2112 + }, + { + "epoch": 0.551085609962835, + "grad_norm": 3.6875, + "learning_rate": 2.2134109623863815e-05, + "loss": 1.4988465309143066, + "step": 2113 + }, + { + "epoch": 0.5513464171611137, + "grad_norm": 3.53125, + "learning_rate": 2.2119597297223976e-05, + "loss": 1.4551095962524414, + "step": 2114 + }, + { + "epoch": 0.5516072243593924, + "grad_norm": 3.46875, + "learning_rate": 2.2105083841810718e-05, + "loss": 1.2718291282653809, + "step": 2115 + }, + { + "epoch": 0.551868031557671, + "grad_norm": 3.484375, + "learning_rate": 2.209056926535307e-05, + "loss": 1.6388218402862549, + "step": 2116 + }, + { + "epoch": 0.5521288387559496, + "grad_norm": 3.71875, + "learning_rate": 2.207605357558064e-05, + "loss": 1.6339783668518066, + "step": 2117 + }, + { + "epoch": 0.5523896459542283, + "grad_norm": 3.28125, + "learning_rate": 2.2061536780223634e-05, + "loss": 1.6556386947631836, + "step": 2118 + }, + { + "epoch": 0.552650453152507, + "grad_norm": 3.625, + "learning_rate": 2.2047018887012838e-05, + "loss": 1.7499902248382568, + "step": 2119 + }, + { + "epoch": 0.5529112603507856, + "grad_norm": 3.390625, + "learning_rate": 2.2032499903679648e-05, + "loss": 1.5917760133743286, + "step": 2120 + }, + { + "epoch": 0.5531720675490643, + "grad_norm": 3.359375, + "learning_rate": 2.201797983795601e-05, + "loss": 1.4344977140426636, + "step": 2121 + }, + { + "epoch": 0.553432874747343, + "grad_norm": 3.234375, + "learning_rate": 2.200345869757448e-05, + "loss": 1.4399155378341675, + "step": 2122 + }, + { + "epoch": 0.5536936819456217, + "grad_norm": 3.359375, + "learning_rate": 2.1988936490268142e-05, + "loss": 1.5594520568847656, + "step": 2123 + }, + { + "epoch": 0.5539544891439003, + "grad_norm": 3.515625, + "learning_rate": 2.1974413223770695e-05, + "loss": 1.6037397384643555, + "step": 2124 + }, + { + "epoch": 0.554215296342179, + "grad_norm": 3.875, + "learning_rate": 2.195988890581637e-05, + "loss": 1.6540207862854004, + "step": 2125 + }, + { + "epoch": 0.5544761035404577, + "grad_norm": 3.640625, + "learning_rate": 2.1945363544139963e-05, + "loss": 1.4645299911499023, + "step": 2126 + }, + { + "epoch": 0.5547369107387364, + "grad_norm": 3.34375, + "learning_rate": 2.193083714647685e-05, + "loss": 1.3683663606643677, + "step": 2127 + }, + { + "epoch": 0.554997717937015, + "grad_norm": 3.453125, + "learning_rate": 2.1916309720562915e-05, + "loss": 1.2491161823272705, + "step": 2128 + }, + { + "epoch": 0.5552585251352937, + "grad_norm": 3.609375, + "learning_rate": 2.1901781274134633e-05, + "loss": 1.5425299406051636, + "step": 2129 + }, + { + "epoch": 0.5555193323335724, + "grad_norm": 3.59375, + "learning_rate": 2.1887251814928998e-05, + "loss": 1.514522671699524, + "step": 2130 + }, + { + "epoch": 0.5557801395318511, + "grad_norm": 3.40625, + "learning_rate": 2.1872721350683552e-05, + "loss": 1.4008015394210815, + "step": 2131 + }, + { + "epoch": 0.5560409467301297, + "grad_norm": 3.5, + "learning_rate": 2.1858189889136363e-05, + "loss": 1.4071357250213623, + "step": 2132 + }, + { + "epoch": 0.5563017539284084, + "grad_norm": 3.46875, + "learning_rate": 2.1843657438026038e-05, + "loss": 1.4505395889282227, + "step": 2133 + }, + { + "epoch": 0.5565625611266871, + "grad_norm": 5.34375, + "learning_rate": 2.182912400509172e-05, + "loss": 2.069634437561035, + "step": 2134 + }, + { + "epoch": 0.5568233683249658, + "grad_norm": 3.53125, + "learning_rate": 2.181458959807305e-05, + "loss": 1.4999897480010986, + "step": 2135 + }, + { + "epoch": 0.5570841755232444, + "grad_norm": 3.203125, + "learning_rate": 2.1800054224710213e-05, + "loss": 1.4152805805206299, + "step": 2136 + }, + { + "epoch": 0.5573449827215231, + "grad_norm": 3.6875, + "learning_rate": 2.1785517892743887e-05, + "loss": 1.6478434801101685, + "step": 2137 + }, + { + "epoch": 0.5576057899198018, + "grad_norm": 3.765625, + "learning_rate": 2.1770980609915283e-05, + "loss": 1.5277208089828491, + "step": 2138 + }, + { + "epoch": 0.5578665971180805, + "grad_norm": 3.609375, + "learning_rate": 2.1756442383966102e-05, + "loss": 1.4693225622177124, + "step": 2139 + }, + { + "epoch": 0.5581274043163591, + "grad_norm": 3.171875, + "learning_rate": 2.174190322263855e-05, + "loss": 1.1739648580551147, + "step": 2140 + }, + { + "epoch": 0.5583882115146378, + "grad_norm": 3.4375, + "learning_rate": 2.172736313367533e-05, + "loss": 1.4084835052490234, + "step": 2141 + }, + { + "epoch": 0.5586490187129165, + "grad_norm": 3.578125, + "learning_rate": 2.171282212481965e-05, + "loss": 1.7365339994430542, + "step": 2142 + }, + { + "epoch": 0.5589098259111952, + "grad_norm": 3.421875, + "learning_rate": 2.1698280203815193e-05, + "loss": 1.377223014831543, + "step": 2143 + }, + { + "epoch": 0.5591706331094738, + "grad_norm": 3.671875, + "learning_rate": 2.1683737378406143e-05, + "loss": 1.5929501056671143, + "step": 2144 + }, + { + "epoch": 0.5594314403077525, + "grad_norm": 3.640625, + "learning_rate": 2.1669193656337147e-05, + "loss": 1.577168345451355, + "step": 2145 + }, + { + "epoch": 0.5596922475060312, + "grad_norm": 3.609375, + "learning_rate": 2.1654649045353348e-05, + "loss": 1.5571691989898682, + "step": 2146 + }, + { + "epoch": 0.5599530547043099, + "grad_norm": 3.75, + "learning_rate": 2.164010355320035e-05, + "loss": 1.7082301378250122, + "step": 2147 + }, + { + "epoch": 0.5602138619025885, + "grad_norm": 3.375, + "learning_rate": 2.162555718762423e-05, + "loss": 1.5191845893859863, + "step": 2148 + }, + { + "epoch": 0.5604746691008672, + "grad_norm": 3.421875, + "learning_rate": 2.1611009956371533e-05, + "loss": 1.469129204750061, + "step": 2149 + }, + { + "epoch": 0.5607354762991459, + "grad_norm": 3.578125, + "learning_rate": 2.1596461867189257e-05, + "loss": 1.5439426898956299, + "step": 2150 + }, + { + "epoch": 0.5609962834974245, + "grad_norm": 3.625, + "learning_rate": 2.1581912927824878e-05, + "loss": 1.470725655555725, + "step": 2151 + }, + { + "epoch": 0.5612570906957032, + "grad_norm": 3.609375, + "learning_rate": 2.156736314602629e-05, + "loss": 1.6040174961090088, + "step": 2152 + }, + { + "epoch": 0.5615178978939819, + "grad_norm": 3.703125, + "learning_rate": 2.1552812529541865e-05, + "loss": 1.6059777736663818, + "step": 2153 + }, + { + "epoch": 0.5617787050922606, + "grad_norm": 3.59375, + "learning_rate": 2.1538261086120408e-05, + "loss": 1.501955509185791, + "step": 2154 + }, + { + "epoch": 0.5620395122905392, + "grad_norm": 3.515625, + "learning_rate": 2.1523708823511168e-05, + "loss": 1.4939640760421753, + "step": 2155 + }, + { + "epoch": 0.5623003194888179, + "grad_norm": 3.390625, + "learning_rate": 2.1509155749463823e-05, + "loss": 1.7001293897628784, + "step": 2156 + }, + { + "epoch": 0.5625611266870966, + "grad_norm": 3.625, + "learning_rate": 2.149460187172849e-05, + "loss": 1.603058099746704, + "step": 2157 + }, + { + "epoch": 0.5628219338853753, + "grad_norm": 3.875, + "learning_rate": 2.148004719805571e-05, + "loss": 1.5233949422836304, + "step": 2158 + }, + { + "epoch": 0.5630827410836539, + "grad_norm": 3.296875, + "learning_rate": 2.146549173619646e-05, + "loss": 1.4987554550170898, + "step": 2159 + }, + { + "epoch": 0.5633435482819326, + "grad_norm": 3.46875, + "learning_rate": 2.145093549390211e-05, + "loss": 1.684701681137085, + "step": 2160 + }, + { + "epoch": 0.5636043554802113, + "grad_norm": 3.46875, + "learning_rate": 2.143637847892448e-05, + "loss": 1.4607975482940674, + "step": 2161 + }, + { + "epoch": 0.56386516267849, + "grad_norm": 3.390625, + "learning_rate": 2.1421820699015763e-05, + "loss": 1.4865243434906006, + "step": 2162 + }, + { + "epoch": 0.5641259698767686, + "grad_norm": 3.375, + "learning_rate": 2.1407262161928607e-05, + "loss": 1.3964680433273315, + "step": 2163 + }, + { + "epoch": 0.5643867770750473, + "grad_norm": 3.453125, + "learning_rate": 2.1392702875416017e-05, + "loss": 1.5070065259933472, + "step": 2164 + }, + { + "epoch": 0.564647584273326, + "grad_norm": 3.453125, + "learning_rate": 2.1378142847231417e-05, + "loss": 1.4372220039367676, + "step": 2165 + }, + { + "epoch": 0.5649083914716047, + "grad_norm": 3.59375, + "learning_rate": 2.1363582085128635e-05, + "loss": 1.735957384109497, + "step": 2166 + }, + { + "epoch": 0.5651691986698832, + "grad_norm": 3.546875, + "learning_rate": 2.134902059686187e-05, + "loss": 1.5055205821990967, + "step": 2167 + }, + { + "epoch": 0.565430005868162, + "grad_norm": 3.421875, + "learning_rate": 2.1334458390185736e-05, + "loss": 1.274411916732788, + "step": 2168 + }, + { + "epoch": 0.5656908130664406, + "grad_norm": 3.4375, + "learning_rate": 2.131989547285519e-05, + "loss": 1.6582330465316772, + "step": 2169 + }, + { + "epoch": 0.5659516202647193, + "grad_norm": 3.78125, + "learning_rate": 2.1305331852625596e-05, + "loss": 1.6835277080535889, + "step": 2170 + }, + { + "epoch": 0.5662124274629979, + "grad_norm": 3.265625, + "learning_rate": 2.129076753725269e-05, + "loss": 1.3967995643615723, + "step": 2171 + }, + { + "epoch": 0.5664732346612766, + "grad_norm": 3.578125, + "learning_rate": 2.1276202534492566e-05, + "loss": 1.6190301179885864, + "step": 2172 + }, + { + "epoch": 0.5667340418595553, + "grad_norm": 3.453125, + "learning_rate": 2.126163685210171e-05, + "loss": 1.456328272819519, + "step": 2173 + }, + { + "epoch": 0.566994849057834, + "grad_norm": 3.53125, + "learning_rate": 2.1247070497836926e-05, + "loss": 1.5893317461013794, + "step": 2174 + }, + { + "epoch": 0.5672556562561126, + "grad_norm": 3.625, + "learning_rate": 2.123250347945542e-05, + "loss": 1.6280925273895264, + "step": 2175 + }, + { + "epoch": 0.5675164634543913, + "grad_norm": 3.359375, + "learning_rate": 2.1217935804714722e-05, + "loss": 1.316066861152649, + "step": 2176 + }, + { + "epoch": 0.56777727065267, + "grad_norm": 3.515625, + "learning_rate": 2.120336748137273e-05, + "loss": 1.6684691905975342, + "step": 2177 + }, + { + "epoch": 0.5680380778509487, + "grad_norm": 3.46875, + "learning_rate": 2.1188798517187683e-05, + "loss": 1.643919587135315, + "step": 2178 + }, + { + "epoch": 0.5682988850492273, + "grad_norm": 3.828125, + "learning_rate": 2.117422891991814e-05, + "loss": 1.4487578868865967, + "step": 2179 + }, + { + "epoch": 0.568559692247506, + "grad_norm": 3.546875, + "learning_rate": 2.1159658697323044e-05, + "loss": 1.3514080047607422, + "step": 2180 + }, + { + "epoch": 0.5688204994457847, + "grad_norm": 3.703125, + "learning_rate": 2.1145087857161614e-05, + "loss": 1.6331803798675537, + "step": 2181 + }, + { + "epoch": 0.5690813066440634, + "grad_norm": 3.5625, + "learning_rate": 2.1130516407193445e-05, + "loss": 1.6580138206481934, + "step": 2182 + }, + { + "epoch": 0.569342113842342, + "grad_norm": 3.765625, + "learning_rate": 2.1115944355178427e-05, + "loss": 1.5402663946151733, + "step": 2183 + }, + { + "epoch": 0.5696029210406207, + "grad_norm": 3.453125, + "learning_rate": 2.1101371708876786e-05, + "loss": 1.4032838344573975, + "step": 2184 + }, + { + "epoch": 0.5698637282388994, + "grad_norm": 3.375, + "learning_rate": 2.1086798476049068e-05, + "loss": 1.3917063474655151, + "step": 2185 + }, + { + "epoch": 0.5701245354371781, + "grad_norm": 3.5, + "learning_rate": 2.1072224664456114e-05, + "loss": 1.6108860969543457, + "step": 2186 + }, + { + "epoch": 0.5703853426354567, + "grad_norm": 3.421875, + "learning_rate": 2.1057650281859083e-05, + "loss": 1.412126064300537, + "step": 2187 + }, + { + "epoch": 0.5706461498337354, + "grad_norm": 3.734375, + "learning_rate": 2.104307533601944e-05, + "loss": 1.647977352142334, + "step": 2188 + }, + { + "epoch": 0.5709069570320141, + "grad_norm": 3.59375, + "learning_rate": 2.1028499834698946e-05, + "loss": 1.6325180530548096, + "step": 2189 + }, + { + "epoch": 0.5711677642302928, + "grad_norm": 3.296875, + "learning_rate": 2.101392378565967e-05, + "loss": 1.5504090785980225, + "step": 2190 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 3.421875, + "learning_rate": 2.0999347196663943e-05, + "loss": 1.5248416662216187, + "step": 2191 + }, + { + "epoch": 0.5716893786268501, + "grad_norm": 3.421875, + "learning_rate": 2.0984770075474414e-05, + "loss": 1.59442138671875, + "step": 2192 + }, + { + "epoch": 0.5719501858251288, + "grad_norm": 3.453125, + "learning_rate": 2.0970192429854004e-05, + "loss": 1.3801133632659912, + "step": 2193 + }, + { + "epoch": 0.5722109930234075, + "grad_norm": 3.609375, + "learning_rate": 2.0955614267565915e-05, + "loss": 1.6320899724960327, + "step": 2194 + }, + { + "epoch": 0.5724718002216861, + "grad_norm": 3.609375, + "learning_rate": 2.0941035596373625e-05, + "loss": 1.6819474697113037, + "step": 2195 + }, + { + "epoch": 0.5727326074199648, + "grad_norm": 3.390625, + "learning_rate": 2.0926456424040865e-05, + "loss": 1.2906243801116943, + "step": 2196 + }, + { + "epoch": 0.5729934146182435, + "grad_norm": 3.28125, + "learning_rate": 2.091187675833167e-05, + "loss": 1.4808273315429688, + "step": 2197 + }, + { + "epoch": 0.5732542218165222, + "grad_norm": 3.84375, + "learning_rate": 2.08972966070103e-05, + "loss": 1.463750958442688, + "step": 2198 + }, + { + "epoch": 0.5735150290148008, + "grad_norm": 3.8125, + "learning_rate": 2.0882715977841296e-05, + "loss": 1.793060302734375, + "step": 2199 + }, + { + "epoch": 0.5737758362130795, + "grad_norm": 3.515625, + "learning_rate": 2.0868134878589452e-05, + "loss": 1.4461506605148315, + "step": 2200 + }, + { + "epoch": 0.5740366434113582, + "grad_norm": 3.328125, + "learning_rate": 2.0853553317019798e-05, + "loss": 1.4556095600128174, + "step": 2201 + }, + { + "epoch": 0.5742974506096368, + "grad_norm": 3.546875, + "learning_rate": 2.083897130089763e-05, + "loss": 1.4989452362060547, + "step": 2202 + }, + { + "epoch": 0.5745582578079155, + "grad_norm": 3.578125, + "learning_rate": 2.082438883798847e-05, + "loss": 1.666111946105957, + "step": 2203 + }, + { + "epoch": 0.5748190650061942, + "grad_norm": 3.359375, + "learning_rate": 2.080980593605808e-05, + "loss": 1.2804036140441895, + "step": 2204 + }, + { + "epoch": 0.5750798722044729, + "grad_norm": 3.765625, + "learning_rate": 2.079522260287247e-05, + "loss": 1.3963851928710938, + "step": 2205 + }, + { + "epoch": 0.5753406794027515, + "grad_norm": 3.53125, + "learning_rate": 2.0780638846197857e-05, + "loss": 1.283686637878418, + "step": 2206 + }, + { + "epoch": 0.5756014866010302, + "grad_norm": 3.296875, + "learning_rate": 2.076605467380071e-05, + "loss": 1.511763572692871, + "step": 2207 + }, + { + "epoch": 0.5758622937993089, + "grad_norm": 3.703125, + "learning_rate": 2.0751470093447694e-05, + "loss": 1.594511866569519, + "step": 2208 + }, + { + "epoch": 0.5761231009975876, + "grad_norm": 3.484375, + "learning_rate": 2.0736885112905708e-05, + "loss": 1.5738933086395264, + "step": 2209 + }, + { + "epoch": 0.5763839081958662, + "grad_norm": 3.640625, + "learning_rate": 2.0722299739941857e-05, + "loss": 1.3943856954574585, + "step": 2210 + }, + { + "epoch": 0.5766447153941449, + "grad_norm": 3.578125, + "learning_rate": 2.0707713982323456e-05, + "loss": 1.4696235656738281, + "step": 2211 + }, + { + "epoch": 0.5769055225924236, + "grad_norm": 3.15625, + "learning_rate": 2.069312784781803e-05, + "loss": 1.5212304592132568, + "step": 2212 + }, + { + "epoch": 0.5771663297907023, + "grad_norm": 3.453125, + "learning_rate": 2.067854134419329e-05, + "loss": 1.3783750534057617, + "step": 2213 + }, + { + "epoch": 0.5774271369889808, + "grad_norm": 3.734375, + "learning_rate": 2.066395447921717e-05, + "loss": 1.7481436729431152, + "step": 2214 + }, + { + "epoch": 0.5776879441872595, + "grad_norm": 3.796875, + "learning_rate": 2.064936726065776e-05, + "loss": 1.6693224906921387, + "step": 2215 + }, + { + "epoch": 0.5779487513855383, + "grad_norm": 3.515625, + "learning_rate": 2.063477969628337e-05, + "loss": 1.3997015953063965, + "step": 2216 + }, + { + "epoch": 0.578209558583817, + "grad_norm": 3.484375, + "learning_rate": 2.0620191793862485e-05, + "loss": 1.6478688716888428, + "step": 2217 + }, + { + "epoch": 0.5784703657820955, + "grad_norm": 3.625, + "learning_rate": 2.0605603561163762e-05, + "loss": 1.5532352924346924, + "step": 2218 + }, + { + "epoch": 0.5787311729803742, + "grad_norm": 3.5, + "learning_rate": 2.059101500595605e-05, + "loss": 1.639623761177063, + "step": 2219 + }, + { + "epoch": 0.5789919801786529, + "grad_norm": 3.75, + "learning_rate": 2.0576426136008344e-05, + "loss": 1.3887747526168823, + "step": 2220 + }, + { + "epoch": 0.5792527873769316, + "grad_norm": 3.4375, + "learning_rate": 2.0561836959089828e-05, + "loss": 1.713215947151184, + "step": 2221 + }, + { + "epoch": 0.5795135945752102, + "grad_norm": 3.296875, + "learning_rate": 2.054724748296985e-05, + "loss": 1.2385673522949219, + "step": 2222 + }, + { + "epoch": 0.5797744017734889, + "grad_norm": 3.671875, + "learning_rate": 2.0532657715417895e-05, + "loss": 1.5485032796859741, + "step": 2223 + }, + { + "epoch": 0.5800352089717676, + "grad_norm": 3.328125, + "learning_rate": 2.0518067664203643e-05, + "loss": 1.702805519104004, + "step": 2224 + }, + { + "epoch": 0.5802960161700463, + "grad_norm": 3.34375, + "learning_rate": 2.0503477337096878e-05, + "loss": 1.4689847230911255, + "step": 2225 + }, + { + "epoch": 0.5805568233683249, + "grad_norm": 3.5625, + "learning_rate": 2.048888674186756e-05, + "loss": 1.5059980154037476, + "step": 2226 + }, + { + "epoch": 0.5808176305666036, + "grad_norm": 3.5, + "learning_rate": 2.0474295886285797e-05, + "loss": 1.4363094568252563, + "step": 2227 + }, + { + "epoch": 0.5810784377648823, + "grad_norm": 3.921875, + "learning_rate": 2.045970477812181e-05, + "loss": 1.6230552196502686, + "step": 2228 + }, + { + "epoch": 0.581339244963161, + "grad_norm": 3.75, + "learning_rate": 2.0445113425145983e-05, + "loss": 1.429176688194275, + "step": 2229 + }, + { + "epoch": 0.5816000521614396, + "grad_norm": 3.765625, + "learning_rate": 2.0430521835128795e-05, + "loss": 1.6176044940948486, + "step": 2230 + }, + { + "epoch": 0.5818608593597183, + "grad_norm": 3.625, + "learning_rate": 2.0415930015840896e-05, + "loss": 1.7044963836669922, + "step": 2231 + }, + { + "epoch": 0.582121666557997, + "grad_norm": 3.390625, + "learning_rate": 2.0401337975053024e-05, + "loss": 1.4895380735397339, + "step": 2232 + }, + { + "epoch": 0.5823824737562757, + "grad_norm": 3.265625, + "learning_rate": 2.038674572053604e-05, + "loss": 1.3310184478759766, + "step": 2233 + }, + { + "epoch": 0.5826432809545543, + "grad_norm": 3.28125, + "learning_rate": 2.0372153260060937e-05, + "loss": 1.7036465406417847, + "step": 2234 + }, + { + "epoch": 0.582904088152833, + "grad_norm": 3.234375, + "learning_rate": 2.035756060139879e-05, + "loss": 1.4056061506271362, + "step": 2235 + }, + { + "epoch": 0.5831648953511117, + "grad_norm": 3.453125, + "learning_rate": 2.034296775232081e-05, + "loss": 1.565107822418213, + "step": 2236 + }, + { + "epoch": 0.5834257025493904, + "grad_norm": 3.625, + "learning_rate": 2.0328374720598286e-05, + "loss": 1.4062299728393555, + "step": 2237 + }, + { + "epoch": 0.583686509747669, + "grad_norm": 3.375, + "learning_rate": 2.0313781514002615e-05, + "loss": 1.3680182695388794, + "step": 2238 + }, + { + "epoch": 0.5839473169459477, + "grad_norm": 3.6875, + "learning_rate": 2.0299188140305276e-05, + "loss": 1.4250578880310059, + "step": 2239 + }, + { + "epoch": 0.5842081241442264, + "grad_norm": 3.515625, + "learning_rate": 2.028459460727785e-05, + "loss": 1.310401439666748, + "step": 2240 + }, + { + "epoch": 0.5844689313425051, + "grad_norm": 3.4375, + "learning_rate": 2.027000092269201e-05, + "loss": 1.3361576795578003, + "step": 2241 + }, + { + "epoch": 0.5847297385407837, + "grad_norm": 3.375, + "learning_rate": 2.025540709431948e-05, + "loss": 1.2321393489837646, + "step": 2242 + }, + { + "epoch": 0.5849905457390624, + "grad_norm": 3.59375, + "learning_rate": 2.0240813129932086e-05, + "loss": 1.6547679901123047, + "step": 2243 + }, + { + "epoch": 0.5852513529373411, + "grad_norm": 3.453125, + "learning_rate": 2.0226219037301723e-05, + "loss": 1.5261492729187012, + "step": 2244 + }, + { + "epoch": 0.5855121601356198, + "grad_norm": 3.1875, + "learning_rate": 2.021162482420034e-05, + "loss": 1.3989250659942627, + "step": 2245 + }, + { + "epoch": 0.5857729673338984, + "grad_norm": 3.3125, + "learning_rate": 2.0197030498399975e-05, + "loss": 1.4565719366073608, + "step": 2246 + }, + { + "epoch": 0.5860337745321771, + "grad_norm": 3.578125, + "learning_rate": 2.0182436067672695e-05, + "loss": 1.4274851083755493, + "step": 2247 + }, + { + "epoch": 0.5862945817304558, + "grad_norm": 3.4375, + "learning_rate": 2.0167841539790657e-05, + "loss": 1.513249158859253, + "step": 2248 + }, + { + "epoch": 0.5865553889287344, + "grad_norm": 3.234375, + "learning_rate": 2.0153246922526034e-05, + "loss": 1.4265291690826416, + "step": 2249 + }, + { + "epoch": 0.5868161961270131, + "grad_norm": 3.515625, + "learning_rate": 2.0138652223651084e-05, + "loss": 1.5014431476593018, + "step": 2250 + }, + { + "epoch": 0.5870770033252918, + "grad_norm": 3.5625, + "learning_rate": 2.0124057450938062e-05, + "loss": 1.5550901889801025, + "step": 2251 + }, + { + "epoch": 0.5873378105235705, + "grad_norm": 3.40625, + "learning_rate": 2.0109462612159314e-05, + "loss": 1.456763505935669, + "step": 2252 + }, + { + "epoch": 0.5875986177218491, + "grad_norm": 3.671875, + "learning_rate": 2.0094867715087192e-05, + "loss": 1.6487913131713867, + "step": 2253 + }, + { + "epoch": 0.5878594249201278, + "grad_norm": 3.75, + "learning_rate": 2.0080272767494075e-05, + "loss": 1.5697367191314697, + "step": 2254 + }, + { + "epoch": 0.5881202321184065, + "grad_norm": 3.296875, + "learning_rate": 2.0065677777152387e-05, + "loss": 1.5290368795394897, + "step": 2255 + }, + { + "epoch": 0.5883810393166852, + "grad_norm": 3.625, + "learning_rate": 2.0051082751834548e-05, + "loss": 1.5546298027038574, + "step": 2256 + }, + { + "epoch": 0.5886418465149638, + "grad_norm": 3.4375, + "learning_rate": 2.0036487699313035e-05, + "loss": 1.705029845237732, + "step": 2257 + }, + { + "epoch": 0.5889026537132425, + "grad_norm": 3.59375, + "learning_rate": 2.0021892627360313e-05, + "loss": 1.6264777183532715, + "step": 2258 + }, + { + "epoch": 0.5891634609115212, + "grad_norm": 3.359375, + "learning_rate": 2.0007297543748856e-05, + "loss": 1.5446674823760986, + "step": 2259 + }, + { + "epoch": 0.5894242681097999, + "grad_norm": 3.8125, + "learning_rate": 1.999270245625115e-05, + "loss": 1.6087820529937744, + "step": 2260 + }, + { + "epoch": 0.5896850753080785, + "grad_norm": 3.328125, + "learning_rate": 1.9978107372639697e-05, + "loss": 1.6164394617080688, + "step": 2261 + }, + { + "epoch": 0.5899458825063572, + "grad_norm": 3.609375, + "learning_rate": 1.996351230068697e-05, + "loss": 1.3870233297348022, + "step": 2262 + }, + { + "epoch": 0.5902066897046359, + "grad_norm": 3.4375, + "learning_rate": 1.9948917248165452e-05, + "loss": 1.3601336479187012, + "step": 2263 + }, + { + "epoch": 0.5904674969029146, + "grad_norm": 3.515625, + "learning_rate": 1.9934322222847626e-05, + "loss": 1.4149237871170044, + "step": 2264 + }, + { + "epoch": 0.5907283041011931, + "grad_norm": 3.28125, + "learning_rate": 1.991972723250593e-05, + "loss": 1.4162180423736572, + "step": 2265 + }, + { + "epoch": 0.5909891112994718, + "grad_norm": 3.375, + "learning_rate": 1.990513228491281e-05, + "loss": 1.2737969160079956, + "step": 2266 + }, + { + "epoch": 0.5912499184977505, + "grad_norm": 3.578125, + "learning_rate": 1.9890537387840693e-05, + "loss": 1.515159249305725, + "step": 2267 + }, + { + "epoch": 0.5915107256960292, + "grad_norm": 3.546875, + "learning_rate": 1.987594254906194e-05, + "loss": 1.4385640621185303, + "step": 2268 + }, + { + "epoch": 0.5917715328943078, + "grad_norm": 3.6875, + "learning_rate": 1.986134777634893e-05, + "loss": 1.728789210319519, + "step": 2269 + }, + { + "epoch": 0.5920323400925865, + "grad_norm": 3.59375, + "learning_rate": 1.984675307747397e-05, + "loss": 1.5056780576705933, + "step": 2270 + }, + { + "epoch": 0.5922931472908652, + "grad_norm": 3.34375, + "learning_rate": 1.9832158460209346e-05, + "loss": 1.262589693069458, + "step": 2271 + }, + { + "epoch": 0.5925539544891439, + "grad_norm": 3.734375, + "learning_rate": 1.9817563932327312e-05, + "loss": 1.6616630554199219, + "step": 2272 + }, + { + "epoch": 0.5928147616874225, + "grad_norm": 3.65625, + "learning_rate": 1.9802969501600028e-05, + "loss": 1.808161735534668, + "step": 2273 + } + ], + "logging_steps": 1, + "max_steps": 4411, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 1324, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.0573094378695557e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}