diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,15013 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.2519106300508283, + "eval_steps": 500, + "global_step": 2140, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005850733170000365, + "grad_norm": 0.3784801935906415, + "learning_rate": 7.999998479354072e-06, + "loss": 1.1815, + "step": 1 + }, + { + "epoch": 0.001170146634000073, + "grad_norm": 0.10369689761982695, + "learning_rate": 7.999993917417574e-06, + "loss": 1.0085, + "step": 2 + }, + { + "epoch": 0.0017552199510001097, + "grad_norm": 0.08165678751844849, + "learning_rate": 7.99998631419436e-06, + "loss": 0.934, + "step": 3 + }, + { + "epoch": 0.002340293268000146, + "grad_norm": 0.06369675215202451, + "learning_rate": 7.999975669690855e-06, + "loss": 1.0323, + "step": 4 + }, + { + "epoch": 0.002925366585000183, + "grad_norm": 0.08076101376684344, + "learning_rate": 7.999961983916048e-06, + "loss": 1.0676, + "step": 5 + }, + { + "epoch": 0.0035104399020002195, + "grad_norm": 0.06288930819751326, + "learning_rate": 7.999945256881502e-06, + "loss": 1.0421, + "step": 6 + }, + { + "epoch": 0.004095513219000256, + "grad_norm": 0.06851051488443692, + "learning_rate": 7.999925488601348e-06, + "loss": 0.9544, + "step": 7 + }, + { + "epoch": 0.004680586536000292, + "grad_norm": 0.06007707301663692, + "learning_rate": 7.999902679092287e-06, + "loss": 1.0349, + "step": 8 + }, + { + "epoch": 0.005265659853000329, + "grad_norm": 0.07674940091671953, + "learning_rate": 7.99987682837359e-06, + "loss": 1.1809, + "step": 9 + }, + { + "epoch": 0.005850733170000366, + "grad_norm": 0.06356726822529891, + "learning_rate": 7.999847936467093e-06, + "loss": 0.9502, + "step": 10 + }, + { + "epoch": 0.006435806487000402, + "grad_norm": 0.060093989725189235, + "learning_rate": 7.999816003397203e-06, + "loss": 1.0007, + "step": 11 + }, + { + "epoch": 0.007020879804000439, + "grad_norm": 0.04797598057222403, + "learning_rate": 7.999781029190902e-06, + "loss": 1.0316, + "step": 12 + }, + { + "epoch": 0.007605953121000476, + "grad_norm": 0.08827709210231136, + "learning_rate": 7.999743013877734e-06, + "loss": 1.0688, + "step": 13 + }, + { + "epoch": 0.008191026438000511, + "grad_norm": 0.04993529689396162, + "learning_rate": 7.999701957489811e-06, + "loss": 0.9695, + "step": 14 + }, + { + "epoch": 0.008776099755000549, + "grad_norm": 0.053685150458545855, + "learning_rate": 7.999657860061823e-06, + "loss": 1.0826, + "step": 15 + }, + { + "epoch": 0.009361173072000585, + "grad_norm": 0.04357397086005245, + "learning_rate": 7.99961072163102e-06, + "loss": 0.9788, + "step": 16 + }, + { + "epoch": 0.009946246389000622, + "grad_norm": 0.051838589957396514, + "learning_rate": 7.999560542237226e-06, + "loss": 1.0222, + "step": 17 + }, + { + "epoch": 0.010531319706000658, + "grad_norm": 0.04948384582310689, + "learning_rate": 7.999507321922833e-06, + "loss": 0.9882, + "step": 18 + }, + { + "epoch": 0.011116393023000696, + "grad_norm": 0.05488935540485584, + "learning_rate": 7.9994510607328e-06, + "loss": 1.0124, + "step": 19 + }, + { + "epoch": 0.011701466340000731, + "grad_norm": 0.04632979465330071, + "learning_rate": 7.999391758714659e-06, + "loss": 0.9646, + "step": 20 + }, + { + "epoch": 0.012286539657000767, + "grad_norm": 0.04102257708769348, + "learning_rate": 7.999329415918508e-06, + "loss": 0.8946, + "step": 21 + }, + { + "epoch": 0.012871612974000805, + "grad_norm": 0.045466366302330634, + "learning_rate": 7.999264032397013e-06, + "loss": 0.9973, + "step": 22 + }, + { + "epoch": 0.01345668629100084, + "grad_norm": 0.04440538659368297, + "learning_rate": 7.99919560820541e-06, + "loss": 1.0596, + "step": 23 + }, + { + "epoch": 0.014041759608000878, + "grad_norm": 0.042500986542824803, + "learning_rate": 7.999124143401507e-06, + "loss": 0.9303, + "step": 24 + }, + { + "epoch": 0.014626832925000914, + "grad_norm": 0.05180121412922999, + "learning_rate": 7.999049638045675e-06, + "loss": 0.9412, + "step": 25 + }, + { + "epoch": 0.015211906242000951, + "grad_norm": 0.06441302150178038, + "learning_rate": 7.998972092200859e-06, + "loss": 0.9986, + "step": 26 + }, + { + "epoch": 0.015796979559000987, + "grad_norm": 0.05297417453471626, + "learning_rate": 7.998891505932565e-06, + "loss": 1.0701, + "step": 27 + }, + { + "epoch": 0.016382052876001023, + "grad_norm": 0.04305692283062352, + "learning_rate": 7.998807879308878e-06, + "loss": 0.9823, + "step": 28 + }, + { + "epoch": 0.016967126193001062, + "grad_norm": 0.048331091733781785, + "learning_rate": 7.998721212400443e-06, + "loss": 0.9145, + "step": 29 + }, + { + "epoch": 0.017552199510001098, + "grad_norm": 0.04249327256576877, + "learning_rate": 7.998631505280477e-06, + "loss": 0.9469, + "step": 30 + }, + { + "epoch": 0.018137272827001134, + "grad_norm": 0.042672714299481426, + "learning_rate": 7.998538758024765e-06, + "loss": 0.9814, + "step": 31 + }, + { + "epoch": 0.01872234614400117, + "grad_norm": 0.04643044969291229, + "learning_rate": 7.998442970711661e-06, + "loss": 0.9342, + "step": 32 + }, + { + "epoch": 0.019307419461001205, + "grad_norm": 0.042411147754236, + "learning_rate": 7.998344143422087e-06, + "loss": 0.8851, + "step": 33 + }, + { + "epoch": 0.019892492778001244, + "grad_norm": 0.04407093546490425, + "learning_rate": 7.998242276239527e-06, + "loss": 1.0026, + "step": 34 + }, + { + "epoch": 0.02047756609500128, + "grad_norm": 0.044039695433035646, + "learning_rate": 7.998137369250046e-06, + "loss": 0.9637, + "step": 35 + }, + { + "epoch": 0.021062639412001316, + "grad_norm": 0.04258725681727216, + "learning_rate": 7.998029422542267e-06, + "loss": 0.9249, + "step": 36 + }, + { + "epoch": 0.021647712729001352, + "grad_norm": 0.045486820543060684, + "learning_rate": 7.997918436207383e-06, + "loss": 1.005, + "step": 37 + }, + { + "epoch": 0.02223278604600139, + "grad_norm": 0.03843585783476988, + "learning_rate": 7.997804410339156e-06, + "loss": 0.9815, + "step": 38 + }, + { + "epoch": 0.022817859363001427, + "grad_norm": 0.05544665751989288, + "learning_rate": 7.997687345033915e-06, + "loss": 0.94, + "step": 39 + }, + { + "epoch": 0.023402932680001463, + "grad_norm": 0.041440042889966715, + "learning_rate": 7.99756724039056e-06, + "loss": 0.9337, + "step": 40 + }, + { + "epoch": 0.0239880059970015, + "grad_norm": 0.040659761048442974, + "learning_rate": 7.997444096510552e-06, + "loss": 0.897, + "step": 41 + }, + { + "epoch": 0.024573079314001534, + "grad_norm": 0.06541743477477353, + "learning_rate": 7.997317913497925e-06, + "loss": 0.9325, + "step": 42 + }, + { + "epoch": 0.025158152631001574, + "grad_norm": 0.04101226832374059, + "learning_rate": 7.997188691459279e-06, + "loss": 0.9323, + "step": 43 + }, + { + "epoch": 0.02574322594800161, + "grad_norm": 0.03954326036578227, + "learning_rate": 7.997056430503782e-06, + "loss": 0.8793, + "step": 44 + }, + { + "epoch": 0.026328299265001645, + "grad_norm": 0.0396268421897809, + "learning_rate": 7.996921130743165e-06, + "loss": 0.9849, + "step": 45 + }, + { + "epoch": 0.02691337258200168, + "grad_norm": 0.03855187149117414, + "learning_rate": 7.996782792291732e-06, + "loss": 0.9328, + "step": 46 + }, + { + "epoch": 0.02749844589900172, + "grad_norm": 0.03791722568157901, + "learning_rate": 7.996641415266355e-06, + "loss": 0.9293, + "step": 47 + }, + { + "epoch": 0.028083519216001756, + "grad_norm": 0.04497262810695708, + "learning_rate": 7.996496999786465e-06, + "loss": 0.9884, + "step": 48 + }, + { + "epoch": 0.02866859253300179, + "grad_norm": 0.041089653577607424, + "learning_rate": 7.996349545974065e-06, + "loss": 0.9305, + "step": 49 + }, + { + "epoch": 0.029253665850001827, + "grad_norm": 0.04162057876282252, + "learning_rate": 7.996199053953729e-06, + "loss": 1.0771, + "step": 50 + }, + { + "epoch": 0.029838739167001863, + "grad_norm": 0.045060287442874644, + "learning_rate": 7.996045523852587e-06, + "loss": 0.9819, + "step": 51 + }, + { + "epoch": 0.030423812484001903, + "grad_norm": 0.03950486400361273, + "learning_rate": 7.995888955800346e-06, + "loss": 0.8911, + "step": 52 + }, + { + "epoch": 0.03100888580100194, + "grad_norm": 0.040638774400984866, + "learning_rate": 7.995729349929275e-06, + "loss": 0.9289, + "step": 53 + }, + { + "epoch": 0.031593959118001974, + "grad_norm": 0.04034434081592363, + "learning_rate": 7.995566706374206e-06, + "loss": 0.9529, + "step": 54 + }, + { + "epoch": 0.03217903243500201, + "grad_norm": 0.04315614004870732, + "learning_rate": 7.995401025272545e-06, + "loss": 0.9554, + "step": 55 + }, + { + "epoch": 0.032764105752002046, + "grad_norm": 0.03862522293897866, + "learning_rate": 7.995232306764258e-06, + "loss": 0.9604, + "step": 56 + }, + { + "epoch": 0.03334917906900208, + "grad_norm": 0.042952485422922036, + "learning_rate": 7.995060550991879e-06, + "loss": 0.9731, + "step": 57 + }, + { + "epoch": 0.033934252386002124, + "grad_norm": 0.04399639647859174, + "learning_rate": 7.994885758100508e-06, + "loss": 0.868, + "step": 58 + }, + { + "epoch": 0.03451932570300216, + "grad_norm": 0.03794998627596821, + "learning_rate": 7.99470792823781e-06, + "loss": 0.9176, + "step": 59 + }, + { + "epoch": 0.035104399020002196, + "grad_norm": 0.046410465067380204, + "learning_rate": 7.99452706155402e-06, + "loss": 0.9456, + "step": 60 + }, + { + "epoch": 0.03568947233700223, + "grad_norm": 0.04773216743183266, + "learning_rate": 7.994343158201927e-06, + "loss": 1.0049, + "step": 61 + }, + { + "epoch": 0.03627454565400227, + "grad_norm": 0.04395331205981261, + "learning_rate": 7.994156218336901e-06, + "loss": 0.9197, + "step": 62 + }, + { + "epoch": 0.0368596189710023, + "grad_norm": 0.04353152162367831, + "learning_rate": 7.993966242116865e-06, + "loss": 0.8978, + "step": 63 + }, + { + "epoch": 0.03744469228800234, + "grad_norm": 0.041195331266407585, + "learning_rate": 7.993773229702312e-06, + "loss": 0.965, + "step": 64 + }, + { + "epoch": 0.038029765605002375, + "grad_norm": 0.050163343033375746, + "learning_rate": 7.993577181256304e-06, + "loss": 0.8876, + "step": 65 + }, + { + "epoch": 0.03861483892200241, + "grad_norm": 0.06660490581550026, + "learning_rate": 7.993378096944456e-06, + "loss": 0.9082, + "step": 66 + }, + { + "epoch": 0.03919991223900245, + "grad_norm": 0.03984407119317513, + "learning_rate": 7.99317597693496e-06, + "loss": 0.9233, + "step": 67 + }, + { + "epoch": 0.03978498555600249, + "grad_norm": 0.04866369054757869, + "learning_rate": 7.992970821398567e-06, + "loss": 0.9349, + "step": 68 + }, + { + "epoch": 0.040370058873002525, + "grad_norm": 0.04990567711730111, + "learning_rate": 7.99276263050859e-06, + "loss": 0.985, + "step": 69 + }, + { + "epoch": 0.04095513219000256, + "grad_norm": 0.17411986574945643, + "learning_rate": 7.992551404440914e-06, + "loss": 1.0208, + "step": 70 + }, + { + "epoch": 0.041540205507002596, + "grad_norm": 0.04353986815925501, + "learning_rate": 7.992337143373981e-06, + "loss": 0.9633, + "step": 71 + }, + { + "epoch": 0.04212527882400263, + "grad_norm": 0.05318272939257367, + "learning_rate": 7.9921198474888e-06, + "loss": 1.0015, + "step": 72 + }, + { + "epoch": 0.04271035214100267, + "grad_norm": 0.059411931897509304, + "learning_rate": 7.991899516968942e-06, + "loss": 1.0224, + "step": 73 + }, + { + "epoch": 0.043295425458002704, + "grad_norm": 0.05746696657441689, + "learning_rate": 7.991676152000545e-06, + "loss": 0.9817, + "step": 74 + }, + { + "epoch": 0.04388049877500274, + "grad_norm": 0.03994063377968752, + "learning_rate": 7.991449752772307e-06, + "loss": 0.899, + "step": 75 + }, + { + "epoch": 0.04446557209200278, + "grad_norm": 0.05114807327866483, + "learning_rate": 7.991220319475492e-06, + "loss": 0.9627, + "step": 76 + }, + { + "epoch": 0.04505064540900282, + "grad_norm": 0.043413594203031196, + "learning_rate": 7.990987852303923e-06, + "loss": 0.9385, + "step": 77 + }, + { + "epoch": 0.045635718726002854, + "grad_norm": 0.044141893159488445, + "learning_rate": 7.990752351453994e-06, + "loss": 0.9214, + "step": 78 + }, + { + "epoch": 0.04622079204300289, + "grad_norm": 0.07098639926950194, + "learning_rate": 7.990513817124652e-06, + "loss": 0.9762, + "step": 79 + }, + { + "epoch": 0.046805865360002925, + "grad_norm": 0.04487096138718826, + "learning_rate": 7.990272249517416e-06, + "loss": 0.9379, + "step": 80 + }, + { + "epoch": 0.04739093867700296, + "grad_norm": 0.040488382771263605, + "learning_rate": 7.990027648836359e-06, + "loss": 0.9563, + "step": 81 + }, + { + "epoch": 0.047976011994003, + "grad_norm": 0.04352730030611419, + "learning_rate": 7.989780015288123e-06, + "loss": 0.9488, + "step": 82 + }, + { + "epoch": 0.04856108531100303, + "grad_norm": 0.04413441845817798, + "learning_rate": 7.98952934908191e-06, + "loss": 1.0336, + "step": 83 + }, + { + "epoch": 0.04914615862800307, + "grad_norm": 0.04193745680850997, + "learning_rate": 7.989275650429482e-06, + "loss": 0.8785, + "step": 84 + }, + { + "epoch": 0.04973123194500311, + "grad_norm": 0.04475381091812719, + "learning_rate": 7.989018919545165e-06, + "loss": 0.9443, + "step": 85 + }, + { + "epoch": 0.05031630526200315, + "grad_norm": 0.04234754821679888, + "learning_rate": 7.988759156645845e-06, + "loss": 0.9564, + "step": 86 + }, + { + "epoch": 0.05090137857900318, + "grad_norm": 0.054324472784088765, + "learning_rate": 7.988496361950972e-06, + "loss": 0.9824, + "step": 87 + }, + { + "epoch": 0.05148645189600322, + "grad_norm": 0.051967709927586946, + "learning_rate": 7.988230535682556e-06, + "loss": 0.914, + "step": 88 + }, + { + "epoch": 0.052071525213003254, + "grad_norm": 0.03996756018971234, + "learning_rate": 7.987961678065169e-06, + "loss": 0.9421, + "step": 89 + }, + { + "epoch": 0.05265659853000329, + "grad_norm": 0.07556612961446735, + "learning_rate": 7.987689789325939e-06, + "loss": 0.9791, + "step": 90 + }, + { + "epoch": 0.053241671847003326, + "grad_norm": 0.04557130060143385, + "learning_rate": 7.987414869694562e-06, + "loss": 0.9318, + "step": 91 + }, + { + "epoch": 0.05382674516400336, + "grad_norm": 0.0424700455078842, + "learning_rate": 7.98713691940329e-06, + "loss": 0.9745, + "step": 92 + }, + { + "epoch": 0.0544118184810034, + "grad_norm": 0.07879025661937863, + "learning_rate": 7.986855938686935e-06, + "loss": 0.9614, + "step": 93 + }, + { + "epoch": 0.05499689179800344, + "grad_norm": 0.04227267907542001, + "learning_rate": 7.986571927782871e-06, + "loss": 0.9317, + "step": 94 + }, + { + "epoch": 0.055581965115003476, + "grad_norm": 0.042275976377898476, + "learning_rate": 7.986284886931033e-06, + "loss": 0.8982, + "step": 95 + }, + { + "epoch": 0.05616703843200351, + "grad_norm": 0.0457764180229334, + "learning_rate": 7.985994816373913e-06, + "loss": 0.8803, + "step": 96 + }, + { + "epoch": 0.05675211174900355, + "grad_norm": 0.04305545472943831, + "learning_rate": 7.985701716356565e-06, + "loss": 0.9786, + "step": 97 + }, + { + "epoch": 0.05733718506600358, + "grad_norm": 0.045580685701444475, + "learning_rate": 7.985405587126597e-06, + "loss": 0.9036, + "step": 98 + }, + { + "epoch": 0.05792225838300362, + "grad_norm": 0.0442896526405325, + "learning_rate": 7.985106428934183e-06, + "loss": 0.9871, + "step": 99 + }, + { + "epoch": 0.058507331700003655, + "grad_norm": 0.04396583794537118, + "learning_rate": 7.984804242032051e-06, + "loss": 1.0145, + "step": 100 + }, + { + "epoch": 0.05909240501700369, + "grad_norm": 0.04488641679254391, + "learning_rate": 7.984499026675494e-06, + "loss": 0.9673, + "step": 101 + }, + { + "epoch": 0.05967747833400373, + "grad_norm": 0.0402258783966036, + "learning_rate": 7.984190783122351e-06, + "loss": 1.0458, + "step": 102 + }, + { + "epoch": 0.06026255165100377, + "grad_norm": 0.048502182555142354, + "learning_rate": 7.983879511633036e-06, + "loss": 0.8879, + "step": 103 + }, + { + "epoch": 0.060847624968003805, + "grad_norm": 0.04818474096979939, + "learning_rate": 7.983565212470504e-06, + "loss": 0.9467, + "step": 104 + }, + { + "epoch": 0.06143269828500384, + "grad_norm": 0.04176125713160911, + "learning_rate": 7.983247885900283e-06, + "loss": 0.9266, + "step": 105 + }, + { + "epoch": 0.06201777160200388, + "grad_norm": 0.04212530605033032, + "learning_rate": 7.982927532190447e-06, + "loss": 0.9179, + "step": 106 + }, + { + "epoch": 0.06260284491900392, + "grad_norm": 0.037198964746819226, + "learning_rate": 7.982604151611633e-06, + "loss": 1.0018, + "step": 107 + }, + { + "epoch": 0.06318791823600395, + "grad_norm": 0.04090799839826158, + "learning_rate": 7.982277744437035e-06, + "loss": 0.8756, + "step": 108 + }, + { + "epoch": 0.06377299155300399, + "grad_norm": 0.03857422273289164, + "learning_rate": 7.981948310942402e-06, + "loss": 0.8855, + "step": 109 + }, + { + "epoch": 0.06435806487000402, + "grad_norm": 0.05241752689478532, + "learning_rate": 7.981615851406039e-06, + "loss": 0.8862, + "step": 110 + }, + { + "epoch": 0.06494313818700406, + "grad_norm": 0.04086973369607326, + "learning_rate": 7.981280366108814e-06, + "loss": 0.9221, + "step": 111 + }, + { + "epoch": 0.06552821150400409, + "grad_norm": 0.03931043694281754, + "learning_rate": 7.98094185533414e-06, + "loss": 0.9417, + "step": 112 + }, + { + "epoch": 0.06611328482100413, + "grad_norm": 0.04169878931857935, + "learning_rate": 7.980600319367995e-06, + "loss": 0.958, + "step": 113 + }, + { + "epoch": 0.06669835813800416, + "grad_norm": 0.044230077430854955, + "learning_rate": 7.980255758498908e-06, + "loss": 0.9265, + "step": 114 + }, + { + "epoch": 0.0672834314550042, + "grad_norm": 0.04488148180330816, + "learning_rate": 7.979908173017968e-06, + "loss": 0.8908, + "step": 115 + }, + { + "epoch": 0.06786850477200425, + "grad_norm": 0.042038370275589085, + "learning_rate": 7.979557563218815e-06, + "loss": 0.8961, + "step": 116 + }, + { + "epoch": 0.06845357808900428, + "grad_norm": 0.04747064970378541, + "learning_rate": 7.979203929397646e-06, + "loss": 1.0609, + "step": 117 + }, + { + "epoch": 0.06903865140600432, + "grad_norm": 0.04392999493678844, + "learning_rate": 7.97884727185321e-06, + "loss": 0.9001, + "step": 118 + }, + { + "epoch": 0.06962372472300435, + "grad_norm": 0.040693633998808994, + "learning_rate": 7.978487590886814e-06, + "loss": 0.8562, + "step": 119 + }, + { + "epoch": 0.07020879804000439, + "grad_norm": 0.08337676841807191, + "learning_rate": 7.978124886802316e-06, + "loss": 0.9344, + "step": 120 + }, + { + "epoch": 0.07079387135700442, + "grad_norm": 0.03951441645023813, + "learning_rate": 7.977759159906134e-06, + "loss": 0.9182, + "step": 121 + }, + { + "epoch": 0.07137894467400446, + "grad_norm": 0.04427536962304041, + "learning_rate": 7.977390410507229e-06, + "loss": 0.9079, + "step": 122 + }, + { + "epoch": 0.07196401799100449, + "grad_norm": 0.047402666476443076, + "learning_rate": 7.977018638917126e-06, + "loss": 0.9442, + "step": 123 + }, + { + "epoch": 0.07254909130800453, + "grad_norm": 0.07806155365092546, + "learning_rate": 7.976643845449897e-06, + "loss": 0.9453, + "step": 124 + }, + { + "epoch": 0.07313416462500458, + "grad_norm": 0.04187989433422361, + "learning_rate": 7.97626603042217e-06, + "loss": 0.9762, + "step": 125 + }, + { + "epoch": 0.0737192379420046, + "grad_norm": 0.038153971027990764, + "learning_rate": 7.975885194153125e-06, + "loss": 0.9377, + "step": 126 + }, + { + "epoch": 0.07430431125900465, + "grad_norm": 0.04398811005912341, + "learning_rate": 7.975501336964492e-06, + "loss": 0.9655, + "step": 127 + }, + { + "epoch": 0.07488938457600468, + "grad_norm": 0.04310577216463342, + "learning_rate": 7.975114459180555e-06, + "loss": 0.9082, + "step": 128 + }, + { + "epoch": 0.07547445789300472, + "grad_norm": 0.04552741227747631, + "learning_rate": 7.97472456112815e-06, + "loss": 0.8667, + "step": 129 + }, + { + "epoch": 0.07605953121000475, + "grad_norm": 0.0406705187810207, + "learning_rate": 7.974331643136666e-06, + "loss": 0.9286, + "step": 130 + }, + { + "epoch": 0.07664460452700479, + "grad_norm": 0.16819457832404855, + "learning_rate": 7.973935705538039e-06, + "loss": 0.9724, + "step": 131 + }, + { + "epoch": 0.07722967784400482, + "grad_norm": 0.04524796652654962, + "learning_rate": 7.973536748666756e-06, + "loss": 0.961, + "step": 132 + }, + { + "epoch": 0.07781475116100486, + "grad_norm": 0.051171072191227115, + "learning_rate": 7.973134772859862e-06, + "loss": 0.9513, + "step": 133 + }, + { + "epoch": 0.0783998244780049, + "grad_norm": 0.044436153110071305, + "learning_rate": 7.972729778456946e-06, + "loss": 0.9363, + "step": 134 + }, + { + "epoch": 0.07898489779500494, + "grad_norm": 0.04485419817375143, + "learning_rate": 7.97232176580015e-06, + "loss": 0.8583, + "step": 135 + }, + { + "epoch": 0.07956997111200498, + "grad_norm": 0.044866876801222304, + "learning_rate": 7.971910735234161e-06, + "loss": 0.9859, + "step": 136 + }, + { + "epoch": 0.080155044429005, + "grad_norm": 0.03934716109284772, + "learning_rate": 7.971496687106219e-06, + "loss": 0.8592, + "step": 137 + }, + { + "epoch": 0.08074011774600505, + "grad_norm": 0.041852675369480244, + "learning_rate": 7.971079621766117e-06, + "loss": 0.9353, + "step": 138 + }, + { + "epoch": 0.08132519106300508, + "grad_norm": 0.3430745253576026, + "learning_rate": 7.97065953956619e-06, + "loss": 0.9602, + "step": 139 + }, + { + "epoch": 0.08191026438000512, + "grad_norm": 0.44783710464834237, + "learning_rate": 7.970236440861327e-06, + "loss": 0.9833, + "step": 140 + }, + { + "epoch": 0.08249533769700515, + "grad_norm": 0.29750847371388817, + "learning_rate": 7.96981032600896e-06, + "loss": 0.8244, + "step": 141 + }, + { + "epoch": 0.08308041101400519, + "grad_norm": 0.169443611740874, + "learning_rate": 7.969381195369076e-06, + "loss": 0.8983, + "step": 142 + }, + { + "epoch": 0.08366548433100524, + "grad_norm": 0.04111262427570532, + "learning_rate": 7.968949049304204e-06, + "loss": 0.9552, + "step": 143 + }, + { + "epoch": 0.08425055764800526, + "grad_norm": 0.04087231642049412, + "learning_rate": 7.968513888179421e-06, + "loss": 0.9051, + "step": 144 + }, + { + "epoch": 0.08483563096500531, + "grad_norm": 0.05663350951363164, + "learning_rate": 7.968075712362356e-06, + "loss": 0.8366, + "step": 145 + }, + { + "epoch": 0.08542070428200534, + "grad_norm": 0.04202895629977615, + "learning_rate": 7.967634522223179e-06, + "loss": 0.8115, + "step": 146 + }, + { + "epoch": 0.08600577759900538, + "grad_norm": 0.04238111014919017, + "learning_rate": 7.96719031813461e-06, + "loss": 0.9086, + "step": 147 + }, + { + "epoch": 0.08659085091600541, + "grad_norm": 0.04226501020383857, + "learning_rate": 7.966743100471913e-06, + "loss": 0.9286, + "step": 148 + }, + { + "epoch": 0.08717592423300545, + "grad_norm": 0.046325797442375215, + "learning_rate": 7.9662928696129e-06, + "loss": 0.9393, + "step": 149 + }, + { + "epoch": 0.08776099755000548, + "grad_norm": 0.05532068772615188, + "learning_rate": 7.965839625937926e-06, + "loss": 0.9202, + "step": 150 + }, + { + "epoch": 0.08834607086700552, + "grad_norm": 0.039360829289514745, + "learning_rate": 7.965383369829894e-06, + "loss": 0.908, + "step": 151 + }, + { + "epoch": 0.08893114418400556, + "grad_norm": 0.04965498407233415, + "learning_rate": 7.964924101674252e-06, + "loss": 0.9406, + "step": 152 + }, + { + "epoch": 0.0895162175010056, + "grad_norm": 0.04177674200968805, + "learning_rate": 7.964461821858987e-06, + "loss": 0.8933, + "step": 153 + }, + { + "epoch": 0.09010129081800564, + "grad_norm": 0.04711456314429998, + "learning_rate": 7.963996530774639e-06, + "loss": 1.0111, + "step": 154 + }, + { + "epoch": 0.09068636413500566, + "grad_norm": 0.045100723994096155, + "learning_rate": 7.963528228814285e-06, + "loss": 0.9806, + "step": 155 + }, + { + "epoch": 0.09127143745200571, + "grad_norm": 0.09448573408012474, + "learning_rate": 7.96305691637355e-06, + "loss": 0.9142, + "step": 156 + }, + { + "epoch": 0.09185651076900574, + "grad_norm": 0.04297907772876167, + "learning_rate": 7.962582593850596e-06, + "loss": 0.8852, + "step": 157 + }, + { + "epoch": 0.09244158408600578, + "grad_norm": 0.05151683390684187, + "learning_rate": 7.962105261646138e-06, + "loss": 0.9975, + "step": 158 + }, + { + "epoch": 0.09302665740300581, + "grad_norm": 0.07242957126413647, + "learning_rate": 7.961624920163423e-06, + "loss": 0.9196, + "step": 159 + }, + { + "epoch": 0.09361173072000585, + "grad_norm": 0.08840439035290122, + "learning_rate": 7.961141569808248e-06, + "loss": 0.92, + "step": 160 + }, + { + "epoch": 0.0941968040370059, + "grad_norm": 0.11047843778949552, + "learning_rate": 7.960655210988948e-06, + "loss": 0.9452, + "step": 161 + }, + { + "epoch": 0.09478187735400592, + "grad_norm": 0.04275273565605597, + "learning_rate": 7.960165844116399e-06, + "loss": 0.9641, + "step": 162 + }, + { + "epoch": 0.09536695067100597, + "grad_norm": 0.05662855927390397, + "learning_rate": 7.959673469604025e-06, + "loss": 0.9354, + "step": 163 + }, + { + "epoch": 0.095952023988006, + "grad_norm": 0.04769002643125012, + "learning_rate": 7.959178087867779e-06, + "loss": 0.9087, + "step": 164 + }, + { + "epoch": 0.09653709730500604, + "grad_norm": 0.050744023655463925, + "learning_rate": 7.958679699326164e-06, + "loss": 0.9561, + "step": 165 + }, + { + "epoch": 0.09712217062200607, + "grad_norm": 0.058451109924341715, + "learning_rate": 7.958178304400222e-06, + "loss": 0.9881, + "step": 166 + }, + { + "epoch": 0.09770724393900611, + "grad_norm": 0.04338399471073774, + "learning_rate": 7.95767390351353e-06, + "loss": 0.9705, + "step": 167 + }, + { + "epoch": 0.09829231725600614, + "grad_norm": 0.04901291931441639, + "learning_rate": 7.95716649709221e-06, + "loss": 0.9229, + "step": 168 + }, + { + "epoch": 0.09887739057300618, + "grad_norm": 0.044284928725944674, + "learning_rate": 7.95665608556492e-06, + "loss": 0.9493, + "step": 169 + }, + { + "epoch": 0.09946246389000622, + "grad_norm": 0.04914804115067185, + "learning_rate": 7.956142669362855e-06, + "loss": 0.9879, + "step": 170 + }, + { + "epoch": 0.10004753720700625, + "grad_norm": 0.04087870881565583, + "learning_rate": 7.955626248919752e-06, + "loss": 0.9435, + "step": 171 + }, + { + "epoch": 0.1006326105240063, + "grad_norm": 0.04938743203389444, + "learning_rate": 7.955106824671888e-06, + "loss": 0.8813, + "step": 172 + }, + { + "epoch": 0.10121768384100632, + "grad_norm": 0.04440441336302364, + "learning_rate": 7.95458439705807e-06, + "loss": 0.954, + "step": 173 + }, + { + "epoch": 0.10180275715800637, + "grad_norm": 0.04052619663535755, + "learning_rate": 7.954058966519649e-06, + "loss": 0.8759, + "step": 174 + }, + { + "epoch": 0.1023878304750064, + "grad_norm": 0.04315943430366373, + "learning_rate": 7.953530533500507e-06, + "loss": 0.8621, + "step": 175 + }, + { + "epoch": 0.10297290379200644, + "grad_norm": 0.07065437183104553, + "learning_rate": 7.952999098447072e-06, + "loss": 0.9796, + "step": 176 + }, + { + "epoch": 0.10355797710900647, + "grad_norm": 0.04286710979601013, + "learning_rate": 7.952464661808297e-06, + "loss": 0.9187, + "step": 177 + }, + { + "epoch": 0.10414305042600651, + "grad_norm": 0.044637305084684484, + "learning_rate": 7.951927224035678e-06, + "loss": 0.8772, + "step": 178 + }, + { + "epoch": 0.10472812374300655, + "grad_norm": 0.04369651860095562, + "learning_rate": 7.951386785583244e-06, + "loss": 0.8969, + "step": 179 + }, + { + "epoch": 0.10531319706000658, + "grad_norm": 0.05048322425196143, + "learning_rate": 7.950843346907559e-06, + "loss": 0.8907, + "step": 180 + }, + { + "epoch": 0.10589827037700662, + "grad_norm": 0.03884181381210202, + "learning_rate": 7.95029690846772e-06, + "loss": 0.931, + "step": 181 + }, + { + "epoch": 0.10648334369400665, + "grad_norm": 0.05060670376656713, + "learning_rate": 7.949747470725362e-06, + "loss": 0.8624, + "step": 182 + }, + { + "epoch": 0.1070684170110067, + "grad_norm": 0.04162121727818916, + "learning_rate": 7.949195034144653e-06, + "loss": 0.9141, + "step": 183 + }, + { + "epoch": 0.10765349032800672, + "grad_norm": 0.04729330740672752, + "learning_rate": 7.94863959919229e-06, + "loss": 0.921, + "step": 184 + }, + { + "epoch": 0.10823856364500677, + "grad_norm": 0.04459204791909808, + "learning_rate": 7.948081166337509e-06, + "loss": 0.8993, + "step": 185 + }, + { + "epoch": 0.1088236369620068, + "grad_norm": 0.04645986236352756, + "learning_rate": 7.947519736052075e-06, + "loss": 0.9158, + "step": 186 + }, + { + "epoch": 0.10940871027900684, + "grad_norm": 0.04275967646092167, + "learning_rate": 7.946955308810285e-06, + "loss": 0.9387, + "step": 187 + }, + { + "epoch": 0.10999378359600688, + "grad_norm": 0.07421648959437231, + "learning_rate": 7.94638788508897e-06, + "loss": 0.8497, + "step": 188 + }, + { + "epoch": 0.11057885691300691, + "grad_norm": 0.04955170539289863, + "learning_rate": 7.945817465367493e-06, + "loss": 0.8525, + "step": 189 + }, + { + "epoch": 0.11116393023000695, + "grad_norm": 0.12013696366466048, + "learning_rate": 7.945244050127744e-06, + "loss": 0.9616, + "step": 190 + }, + { + "epoch": 0.11174900354700698, + "grad_norm": 0.040829512145039985, + "learning_rate": 7.944667639854148e-06, + "loss": 0.8344, + "step": 191 + }, + { + "epoch": 0.11233407686400702, + "grad_norm": 0.05068364591326354, + "learning_rate": 7.944088235033657e-06, + "loss": 0.9403, + "step": 192 + }, + { + "epoch": 0.11291915018100705, + "grad_norm": 0.0499939002215986, + "learning_rate": 7.943505836155753e-06, + "loss": 0.9475, + "step": 193 + }, + { + "epoch": 0.1135042234980071, + "grad_norm": 0.05407026250866459, + "learning_rate": 7.94292044371245e-06, + "loss": 0.9101, + "step": 194 + }, + { + "epoch": 0.11408929681500712, + "grad_norm": 0.06417314751489454, + "learning_rate": 7.94233205819829e-06, + "loss": 0.8787, + "step": 195 + }, + { + "epoch": 0.11467437013200717, + "grad_norm": 0.03871241656337873, + "learning_rate": 7.941740680110343e-06, + "loss": 0.9059, + "step": 196 + }, + { + "epoch": 0.11525944344900721, + "grad_norm": 0.041501526897382096, + "learning_rate": 7.941146309948205e-06, + "loss": 0.8946, + "step": 197 + }, + { + "epoch": 0.11584451676600724, + "grad_norm": 0.037938175729775744, + "learning_rate": 7.940548948214005e-06, + "loss": 0.8534, + "step": 198 + }, + { + "epoch": 0.11642959008300728, + "grad_norm": 0.038697315108935856, + "learning_rate": 7.939948595412394e-06, + "loss": 0.9671, + "step": 199 + }, + { + "epoch": 0.11701466340000731, + "grad_norm": 0.040128164685532784, + "learning_rate": 7.939345252050552e-06, + "loss": 0.9452, + "step": 200 + }, + { + "epoch": 0.11759973671700735, + "grad_norm": 0.04159138776071103, + "learning_rate": 7.938738918638187e-06, + "loss": 0.9525, + "step": 201 + }, + { + "epoch": 0.11818481003400738, + "grad_norm": 0.044356876333774484, + "learning_rate": 7.93812959568753e-06, + "loss": 0.8863, + "step": 202 + }, + { + "epoch": 0.11876988335100742, + "grad_norm": 0.04110203273326361, + "learning_rate": 7.93751728371334e-06, + "loss": 0.8585, + "step": 203 + }, + { + "epoch": 0.11935495666800745, + "grad_norm": 0.03907623380015084, + "learning_rate": 7.9369019832329e-06, + "loss": 0.8901, + "step": 204 + }, + { + "epoch": 0.1199400299850075, + "grad_norm": 0.06841747716076531, + "learning_rate": 7.936283694766016e-06, + "loss": 0.9382, + "step": 205 + }, + { + "epoch": 0.12052510330200754, + "grad_norm": 0.03864267029597622, + "learning_rate": 7.935662418835023e-06, + "loss": 0.9445, + "step": 206 + }, + { + "epoch": 0.12111017661900757, + "grad_norm": 0.04187477888552647, + "learning_rate": 7.935038155964775e-06, + "loss": 0.949, + "step": 207 + }, + { + "epoch": 0.12169524993600761, + "grad_norm": 0.03844046778107278, + "learning_rate": 7.934410906682653e-06, + "loss": 0.8185, + "step": 208 + }, + { + "epoch": 0.12228032325300764, + "grad_norm": 0.03894778215959397, + "learning_rate": 7.933780671518558e-06, + "loss": 0.8226, + "step": 209 + }, + { + "epoch": 0.12286539657000768, + "grad_norm": 0.05316470088962357, + "learning_rate": 7.933147451004914e-06, + "loss": 0.9149, + "step": 210 + }, + { + "epoch": 0.12345046988700771, + "grad_norm": 0.04054196015487159, + "learning_rate": 7.932511245676669e-06, + "loss": 0.907, + "step": 211 + }, + { + "epoch": 0.12403554320400775, + "grad_norm": 0.04197248747401694, + "learning_rate": 7.931872056071292e-06, + "loss": 0.8974, + "step": 212 + }, + { + "epoch": 0.12462061652100778, + "grad_norm": 0.04107563523275673, + "learning_rate": 7.931229882728771e-06, + "loss": 0.8758, + "step": 213 + }, + { + "epoch": 0.12520568983800784, + "grad_norm": 0.04898006391812857, + "learning_rate": 7.930584726191616e-06, + "loss": 0.9015, + "step": 214 + }, + { + "epoch": 0.12579076315500787, + "grad_norm": 0.053828637223887026, + "learning_rate": 7.92993658700486e-06, + "loss": 0.9095, + "step": 215 + }, + { + "epoch": 0.1263758364720079, + "grad_norm": 0.041596264391364365, + "learning_rate": 7.929285465716051e-06, + "loss": 0.9324, + "step": 216 + }, + { + "epoch": 0.12696090978900793, + "grad_norm": 0.04270202089537964, + "learning_rate": 7.928631362875258e-06, + "loss": 0.9712, + "step": 217 + }, + { + "epoch": 0.12754598310600798, + "grad_norm": 0.04508600982672278, + "learning_rate": 7.927974279035069e-06, + "loss": 0.8526, + "step": 218 + }, + { + "epoch": 0.128131056423008, + "grad_norm": 0.04210505656160864, + "learning_rate": 7.927314214750592e-06, + "loss": 0.8483, + "step": 219 + }, + { + "epoch": 0.12871612974000804, + "grad_norm": 0.04228456764136032, + "learning_rate": 7.926651170579451e-06, + "loss": 1.0049, + "step": 220 + }, + { + "epoch": 0.12930120305700807, + "grad_norm": 0.039866451292078504, + "learning_rate": 7.92598514708179e-06, + "loss": 0.9421, + "step": 221 + }, + { + "epoch": 0.12988627637400812, + "grad_norm": 0.037613132725661406, + "learning_rate": 7.925316144820263e-06, + "loss": 0.9474, + "step": 222 + }, + { + "epoch": 0.13047134969100815, + "grad_norm": 0.04168086524600805, + "learning_rate": 7.92464416436005e-06, + "loss": 0.9058, + "step": 223 + }, + { + "epoch": 0.13105642300800818, + "grad_norm": 0.039621964711338775, + "learning_rate": 7.923969206268839e-06, + "loss": 0.9086, + "step": 224 + }, + { + "epoch": 0.13164149632500824, + "grad_norm": 0.05209282643387043, + "learning_rate": 7.923291271116838e-06, + "loss": 0.9298, + "step": 225 + }, + { + "epoch": 0.13222656964200827, + "grad_norm": 0.044258185244179175, + "learning_rate": 7.92261035947677e-06, + "loss": 0.8925, + "step": 226 + }, + { + "epoch": 0.1328116429590083, + "grad_norm": 0.039844108391859055, + "learning_rate": 7.92192647192387e-06, + "loss": 0.8392, + "step": 227 + }, + { + "epoch": 0.13339671627600833, + "grad_norm": 0.044744651789733476, + "learning_rate": 7.92123960903589e-06, + "loss": 0.9329, + "step": 228 + }, + { + "epoch": 0.13398178959300838, + "grad_norm": 0.04494192821446448, + "learning_rate": 7.92054977139309e-06, + "loss": 0.8606, + "step": 229 + }, + { + "epoch": 0.1345668629100084, + "grad_norm": 0.05863838322698434, + "learning_rate": 7.919856959578252e-06, + "loss": 0.9302, + "step": 230 + }, + { + "epoch": 0.13515193622700844, + "grad_norm": 0.04085984556832983, + "learning_rate": 7.919161174176663e-06, + "loss": 0.9993, + "step": 231 + }, + { + "epoch": 0.1357370095440085, + "grad_norm": 0.04500674877406008, + "learning_rate": 7.918462415776125e-06, + "loss": 0.9377, + "step": 232 + }, + { + "epoch": 0.13632208286100853, + "grad_norm": 0.039173528958006375, + "learning_rate": 7.917760684966955e-06, + "loss": 0.813, + "step": 233 + }, + { + "epoch": 0.13690715617800855, + "grad_norm": 0.04298295272921228, + "learning_rate": 7.91705598234197e-06, + "loss": 0.8952, + "step": 234 + }, + { + "epoch": 0.13749222949500858, + "grad_norm": 0.04028408438122686, + "learning_rate": 7.916348308496513e-06, + "loss": 0.9051, + "step": 235 + }, + { + "epoch": 0.13807730281200864, + "grad_norm": 0.06743695497866435, + "learning_rate": 7.915637664028423e-06, + "loss": 0.9475, + "step": 236 + }, + { + "epoch": 0.13866237612900867, + "grad_norm": 0.03949575625475006, + "learning_rate": 7.914924049538061e-06, + "loss": 0.9316, + "step": 237 + }, + { + "epoch": 0.1392474494460087, + "grad_norm": 0.04030623921903529, + "learning_rate": 7.914207465628284e-06, + "loss": 0.885, + "step": 238 + }, + { + "epoch": 0.13983252276300873, + "grad_norm": 0.047528726409039795, + "learning_rate": 7.91348791290447e-06, + "loss": 0.918, + "step": 239 + }, + { + "epoch": 0.14041759608000878, + "grad_norm": 0.04466991571391728, + "learning_rate": 7.912765391974496e-06, + "loss": 0.9305, + "step": 240 + }, + { + "epoch": 0.1410026693970088, + "grad_norm": 0.04186868219039162, + "learning_rate": 7.912039903448752e-06, + "loss": 0.9415, + "step": 241 + }, + { + "epoch": 0.14158774271400884, + "grad_norm": 0.05801571080351748, + "learning_rate": 7.91131144794013e-06, + "loss": 0.8787, + "step": 242 + }, + { + "epoch": 0.1421728160310089, + "grad_norm": 0.045088484806881386, + "learning_rate": 7.910580026064038e-06, + "loss": 1.0604, + "step": 243 + }, + { + "epoch": 0.14275788934800893, + "grad_norm": 0.06574863742707004, + "learning_rate": 7.909845638438377e-06, + "loss": 0.9216, + "step": 244 + }, + { + "epoch": 0.14334296266500896, + "grad_norm": 0.0372095564143721, + "learning_rate": 7.909108285683563e-06, + "loss": 0.9336, + "step": 245 + }, + { + "epoch": 0.14392803598200898, + "grad_norm": 0.04397627871472844, + "learning_rate": 7.908367968422515e-06, + "loss": 0.9261, + "step": 246 + }, + { + "epoch": 0.14451310929900904, + "grad_norm": 0.04276892267104892, + "learning_rate": 7.907624687280654e-06, + "loss": 0.8673, + "step": 247 + }, + { + "epoch": 0.14509818261600907, + "grad_norm": 0.045304865664725656, + "learning_rate": 7.906878442885907e-06, + "loss": 0.9475, + "step": 248 + }, + { + "epoch": 0.1456832559330091, + "grad_norm": 0.04370076931511423, + "learning_rate": 7.906129235868702e-06, + "loss": 0.8433, + "step": 249 + }, + { + "epoch": 0.14626832925000915, + "grad_norm": 0.0672588807453601, + "learning_rate": 7.905377066861973e-06, + "loss": 0.9124, + "step": 250 + }, + { + "epoch": 0.14685340256700918, + "grad_norm": 0.046912566764935076, + "learning_rate": 7.904621936501156e-06, + "loss": 0.9044, + "step": 251 + }, + { + "epoch": 0.1474384758840092, + "grad_norm": 0.07278415346178851, + "learning_rate": 7.903863845424185e-06, + "loss": 0.901, + "step": 252 + }, + { + "epoch": 0.14802354920100924, + "grad_norm": 0.04218918671820467, + "learning_rate": 7.9031027942715e-06, + "loss": 0.9019, + "step": 253 + }, + { + "epoch": 0.1486086225180093, + "grad_norm": 0.03858325311022567, + "learning_rate": 7.90233878368604e-06, + "loss": 0.8601, + "step": 254 + }, + { + "epoch": 0.14919369583500933, + "grad_norm": 0.04209886442727145, + "learning_rate": 7.90157181431324e-06, + "loss": 0.8681, + "step": 255 + }, + { + "epoch": 0.14977876915200936, + "grad_norm": 0.049400812285918, + "learning_rate": 7.90080188680104e-06, + "loss": 0.9494, + "step": 256 + }, + { + "epoch": 0.15036384246900938, + "grad_norm": 0.047941941474994906, + "learning_rate": 7.900029001799882e-06, + "loss": 0.8439, + "step": 257 + }, + { + "epoch": 0.15094891578600944, + "grad_norm": 0.042101153780900284, + "learning_rate": 7.899253159962694e-06, + "loss": 0.917, + "step": 258 + }, + { + "epoch": 0.15153398910300947, + "grad_norm": 0.040583990176968884, + "learning_rate": 7.898474361944915e-06, + "loss": 0.9328, + "step": 259 + }, + { + "epoch": 0.1521190624200095, + "grad_norm": 0.04000469028913075, + "learning_rate": 7.897692608404474e-06, + "loss": 0.9454, + "step": 260 + }, + { + "epoch": 0.15270413573700956, + "grad_norm": 0.04819787772789072, + "learning_rate": 7.8969079000018e-06, + "loss": 1.0061, + "step": 261 + }, + { + "epoch": 0.15328920905400958, + "grad_norm": 0.04670811295251141, + "learning_rate": 7.896120237399817e-06, + "loss": 0.8678, + "step": 262 + }, + { + "epoch": 0.1538742823710096, + "grad_norm": 0.03876277230266676, + "learning_rate": 7.895329621263945e-06, + "loss": 0.9465, + "step": 263 + }, + { + "epoch": 0.15445935568800964, + "grad_norm": 0.06917560168966953, + "learning_rate": 7.894536052262098e-06, + "loss": 0.9114, + "step": 264 + }, + { + "epoch": 0.1550444290050097, + "grad_norm": 0.052197549375990714, + "learning_rate": 7.893739531064688e-06, + "loss": 0.8966, + "step": 265 + }, + { + "epoch": 0.15562950232200973, + "grad_norm": 0.05089520469658166, + "learning_rate": 7.892940058344615e-06, + "loss": 0.8403, + "step": 266 + }, + { + "epoch": 0.15621457563900976, + "grad_norm": 0.04200303069403226, + "learning_rate": 7.89213763477728e-06, + "loss": 0.7954, + "step": 267 + }, + { + "epoch": 0.1567996489560098, + "grad_norm": 0.04045997821257577, + "learning_rate": 7.89133226104057e-06, + "loss": 0.9484, + "step": 268 + }, + { + "epoch": 0.15738472227300984, + "grad_norm": 0.04355833968928891, + "learning_rate": 7.890523937814872e-06, + "loss": 0.8871, + "step": 269 + }, + { + "epoch": 0.15796979559000987, + "grad_norm": 0.038980722774793516, + "learning_rate": 7.889712665783055e-06, + "loss": 0.8242, + "step": 270 + }, + { + "epoch": 0.1585548689070099, + "grad_norm": 0.0417431555190703, + "learning_rate": 7.888898445630486e-06, + "loss": 0.918, + "step": 271 + }, + { + "epoch": 0.15913994222400996, + "grad_norm": 0.04420422353440596, + "learning_rate": 7.888081278045022e-06, + "loss": 0.9345, + "step": 272 + }, + { + "epoch": 0.15972501554100998, + "grad_norm": 0.043453256463112454, + "learning_rate": 7.88726116371701e-06, + "loss": 0.8439, + "step": 273 + }, + { + "epoch": 0.16031008885801, + "grad_norm": 0.04018335492133053, + "learning_rate": 7.88643810333928e-06, + "loss": 0.9024, + "step": 274 + }, + { + "epoch": 0.16089516217501004, + "grad_norm": 0.03885288803364117, + "learning_rate": 7.885612097607161e-06, + "loss": 0.9005, + "step": 275 + }, + { + "epoch": 0.1614802354920101, + "grad_norm": 0.03940189643105726, + "learning_rate": 7.884783147218464e-06, + "loss": 0.8726, + "step": 276 + }, + { + "epoch": 0.16206530880901013, + "grad_norm": 0.04322545289358093, + "learning_rate": 7.88395125287349e-06, + "loss": 0.9309, + "step": 277 + }, + { + "epoch": 0.16265038212601016, + "grad_norm": 0.03991615308835092, + "learning_rate": 7.883116415275022e-06, + "loss": 0.9319, + "step": 278 + }, + { + "epoch": 0.1632354554430102, + "grad_norm": 0.04774591346437184, + "learning_rate": 7.882278635128339e-06, + "loss": 0.9976, + "step": 279 + }, + { + "epoch": 0.16382052876001024, + "grad_norm": 0.043726280435400645, + "learning_rate": 7.881437913141196e-06, + "loss": 0.9041, + "step": 280 + }, + { + "epoch": 0.16440560207701027, + "grad_norm": 0.05349577122745657, + "learning_rate": 7.880594250023842e-06, + "loss": 0.9109, + "step": 281 + }, + { + "epoch": 0.1649906753940103, + "grad_norm": 0.04664425873865413, + "learning_rate": 7.879747646489002e-06, + "loss": 0.8872, + "step": 282 + }, + { + "epoch": 0.16557574871101036, + "grad_norm": 0.04689991827127376, + "learning_rate": 7.878898103251891e-06, + "loss": 0.946, + "step": 283 + }, + { + "epoch": 0.16616082202801039, + "grad_norm": 0.04419788368438809, + "learning_rate": 7.87804562103021e-06, + "loss": 0.8699, + "step": 284 + }, + { + "epoch": 0.16674589534501041, + "grad_norm": 0.0498909584325992, + "learning_rate": 7.877190200544131e-06, + "loss": 0.8396, + "step": 285 + }, + { + "epoch": 0.16733096866201047, + "grad_norm": 0.04446094526551824, + "learning_rate": 7.876331842516323e-06, + "loss": 0.887, + "step": 286 + }, + { + "epoch": 0.1679160419790105, + "grad_norm": 0.06094680175561847, + "learning_rate": 7.875470547671926e-06, + "loss": 0.8834, + "step": 287 + }, + { + "epoch": 0.16850111529601053, + "grad_norm": 0.038876474999689326, + "learning_rate": 7.874606316738566e-06, + "loss": 0.8975, + "step": 288 + }, + { + "epoch": 0.16908618861301056, + "grad_norm": 0.04076135396799628, + "learning_rate": 7.873739150446349e-06, + "loss": 0.9094, + "step": 289 + }, + { + "epoch": 0.16967126193001061, + "grad_norm": 0.04242085203333459, + "learning_rate": 7.872869049527855e-06, + "loss": 0.9346, + "step": 290 + }, + { + "epoch": 0.17025633524701064, + "grad_norm": 0.04192270928126719, + "learning_rate": 7.871996014718154e-06, + "loss": 0.916, + "step": 291 + }, + { + "epoch": 0.17084140856401067, + "grad_norm": 0.06296131776401025, + "learning_rate": 7.871120046754787e-06, + "loss": 0.7869, + "step": 292 + }, + { + "epoch": 0.1714264818810107, + "grad_norm": 0.04208658542590707, + "learning_rate": 7.870241146377773e-06, + "loss": 0.863, + "step": 293 + }, + { + "epoch": 0.17201155519801076, + "grad_norm": 0.04221040890826715, + "learning_rate": 7.869359314329613e-06, + "loss": 0.8125, + "step": 294 + }, + { + "epoch": 0.17259662851501079, + "grad_norm": 0.044062682914531, + "learning_rate": 7.868474551355277e-06, + "loss": 0.8283, + "step": 295 + }, + { + "epoch": 0.17318170183201081, + "grad_norm": 0.053136044791769796, + "learning_rate": 7.867586858202221e-06, + "loss": 0.9321, + "step": 296 + }, + { + "epoch": 0.17376677514901087, + "grad_norm": 0.041998835388755755, + "learning_rate": 7.866696235620367e-06, + "loss": 0.9435, + "step": 297 + }, + { + "epoch": 0.1743518484660109, + "grad_norm": 0.04432061620173052, + "learning_rate": 7.865802684362119e-06, + "loss": 0.944, + "step": 298 + }, + { + "epoch": 0.17493692178301093, + "grad_norm": 0.03816132379303917, + "learning_rate": 7.864906205182347e-06, + "loss": 0.9222, + "step": 299 + }, + { + "epoch": 0.17552199510001096, + "grad_norm": 0.04061878988742196, + "learning_rate": 7.864006798838405e-06, + "loss": 0.9344, + "step": 300 + }, + { + "epoch": 0.17610706841701101, + "grad_norm": 0.038725999488975066, + "learning_rate": 7.863104466090113e-06, + "loss": 0.9477, + "step": 301 + }, + { + "epoch": 0.17669214173401104, + "grad_norm": 0.0675670307551766, + "learning_rate": 7.862199207699763e-06, + "loss": 0.8939, + "step": 302 + }, + { + "epoch": 0.17727721505101107, + "grad_norm": 0.04866290151842816, + "learning_rate": 7.861291024432122e-06, + "loss": 0.944, + "step": 303 + }, + { + "epoch": 0.17786228836801113, + "grad_norm": 0.04231401792052211, + "learning_rate": 7.860379917054426e-06, + "loss": 0.9108, + "step": 304 + }, + { + "epoch": 0.17844736168501116, + "grad_norm": 0.21645206187990054, + "learning_rate": 7.859465886336381e-06, + "loss": 0.9328, + "step": 305 + }, + { + "epoch": 0.1790324350020112, + "grad_norm": 0.04212212891416765, + "learning_rate": 7.858548933050162e-06, + "loss": 0.9755, + "step": 306 + }, + { + "epoch": 0.17961750831901122, + "grad_norm": 0.047331820432207364, + "learning_rate": 7.857629057970417e-06, + "loss": 0.8702, + "step": 307 + }, + { + "epoch": 0.18020258163601127, + "grad_norm": 0.040260553510288316, + "learning_rate": 7.856706261874258e-06, + "loss": 0.8934, + "step": 308 + }, + { + "epoch": 0.1807876549530113, + "grad_norm": 0.04326897093604986, + "learning_rate": 7.855780545541264e-06, + "loss": 0.877, + "step": 309 + }, + { + "epoch": 0.18137272827001133, + "grad_norm": 0.03936456606263684, + "learning_rate": 7.854851909753487e-06, + "loss": 0.9206, + "step": 310 + }, + { + "epoch": 0.18195780158701136, + "grad_norm": 0.040529920237504666, + "learning_rate": 7.853920355295438e-06, + "loss": 0.8469, + "step": 311 + }, + { + "epoch": 0.18254287490401142, + "grad_norm": 0.07984199172508148, + "learning_rate": 7.852985882954102e-06, + "loss": 0.856, + "step": 312 + }, + { + "epoch": 0.18312794822101144, + "grad_norm": 0.04341308255910565, + "learning_rate": 7.85204849351892e-06, + "loss": 0.8975, + "step": 313 + }, + { + "epoch": 0.18371302153801147, + "grad_norm": 0.05163482815007264, + "learning_rate": 7.851108187781802e-06, + "loss": 0.8516, + "step": 314 + }, + { + "epoch": 0.18429809485501153, + "grad_norm": 0.04363599571939765, + "learning_rate": 7.850164966537124e-06, + "loss": 0.9088, + "step": 315 + }, + { + "epoch": 0.18488316817201156, + "grad_norm": 0.043024652012164384, + "learning_rate": 7.84921883058172e-06, + "loss": 0.8291, + "step": 316 + }, + { + "epoch": 0.1854682414890116, + "grad_norm": 0.041659608271311335, + "learning_rate": 7.848269780714892e-06, + "loss": 0.9719, + "step": 317 + }, + { + "epoch": 0.18605331480601162, + "grad_norm": 0.04557805725835434, + "learning_rate": 7.847317817738394e-06, + "loss": 0.9638, + "step": 318 + }, + { + "epoch": 0.18663838812301167, + "grad_norm": 0.04388413396186285, + "learning_rate": 7.846362942456455e-06, + "loss": 0.93, + "step": 319 + }, + { + "epoch": 0.1872234614400117, + "grad_norm": 0.04052950822615268, + "learning_rate": 7.845405155675752e-06, + "loss": 0.8951, + "step": 320 + }, + { + "epoch": 0.18780853475701173, + "grad_norm": 0.03852369264523642, + "learning_rate": 7.844444458205428e-06, + "loss": 0.8521, + "step": 321 + }, + { + "epoch": 0.1883936080740118, + "grad_norm": 0.045799254500923765, + "learning_rate": 7.843480850857083e-06, + "loss": 0.8966, + "step": 322 + }, + { + "epoch": 0.18897868139101182, + "grad_norm": 0.04373024374645557, + "learning_rate": 7.842514334444776e-06, + "loss": 0.973, + "step": 323 + }, + { + "epoch": 0.18956375470801184, + "grad_norm": 0.037372310076302304, + "learning_rate": 7.841544909785022e-06, + "loss": 1.0232, + "step": 324 + }, + { + "epoch": 0.19014882802501187, + "grad_norm": 0.044911079932608254, + "learning_rate": 7.840572577696798e-06, + "loss": 0.8351, + "step": 325 + }, + { + "epoch": 0.19073390134201193, + "grad_norm": 0.04820487217400144, + "learning_rate": 7.839597339001529e-06, + "loss": 0.9381, + "step": 326 + }, + { + "epoch": 0.19131897465901196, + "grad_norm": 0.04460622903204633, + "learning_rate": 7.8386191945231e-06, + "loss": 0.9047, + "step": 327 + }, + { + "epoch": 0.191904047976012, + "grad_norm": 0.04344227987010105, + "learning_rate": 7.837638145087855e-06, + "loss": 0.8882, + "step": 328 + }, + { + "epoch": 0.19248912129301204, + "grad_norm": 0.041162516007716705, + "learning_rate": 7.836654191524583e-06, + "loss": 0.888, + "step": 329 + }, + { + "epoch": 0.19307419461001207, + "grad_norm": 0.050217278727892056, + "learning_rate": 7.835667334664533e-06, + "loss": 0.9425, + "step": 330 + }, + { + "epoch": 0.1936592679270121, + "grad_norm": 0.045239661384039016, + "learning_rate": 7.834677575341407e-06, + "loss": 0.845, + "step": 331 + }, + { + "epoch": 0.19424434124401213, + "grad_norm": 0.04069762643444013, + "learning_rate": 7.833684914391354e-06, + "loss": 0.9045, + "step": 332 + }, + { + "epoch": 0.1948294145610122, + "grad_norm": 0.03659391253836006, + "learning_rate": 7.832689352652978e-06, + "loss": 0.8415, + "step": 333 + }, + { + "epoch": 0.19541448787801222, + "grad_norm": 0.0478253523475305, + "learning_rate": 7.831690890967332e-06, + "loss": 0.9023, + "step": 334 + }, + { + "epoch": 0.19599956119501225, + "grad_norm": 0.03931532297283958, + "learning_rate": 7.830689530177923e-06, + "loss": 0.8757, + "step": 335 + }, + { + "epoch": 0.19658463451201227, + "grad_norm": 0.04480666786944768, + "learning_rate": 7.8296852711307e-06, + "loss": 0.8393, + "step": 336 + }, + { + "epoch": 0.19716970782901233, + "grad_norm": 0.0734058976406723, + "learning_rate": 7.828678114674066e-06, + "loss": 0.9038, + "step": 337 + }, + { + "epoch": 0.19775478114601236, + "grad_norm": 0.03967258990394233, + "learning_rate": 7.827668061658871e-06, + "loss": 0.8009, + "step": 338 + }, + { + "epoch": 0.1983398544630124, + "grad_norm": 0.0418553799297778, + "learning_rate": 7.82665511293841e-06, + "loss": 0.8865, + "step": 339 + }, + { + "epoch": 0.19892492778001244, + "grad_norm": 0.0384561013270465, + "learning_rate": 7.825639269368426e-06, + "loss": 0.872, + "step": 340 + }, + { + "epoch": 0.19951000109701247, + "grad_norm": 0.04182210062250734, + "learning_rate": 7.824620531807106e-06, + "loss": 0.8974, + "step": 341 + }, + { + "epoch": 0.2000950744140125, + "grad_norm": 0.0402445680829306, + "learning_rate": 7.823598901115085e-06, + "loss": 0.8017, + "step": 342 + }, + { + "epoch": 0.20068014773101253, + "grad_norm": 0.03976795416941979, + "learning_rate": 7.822574378155436e-06, + "loss": 0.9298, + "step": 343 + }, + { + "epoch": 0.2012652210480126, + "grad_norm": 0.043031163568613286, + "learning_rate": 7.821546963793683e-06, + "loss": 0.9508, + "step": 344 + }, + { + "epoch": 0.20185029436501262, + "grad_norm": 0.0485982687560341, + "learning_rate": 7.82051665889779e-06, + "loss": 0.8536, + "step": 345 + }, + { + "epoch": 0.20243536768201265, + "grad_norm": 0.04149899672912665, + "learning_rate": 7.819483464338156e-06, + "loss": 0.8767, + "step": 346 + }, + { + "epoch": 0.2030204409990127, + "grad_norm": 0.0971131140419449, + "learning_rate": 7.818447380987634e-06, + "loss": 0.9271, + "step": 347 + }, + { + "epoch": 0.20360551431601273, + "grad_norm": 0.04183099462603424, + "learning_rate": 7.817408409721506e-06, + "loss": 0.8362, + "step": 348 + }, + { + "epoch": 0.20419058763301276, + "grad_norm": 0.03719681766847752, + "learning_rate": 7.8163665514175e-06, + "loss": 0.8544, + "step": 349 + }, + { + "epoch": 0.2047756609500128, + "grad_norm": 0.08061763199476867, + "learning_rate": 7.815321806955782e-06, + "loss": 0.8335, + "step": 350 + }, + { + "epoch": 0.20536073426701285, + "grad_norm": 0.053441055551360056, + "learning_rate": 7.814274177218955e-06, + "loss": 0.8602, + "step": 351 + }, + { + "epoch": 0.20594580758401287, + "grad_norm": 0.1775803458323184, + "learning_rate": 7.81322366309206e-06, + "loss": 0.8998, + "step": 352 + }, + { + "epoch": 0.2065308809010129, + "grad_norm": 0.0637002971995377, + "learning_rate": 7.812170265462573e-06, + "loss": 0.9737, + "step": 353 + }, + { + "epoch": 0.20711595421801293, + "grad_norm": 0.04274639590829472, + "learning_rate": 7.81111398522041e-06, + "loss": 1.009, + "step": 354 + }, + { + "epoch": 0.207701027535013, + "grad_norm": 0.041282961278004064, + "learning_rate": 7.81005482325792e-06, + "loss": 0.9035, + "step": 355 + }, + { + "epoch": 0.20828610085201302, + "grad_norm": 0.04195399372218902, + "learning_rate": 7.808992780469889e-06, + "loss": 0.9128, + "step": 356 + }, + { + "epoch": 0.20887117416901305, + "grad_norm": 0.03698717701423449, + "learning_rate": 7.807927857753527e-06, + "loss": 0.8562, + "step": 357 + }, + { + "epoch": 0.2094562474860131, + "grad_norm": 0.1081591722204418, + "learning_rate": 7.80686005600849e-06, + "loss": 0.8931, + "step": 358 + }, + { + "epoch": 0.21004132080301313, + "grad_norm": 0.04152721891763337, + "learning_rate": 7.80578937613686e-06, + "loss": 0.8633, + "step": 359 + }, + { + "epoch": 0.21062639412001316, + "grad_norm": 0.04200660652374662, + "learning_rate": 7.804715819043148e-06, + "loss": 0.8773, + "step": 360 + }, + { + "epoch": 0.2112114674370132, + "grad_norm": 0.04405111681557889, + "learning_rate": 7.803639385634302e-06, + "loss": 0.8587, + "step": 361 + }, + { + "epoch": 0.21179654075401325, + "grad_norm": 0.04571477197647962, + "learning_rate": 7.802560076819694e-06, + "loss": 0.8334, + "step": 362 + }, + { + "epoch": 0.21238161407101327, + "grad_norm": 0.041607118224879065, + "learning_rate": 7.80147789351113e-06, + "loss": 0.8739, + "step": 363 + }, + { + "epoch": 0.2129666873880133, + "grad_norm": 0.06194034993642153, + "learning_rate": 7.800392836622838e-06, + "loss": 0.8956, + "step": 364 + }, + { + "epoch": 0.21355176070501336, + "grad_norm": 0.03798806008002444, + "learning_rate": 7.79930490707148e-06, + "loss": 0.8966, + "step": 365 + }, + { + "epoch": 0.2141368340220134, + "grad_norm": 0.04739784630899101, + "learning_rate": 7.798214105776146e-06, + "loss": 0.9552, + "step": 366 + }, + { + "epoch": 0.21472190733901342, + "grad_norm": 0.04497114482691456, + "learning_rate": 7.797120433658343e-06, + "loss": 0.8666, + "step": 367 + }, + { + "epoch": 0.21530698065601345, + "grad_norm": 0.04176901867071411, + "learning_rate": 7.796023891642011e-06, + "loss": 0.9051, + "step": 368 + }, + { + "epoch": 0.2158920539730135, + "grad_norm": 0.056698801623745465, + "learning_rate": 7.794924480653513e-06, + "loss": 0.8745, + "step": 369 + }, + { + "epoch": 0.21647712729001353, + "grad_norm": 0.04267067239376988, + "learning_rate": 7.793822201621633e-06, + "loss": 0.9129, + "step": 370 + }, + { + "epoch": 0.21706220060701356, + "grad_norm": 0.03999203881276761, + "learning_rate": 7.79271705547758e-06, + "loss": 0.8814, + "step": 371 + }, + { + "epoch": 0.2176472739240136, + "grad_norm": 0.04206496870855173, + "learning_rate": 7.79160904315499e-06, + "loss": 0.8936, + "step": 372 + }, + { + "epoch": 0.21823234724101365, + "grad_norm": 0.040567992793796616, + "learning_rate": 7.79049816558991e-06, + "loss": 0.8961, + "step": 373 + }, + { + "epoch": 0.21881742055801368, + "grad_norm": 0.04165915715538525, + "learning_rate": 7.789384423720815e-06, + "loss": 0.901, + "step": 374 + }, + { + "epoch": 0.2194024938750137, + "grad_norm": 0.04311161567240108, + "learning_rate": 7.788267818488597e-06, + "loss": 0.8571, + "step": 375 + }, + { + "epoch": 0.21998756719201376, + "grad_norm": 0.04090984120457054, + "learning_rate": 7.78714835083657e-06, + "loss": 0.879, + "step": 376 + }, + { + "epoch": 0.2205726405090138, + "grad_norm": 0.03723653818234615, + "learning_rate": 7.786026021710462e-06, + "loss": 0.8687, + "step": 377 + }, + { + "epoch": 0.22115771382601382, + "grad_norm": 0.040732056413017595, + "learning_rate": 7.78490083205842e-06, + "loss": 0.9033, + "step": 378 + }, + { + "epoch": 0.22174278714301385, + "grad_norm": 0.03755340315603773, + "learning_rate": 7.783772782831008e-06, + "loss": 0.8919, + "step": 379 + }, + { + "epoch": 0.2223278604600139, + "grad_norm": 0.04513035465018611, + "learning_rate": 7.782641874981207e-06, + "loss": 0.8766, + "step": 380 + }, + { + "epoch": 0.22291293377701393, + "grad_norm": 0.042927929340526826, + "learning_rate": 7.78150810946441e-06, + "loss": 0.8692, + "step": 381 + }, + { + "epoch": 0.22349800709401396, + "grad_norm": 0.03548281821425231, + "learning_rate": 7.780371487238428e-06, + "loss": 0.7295, + "step": 382 + }, + { + "epoch": 0.22408308041101402, + "grad_norm": 0.03978070402906236, + "learning_rate": 7.779232009263484e-06, + "loss": 0.8555, + "step": 383 + }, + { + "epoch": 0.22466815372801405, + "grad_norm": 0.036709565386030436, + "learning_rate": 7.778089676502209e-06, + "loss": 0.7492, + "step": 384 + }, + { + "epoch": 0.22525322704501408, + "grad_norm": 0.04547788495521431, + "learning_rate": 7.776944489919649e-06, + "loss": 0.8334, + "step": 385 + }, + { + "epoch": 0.2258383003620141, + "grad_norm": 0.04220442842369723, + "learning_rate": 7.775796450483267e-06, + "loss": 0.8244, + "step": 386 + }, + { + "epoch": 0.22642337367901416, + "grad_norm": 0.04778060193840243, + "learning_rate": 7.774645559162927e-06, + "loss": 0.8511, + "step": 387 + }, + { + "epoch": 0.2270084469960142, + "grad_norm": 0.05014682109980806, + "learning_rate": 7.773491816930904e-06, + "loss": 0.8334, + "step": 388 + }, + { + "epoch": 0.22759352031301422, + "grad_norm": 0.04001986651909544, + "learning_rate": 7.772335224761886e-06, + "loss": 0.8224, + "step": 389 + }, + { + "epoch": 0.22817859363001425, + "grad_norm": 0.040094874463681825, + "learning_rate": 7.771175783632966e-06, + "loss": 0.9069, + "step": 390 + }, + { + "epoch": 0.2287636669470143, + "grad_norm": 0.03678306586668835, + "learning_rate": 7.770013494523641e-06, + "loss": 0.8758, + "step": 391 + }, + { + "epoch": 0.22934874026401433, + "grad_norm": 0.04444193067206238, + "learning_rate": 7.768848358415819e-06, + "loss": 0.946, + "step": 392 + }, + { + "epoch": 0.22993381358101436, + "grad_norm": 0.042386218395158284, + "learning_rate": 7.767680376293811e-06, + "loss": 0.8395, + "step": 393 + }, + { + "epoch": 0.23051888689801442, + "grad_norm": 0.051874603825963005, + "learning_rate": 7.766509549144332e-06, + "loss": 0.8867, + "step": 394 + }, + { + "epoch": 0.23110396021501445, + "grad_norm": 0.04115935591406729, + "learning_rate": 7.765335877956498e-06, + "loss": 0.8509, + "step": 395 + }, + { + "epoch": 0.23168903353201448, + "grad_norm": 0.045377388296697053, + "learning_rate": 7.764159363721833e-06, + "loss": 0.8783, + "step": 396 + }, + { + "epoch": 0.2322741068490145, + "grad_norm": 0.03845331275197764, + "learning_rate": 7.762980007434261e-06, + "loss": 0.8721, + "step": 397 + }, + { + "epoch": 0.23285918016601456, + "grad_norm": 0.06130607399623932, + "learning_rate": 7.761797810090103e-06, + "loss": 0.896, + "step": 398 + }, + { + "epoch": 0.2334442534830146, + "grad_norm": 0.0653248890033106, + "learning_rate": 7.760612772688086e-06, + "loss": 0.9239, + "step": 399 + }, + { + "epoch": 0.23402932680001462, + "grad_norm": 0.04386903271065406, + "learning_rate": 7.759424896229329e-06, + "loss": 0.9055, + "step": 400 + }, + { + "epoch": 0.23461440011701468, + "grad_norm": 0.05112057938888221, + "learning_rate": 7.758234181717359e-06, + "loss": 0.8179, + "step": 401 + }, + { + "epoch": 0.2351994734340147, + "grad_norm": 0.03919834965990091, + "learning_rate": 7.757040630158094e-06, + "loss": 0.9131, + "step": 402 + }, + { + "epoch": 0.23578454675101473, + "grad_norm": 0.04080472273046829, + "learning_rate": 7.75584424255985e-06, + "loss": 0.8772, + "step": 403 + }, + { + "epoch": 0.23636962006801476, + "grad_norm": 0.036834620421617906, + "learning_rate": 7.754645019933338e-06, + "loss": 0.8155, + "step": 404 + }, + { + "epoch": 0.23695469338501482, + "grad_norm": 0.03654317750178369, + "learning_rate": 7.753442963291668e-06, + "loss": 0.8346, + "step": 405 + }, + { + "epoch": 0.23753976670201485, + "grad_norm": 0.04519798929105044, + "learning_rate": 7.752238073650339e-06, + "loss": 0.912, + "step": 406 + }, + { + "epoch": 0.23812484001901488, + "grad_norm": 0.03826695253269902, + "learning_rate": 7.751030352027246e-06, + "loss": 0.8772, + "step": 407 + }, + { + "epoch": 0.2387099133360149, + "grad_norm": 0.04486993387188878, + "learning_rate": 7.749819799442676e-06, + "loss": 0.8826, + "step": 408 + }, + { + "epoch": 0.23929498665301496, + "grad_norm": 0.0776681654995442, + "learning_rate": 7.74860641691931e-06, + "loss": 0.8987, + "step": 409 + }, + { + "epoch": 0.239880059970015, + "grad_norm": 0.05169098865704706, + "learning_rate": 7.747390205482216e-06, + "loss": 0.7904, + "step": 410 + }, + { + "epoch": 0.24046513328701502, + "grad_norm": 0.042053672534972886, + "learning_rate": 7.746171166158855e-06, + "loss": 0.9188, + "step": 411 + }, + { + "epoch": 0.24105020660401508, + "grad_norm": 0.044614916290635534, + "learning_rate": 7.744949299979071e-06, + "loss": 0.9118, + "step": 412 + }, + { + "epoch": 0.2416352799210151, + "grad_norm": 0.04594381480411999, + "learning_rate": 7.743724607975105e-06, + "loss": 0.8547, + "step": 413 + }, + { + "epoch": 0.24222035323801513, + "grad_norm": 0.037664202042716706, + "learning_rate": 7.742497091181578e-06, + "loss": 0.8446, + "step": 414 + }, + { + "epoch": 0.24280542655501516, + "grad_norm": 0.04794778502796113, + "learning_rate": 7.741266750635502e-06, + "loss": 0.897, + "step": 415 + }, + { + "epoch": 0.24339049987201522, + "grad_norm": 0.04188595026753968, + "learning_rate": 7.740033587376275e-06, + "loss": 0.9061, + "step": 416 + }, + { + "epoch": 0.24397557318901525, + "grad_norm": 0.04407359629305258, + "learning_rate": 7.738797602445671e-06, + "loss": 0.9146, + "step": 417 + }, + { + "epoch": 0.24456064650601528, + "grad_norm": 0.04077311126531583, + "learning_rate": 7.73755879688786e-06, + "loss": 0.8515, + "step": 418 + }, + { + "epoch": 0.24514571982301533, + "grad_norm": 0.055784116703384304, + "learning_rate": 7.736317171749385e-06, + "loss": 0.851, + "step": 419 + }, + { + "epoch": 0.24573079314001536, + "grad_norm": 0.049743142106565615, + "learning_rate": 7.735072728079179e-06, + "loss": 0.8718, + "step": 420 + }, + { + "epoch": 0.2463158664570154, + "grad_norm": 0.04470234941233142, + "learning_rate": 7.73382546692855e-06, + "loss": 0.9624, + "step": 421 + }, + { + "epoch": 0.24690093977401542, + "grad_norm": 0.040010409109375616, + "learning_rate": 7.732575389351187e-06, + "loss": 0.8925, + "step": 422 + }, + { + "epoch": 0.24748601309101548, + "grad_norm": 0.04513853855792064, + "learning_rate": 7.731322496403161e-06, + "loss": 0.8163, + "step": 423 + }, + { + "epoch": 0.2480710864080155, + "grad_norm": 0.04305001532204258, + "learning_rate": 7.730066789142922e-06, + "loss": 0.818, + "step": 424 + }, + { + "epoch": 0.24865615972501554, + "grad_norm": 0.0381328984157233, + "learning_rate": 7.728808268631291e-06, + "loss": 0.8655, + "step": 425 + }, + { + "epoch": 0.24924123304201556, + "grad_norm": 0.04141777979618345, + "learning_rate": 7.727546935931473e-06, + "loss": 0.8447, + "step": 426 + }, + { + "epoch": 0.24982630635901562, + "grad_norm": 0.0441532278076972, + "learning_rate": 7.726282792109049e-06, + "loss": 0.7839, + "step": 427 + }, + { + "epoch": 0.2504113796760157, + "grad_norm": 0.04369415131884431, + "learning_rate": 7.725015838231966e-06, + "loss": 0.8375, + "step": 428 + }, + { + "epoch": 0.2509964529930157, + "grad_norm": 0.04101978650087937, + "learning_rate": 7.723746075370553e-06, + "loss": 0.8001, + "step": 429 + }, + { + "epoch": 0.25158152631001574, + "grad_norm": 0.04974984131766339, + "learning_rate": 7.722473504597512e-06, + "loss": 0.8914, + "step": 430 + }, + { + "epoch": 0.25216659962701576, + "grad_norm": 0.042172544165836386, + "learning_rate": 7.721198126987914e-06, + "loss": 0.8266, + "step": 431 + }, + { + "epoch": 0.2527516729440158, + "grad_norm": 0.04137736709510405, + "learning_rate": 7.719919943619202e-06, + "loss": 0.9043, + "step": 432 + }, + { + "epoch": 0.2533367462610158, + "grad_norm": 0.04296090636863713, + "learning_rate": 7.718638955571187e-06, + "loss": 0.8638, + "step": 433 + }, + { + "epoch": 0.25392181957801585, + "grad_norm": 0.04617407215473608, + "learning_rate": 7.717355163926054e-06, + "loss": 0.8223, + "step": 434 + }, + { + "epoch": 0.2545068928950159, + "grad_norm": 0.05470709039494341, + "learning_rate": 7.716068569768357e-06, + "loss": 0.8702, + "step": 435 + }, + { + "epoch": 0.25509196621201596, + "grad_norm": 0.05481374522257052, + "learning_rate": 7.714779174185011e-06, + "loss": 0.8644, + "step": 436 + }, + { + "epoch": 0.255677039529016, + "grad_norm": 0.047154937968022254, + "learning_rate": 7.713486978265303e-06, + "loss": 0.8656, + "step": 437 + }, + { + "epoch": 0.256262112846016, + "grad_norm": 0.04351286538068476, + "learning_rate": 7.712191983100885e-06, + "loss": 0.9139, + "step": 438 + }, + { + "epoch": 0.25684718616301605, + "grad_norm": 0.036225679985398614, + "learning_rate": 7.710894189785773e-06, + "loss": 0.8589, + "step": 439 + }, + { + "epoch": 0.2574322594800161, + "grad_norm": 0.036619323150318614, + "learning_rate": 7.709593599416346e-06, + "loss": 0.8355, + "step": 440 + }, + { + "epoch": 0.2580173327970161, + "grad_norm": 0.052556029410115106, + "learning_rate": 7.708290213091348e-06, + "loss": 0.9407, + "step": 441 + }, + { + "epoch": 0.25860240611401614, + "grad_norm": 0.04045383405114572, + "learning_rate": 7.706984031911884e-06, + "loss": 0.8224, + "step": 442 + }, + { + "epoch": 0.2591874794310162, + "grad_norm": 0.03850958419485924, + "learning_rate": 7.705675056981419e-06, + "loss": 0.8289, + "step": 443 + }, + { + "epoch": 0.25977255274801625, + "grad_norm": 0.042025364903943374, + "learning_rate": 7.704363289405782e-06, + "loss": 0.8295, + "step": 444 + }, + { + "epoch": 0.2603576260650163, + "grad_norm": 0.18295957147270098, + "learning_rate": 7.703048730293156e-06, + "loss": 0.8591, + "step": 445 + }, + { + "epoch": 0.2609426993820163, + "grad_norm": 0.08559741878328085, + "learning_rate": 7.701731380754086e-06, + "loss": 0.8841, + "step": 446 + }, + { + "epoch": 0.26152777269901634, + "grad_norm": 0.04030996945096956, + "learning_rate": 7.700411241901473e-06, + "loss": 0.927, + "step": 447 + }, + { + "epoch": 0.26211284601601637, + "grad_norm": 0.04080969851872162, + "learning_rate": 7.699088314850574e-06, + "loss": 0.8448, + "step": 448 + }, + { + "epoch": 0.2626979193330164, + "grad_norm": 0.0378951860436084, + "learning_rate": 7.697762600719002e-06, + "loss": 0.8806, + "step": 449 + }, + { + "epoch": 0.2632829926500165, + "grad_norm": 0.03623506747860633, + "learning_rate": 7.696434100626727e-06, + "loss": 0.8899, + "step": 450 + }, + { + "epoch": 0.2638680659670165, + "grad_norm": 0.03949638769998007, + "learning_rate": 7.695102815696068e-06, + "loss": 0.896, + "step": 451 + }, + { + "epoch": 0.26445313928401654, + "grad_norm": 0.039703852605046996, + "learning_rate": 7.6937687470517e-06, + "loss": 0.9092, + "step": 452 + }, + { + "epoch": 0.26503821260101657, + "grad_norm": 0.04067815287205374, + "learning_rate": 7.692431895820648e-06, + "loss": 0.8948, + "step": 453 + }, + { + "epoch": 0.2656232859180166, + "grad_norm": 0.0498733499092084, + "learning_rate": 7.691092263132289e-06, + "loss": 0.8407, + "step": 454 + }, + { + "epoch": 0.2662083592350166, + "grad_norm": 0.06147495248603913, + "learning_rate": 7.689749850118347e-06, + "loss": 0.8343, + "step": 455 + }, + { + "epoch": 0.26679343255201665, + "grad_norm": 0.2184373267208327, + "learning_rate": 7.6884046579129e-06, + "loss": 0.909, + "step": 456 + }, + { + "epoch": 0.26737850586901674, + "grad_norm": 0.038431478494613806, + "learning_rate": 7.68705668765237e-06, + "loss": 0.8685, + "step": 457 + }, + { + "epoch": 0.26796357918601676, + "grad_norm": 0.15936476995911328, + "learning_rate": 7.685705940475523e-06, + "loss": 0.8176, + "step": 458 + }, + { + "epoch": 0.2685486525030168, + "grad_norm": 0.04683404608346142, + "learning_rate": 7.68435241752348e-06, + "loss": 0.8523, + "step": 459 + }, + { + "epoch": 0.2691337258200168, + "grad_norm": 0.04603467291785015, + "learning_rate": 7.6829961199397e-06, + "loss": 0.8134, + "step": 460 + }, + { + "epoch": 0.26971879913701685, + "grad_norm": 0.0502568206105835, + "learning_rate": 7.681637048869985e-06, + "loss": 0.9043, + "step": 461 + }, + { + "epoch": 0.2703038724540169, + "grad_norm": 0.03828249024856938, + "learning_rate": 7.680275205462485e-06, + "loss": 0.843, + "step": 462 + }, + { + "epoch": 0.2708889457710169, + "grad_norm": 0.043272538310479496, + "learning_rate": 7.67891059086769e-06, + "loss": 0.9456, + "step": 463 + }, + { + "epoch": 0.271474019088017, + "grad_norm": 0.04015211847803301, + "learning_rate": 7.67754320623843e-06, + "loss": 0.9414, + "step": 464 + }, + { + "epoch": 0.272059092405017, + "grad_norm": 0.04064423849996686, + "learning_rate": 7.676173052729877e-06, + "loss": 0.8705, + "step": 465 + }, + { + "epoch": 0.27264416572201705, + "grad_norm": 0.04969071039366313, + "learning_rate": 7.67480013149954e-06, + "loss": 0.8922, + "step": 466 + }, + { + "epoch": 0.2732292390390171, + "grad_norm": 0.05260125052428257, + "learning_rate": 7.67342444370727e-06, + "loss": 0.9733, + "step": 467 + }, + { + "epoch": 0.2738143123560171, + "grad_norm": 0.040268950739110694, + "learning_rate": 7.672045990515248e-06, + "loss": 0.8806, + "step": 468 + }, + { + "epoch": 0.27439938567301714, + "grad_norm": 0.042577022398831814, + "learning_rate": 7.670664773088e-06, + "loss": 0.9561, + "step": 469 + }, + { + "epoch": 0.27498445899001717, + "grad_norm": 0.04618878912895806, + "learning_rate": 7.669280792592383e-06, + "loss": 0.8403, + "step": 470 + }, + { + "epoch": 0.2755695323070172, + "grad_norm": 0.04343593914167416, + "learning_rate": 7.667894050197583e-06, + "loss": 0.8355, + "step": 471 + }, + { + "epoch": 0.2761546056240173, + "grad_norm": 0.040556959908718666, + "learning_rate": 7.66650454707513e-06, + "loss": 0.8608, + "step": 472 + }, + { + "epoch": 0.2767396789410173, + "grad_norm": 0.054697588596535354, + "learning_rate": 7.665112284398881e-06, + "loss": 0.839, + "step": 473 + }, + { + "epoch": 0.27732475225801734, + "grad_norm": 0.04393307328991595, + "learning_rate": 7.66371726334502e-06, + "loss": 0.8494, + "step": 474 + }, + { + "epoch": 0.27790982557501737, + "grad_norm": 0.04275309783082017, + "learning_rate": 7.662319485092067e-06, + "loss": 0.8259, + "step": 475 + }, + { + "epoch": 0.2784948988920174, + "grad_norm": 0.03953402161699839, + "learning_rate": 7.66091895082087e-06, + "loss": 0.7773, + "step": 476 + }, + { + "epoch": 0.2790799722090174, + "grad_norm": 0.03860846601474072, + "learning_rate": 7.659515661714608e-06, + "loss": 0.7962, + "step": 477 + }, + { + "epoch": 0.27966504552601745, + "grad_norm": 0.05366294387360709, + "learning_rate": 7.658109618958779e-06, + "loss": 0.9233, + "step": 478 + }, + { + "epoch": 0.28025011884301754, + "grad_norm": 0.04420018919513911, + "learning_rate": 7.656700823741216e-06, + "loss": 0.8156, + "step": 479 + }, + { + "epoch": 0.28083519216001757, + "grad_norm": 0.039049327120388326, + "learning_rate": 7.655289277252074e-06, + "loss": 0.8856, + "step": 480 + }, + { + "epoch": 0.2814202654770176, + "grad_norm": 0.26425632822736106, + "learning_rate": 7.653874980683828e-06, + "loss": 0.9503, + "step": 481 + }, + { + "epoch": 0.2820053387940176, + "grad_norm": 0.04289481203658443, + "learning_rate": 7.652457935231285e-06, + "loss": 0.8937, + "step": 482 + }, + { + "epoch": 0.28259041211101765, + "grad_norm": 0.04293700432214028, + "learning_rate": 7.651038142091568e-06, + "loss": 0.8571, + "step": 483 + }, + { + "epoch": 0.2831754854280177, + "grad_norm": 0.038244633724887706, + "learning_rate": 7.649615602464123e-06, + "loss": 0.7906, + "step": 484 + }, + { + "epoch": 0.2837605587450177, + "grad_norm": 0.03944291933937196, + "learning_rate": 7.648190317550717e-06, + "loss": 0.926, + "step": 485 + }, + { + "epoch": 0.2843456320620178, + "grad_norm": 0.10913228375108804, + "learning_rate": 7.646762288555433e-06, + "loss": 0.8592, + "step": 486 + }, + { + "epoch": 0.2849307053790178, + "grad_norm": 0.038286199243376444, + "learning_rate": 7.645331516684676e-06, + "loss": 0.8418, + "step": 487 + }, + { + "epoch": 0.28551577869601785, + "grad_norm": 0.043890441209433355, + "learning_rate": 7.643898003147167e-06, + "loss": 0.9614, + "step": 488 + }, + { + "epoch": 0.2861008520130179, + "grad_norm": 0.0382147078144481, + "learning_rate": 7.642461749153943e-06, + "loss": 0.83, + "step": 489 + }, + { + "epoch": 0.2866859253300179, + "grad_norm": 0.10202631306826099, + "learning_rate": 7.641022755918357e-06, + "loss": 0.9484, + "step": 490 + }, + { + "epoch": 0.28727099864701794, + "grad_norm": 0.04017584960145629, + "learning_rate": 7.639581024656072e-06, + "loss": 0.8278, + "step": 491 + }, + { + "epoch": 0.28785607196401797, + "grad_norm": 0.05571278346236126, + "learning_rate": 7.638136556585071e-06, + "loss": 0.9093, + "step": 492 + }, + { + "epoch": 0.28844114528101805, + "grad_norm": 0.04145030654143878, + "learning_rate": 7.636689352925643e-06, + "loss": 0.913, + "step": 493 + }, + { + "epoch": 0.2890262185980181, + "grad_norm": 0.05058778086388885, + "learning_rate": 7.635239414900393e-06, + "loss": 0.9366, + "step": 494 + }, + { + "epoch": 0.2896112919150181, + "grad_norm": 0.047476857954303966, + "learning_rate": 7.63378674373423e-06, + "loss": 0.8528, + "step": 495 + }, + { + "epoch": 0.29019636523201814, + "grad_norm": 0.043805250682478876, + "learning_rate": 7.632331340654377e-06, + "loss": 0.9953, + "step": 496 + }, + { + "epoch": 0.29078143854901817, + "grad_norm": 0.04691649904039108, + "learning_rate": 7.630873206890365e-06, + "loss": 0.8893, + "step": 497 + }, + { + "epoch": 0.2913665118660182, + "grad_norm": 0.061366723803107565, + "learning_rate": 7.629412343674026e-06, + "loss": 0.8895, + "step": 498 + }, + { + "epoch": 0.2919515851830182, + "grad_norm": 0.052047839285183604, + "learning_rate": 7.627948752239508e-06, + "loss": 0.8322, + "step": 499 + }, + { + "epoch": 0.2925366585000183, + "grad_norm": 0.04858401908225131, + "learning_rate": 7.6264824338232515e-06, + "loss": 0.7962, + "step": 500 + }, + { + "epoch": 0.29312173181701834, + "grad_norm": 0.05167192214353387, + "learning_rate": 7.625013389664012e-06, + "loss": 0.8178, + "step": 501 + }, + { + "epoch": 0.29370680513401837, + "grad_norm": 0.04571347435775933, + "learning_rate": 7.623541621002841e-06, + "loss": 0.9531, + "step": 502 + }, + { + "epoch": 0.2942918784510184, + "grad_norm": 0.05170428092011692, + "learning_rate": 7.622067129083092e-06, + "loss": 0.863, + "step": 503 + }, + { + "epoch": 0.2948769517680184, + "grad_norm": 0.04769020808960418, + "learning_rate": 7.620589915150423e-06, + "loss": 0.8693, + "step": 504 + }, + { + "epoch": 0.29546202508501845, + "grad_norm": 0.036447168096671895, + "learning_rate": 7.619109980452789e-06, + "loss": 0.8263, + "step": 505 + }, + { + "epoch": 0.2960470984020185, + "grad_norm": 0.04697422924818202, + "learning_rate": 7.617627326240441e-06, + "loss": 0.9127, + "step": 506 + }, + { + "epoch": 0.29663217171901857, + "grad_norm": 0.05026264821034648, + "learning_rate": 7.6161419537659345e-06, + "loss": 0.9163, + "step": 507 + }, + { + "epoch": 0.2972172450360186, + "grad_norm": 0.044605972655997896, + "learning_rate": 7.614653864284114e-06, + "loss": 0.8998, + "step": 508 + }, + { + "epoch": 0.2978023183530186, + "grad_norm": 0.06437245062396214, + "learning_rate": 7.613163059052123e-06, + "loss": 0.8039, + "step": 509 + }, + { + "epoch": 0.29838739167001865, + "grad_norm": 0.04554747767372419, + "learning_rate": 7.611669539329398e-06, + "loss": 0.8662, + "step": 510 + }, + { + "epoch": 0.2989724649870187, + "grad_norm": 0.0802330744693163, + "learning_rate": 7.610173306377671e-06, + "loss": 0.8343, + "step": 511 + }, + { + "epoch": 0.2995575383040187, + "grad_norm": 0.04192153945182111, + "learning_rate": 7.608674361460963e-06, + "loss": 0.8983, + "step": 512 + }, + { + "epoch": 0.30014261162101874, + "grad_norm": 0.04902593536059904, + "learning_rate": 7.607172705845589e-06, + "loss": 0.9242, + "step": 513 + }, + { + "epoch": 0.30072768493801877, + "grad_norm": 0.05436876049704265, + "learning_rate": 7.605668340800153e-06, + "loss": 0.834, + "step": 514 + }, + { + "epoch": 0.30131275825501885, + "grad_norm": 0.04047347680328035, + "learning_rate": 7.604161267595545e-06, + "loss": 0.8359, + "step": 515 + }, + { + "epoch": 0.3018978315720189, + "grad_norm": 0.03848909199643286, + "learning_rate": 7.602651487504946e-06, + "loss": 0.8126, + "step": 516 + }, + { + "epoch": 0.3024829048890189, + "grad_norm": 0.046207806514092946, + "learning_rate": 7.601139001803825e-06, + "loss": 0.869, + "step": 517 + }, + { + "epoch": 0.30306797820601894, + "grad_norm": 0.03852808749767389, + "learning_rate": 7.5996238117699344e-06, + "loss": 0.9808, + "step": 518 + }, + { + "epoch": 0.30365305152301897, + "grad_norm": 0.04005257123878126, + "learning_rate": 7.5981059186833114e-06, + "loss": 0.7767, + "step": 519 + }, + { + "epoch": 0.304238124840019, + "grad_norm": 0.05159848088030845, + "learning_rate": 7.596585323826277e-06, + "loss": 0.8932, + "step": 520 + }, + { + "epoch": 0.304823198157019, + "grad_norm": 0.040391024202221286, + "learning_rate": 7.595062028483434e-06, + "loss": 0.9093, + "step": 521 + }, + { + "epoch": 0.3054082714740191, + "grad_norm": 0.040428764785829324, + "learning_rate": 7.593536033941669e-06, + "loss": 0.9087, + "step": 522 + }, + { + "epoch": 0.30599334479101914, + "grad_norm": 0.09158364294178328, + "learning_rate": 7.592007341490145e-06, + "loss": 0.8601, + "step": 523 + }, + { + "epoch": 0.30657841810801917, + "grad_norm": 0.04136317840574456, + "learning_rate": 7.590475952420309e-06, + "loss": 0.8953, + "step": 524 + }, + { + "epoch": 0.3071634914250192, + "grad_norm": 0.06396409974499655, + "learning_rate": 7.588941868025881e-06, + "loss": 0.8297, + "step": 525 + }, + { + "epoch": 0.3077485647420192, + "grad_norm": 0.040333070246341814, + "learning_rate": 7.587405089602862e-06, + "loss": 0.7719, + "step": 526 + }, + { + "epoch": 0.30833363805901925, + "grad_norm": 0.04148918223122552, + "learning_rate": 7.585865618449528e-06, + "loss": 0.8007, + "step": 527 + }, + { + "epoch": 0.3089187113760193, + "grad_norm": 0.048132396423435676, + "learning_rate": 7.584323455866427e-06, + "loss": 0.8579, + "step": 528 + }, + { + "epoch": 0.30950378469301937, + "grad_norm": 0.04328208405834297, + "learning_rate": 7.582778603156387e-06, + "loss": 0.8071, + "step": 529 + }, + { + "epoch": 0.3100888580100194, + "grad_norm": 0.04318172969759895, + "learning_rate": 7.5812310616245e-06, + "loss": 0.83, + "step": 530 + }, + { + "epoch": 0.3106739313270194, + "grad_norm": 0.03768227605900526, + "learning_rate": 7.579680832578137e-06, + "loss": 0.8344, + "step": 531 + }, + { + "epoch": 0.31125900464401945, + "grad_norm": 0.04790982152353994, + "learning_rate": 7.578127917326936e-06, + "loss": 0.8974, + "step": 532 + }, + { + "epoch": 0.3118440779610195, + "grad_norm": 0.038002618502432514, + "learning_rate": 7.576572317182805e-06, + "loss": 0.792, + "step": 533 + }, + { + "epoch": 0.3124291512780195, + "grad_norm": 0.036571897121817694, + "learning_rate": 7.575014033459921e-06, + "loss": 0.8418, + "step": 534 + }, + { + "epoch": 0.31301422459501954, + "grad_norm": 0.03670017559323694, + "learning_rate": 7.573453067474724e-06, + "loss": 0.8834, + "step": 535 + }, + { + "epoch": 0.3135992979120196, + "grad_norm": 0.03621350431630049, + "learning_rate": 7.5718894205459284e-06, + "loss": 0.8549, + "step": 536 + }, + { + "epoch": 0.31418437122901965, + "grad_norm": 0.04296675197248163, + "learning_rate": 7.570323093994503e-06, + "loss": 0.7894, + "step": 537 + }, + { + "epoch": 0.3147694445460197, + "grad_norm": 0.044503765611357125, + "learning_rate": 7.568754089143688e-06, + "loss": 0.8655, + "step": 538 + }, + { + "epoch": 0.3153545178630197, + "grad_norm": 0.03715684392321602, + "learning_rate": 7.5671824073189845e-06, + "loss": 0.8697, + "step": 539 + }, + { + "epoch": 0.31593959118001974, + "grad_norm": 0.04056871817206291, + "learning_rate": 7.5656080498481535e-06, + "loss": 0.8803, + "step": 540 + }, + { + "epoch": 0.31652466449701977, + "grad_norm": 0.08089424036325778, + "learning_rate": 7.564031018061219e-06, + "loss": 0.8098, + "step": 541 + }, + { + "epoch": 0.3171097378140198, + "grad_norm": 0.04027917264460558, + "learning_rate": 7.562451313290459e-06, + "loss": 0.8939, + "step": 542 + }, + { + "epoch": 0.3176948111310199, + "grad_norm": 0.042852152415068484, + "learning_rate": 7.560868936870418e-06, + "loss": 0.8901, + "step": 543 + }, + { + "epoch": 0.3182798844480199, + "grad_norm": 0.05886634493495861, + "learning_rate": 7.559283890137889e-06, + "loss": 0.9286, + "step": 544 + }, + { + "epoch": 0.31886495776501994, + "grad_norm": 0.04216130498342499, + "learning_rate": 7.557696174431927e-06, + "loss": 0.8311, + "step": 545 + }, + { + "epoch": 0.31945003108201997, + "grad_norm": 0.053197981320438834, + "learning_rate": 7.556105791093838e-06, + "loss": 0.8952, + "step": 546 + }, + { + "epoch": 0.32003510439902, + "grad_norm": 0.038738558056749535, + "learning_rate": 7.554512741467183e-06, + "loss": 0.87, + "step": 547 + }, + { + "epoch": 0.32062017771602, + "grad_norm": 0.04255317774837396, + "learning_rate": 7.552917026897778e-06, + "loss": 0.8386, + "step": 548 + }, + { + "epoch": 0.32120525103302006, + "grad_norm": 0.06530091195329979, + "learning_rate": 7.551318648733684e-06, + "loss": 0.8862, + "step": 549 + }, + { + "epoch": 0.3217903243500201, + "grad_norm": 0.04640020878919504, + "learning_rate": 7.549717608325219e-06, + "loss": 0.8615, + "step": 550 + }, + { + "epoch": 0.32237539766702017, + "grad_norm": 0.038491606071055146, + "learning_rate": 7.548113907024948e-06, + "loss": 0.8581, + "step": 551 + }, + { + "epoch": 0.3229604709840202, + "grad_norm": 0.05470482648340031, + "learning_rate": 7.54650754618768e-06, + "loss": 0.8104, + "step": 552 + }, + { + "epoch": 0.3235455443010202, + "grad_norm": 0.04194329994008872, + "learning_rate": 7.544898527170475e-06, + "loss": 0.7725, + "step": 553 + }, + { + "epoch": 0.32413061761802026, + "grad_norm": 0.059518370147140835, + "learning_rate": 7.543286851332641e-06, + "loss": 0.8814, + "step": 554 + }, + { + "epoch": 0.3247156909350203, + "grad_norm": 0.041939649514720574, + "learning_rate": 7.5416725200357215e-06, + "loss": 0.8785, + "step": 555 + }, + { + "epoch": 0.3253007642520203, + "grad_norm": 0.038119584353072236, + "learning_rate": 7.540055534643512e-06, + "loss": 0.8787, + "step": 556 + }, + { + "epoch": 0.32588583756902034, + "grad_norm": 0.35043742170562575, + "learning_rate": 7.538435896522048e-06, + "loss": 0.9425, + "step": 557 + }, + { + "epoch": 0.3264709108860204, + "grad_norm": 0.037804212753556964, + "learning_rate": 7.536813607039603e-06, + "loss": 0.9275, + "step": 558 + }, + { + "epoch": 0.32705598420302046, + "grad_norm": 0.04797991190424691, + "learning_rate": 7.535188667566693e-06, + "loss": 0.8349, + "step": 559 + }, + { + "epoch": 0.3276410575200205, + "grad_norm": 0.07131490446157496, + "learning_rate": 7.533561079476073e-06, + "loss": 0.7922, + "step": 560 + }, + { + "epoch": 0.3282261308370205, + "grad_norm": 0.04273451205950798, + "learning_rate": 7.531930844142734e-06, + "loss": 0.9063, + "step": 561 + }, + { + "epoch": 0.32881120415402054, + "grad_norm": 0.04761650491169362, + "learning_rate": 7.5302979629439044e-06, + "loss": 0.874, + "step": 562 + }, + { + "epoch": 0.32939627747102057, + "grad_norm": 0.053621412028226095, + "learning_rate": 7.528662437259048e-06, + "loss": 0.8908, + "step": 563 + }, + { + "epoch": 0.3299813507880206, + "grad_norm": 0.08172735146274197, + "learning_rate": 7.527024268469862e-06, + "loss": 0.7873, + "step": 564 + }, + { + "epoch": 0.3305664241050207, + "grad_norm": 0.045936897060612714, + "learning_rate": 7.525383457960277e-06, + "loss": 0.8582, + "step": 565 + }, + { + "epoch": 0.3311514974220207, + "grad_norm": 0.043038357564032, + "learning_rate": 7.523740007116453e-06, + "loss": 0.8864, + "step": 566 + }, + { + "epoch": 0.33173657073902074, + "grad_norm": 0.0595338473222242, + "learning_rate": 7.5220939173267855e-06, + "loss": 0.8398, + "step": 567 + }, + { + "epoch": 0.33232164405602077, + "grad_norm": 0.03876298862367819, + "learning_rate": 7.520445189981897e-06, + "loss": 0.7608, + "step": 568 + }, + { + "epoch": 0.3329067173730208, + "grad_norm": 0.04137701061941132, + "learning_rate": 7.518793826474636e-06, + "loss": 0.9109, + "step": 569 + }, + { + "epoch": 0.33349179069002083, + "grad_norm": 0.04054950689452163, + "learning_rate": 7.517139828200079e-06, + "loss": 0.9109, + "step": 570 + }, + { + "epoch": 0.33407686400702086, + "grad_norm": 0.045942175432546556, + "learning_rate": 7.5154831965555315e-06, + "loss": 0.8569, + "step": 571 + }, + { + "epoch": 0.33466193732402094, + "grad_norm": 0.0412369088341243, + "learning_rate": 7.51382393294052e-06, + "loss": 0.837, + "step": 572 + }, + { + "epoch": 0.33524701064102097, + "grad_norm": 0.03984087759584128, + "learning_rate": 7.5121620387567955e-06, + "loss": 0.8173, + "step": 573 + }, + { + "epoch": 0.335832083958021, + "grad_norm": 0.03855595716614889, + "learning_rate": 7.510497515408333e-06, + "loss": 0.7942, + "step": 574 + }, + { + "epoch": 0.33641715727502103, + "grad_norm": 0.04123539453768172, + "learning_rate": 7.508830364301327e-06, + "loss": 0.9384, + "step": 575 + }, + { + "epoch": 0.33700223059202106, + "grad_norm": 0.047137253838708155, + "learning_rate": 7.507160586844191e-06, + "loss": 0.8449, + "step": 576 + }, + { + "epoch": 0.3375873039090211, + "grad_norm": 0.03987110874806033, + "learning_rate": 7.50548818444756e-06, + "loss": 0.8678, + "step": 577 + }, + { + "epoch": 0.3381723772260211, + "grad_norm": 0.05156972953667277, + "learning_rate": 7.503813158524284e-06, + "loss": 0.8369, + "step": 578 + }, + { + "epoch": 0.3387574505430212, + "grad_norm": 0.05429963825180294, + "learning_rate": 7.502135510489432e-06, + "loss": 0.9386, + "step": 579 + }, + { + "epoch": 0.33934252386002123, + "grad_norm": 0.04563390307441152, + "learning_rate": 7.500455241760284e-06, + "loss": 0.7892, + "step": 580 + }, + { + "epoch": 0.33992759717702126, + "grad_norm": 0.03772537720365855, + "learning_rate": 7.4987723537563395e-06, + "loss": 0.8171, + "step": 581 + }, + { + "epoch": 0.3405126704940213, + "grad_norm": 0.14591090698165343, + "learning_rate": 7.497086847899305e-06, + "loss": 0.9374, + "step": 582 + }, + { + "epoch": 0.3410977438110213, + "grad_norm": 0.04274276349409338, + "learning_rate": 7.495398725613103e-06, + "loss": 0.8886, + "step": 583 + }, + { + "epoch": 0.34168281712802134, + "grad_norm": 0.045419293797600416, + "learning_rate": 7.4937079883238644e-06, + "loss": 0.8874, + "step": 584 + }, + { + "epoch": 0.34226789044502137, + "grad_norm": 0.039369245868335276, + "learning_rate": 7.4920146374599305e-06, + "loss": 0.8515, + "step": 585 + }, + { + "epoch": 0.3428529637620214, + "grad_norm": 0.041038361870929824, + "learning_rate": 7.490318674451848e-06, + "loss": 0.8352, + "step": 586 + }, + { + "epoch": 0.3434380370790215, + "grad_norm": 0.03546313415290626, + "learning_rate": 7.488620100732373e-06, + "loss": 0.8126, + "step": 587 + }, + { + "epoch": 0.3440231103960215, + "grad_norm": 0.047686593783821035, + "learning_rate": 7.486918917736467e-06, + "loss": 0.8438, + "step": 588 + }, + { + "epoch": 0.34460818371302154, + "grad_norm": 0.03860874121220213, + "learning_rate": 7.485215126901294e-06, + "loss": 0.796, + "step": 589 + }, + { + "epoch": 0.34519325703002157, + "grad_norm": 0.04131878299287184, + "learning_rate": 7.483508729666222e-06, + "loss": 0.8787, + "step": 590 + }, + { + "epoch": 0.3457783303470216, + "grad_norm": 0.0626727371222094, + "learning_rate": 7.481799727472821e-06, + "loss": 0.8556, + "step": 591 + }, + { + "epoch": 0.34636340366402163, + "grad_norm": 0.046182038395419095, + "learning_rate": 7.480088121764862e-06, + "loss": 0.8362, + "step": 592 + }, + { + "epoch": 0.34694847698102166, + "grad_norm": 0.04057968397212824, + "learning_rate": 7.478373913988314e-06, + "loss": 0.8382, + "step": 593 + }, + { + "epoch": 0.34753355029802174, + "grad_norm": 0.04060993045831877, + "learning_rate": 7.476657105591347e-06, + "loss": 0.844, + "step": 594 + }, + { + "epoch": 0.34811862361502177, + "grad_norm": 0.042757904869809, + "learning_rate": 7.474937698024326e-06, + "loss": 0.8323, + "step": 595 + }, + { + "epoch": 0.3487036969320218, + "grad_norm": 0.042358138791525404, + "learning_rate": 7.4732156927398134e-06, + "loss": 0.8055, + "step": 596 + }, + { + "epoch": 0.34928877024902183, + "grad_norm": 0.04569798776629126, + "learning_rate": 7.4714910911925614e-06, + "loss": 0.7941, + "step": 597 + }, + { + "epoch": 0.34987384356602186, + "grad_norm": 0.0438874811170573, + "learning_rate": 7.469763894839523e-06, + "loss": 0.919, + "step": 598 + }, + { + "epoch": 0.3504589168830219, + "grad_norm": 0.06653725512945165, + "learning_rate": 7.468034105139836e-06, + "loss": 0.8574, + "step": 599 + }, + { + "epoch": 0.3510439902000219, + "grad_norm": 0.04995025638554342, + "learning_rate": 7.466301723554835e-06, + "loss": 0.8496, + "step": 600 + }, + { + "epoch": 0.351629063517022, + "grad_norm": 0.04283576163163911, + "learning_rate": 7.46456675154804e-06, + "loss": 0.8757, + "step": 601 + }, + { + "epoch": 0.35221413683402203, + "grad_norm": 0.039849657265063225, + "learning_rate": 7.462829190585162e-06, + "loss": 0.8945, + "step": 602 + }, + { + "epoch": 0.35279921015102206, + "grad_norm": 0.0440391834022003, + "learning_rate": 7.461089042134098e-06, + "loss": 0.8571, + "step": 603 + }, + { + "epoch": 0.3533842834680221, + "grad_norm": 0.043132549255425784, + "learning_rate": 7.45934630766493e-06, + "loss": 0.8668, + "step": 604 + }, + { + "epoch": 0.3539693567850221, + "grad_norm": 0.04249197374690922, + "learning_rate": 7.4576009886499285e-06, + "loss": 0.8797, + "step": 605 + }, + { + "epoch": 0.35455443010202214, + "grad_norm": 0.05324832438497396, + "learning_rate": 7.455853086563542e-06, + "loss": 0.9684, + "step": 606 + }, + { + "epoch": 0.3551395034190222, + "grad_norm": 0.04904764924574238, + "learning_rate": 7.454102602882405e-06, + "loss": 0.7753, + "step": 607 + }, + { + "epoch": 0.35572457673602226, + "grad_norm": 0.039014384352298336, + "learning_rate": 7.452349539085334e-06, + "loss": 0.8561, + "step": 608 + }, + { + "epoch": 0.3563096500530223, + "grad_norm": 0.03934058564150346, + "learning_rate": 7.4505938966533175e-06, + "loss": 0.8438, + "step": 609 + }, + { + "epoch": 0.3568947233700223, + "grad_norm": 0.05279799201688822, + "learning_rate": 7.448835677069536e-06, + "loss": 0.8912, + "step": 610 + }, + { + "epoch": 0.35747979668702234, + "grad_norm": 0.04001456755669222, + "learning_rate": 7.447074881819332e-06, + "loss": 0.8553, + "step": 611 + }, + { + "epoch": 0.3580648700040224, + "grad_norm": 0.042425970420024564, + "learning_rate": 7.445311512390233e-06, + "loss": 0.8327, + "step": 612 + }, + { + "epoch": 0.3586499433210224, + "grad_norm": 0.061113675404079935, + "learning_rate": 7.443545570271942e-06, + "loss": 0.8842, + "step": 613 + }, + { + "epoch": 0.35923501663802243, + "grad_norm": 0.046390573358408176, + "learning_rate": 7.44177705695633e-06, + "loss": 0.869, + "step": 614 + }, + { + "epoch": 0.3598200899550225, + "grad_norm": 0.04323809663682179, + "learning_rate": 7.440005973937445e-06, + "loss": 0.9184, + "step": 615 + }, + { + "epoch": 0.36040516327202254, + "grad_norm": 0.04915044460354856, + "learning_rate": 7.4382323227115e-06, + "loss": 0.8376, + "step": 616 + }, + { + "epoch": 0.3609902365890226, + "grad_norm": 0.0385546720018955, + "learning_rate": 7.436456104776885e-06, + "loss": 0.8515, + "step": 617 + }, + { + "epoch": 0.3615753099060226, + "grad_norm": 0.042848988320383044, + "learning_rate": 7.4346773216341545e-06, + "loss": 0.867, + "step": 618 + }, + { + "epoch": 0.36216038322302263, + "grad_norm": 0.04584623182025803, + "learning_rate": 7.432895974786029e-06, + "loss": 0.8974, + "step": 619 + }, + { + "epoch": 0.36274545654002266, + "grad_norm": 0.04282342241688822, + "learning_rate": 7.431112065737397e-06, + "loss": 0.8623, + "step": 620 + }, + { + "epoch": 0.3633305298570227, + "grad_norm": 0.04076113650735267, + "learning_rate": 7.429325595995311e-06, + "loss": 0.8682, + "step": 621 + }, + { + "epoch": 0.3639156031740227, + "grad_norm": 0.05126027130856797, + "learning_rate": 7.427536567068985e-06, + "loss": 0.906, + "step": 622 + }, + { + "epoch": 0.3645006764910228, + "grad_norm": 0.049318751888490414, + "learning_rate": 7.4257449804697975e-06, + "loss": 0.8397, + "step": 623 + }, + { + "epoch": 0.36508574980802283, + "grad_norm": 0.043254726523200684, + "learning_rate": 7.423950837711287e-06, + "loss": 0.8622, + "step": 624 + }, + { + "epoch": 0.36567082312502286, + "grad_norm": 0.05038943712513383, + "learning_rate": 7.422154140309151e-06, + "loss": 0.8775, + "step": 625 + }, + { + "epoch": 0.3662558964420229, + "grad_norm": 0.04134166981874639, + "learning_rate": 7.420354889781245e-06, + "loss": 0.8226, + "step": 626 + }, + { + "epoch": 0.3668409697590229, + "grad_norm": 0.03866147658091302, + "learning_rate": 7.418553087647582e-06, + "loss": 0.8664, + "step": 627 + }, + { + "epoch": 0.36742604307602295, + "grad_norm": 0.038894973852294755, + "learning_rate": 7.416748735430332e-06, + "loss": 0.8796, + "step": 628 + }, + { + "epoch": 0.368011116393023, + "grad_norm": 0.05664602547094577, + "learning_rate": 7.4149418346538144e-06, + "loss": 0.9451, + "step": 629 + }, + { + "epoch": 0.36859618971002306, + "grad_norm": 0.03682659768264277, + "learning_rate": 7.413132386844507e-06, + "loss": 0.7877, + "step": 630 + }, + { + "epoch": 0.3691812630270231, + "grad_norm": 0.04070102583713315, + "learning_rate": 7.411320393531038e-06, + "loss": 0.8647, + "step": 631 + }, + { + "epoch": 0.3697663363440231, + "grad_norm": 0.037583772238652974, + "learning_rate": 7.4095058562441835e-06, + "loss": 0.8984, + "step": 632 + }, + { + "epoch": 0.37035140966102315, + "grad_norm": 0.03745618837783022, + "learning_rate": 7.407688776516873e-06, + "loss": 0.8077, + "step": 633 + }, + { + "epoch": 0.3709364829780232, + "grad_norm": 0.04719274188930425, + "learning_rate": 7.405869155884178e-06, + "loss": 0.7846, + "step": 634 + }, + { + "epoch": 0.3715215562950232, + "grad_norm": 0.04726483533088948, + "learning_rate": 7.404046995883322e-06, + "loss": 0.9625, + "step": 635 + }, + { + "epoch": 0.37210662961202323, + "grad_norm": 0.03718098121534363, + "learning_rate": 7.402222298053672e-06, + "loss": 0.7673, + "step": 636 + }, + { + "epoch": 0.3726917029290233, + "grad_norm": 0.041699609984716224, + "learning_rate": 7.400395063936738e-06, + "loss": 0.8846, + "step": 637 + }, + { + "epoch": 0.37327677624602335, + "grad_norm": 0.03781340458428561, + "learning_rate": 7.3985652950761734e-06, + "loss": 0.8589, + "step": 638 + }, + { + "epoch": 0.3738618495630234, + "grad_norm": 0.05169204738245716, + "learning_rate": 7.396732993017774e-06, + "loss": 0.8737, + "step": 639 + }, + { + "epoch": 0.3744469228800234, + "grad_norm": 0.040199437093993116, + "learning_rate": 7.394898159309474e-06, + "loss": 0.8432, + "step": 640 + }, + { + "epoch": 0.37503199619702343, + "grad_norm": 0.04129820957000115, + "learning_rate": 7.393060795501346e-06, + "loss": 0.8363, + "step": 641 + }, + { + "epoch": 0.37561706951402346, + "grad_norm": 0.04346950641443829, + "learning_rate": 7.391220903145602e-06, + "loss": 0.8008, + "step": 642 + }, + { + "epoch": 0.3762021428310235, + "grad_norm": 0.043967535702394124, + "learning_rate": 7.389378483796589e-06, + "loss": 0.8695, + "step": 643 + }, + { + "epoch": 0.3767872161480236, + "grad_norm": 0.037870985542036356, + "learning_rate": 7.387533539010789e-06, + "loss": 0.7874, + "step": 644 + }, + { + "epoch": 0.3773722894650236, + "grad_norm": 0.0394925511025262, + "learning_rate": 7.385686070346818e-06, + "loss": 0.8427, + "step": 645 + }, + { + "epoch": 0.37795736278202363, + "grad_norm": 0.03932506659821688, + "learning_rate": 7.383836079365423e-06, + "loss": 0.8773, + "step": 646 + }, + { + "epoch": 0.37854243609902366, + "grad_norm": 0.04174103140864924, + "learning_rate": 7.381983567629482e-06, + "loss": 0.8532, + "step": 647 + }, + { + "epoch": 0.3791275094160237, + "grad_norm": 0.03777546260763163, + "learning_rate": 7.380128536704003e-06, + "loss": 0.842, + "step": 648 + }, + { + "epoch": 0.3797125827330237, + "grad_norm": 0.038903962254009807, + "learning_rate": 7.378270988156122e-06, + "loss": 0.9141, + "step": 649 + }, + { + "epoch": 0.38029765605002375, + "grad_norm": 0.03931172416456551, + "learning_rate": 7.376410923555104e-06, + "loss": 0.8382, + "step": 650 + }, + { + "epoch": 0.38088272936702383, + "grad_norm": 0.03830193944287262, + "learning_rate": 7.374548344472336e-06, + "loss": 0.913, + "step": 651 + }, + { + "epoch": 0.38146780268402386, + "grad_norm": 0.05689973145188307, + "learning_rate": 7.372683252481333e-06, + "loss": 0.9233, + "step": 652 + }, + { + "epoch": 0.3820528760010239, + "grad_norm": 0.11927191175958904, + "learning_rate": 7.370815649157728e-06, + "loss": 0.8497, + "step": 653 + }, + { + "epoch": 0.3826379493180239, + "grad_norm": 0.04879685015775887, + "learning_rate": 7.36894553607928e-06, + "loss": 0.8902, + "step": 654 + }, + { + "epoch": 0.38322302263502395, + "grad_norm": 0.038545163591213454, + "learning_rate": 7.3670729148258655e-06, + "loss": 0.8101, + "step": 655 + }, + { + "epoch": 0.383808095952024, + "grad_norm": 0.07018595700302209, + "learning_rate": 7.365197786979483e-06, + "loss": 0.8732, + "step": 656 + }, + { + "epoch": 0.384393169269024, + "grad_norm": 0.03883816516578689, + "learning_rate": 7.3633201541242465e-06, + "loss": 0.9438, + "step": 657 + }, + { + "epoch": 0.3849782425860241, + "grad_norm": 0.045681207579253334, + "learning_rate": 7.3614400178463834e-06, + "loss": 0.8083, + "step": 658 + }, + { + "epoch": 0.3855633159030241, + "grad_norm": 0.03536545477300677, + "learning_rate": 7.359557379734242e-06, + "loss": 0.7559, + "step": 659 + }, + { + "epoch": 0.38614838922002415, + "grad_norm": 0.03989482911665265, + "learning_rate": 7.357672241378282e-06, + "loss": 0.8969, + "step": 660 + }, + { + "epoch": 0.3867334625370242, + "grad_norm": 0.03858611847647411, + "learning_rate": 7.355784604371071e-06, + "loss": 0.835, + "step": 661 + }, + { + "epoch": 0.3873185358540242, + "grad_norm": 0.04523699088789738, + "learning_rate": 7.353894470307294e-06, + "loss": 0.8641, + "step": 662 + }, + { + "epoch": 0.38790360917102423, + "grad_norm": 0.041661369856111206, + "learning_rate": 7.352001840783741e-06, + "loss": 0.834, + "step": 663 + }, + { + "epoch": 0.38848868248802426, + "grad_norm": 0.041538283343954825, + "learning_rate": 7.3501067173993115e-06, + "loss": 0.9114, + "step": 664 + }, + { + "epoch": 0.3890737558050243, + "grad_norm": 0.039602491788351755, + "learning_rate": 7.348209101755012e-06, + "loss": 0.8479, + "step": 665 + }, + { + "epoch": 0.3896588291220244, + "grad_norm": 0.04077290004316095, + "learning_rate": 7.346308995453956e-06, + "loss": 0.8511, + "step": 666 + }, + { + "epoch": 0.3902439024390244, + "grad_norm": 0.044595088605505394, + "learning_rate": 7.344406400101358e-06, + "loss": 0.8121, + "step": 667 + }, + { + "epoch": 0.39082897575602443, + "grad_norm": 0.04520053665259006, + "learning_rate": 7.342501317304538e-06, + "loss": 0.8916, + "step": 668 + }, + { + "epoch": 0.39141404907302446, + "grad_norm": 0.04048302305644283, + "learning_rate": 7.340593748672915e-06, + "loss": 0.9092, + "step": 669 + }, + { + "epoch": 0.3919991223900245, + "grad_norm": 0.04037233961111267, + "learning_rate": 7.33868369581801e-06, + "loss": 0.881, + "step": 670 + }, + { + "epoch": 0.3925841957070245, + "grad_norm": 0.05051075945293654, + "learning_rate": 7.336771160353441e-06, + "loss": 0.8109, + "step": 671 + }, + { + "epoch": 0.39316926902402455, + "grad_norm": 0.049551953529107955, + "learning_rate": 7.334856143894927e-06, + "loss": 0.9017, + "step": 672 + }, + { + "epoch": 0.39375434234102463, + "grad_norm": 0.03993075169550292, + "learning_rate": 7.332938648060276e-06, + "loss": 0.8542, + "step": 673 + }, + { + "epoch": 0.39433941565802466, + "grad_norm": 0.038451189858571745, + "learning_rate": 7.331018674469396e-06, + "loss": 0.8112, + "step": 674 + }, + { + "epoch": 0.3949244889750247, + "grad_norm": 0.0505499896286193, + "learning_rate": 7.329096224744291e-06, + "loss": 0.8813, + "step": 675 + }, + { + "epoch": 0.3955095622920247, + "grad_norm": 0.071098260548373, + "learning_rate": 7.3271713005090494e-06, + "loss": 0.7823, + "step": 676 + }, + { + "epoch": 0.39609463560902475, + "grad_norm": 0.039554913835090445, + "learning_rate": 7.325243903389853e-06, + "loss": 0.8741, + "step": 677 + }, + { + "epoch": 0.3966797089260248, + "grad_norm": 0.03860100123135944, + "learning_rate": 7.323314035014974e-06, + "loss": 0.8042, + "step": 678 + }, + { + "epoch": 0.3972647822430248, + "grad_norm": 0.043534126452199624, + "learning_rate": 7.321381697014771e-06, + "loss": 0.8062, + "step": 679 + }, + { + "epoch": 0.3978498555600249, + "grad_norm": 0.04429511697906778, + "learning_rate": 7.319446891021693e-06, + "loss": 0.8726, + "step": 680 + }, + { + "epoch": 0.3984349288770249, + "grad_norm": 0.03812425577668118, + "learning_rate": 7.317509618670267e-06, + "loss": 0.8136, + "step": 681 + }, + { + "epoch": 0.39902000219402495, + "grad_norm": 0.07345134546444418, + "learning_rate": 7.315569881597106e-06, + "loss": 0.8483, + "step": 682 + }, + { + "epoch": 0.399605075511025, + "grad_norm": 0.038808207078641584, + "learning_rate": 7.313627681440909e-06, + "loss": 0.9122, + "step": 683 + }, + { + "epoch": 0.400190148828025, + "grad_norm": 0.03882381554371386, + "learning_rate": 7.311683019842453e-06, + "loss": 0.8767, + "step": 684 + }, + { + "epoch": 0.40077522214502503, + "grad_norm": 0.09532542796696841, + "learning_rate": 7.309735898444593e-06, + "loss": 0.817, + "step": 685 + }, + { + "epoch": 0.40136029546202506, + "grad_norm": 0.0401829342578114, + "learning_rate": 7.307786318892265e-06, + "loss": 0.9071, + "step": 686 + }, + { + "epoch": 0.40194536877902515, + "grad_norm": 0.03903463743661068, + "learning_rate": 7.305834282832478e-06, + "loss": 0.9161, + "step": 687 + }, + { + "epoch": 0.4025304420960252, + "grad_norm": 0.03813349508001005, + "learning_rate": 7.303879791914321e-06, + "loss": 0.7688, + "step": 688 + }, + { + "epoch": 0.4031155154130252, + "grad_norm": 0.03801357430065186, + "learning_rate": 7.301922847788953e-06, + "loss": 0.7473, + "step": 689 + }, + { + "epoch": 0.40370058873002523, + "grad_norm": 0.0391358430740653, + "learning_rate": 7.299963452109607e-06, + "loss": 0.84, + "step": 690 + }, + { + "epoch": 0.40428566204702526, + "grad_norm": 0.04237700699339236, + "learning_rate": 7.298001606531588e-06, + "loss": 0.7707, + "step": 691 + }, + { + "epoch": 0.4048707353640253, + "grad_norm": 0.04484671718594054, + "learning_rate": 7.296037312712267e-06, + "loss": 0.8456, + "step": 692 + }, + { + "epoch": 0.4054558086810253, + "grad_norm": 0.04025195813747376, + "learning_rate": 7.2940705723110895e-06, + "loss": 0.8882, + "step": 693 + }, + { + "epoch": 0.4060408819980254, + "grad_norm": 0.2399660293834239, + "learning_rate": 7.292101386989561e-06, + "loss": 0.8086, + "step": 694 + }, + { + "epoch": 0.40662595531502543, + "grad_norm": 0.03485270510298816, + "learning_rate": 7.290129758411258e-06, + "loss": 0.8997, + "step": 695 + }, + { + "epoch": 0.40721102863202546, + "grad_norm": 0.03819929313036679, + "learning_rate": 7.288155688241819e-06, + "loss": 0.9212, + "step": 696 + }, + { + "epoch": 0.4077961019490255, + "grad_norm": 0.03510738864060292, + "learning_rate": 7.286179178148942e-06, + "loss": 0.8754, + "step": 697 + }, + { + "epoch": 0.4083811752660255, + "grad_norm": 0.043632341643109356, + "learning_rate": 7.284200229802391e-06, + "loss": 0.7717, + "step": 698 + }, + { + "epoch": 0.40896624858302555, + "grad_norm": 0.04089651755929757, + "learning_rate": 7.28221884487399e-06, + "loss": 0.88, + "step": 699 + }, + { + "epoch": 0.4095513219000256, + "grad_norm": 0.037780685942199126, + "learning_rate": 7.280235025037616e-06, + "loss": 0.8197, + "step": 700 + }, + { + "epoch": 0.4101363952170256, + "grad_norm": 0.03655073966270369, + "learning_rate": 7.27824877196921e-06, + "loss": 0.8279, + "step": 701 + }, + { + "epoch": 0.4107214685340257, + "grad_norm": 0.05600910511927575, + "learning_rate": 7.2762600873467624e-06, + "loss": 0.8476, + "step": 702 + }, + { + "epoch": 0.4113065418510257, + "grad_norm": 0.034951743393564536, + "learning_rate": 7.274268972850321e-06, + "loss": 0.7609, + "step": 703 + }, + { + "epoch": 0.41189161516802575, + "grad_norm": 0.09106547558232417, + "learning_rate": 7.272275430161988e-06, + "loss": 0.8996, + "step": 704 + }, + { + "epoch": 0.4124766884850258, + "grad_norm": 0.051359026093647085, + "learning_rate": 7.270279460965912e-06, + "loss": 0.9052, + "step": 705 + }, + { + "epoch": 0.4130617618020258, + "grad_norm": 0.03905089647983441, + "learning_rate": 7.268281066948296e-06, + "loss": 0.918, + "step": 706 + }, + { + "epoch": 0.41364683511902584, + "grad_norm": 0.04206925775516373, + "learning_rate": 7.2662802497973875e-06, + "loss": 0.8053, + "step": 707 + }, + { + "epoch": 0.41423190843602586, + "grad_norm": 0.040474218310822804, + "learning_rate": 7.264277011203488e-06, + "loss": 0.891, + "step": 708 + }, + { + "epoch": 0.41481698175302595, + "grad_norm": 0.0442335071782456, + "learning_rate": 7.262271352858936e-06, + "loss": 0.8593, + "step": 709 + }, + { + "epoch": 0.415402055070026, + "grad_norm": 0.047122148951545366, + "learning_rate": 7.26026327645812e-06, + "loss": 0.844, + "step": 710 + }, + { + "epoch": 0.415987128387026, + "grad_norm": 0.03850054120717149, + "learning_rate": 7.258252783697469e-06, + "loss": 0.7795, + "step": 711 + }, + { + "epoch": 0.41657220170402603, + "grad_norm": 0.050566843036294336, + "learning_rate": 7.2562398762754554e-06, + "loss": 0.7794, + "step": 712 + }, + { + "epoch": 0.41715727502102606, + "grad_norm": 0.058106581796564256, + "learning_rate": 7.254224555892587e-06, + "loss": 0.9735, + "step": 713 + }, + { + "epoch": 0.4177423483380261, + "grad_norm": 0.03831983478639148, + "learning_rate": 7.252206824251416e-06, + "loss": 0.7832, + "step": 714 + }, + { + "epoch": 0.4183274216550261, + "grad_norm": 0.0431158884889512, + "learning_rate": 7.250186683056527e-06, + "loss": 0.7865, + "step": 715 + }, + { + "epoch": 0.4189124949720262, + "grad_norm": 0.046081855255802225, + "learning_rate": 7.248164134014544e-06, + "loss": 0.8512, + "step": 716 + }, + { + "epoch": 0.41949756828902623, + "grad_norm": 0.04502443182272067, + "learning_rate": 7.246139178834119e-06, + "loss": 0.853, + "step": 717 + }, + { + "epoch": 0.42008264160602626, + "grad_norm": 0.045461255479903595, + "learning_rate": 7.244111819225946e-06, + "loss": 0.8262, + "step": 718 + }, + { + "epoch": 0.4206677149230263, + "grad_norm": 0.038558457568846585, + "learning_rate": 7.24208205690274e-06, + "loss": 0.7975, + "step": 719 + }, + { + "epoch": 0.4212527882400263, + "grad_norm": 0.04654917549136812, + "learning_rate": 7.240049893579256e-06, + "loss": 0.8849, + "step": 720 + }, + { + "epoch": 0.42183786155702635, + "grad_norm": 0.04307894480353439, + "learning_rate": 7.238015330972268e-06, + "loss": 0.805, + "step": 721 + }, + { + "epoch": 0.4224229348740264, + "grad_norm": 0.04281636756195477, + "learning_rate": 7.235978370800583e-06, + "loss": 0.8471, + "step": 722 + }, + { + "epoch": 0.42300800819102646, + "grad_norm": 0.04507551309506065, + "learning_rate": 7.233939014785032e-06, + "loss": 0.8468, + "step": 723 + }, + { + "epoch": 0.4235930815080265, + "grad_norm": 0.06918610285412721, + "learning_rate": 7.2318972646484685e-06, + "loss": 0.8655, + "step": 724 + }, + { + "epoch": 0.4241781548250265, + "grad_norm": 0.04721791843997281, + "learning_rate": 7.229853122115772e-06, + "loss": 0.7927, + "step": 725 + }, + { + "epoch": 0.42476322814202655, + "grad_norm": 0.041834283551617495, + "learning_rate": 7.227806588913838e-06, + "loss": 0.8712, + "step": 726 + }, + { + "epoch": 0.4253483014590266, + "grad_norm": 0.05505118490346748, + "learning_rate": 7.225757666771585e-06, + "loss": 0.8584, + "step": 727 + }, + { + "epoch": 0.4259333747760266, + "grad_norm": 0.047773194756069906, + "learning_rate": 7.223706357419951e-06, + "loss": 0.7893, + "step": 728 + }, + { + "epoch": 0.42651844809302664, + "grad_norm": 0.04822106540139186, + "learning_rate": 7.221652662591887e-06, + "loss": 0.8277, + "step": 729 + }, + { + "epoch": 0.4271035214100267, + "grad_norm": 0.03969595772236796, + "learning_rate": 7.219596584022363e-06, + "loss": 0.8394, + "step": 730 + }, + { + "epoch": 0.42768859472702675, + "grad_norm": 0.0422480401855194, + "learning_rate": 7.217538123448359e-06, + "loss": 0.8094, + "step": 731 + }, + { + "epoch": 0.4282736680440268, + "grad_norm": 0.0394391284081235, + "learning_rate": 7.215477282608871e-06, + "loss": 0.8847, + "step": 732 + }, + { + "epoch": 0.4288587413610268, + "grad_norm": 0.05956332380443304, + "learning_rate": 7.213414063244903e-06, + "loss": 0.8427, + "step": 733 + }, + { + "epoch": 0.42944381467802684, + "grad_norm": 0.03974032496622027, + "learning_rate": 7.21134846709947e-06, + "loss": 0.7986, + "step": 734 + }, + { + "epoch": 0.43002888799502687, + "grad_norm": 0.061979287274211155, + "learning_rate": 7.209280495917594e-06, + "loss": 0.8178, + "step": 735 + }, + { + "epoch": 0.4306139613120269, + "grad_norm": 0.04099124105139921, + "learning_rate": 7.2072101514463045e-06, + "loss": 0.7936, + "step": 736 + }, + { + "epoch": 0.4311990346290269, + "grad_norm": 0.04101203671080088, + "learning_rate": 7.205137435434634e-06, + "loss": 0.8607, + "step": 737 + }, + { + "epoch": 0.431784107946027, + "grad_norm": 0.04063679329152861, + "learning_rate": 7.203062349633622e-06, + "loss": 0.8066, + "step": 738 + }, + { + "epoch": 0.43236918126302704, + "grad_norm": 0.045567608816787585, + "learning_rate": 7.200984895796305e-06, + "loss": 0.8558, + "step": 739 + }, + { + "epoch": 0.43295425458002706, + "grad_norm": 0.0514210080543997, + "learning_rate": 7.198905075677726e-06, + "loss": 0.7855, + "step": 740 + }, + { + "epoch": 0.4335393278970271, + "grad_norm": 0.0404180379498117, + "learning_rate": 7.196822891034922e-06, + "loss": 0.9028, + "step": 741 + }, + { + "epoch": 0.4341244012140271, + "grad_norm": 0.03964126441343688, + "learning_rate": 7.1947383436269295e-06, + "loss": 0.872, + "step": 742 + }, + { + "epoch": 0.43470947453102715, + "grad_norm": 0.04863222203139086, + "learning_rate": 7.192651435214781e-06, + "loss": 0.9288, + "step": 743 + }, + { + "epoch": 0.4352945478480272, + "grad_norm": 0.04412474170590896, + "learning_rate": 7.190562167561505e-06, + "loss": 0.836, + "step": 744 + }, + { + "epoch": 0.43587962116502726, + "grad_norm": 0.16048273545352948, + "learning_rate": 7.188470542432119e-06, + "loss": 0.7639, + "step": 745 + }, + { + "epoch": 0.4364646944820273, + "grad_norm": 0.04767484164607536, + "learning_rate": 7.1863765615936375e-06, + "loss": 0.8481, + "step": 746 + }, + { + "epoch": 0.4370497677990273, + "grad_norm": 0.06989126022408862, + "learning_rate": 7.184280226815061e-06, + "loss": 0.8569, + "step": 747 + }, + { + "epoch": 0.43763484111602735, + "grad_norm": 0.041900678395983416, + "learning_rate": 7.18218153986738e-06, + "loss": 0.931, + "step": 748 + }, + { + "epoch": 0.4382199144330274, + "grad_norm": 0.04216846873986442, + "learning_rate": 7.180080502523572e-06, + "loss": 0.8646, + "step": 749 + }, + { + "epoch": 0.4388049877500274, + "grad_norm": 0.050264349955050885, + "learning_rate": 7.177977116558601e-06, + "loss": 0.8199, + "step": 750 + }, + { + "epoch": 0.43939006106702744, + "grad_norm": 0.03843848162959958, + "learning_rate": 7.175871383749415e-06, + "loss": 0.8097, + "step": 751 + }, + { + "epoch": 0.4399751343840275, + "grad_norm": 0.05545552486681674, + "learning_rate": 7.173763305874942e-06, + "loss": 0.9036, + "step": 752 + }, + { + "epoch": 0.44056020770102755, + "grad_norm": 0.040372444436672136, + "learning_rate": 7.1716528847160944e-06, + "loss": 0.7861, + "step": 753 + }, + { + "epoch": 0.4411452810180276, + "grad_norm": 0.03823936143241982, + "learning_rate": 7.169540122055764e-06, + "loss": 0.7976, + "step": 754 + }, + { + "epoch": 0.4417303543350276, + "grad_norm": 0.040595268808395715, + "learning_rate": 7.167425019678817e-06, + "loss": 0.8007, + "step": 755 + }, + { + "epoch": 0.44231542765202764, + "grad_norm": 0.04580612084615541, + "learning_rate": 7.1653075793721e-06, + "loss": 0.741, + "step": 756 + }, + { + "epoch": 0.44290050096902767, + "grad_norm": 0.044516769340694984, + "learning_rate": 7.163187802924435e-06, + "loss": 0.911, + "step": 757 + }, + { + "epoch": 0.4434855742860277, + "grad_norm": 0.04502988747564978, + "learning_rate": 7.161065692126614e-06, + "loss": 0.8775, + "step": 758 + }, + { + "epoch": 0.4440706476030278, + "grad_norm": 0.041288645539491076, + "learning_rate": 7.1589412487714055e-06, + "loss": 0.8393, + "step": 759 + }, + { + "epoch": 0.4446557209200278, + "grad_norm": 0.04219916867709284, + "learning_rate": 7.156814474653542e-06, + "loss": 0.8203, + "step": 760 + }, + { + "epoch": 0.44524079423702784, + "grad_norm": 0.04510176879579754, + "learning_rate": 7.154685371569736e-06, + "loss": 0.7882, + "step": 761 + }, + { + "epoch": 0.44582586755402787, + "grad_norm": 0.03956297558604167, + "learning_rate": 7.152553941318655e-06, + "loss": 0.8313, + "step": 762 + }, + { + "epoch": 0.4464109408710279, + "grad_norm": 0.03781666670962381, + "learning_rate": 7.15042018570094e-06, + "loss": 0.9057, + "step": 763 + }, + { + "epoch": 0.4469960141880279, + "grad_norm": 0.04278185971588543, + "learning_rate": 7.148284106519195e-06, + "loss": 0.842, + "step": 764 + }, + { + "epoch": 0.44758108750502795, + "grad_norm": 0.03726277158845805, + "learning_rate": 7.1461457055779875e-06, + "loss": 0.8003, + "step": 765 + }, + { + "epoch": 0.44816616082202804, + "grad_norm": 0.044396475499465786, + "learning_rate": 7.144004984683844e-06, + "loss": 0.8393, + "step": 766 + }, + { + "epoch": 0.44875123413902807, + "grad_norm": 0.03484609580213948, + "learning_rate": 7.141861945645254e-06, + "loss": 0.8255, + "step": 767 + }, + { + "epoch": 0.4493363074560281, + "grad_norm": 0.04524092857108677, + "learning_rate": 7.139716590272663e-06, + "loss": 0.7811, + "step": 768 + }, + { + "epoch": 0.4499213807730281, + "grad_norm": 0.03734702819768394, + "learning_rate": 7.1375689203784755e-06, + "loss": 0.8644, + "step": 769 + }, + { + "epoch": 0.45050645409002815, + "grad_norm": 0.05019144085295421, + "learning_rate": 7.135418937777049e-06, + "loss": 0.9044, + "step": 770 + }, + { + "epoch": 0.4510915274070282, + "grad_norm": 0.04029746203545264, + "learning_rate": 7.133266644284696e-06, + "loss": 0.7769, + "step": 771 + }, + { + "epoch": 0.4516766007240282, + "grad_norm": 0.04262838461849833, + "learning_rate": 7.131112041719681e-06, + "loss": 0.9143, + "step": 772 + }, + { + "epoch": 0.45226167404102824, + "grad_norm": 0.043876149904780545, + "learning_rate": 7.1289551319022195e-06, + "loss": 0.8828, + "step": 773 + }, + { + "epoch": 0.4528467473580283, + "grad_norm": 0.04711071987260168, + "learning_rate": 7.126795916654477e-06, + "loss": 0.7762, + "step": 774 + }, + { + "epoch": 0.45343182067502835, + "grad_norm": 0.04116549252293238, + "learning_rate": 7.124634397800565e-06, + "loss": 0.7778, + "step": 775 + }, + { + "epoch": 0.4540168939920284, + "grad_norm": 0.0393006386390472, + "learning_rate": 7.1224705771665405e-06, + "loss": 0.8465, + "step": 776 + }, + { + "epoch": 0.4546019673090284, + "grad_norm": 0.03826784736833335, + "learning_rate": 7.120304456580408e-06, + "loss": 0.8359, + "step": 777 + }, + { + "epoch": 0.45518704062602844, + "grad_norm": 0.04985454473324124, + "learning_rate": 7.118136037872112e-06, + "loss": 0.8552, + "step": 778 + }, + { + "epoch": 0.45577211394302847, + "grad_norm": 0.04012812213905606, + "learning_rate": 7.115965322873541e-06, + "loss": 0.8249, + "step": 779 + }, + { + "epoch": 0.4563571872600285, + "grad_norm": 0.03736935441616661, + "learning_rate": 7.113792313418522e-06, + "loss": 0.8399, + "step": 780 + }, + { + "epoch": 0.4569422605770286, + "grad_norm": 0.036669437710784104, + "learning_rate": 7.1116170113428194e-06, + "loss": 0.79, + "step": 781 + }, + { + "epoch": 0.4575273338940286, + "grad_norm": 0.03882415136861196, + "learning_rate": 7.109439418484137e-06, + "loss": 0.8016, + "step": 782 + }, + { + "epoch": 0.45811240721102864, + "grad_norm": 0.0452764493144211, + "learning_rate": 7.107259536682111e-06, + "loss": 0.8138, + "step": 783 + }, + { + "epoch": 0.45869748052802867, + "grad_norm": 0.04729394522582173, + "learning_rate": 7.105077367778313e-06, + "loss": 0.7596, + "step": 784 + }, + { + "epoch": 0.4592825538450287, + "grad_norm": 0.05019146434388651, + "learning_rate": 7.102892913616248e-06, + "loss": 0.8015, + "step": 785 + }, + { + "epoch": 0.4598676271620287, + "grad_norm": 0.039099244411788574, + "learning_rate": 7.100706176041348e-06, + "loss": 0.8098, + "step": 786 + }, + { + "epoch": 0.46045270047902875, + "grad_norm": 0.03895433813179543, + "learning_rate": 7.098517156900978e-06, + "loss": 0.8851, + "step": 787 + }, + { + "epoch": 0.46103777379602884, + "grad_norm": 0.03731969942911145, + "learning_rate": 7.096325858044427e-06, + "loss": 0.7721, + "step": 788 + }, + { + "epoch": 0.46162284711302887, + "grad_norm": 0.04097182225913861, + "learning_rate": 7.094132281322912e-06, + "loss": 0.8223, + "step": 789 + }, + { + "epoch": 0.4622079204300289, + "grad_norm": 0.03696891399238777, + "learning_rate": 7.091936428589576e-06, + "loss": 0.8938, + "step": 790 + }, + { + "epoch": 0.4627929937470289, + "grad_norm": 0.03792585935917287, + "learning_rate": 7.089738301699479e-06, + "loss": 0.8393, + "step": 791 + }, + { + "epoch": 0.46337806706402895, + "grad_norm": 0.10218168425909542, + "learning_rate": 7.087537902509607e-06, + "loss": 0.8016, + "step": 792 + }, + { + "epoch": 0.463963140381029, + "grad_norm": 0.03989943757222258, + "learning_rate": 7.085335232878865e-06, + "loss": 0.7431, + "step": 793 + }, + { + "epoch": 0.464548213698029, + "grad_norm": 0.03900627784304605, + "learning_rate": 7.083130294668076e-06, + "loss": 0.8431, + "step": 794 + }, + { + "epoch": 0.4651332870150291, + "grad_norm": 0.05897148270799823, + "learning_rate": 7.080923089739978e-06, + "loss": 0.83, + "step": 795 + }, + { + "epoch": 0.4657183603320291, + "grad_norm": 0.03888063452127548, + "learning_rate": 7.078713619959228e-06, + "loss": 0.7915, + "step": 796 + }, + { + "epoch": 0.46630343364902915, + "grad_norm": 0.04604225990759911, + "learning_rate": 7.076501887192387e-06, + "loss": 0.8419, + "step": 797 + }, + { + "epoch": 0.4668885069660292, + "grad_norm": 0.040295564919243125, + "learning_rate": 7.074287893307941e-06, + "loss": 0.8764, + "step": 798 + }, + { + "epoch": 0.4674735802830292, + "grad_norm": 0.03769433658091701, + "learning_rate": 7.072071640176274e-06, + "loss": 0.8566, + "step": 799 + }, + { + "epoch": 0.46805865360002924, + "grad_norm": 0.03728814454795478, + "learning_rate": 7.069853129669688e-06, + "loss": 0.771, + "step": 800 + }, + { + "epoch": 0.46864372691702927, + "grad_norm": 0.03547834831253728, + "learning_rate": 7.067632363662386e-06, + "loss": 0.7874, + "step": 801 + }, + { + "epoch": 0.46922880023402935, + "grad_norm": 0.04116629646593246, + "learning_rate": 7.065409344030479e-06, + "loss": 0.8579, + "step": 802 + }, + { + "epoch": 0.4698138735510294, + "grad_norm": 0.04284181743182457, + "learning_rate": 7.063184072651981e-06, + "loss": 0.874, + "step": 803 + }, + { + "epoch": 0.4703989468680294, + "grad_norm": 0.5453076047222728, + "learning_rate": 7.060956551406807e-06, + "loss": 0.8275, + "step": 804 + }, + { + "epoch": 0.47098402018502944, + "grad_norm": 0.04285257240888046, + "learning_rate": 7.058726782176778e-06, + "loss": 0.8748, + "step": 805 + }, + { + "epoch": 0.47156909350202947, + "grad_norm": 0.0426593418584649, + "learning_rate": 7.056494766845606e-06, + "loss": 0.805, + "step": 806 + }, + { + "epoch": 0.4721541668190295, + "grad_norm": 0.03749564999202563, + "learning_rate": 7.05426050729891e-06, + "loss": 0.8289, + "step": 807 + }, + { + "epoch": 0.4727392401360295, + "grad_norm": 0.044556182570763894, + "learning_rate": 7.052024005424194e-06, + "loss": 0.8757, + "step": 808 + }, + { + "epoch": 0.4733243134530296, + "grad_norm": 0.04281051498943111, + "learning_rate": 7.049785263110867e-06, + "loss": 0.8838, + "step": 809 + }, + { + "epoch": 0.47390938677002964, + "grad_norm": 0.04188599336409731, + "learning_rate": 7.047544282250223e-06, + "loss": 0.8371, + "step": 810 + }, + { + "epoch": 0.47449446008702967, + "grad_norm": 0.04079367612711138, + "learning_rate": 7.045301064735451e-06, + "loss": 0.8383, + "step": 811 + }, + { + "epoch": 0.4750795334040297, + "grad_norm": 0.037411483434431944, + "learning_rate": 7.0430556124616294e-06, + "loss": 0.7866, + "step": 812 + }, + { + "epoch": 0.4756646067210297, + "grad_norm": 0.0497395887833391, + "learning_rate": 7.040807927325723e-06, + "loss": 0.836, + "step": 813 + }, + { + "epoch": 0.47624968003802975, + "grad_norm": 0.039340018189295584, + "learning_rate": 7.038558011226583e-06, + "loss": 0.7925, + "step": 814 + }, + { + "epoch": 0.4768347533550298, + "grad_norm": 0.04067266509967937, + "learning_rate": 7.036305866064947e-06, + "loss": 0.8246, + "step": 815 + }, + { + "epoch": 0.4774198266720298, + "grad_norm": 0.03756080114515082, + "learning_rate": 7.0340514937434334e-06, + "loss": 0.8091, + "step": 816 + }, + { + "epoch": 0.4780048999890299, + "grad_norm": 0.04009793142877897, + "learning_rate": 7.031794896166544e-06, + "loss": 0.8367, + "step": 817 + }, + { + "epoch": 0.4785899733060299, + "grad_norm": 0.045719600380512315, + "learning_rate": 7.029536075240659e-06, + "loss": 0.8698, + "step": 818 + }, + { + "epoch": 0.47917504662302995, + "grad_norm": 0.06146708297478866, + "learning_rate": 7.0272750328740394e-06, + "loss": 0.7769, + "step": 819 + }, + { + "epoch": 0.47976011994003, + "grad_norm": 0.0379549936030894, + "learning_rate": 7.025011770976821e-06, + "loss": 0.8307, + "step": 820 + }, + { + "epoch": 0.48034519325703, + "grad_norm": 0.046629898254568806, + "learning_rate": 7.022746291461013e-06, + "loss": 0.8296, + "step": 821 + }, + { + "epoch": 0.48093026657403004, + "grad_norm": 0.04903726722961758, + "learning_rate": 7.020478596240503e-06, + "loss": 0.8578, + "step": 822 + }, + { + "epoch": 0.48151533989103007, + "grad_norm": 0.044482172504589565, + "learning_rate": 7.018208687231045e-06, + "loss": 0.8339, + "step": 823 + }, + { + "epoch": 0.48210041320803015, + "grad_norm": 0.04652606756149258, + "learning_rate": 7.015936566350267e-06, + "loss": 0.8629, + "step": 824 + }, + { + "epoch": 0.4826854865250302, + "grad_norm": 0.03952788329765694, + "learning_rate": 7.013662235517661e-06, + "loss": 0.8851, + "step": 825 + }, + { + "epoch": 0.4832705598420302, + "grad_norm": 0.04737073432575187, + "learning_rate": 7.011385696654594e-06, + "loss": 0.8662, + "step": 826 + }, + { + "epoch": 0.48385563315903024, + "grad_norm": 0.03751881453871839, + "learning_rate": 7.0091069516842915e-06, + "loss": 0.8559, + "step": 827 + }, + { + "epoch": 0.48444070647603027, + "grad_norm": 0.03716116535695275, + "learning_rate": 7.006826002531843e-06, + "loss": 0.7718, + "step": 828 + }, + { + "epoch": 0.4850257797930303, + "grad_norm": 0.056266031939443406, + "learning_rate": 7.004542851124203e-06, + "loss": 0.7714, + "step": 829 + }, + { + "epoch": 0.4856108531100303, + "grad_norm": 0.05535556673447491, + "learning_rate": 7.0022574993901865e-06, + "loss": 0.8676, + "step": 830 + }, + { + "epoch": 0.4861959264270304, + "grad_norm": 0.03845486724104416, + "learning_rate": 6.999969949260464e-06, + "loss": 0.7885, + "step": 831 + }, + { + "epoch": 0.48678099974403044, + "grad_norm": 0.04062056962499875, + "learning_rate": 6.99768020266757e-06, + "loss": 0.8322, + "step": 832 + }, + { + "epoch": 0.48736607306103047, + "grad_norm": 0.06424079693979534, + "learning_rate": 6.995388261545884e-06, + "loss": 0.905, + "step": 833 + }, + { + "epoch": 0.4879511463780305, + "grad_norm": 0.04236693639090112, + "learning_rate": 6.993094127831649e-06, + "loss": 0.8726, + "step": 834 + }, + { + "epoch": 0.4885362196950305, + "grad_norm": 0.038818701632739046, + "learning_rate": 6.990797803462955e-06, + "loss": 0.7753, + "step": 835 + }, + { + "epoch": 0.48912129301203056, + "grad_norm": 0.05513690717963214, + "learning_rate": 6.988499290379746e-06, + "loss": 0.888, + "step": 836 + }, + { + "epoch": 0.4897063663290306, + "grad_norm": 0.040849030445044794, + "learning_rate": 6.986198590523812e-06, + "loss": 0.8466, + "step": 837 + }, + { + "epoch": 0.49029143964603067, + "grad_norm": 0.040919168417802985, + "learning_rate": 6.983895705838793e-06, + "loss": 0.8589, + "step": 838 + }, + { + "epoch": 0.4908765129630307, + "grad_norm": 0.038496795188997325, + "learning_rate": 6.9815906382701725e-06, + "loss": 0.862, + "step": 839 + }, + { + "epoch": 0.4914615862800307, + "grad_norm": 0.043613843698726576, + "learning_rate": 6.97928338976528e-06, + "loss": 0.8702, + "step": 840 + }, + { + "epoch": 0.49204665959703076, + "grad_norm": 0.03941143092937186, + "learning_rate": 6.9769739622732855e-06, + "loss": 0.7831, + "step": 841 + }, + { + "epoch": 0.4926317329140308, + "grad_norm": 0.06878143755782574, + "learning_rate": 6.974662357745203e-06, + "loss": 0.9225, + "step": 842 + }, + { + "epoch": 0.4932168062310308, + "grad_norm": 0.05850718491585764, + "learning_rate": 6.972348578133881e-06, + "loss": 0.8781, + "step": 843 + }, + { + "epoch": 0.49380187954803084, + "grad_norm": 0.039118395131995005, + "learning_rate": 6.9700326253940095e-06, + "loss": 0.7985, + "step": 844 + }, + { + "epoch": 0.4943869528650309, + "grad_norm": 0.057835827757031166, + "learning_rate": 6.967714501482114e-06, + "loss": 0.781, + "step": 845 + }, + { + "epoch": 0.49497202618203096, + "grad_norm": 0.05588852958063585, + "learning_rate": 6.965394208356551e-06, + "loss": 0.8423, + "step": 846 + }, + { + "epoch": 0.495557099499031, + "grad_norm": 0.039374530415438855, + "learning_rate": 6.9630717479775145e-06, + "loss": 0.8456, + "step": 847 + }, + { + "epoch": 0.496142172816031, + "grad_norm": 0.0358193083222334, + "learning_rate": 6.960747122307025e-06, + "loss": 0.7992, + "step": 848 + }, + { + "epoch": 0.49672724613303104, + "grad_norm": 0.03788661691723569, + "learning_rate": 6.9584203333089325e-06, + "loss": 0.8037, + "step": 849 + }, + { + "epoch": 0.49731231945003107, + "grad_norm": 0.04201649774003793, + "learning_rate": 6.956091382948918e-06, + "loss": 0.8882, + "step": 850 + }, + { + "epoch": 0.4978973927670311, + "grad_norm": 0.04434150316820075, + "learning_rate": 6.953760273194487e-06, + "loss": 0.8166, + "step": 851 + }, + { + "epoch": 0.49848246608403113, + "grad_norm": 0.07426388002691353, + "learning_rate": 6.951427006014967e-06, + "loss": 0.8424, + "step": 852 + }, + { + "epoch": 0.4990675394010312, + "grad_norm": 0.037544290713932796, + "learning_rate": 6.949091583381511e-06, + "loss": 0.7957, + "step": 853 + }, + { + "epoch": 0.49965261271803124, + "grad_norm": 0.05489041840699949, + "learning_rate": 6.946754007267091e-06, + "loss": 0.9257, + "step": 854 + }, + { + "epoch": 0.5002376860350313, + "grad_norm": 0.042595883415080985, + "learning_rate": 6.944414279646499e-06, + "loss": 0.8277, + "step": 855 + }, + { + "epoch": 0.5008227593520314, + "grad_norm": 0.08019312917038988, + "learning_rate": 6.942072402496345e-06, + "loss": 0.9115, + "step": 856 + }, + { + "epoch": 0.5014078326690313, + "grad_norm": 0.05086333614690917, + "learning_rate": 6.9397283777950545e-06, + "loss": 0.9041, + "step": 857 + }, + { + "epoch": 0.5019929059860314, + "grad_norm": 0.04214362484163065, + "learning_rate": 6.937382207522867e-06, + "loss": 0.8299, + "step": 858 + }, + { + "epoch": 0.5025779793030314, + "grad_norm": 0.04083088848742396, + "learning_rate": 6.935033893661835e-06, + "loss": 0.8356, + "step": 859 + }, + { + "epoch": 0.5031630526200315, + "grad_norm": 0.04687863044141472, + "learning_rate": 6.932683438195821e-06, + "loss": 0.865, + "step": 860 + }, + { + "epoch": 0.5037481259370314, + "grad_norm": 0.04280142790393131, + "learning_rate": 6.9303308431105e-06, + "loss": 0.8862, + "step": 861 + }, + { + "epoch": 0.5043331992540315, + "grad_norm": 0.04180226845557804, + "learning_rate": 6.92797611039335e-06, + "loss": 0.8046, + "step": 862 + }, + { + "epoch": 0.5049182725710316, + "grad_norm": 0.035262278854158474, + "learning_rate": 6.925619242033656e-06, + "loss": 0.8197, + "step": 863 + }, + { + "epoch": 0.5055033458880316, + "grad_norm": 0.03754079468139451, + "learning_rate": 6.92326024002251e-06, + "loss": 0.8101, + "step": 864 + }, + { + "epoch": 0.5060884192050317, + "grad_norm": 0.039581024469053656, + "learning_rate": 6.9208991063528045e-06, + "loss": 0.8607, + "step": 865 + }, + { + "epoch": 0.5066734925220316, + "grad_norm": 0.0371798589671245, + "learning_rate": 6.918535843019233e-06, + "loss": 0.8102, + "step": 866 + }, + { + "epoch": 0.5072585658390317, + "grad_norm": 0.037768636271352095, + "learning_rate": 6.916170452018288e-06, + "loss": 0.8418, + "step": 867 + }, + { + "epoch": 0.5078436391560317, + "grad_norm": 0.10093759593186688, + "learning_rate": 6.913802935348258e-06, + "loss": 0.8629, + "step": 868 + }, + { + "epoch": 0.5084287124730318, + "grad_norm": 0.03663052824269648, + "learning_rate": 6.911433295009232e-06, + "loss": 0.8162, + "step": 869 + }, + { + "epoch": 0.5090137857900318, + "grad_norm": 0.04855772810380649, + "learning_rate": 6.909061533003088e-06, + "loss": 0.8616, + "step": 870 + }, + { + "epoch": 0.5095988591070318, + "grad_norm": 0.043219673612599416, + "learning_rate": 6.906687651333498e-06, + "loss": 0.7216, + "step": 871 + }, + { + "epoch": 0.5101839324240319, + "grad_norm": 0.042711861232268646, + "learning_rate": 6.904311652005925e-06, + "loss": 0.7547, + "step": 872 + }, + { + "epoch": 0.5107690057410319, + "grad_norm": 0.05574403716352006, + "learning_rate": 6.9019335370276225e-06, + "loss": 0.795, + "step": 873 + }, + { + "epoch": 0.511354079058032, + "grad_norm": 0.06264056674635722, + "learning_rate": 6.899553308407629e-06, + "loss": 0.85, + "step": 874 + }, + { + "epoch": 0.511939152375032, + "grad_norm": 0.04875354726056701, + "learning_rate": 6.89717096815677e-06, + "loss": 0.8407, + "step": 875 + }, + { + "epoch": 0.512524225692032, + "grad_norm": 0.038870649597744084, + "learning_rate": 6.894786518287653e-06, + "loss": 0.8076, + "step": 876 + }, + { + "epoch": 0.513109299009032, + "grad_norm": 0.05118848362688543, + "learning_rate": 6.8923999608146705e-06, + "loss": 0.8363, + "step": 877 + }, + { + "epoch": 0.5136943723260321, + "grad_norm": 0.0376112846207315, + "learning_rate": 6.890011297753994e-06, + "loss": 0.7743, + "step": 878 + }, + { + "epoch": 0.5142794456430322, + "grad_norm": 0.06896727299920731, + "learning_rate": 6.887620531123574e-06, + "loss": 0.8359, + "step": 879 + }, + { + "epoch": 0.5148645189600322, + "grad_norm": 0.03963337869499397, + "learning_rate": 6.885227662943136e-06, + "loss": 0.8206, + "step": 880 + }, + { + "epoch": 0.5154495922770322, + "grad_norm": 0.04870453554962347, + "learning_rate": 6.882832695234186e-06, + "loss": 0.8189, + "step": 881 + }, + { + "epoch": 0.5160346655940322, + "grad_norm": 0.03835351064840454, + "learning_rate": 6.880435630019998e-06, + "loss": 0.8973, + "step": 882 + }, + { + "epoch": 0.5166197389110323, + "grad_norm": 0.058777196577975345, + "learning_rate": 6.8780364693256224e-06, + "loss": 0.8733, + "step": 883 + }, + { + "epoch": 0.5172048122280323, + "grad_norm": 0.04023477251259275, + "learning_rate": 6.875635215177878e-06, + "loss": 0.7674, + "step": 884 + }, + { + "epoch": 0.5177898855450324, + "grad_norm": 0.10180465827282513, + "learning_rate": 6.873231869605351e-06, + "loss": 0.8238, + "step": 885 + }, + { + "epoch": 0.5183749588620324, + "grad_norm": 0.03697008145978301, + "learning_rate": 6.870826434638396e-06, + "loss": 0.8113, + "step": 886 + }, + { + "epoch": 0.5189600321790324, + "grad_norm": 0.03765250989596081, + "learning_rate": 6.868418912309133e-06, + "loss": 0.8409, + "step": 887 + }, + { + "epoch": 0.5195451054960325, + "grad_norm": 0.03863967736287724, + "learning_rate": 6.866009304651444e-06, + "loss": 0.7622, + "step": 888 + }, + { + "epoch": 0.5201301788130325, + "grad_norm": 0.04809769182496589, + "learning_rate": 6.8635976137009735e-06, + "loss": 0.838, + "step": 889 + }, + { + "epoch": 0.5207152521300326, + "grad_norm": 0.04589536972341525, + "learning_rate": 6.861183841495127e-06, + "loss": 0.9291, + "step": 890 + }, + { + "epoch": 0.5213003254470325, + "grad_norm": 0.04495994452664241, + "learning_rate": 6.858767990073066e-06, + "loss": 0.9015, + "step": 891 + }, + { + "epoch": 0.5218853987640326, + "grad_norm": 0.03736417738835852, + "learning_rate": 6.856350061475712e-06, + "loss": 0.7575, + "step": 892 + }, + { + "epoch": 0.5224704720810327, + "grad_norm": 0.039650470820273634, + "learning_rate": 6.853930057745735e-06, + "loss": 0.7939, + "step": 893 + }, + { + "epoch": 0.5230555453980327, + "grad_norm": 0.04072420592259055, + "learning_rate": 6.8515079809275656e-06, + "loss": 0.8026, + "step": 894 + }, + { + "epoch": 0.5236406187150328, + "grad_norm": 0.04022954757166665, + "learning_rate": 6.849083833067381e-06, + "loss": 0.7891, + "step": 895 + }, + { + "epoch": 0.5242256920320327, + "grad_norm": 0.04135409431135059, + "learning_rate": 6.846657616213109e-06, + "loss": 0.758, + "step": 896 + }, + { + "epoch": 0.5248107653490328, + "grad_norm": 0.03960521359539724, + "learning_rate": 6.844229332414427e-06, + "loss": 0.8038, + "step": 897 + }, + { + "epoch": 0.5253958386660328, + "grad_norm": 0.03765896158372491, + "learning_rate": 6.841798983722755e-06, + "loss": 0.7877, + "step": 898 + }, + { + "epoch": 0.5259809119830329, + "grad_norm": 0.058839179066597226, + "learning_rate": 6.839366572191262e-06, + "loss": 0.7523, + "step": 899 + }, + { + "epoch": 0.526565985300033, + "grad_norm": 0.040054239033310936, + "learning_rate": 6.836932099874856e-06, + "loss": 0.8321, + "step": 900 + }, + { + "epoch": 0.5271510586170329, + "grad_norm": 0.0627577307413516, + "learning_rate": 6.834495568830187e-06, + "loss": 0.8389, + "step": 901 + }, + { + "epoch": 0.527736131934033, + "grad_norm": 0.036542541933322875, + "learning_rate": 6.832056981115644e-06, + "loss": 0.8108, + "step": 902 + }, + { + "epoch": 0.528321205251033, + "grad_norm": 0.039969851332150676, + "learning_rate": 6.8296163387913545e-06, + "loss": 0.8144, + "step": 903 + }, + { + "epoch": 0.5289062785680331, + "grad_norm": 0.04091122066442238, + "learning_rate": 6.827173643919181e-06, + "loss": 0.8865, + "step": 904 + }, + { + "epoch": 0.529491351885033, + "grad_norm": 0.04077179519179353, + "learning_rate": 6.824728898562721e-06, + "loss": 0.8353, + "step": 905 + }, + { + "epoch": 0.5300764252020331, + "grad_norm": 0.045472282387591885, + "learning_rate": 6.822282104787305e-06, + "loss": 0.8175, + "step": 906 + }, + { + "epoch": 0.5306614985190332, + "grad_norm": 0.03919660004429457, + "learning_rate": 6.819833264659988e-06, + "loss": 0.7968, + "step": 907 + }, + { + "epoch": 0.5312465718360332, + "grad_norm": 0.0550475630156648, + "learning_rate": 6.81738238024956e-06, + "loss": 0.8005, + "step": 908 + }, + { + "epoch": 0.5318316451530333, + "grad_norm": 0.04044516211789697, + "learning_rate": 6.814929453626538e-06, + "loss": 0.8056, + "step": 909 + }, + { + "epoch": 0.5324167184700332, + "grad_norm": 0.04512920702041598, + "learning_rate": 6.81247448686316e-06, + "loss": 0.8061, + "step": 910 + }, + { + "epoch": 0.5330017917870333, + "grad_norm": 0.06440782132528662, + "learning_rate": 6.810017482033392e-06, + "loss": 0.8471, + "step": 911 + }, + { + "epoch": 0.5335868651040333, + "grad_norm": 0.03695317579916004, + "learning_rate": 6.8075584412129205e-06, + "loss": 0.8222, + "step": 912 + }, + { + "epoch": 0.5341719384210334, + "grad_norm": 0.03666660940321042, + "learning_rate": 6.805097366479148e-06, + "loss": 0.7822, + "step": 913 + }, + { + "epoch": 0.5347570117380335, + "grad_norm": 0.055445109178676184, + "learning_rate": 6.802634259911201e-06, + "loss": 0.844, + "step": 914 + }, + { + "epoch": 0.5353420850550334, + "grad_norm": 0.05279543181613761, + "learning_rate": 6.800169123589919e-06, + "loss": 0.7463, + "step": 915 + }, + { + "epoch": 0.5359271583720335, + "grad_norm": 0.041859624286024466, + "learning_rate": 6.797701959597859e-06, + "loss": 0.8604, + "step": 916 + }, + { + "epoch": 0.5365122316890335, + "grad_norm": 0.0383422427552055, + "learning_rate": 6.795232770019286e-06, + "loss": 0.7703, + "step": 917 + }, + { + "epoch": 0.5370973050060336, + "grad_norm": 0.043028959290976825, + "learning_rate": 6.7927615569401815e-06, + "loss": 0.8212, + "step": 918 + }, + { + "epoch": 0.5376823783230336, + "grad_norm": 0.03804962859282499, + "learning_rate": 6.790288322448235e-06, + "loss": 0.8366, + "step": 919 + }, + { + "epoch": 0.5382674516400336, + "grad_norm": 0.039087833943377275, + "learning_rate": 6.787813068632843e-06, + "loss": 0.7831, + "step": 920 + }, + { + "epoch": 0.5388525249570337, + "grad_norm": 0.04141823524439823, + "learning_rate": 6.785335797585107e-06, + "loss": 0.8828, + "step": 921 + }, + { + "epoch": 0.5394375982740337, + "grad_norm": 0.05007681264530146, + "learning_rate": 6.782856511397835e-06, + "loss": 0.8005, + "step": 922 + }, + { + "epoch": 0.5400226715910338, + "grad_norm": 0.0358413269090128, + "learning_rate": 6.780375212165535e-06, + "loss": 0.8488, + "step": 923 + }, + { + "epoch": 0.5406077449080338, + "grad_norm": 0.039662947137159635, + "learning_rate": 6.777891901984417e-06, + "loss": 0.8269, + "step": 924 + }, + { + "epoch": 0.5411928182250338, + "grad_norm": 0.03708918191821332, + "learning_rate": 6.775406582952389e-06, + "loss": 0.8561, + "step": 925 + }, + { + "epoch": 0.5417778915420338, + "grad_norm": 0.0512337126087228, + "learning_rate": 6.772919257169059e-06, + "loss": 0.8225, + "step": 926 + }, + { + "epoch": 0.5423629648590339, + "grad_norm": 0.15259566476683822, + "learning_rate": 6.770429926735727e-06, + "loss": 0.7892, + "step": 927 + }, + { + "epoch": 0.542948038176034, + "grad_norm": 0.036454692983525445, + "learning_rate": 6.767938593755386e-06, + "loss": 0.87, + "step": 928 + }, + { + "epoch": 0.543533111493034, + "grad_norm": 0.08503045559657693, + "learning_rate": 6.765445260332723e-06, + "loss": 0.878, + "step": 929 + }, + { + "epoch": 0.544118184810034, + "grad_norm": 0.04291403088850566, + "learning_rate": 6.7629499285741155e-06, + "loss": 0.8633, + "step": 930 + }, + { + "epoch": 0.544703258127034, + "grad_norm": 0.04417233325326183, + "learning_rate": 6.7604526005876265e-06, + "loss": 0.7777, + "step": 931 + }, + { + "epoch": 0.5452883314440341, + "grad_norm": 0.04685514307801011, + "learning_rate": 6.7579532784830075e-06, + "loss": 0.8233, + "step": 932 + }, + { + "epoch": 0.5458734047610341, + "grad_norm": 0.04216821721923777, + "learning_rate": 6.755451964371696e-06, + "loss": 0.8055, + "step": 933 + }, + { + "epoch": 0.5464584780780342, + "grad_norm": 0.05899439548668003, + "learning_rate": 6.752948660366807e-06, + "loss": 0.7423, + "step": 934 + }, + { + "epoch": 0.5470435513950342, + "grad_norm": 0.039547439339394544, + "learning_rate": 6.750443368583141e-06, + "loss": 0.7959, + "step": 935 + }, + { + "epoch": 0.5476286247120342, + "grad_norm": 0.046082967838638036, + "learning_rate": 6.747936091137179e-06, + "loss": 0.7691, + "step": 936 + }, + { + "epoch": 0.5482136980290343, + "grad_norm": 0.04724571857592231, + "learning_rate": 6.745426830147074e-06, + "loss": 0.7716, + "step": 937 + }, + { + "epoch": 0.5487987713460343, + "grad_norm": 0.049126540248396626, + "learning_rate": 6.74291558773266e-06, + "loss": 0.8051, + "step": 938 + }, + { + "epoch": 0.5493838446630344, + "grad_norm": 0.038162530207834874, + "learning_rate": 6.740402366015442e-06, + "loss": 0.8182, + "step": 939 + }, + { + "epoch": 0.5499689179800343, + "grad_norm": 0.07362395009585133, + "learning_rate": 6.737887167118597e-06, + "loss": 0.8025, + "step": 940 + }, + { + "epoch": 0.5505539912970344, + "grad_norm": 0.03855880618683095, + "learning_rate": 6.735369993166977e-06, + "loss": 0.8257, + "step": 941 + }, + { + "epoch": 0.5511390646140344, + "grad_norm": 0.04403565508981498, + "learning_rate": 6.732850846287096e-06, + "loss": 0.7377, + "step": 942 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 0.039173991395047666, + "learning_rate": 6.730329728607137e-06, + "loss": 0.8063, + "step": 943 + }, + { + "epoch": 0.5523092112480346, + "grad_norm": 0.042531239336748856, + "learning_rate": 6.72780664225695e-06, + "loss": 0.8237, + "step": 944 + }, + { + "epoch": 0.5528942845650345, + "grad_norm": 0.039715480086954504, + "learning_rate": 6.725281589368046e-06, + "loss": 0.8229, + "step": 945 + }, + { + "epoch": 0.5534793578820346, + "grad_norm": 0.04910369460983757, + "learning_rate": 6.722754572073599e-06, + "loss": 0.8503, + "step": 946 + }, + { + "epoch": 0.5540644311990346, + "grad_norm": 0.03827200257320983, + "learning_rate": 6.720225592508439e-06, + "loss": 0.9069, + "step": 947 + }, + { + "epoch": 0.5546495045160347, + "grad_norm": 0.06511818499208381, + "learning_rate": 6.7176946528090585e-06, + "loss": 0.9065, + "step": 948 + }, + { + "epoch": 0.5552345778330346, + "grad_norm": 0.03694464014581456, + "learning_rate": 6.715161755113604e-06, + "loss": 0.8588, + "step": 949 + }, + { + "epoch": 0.5558196511500347, + "grad_norm": 0.03684150463907197, + "learning_rate": 6.712626901561876e-06, + "loss": 0.8272, + "step": 950 + }, + { + "epoch": 0.5564047244670348, + "grad_norm": 0.04096608730346328, + "learning_rate": 6.710090094295323e-06, + "loss": 0.9031, + "step": 951 + }, + { + "epoch": 0.5569897977840348, + "grad_norm": 0.04008860398814125, + "learning_rate": 6.707551335457054e-06, + "loss": 0.8452, + "step": 952 + }, + { + "epoch": 0.5575748711010349, + "grad_norm": 0.06233285275932101, + "learning_rate": 6.705010627191816e-06, + "loss": 0.8813, + "step": 953 + }, + { + "epoch": 0.5581599444180348, + "grad_norm": 0.04077134247799943, + "learning_rate": 6.7024679716460114e-06, + "loss": 0.8493, + "step": 954 + }, + { + "epoch": 0.5587450177350349, + "grad_norm": 0.046352641350363474, + "learning_rate": 6.699923370967682e-06, + "loss": 0.8309, + "step": 955 + }, + { + "epoch": 0.5593300910520349, + "grad_norm": 0.041407416461248854, + "learning_rate": 6.6973768273065145e-06, + "loss": 0.8149, + "step": 956 + }, + { + "epoch": 0.559915164369035, + "grad_norm": 0.03903915190219945, + "learning_rate": 6.694828342813839e-06, + "loss": 0.8669, + "step": 957 + }, + { + "epoch": 0.5605002376860351, + "grad_norm": 0.05659630453049191, + "learning_rate": 6.692277919642623e-06, + "loss": 0.8291, + "step": 958 + }, + { + "epoch": 0.561085311003035, + "grad_norm": 0.048419920107990906, + "learning_rate": 6.6897255599474705e-06, + "loss": 0.7891, + "step": 959 + }, + { + "epoch": 0.5616703843200351, + "grad_norm": 0.04526622141241147, + "learning_rate": 6.6871712658846255e-06, + "loss": 0.82, + "step": 960 + }, + { + "epoch": 0.5622554576370351, + "grad_norm": 0.05616992844910552, + "learning_rate": 6.684615039611963e-06, + "loss": 0.819, + "step": 961 + }, + { + "epoch": 0.5628405309540352, + "grad_norm": 0.05533671965109322, + "learning_rate": 6.682056883288993e-06, + "loss": 0.8278, + "step": 962 + }, + { + "epoch": 0.5634256042710352, + "grad_norm": 0.04723246972770138, + "learning_rate": 6.679496799076853e-06, + "loss": 0.8255, + "step": 963 + }, + { + "epoch": 0.5640106775880352, + "grad_norm": 0.04762545006180037, + "learning_rate": 6.67693478913831e-06, + "loss": 0.8867, + "step": 964 + }, + { + "epoch": 0.5645957509050353, + "grad_norm": 0.04368900157724721, + "learning_rate": 6.674370855637759e-06, + "loss": 0.8527, + "step": 965 + }, + { + "epoch": 0.5651808242220353, + "grad_norm": 0.04639910967539116, + "learning_rate": 6.671805000741221e-06, + "loss": 0.8147, + "step": 966 + }, + { + "epoch": 0.5657658975390354, + "grad_norm": 0.04122171566933528, + "learning_rate": 6.6692372266163365e-06, + "loss": 0.8176, + "step": 967 + }, + { + "epoch": 0.5663509708560354, + "grad_norm": 0.04160364438101482, + "learning_rate": 6.666667535432371e-06, + "loss": 0.8588, + "step": 968 + }, + { + "epoch": 0.5669360441730354, + "grad_norm": 0.04172707626334754, + "learning_rate": 6.664095929360207e-06, + "loss": 0.8315, + "step": 969 + }, + { + "epoch": 0.5675211174900354, + "grad_norm": 0.04646561250778188, + "learning_rate": 6.661522410572346e-06, + "loss": 0.7923, + "step": 970 + }, + { + "epoch": 0.5681061908070355, + "grad_norm": 0.04188298952966753, + "learning_rate": 6.658946981242906e-06, + "loss": 0.7711, + "step": 971 + }, + { + "epoch": 0.5686912641240356, + "grad_norm": 0.03908796787695422, + "learning_rate": 6.656369643547617e-06, + "loss": 0.8856, + "step": 972 + }, + { + "epoch": 0.5692763374410356, + "grad_norm": 0.042788844509774104, + "learning_rate": 6.653790399663823e-06, + "loss": 0.7808, + "step": 973 + }, + { + "epoch": 0.5698614107580356, + "grad_norm": 0.05201703823511508, + "learning_rate": 6.651209251770478e-06, + "loss": 0.8618, + "step": 974 + }, + { + "epoch": 0.5704464840750356, + "grad_norm": 0.04585801993937681, + "learning_rate": 6.648626202048144e-06, + "loss": 0.8373, + "step": 975 + }, + { + "epoch": 0.5710315573920357, + "grad_norm": 0.040661578495326846, + "learning_rate": 6.646041252678989e-06, + "loss": 0.7641, + "step": 976 + }, + { + "epoch": 0.5716166307090357, + "grad_norm": 0.035144726376477184, + "learning_rate": 6.643454405846788e-06, + "loss": 0.7272, + "step": 977 + }, + { + "epoch": 0.5722017040260358, + "grad_norm": 0.04251759434748344, + "learning_rate": 6.640865663736917e-06, + "loss": 0.8063, + "step": 978 + }, + { + "epoch": 0.5727867773430358, + "grad_norm": 0.037521500861554015, + "learning_rate": 6.638275028536356e-06, + "loss": 0.7797, + "step": 979 + }, + { + "epoch": 0.5733718506600358, + "grad_norm": 0.041622067012318506, + "learning_rate": 6.6356825024336784e-06, + "loss": 0.8679, + "step": 980 + }, + { + "epoch": 0.5739569239770359, + "grad_norm": 0.04290995619424604, + "learning_rate": 6.63308808761906e-06, + "loss": 0.8477, + "step": 981 + }, + { + "epoch": 0.5745419972940359, + "grad_norm": 0.0429619795472441, + "learning_rate": 6.630491786284273e-06, + "loss": 0.8145, + "step": 982 + }, + { + "epoch": 0.575127070611036, + "grad_norm": 0.03771407315818989, + "learning_rate": 6.6278936006226795e-06, + "loss": 0.796, + "step": 983 + }, + { + "epoch": 0.5757121439280359, + "grad_norm": 0.0720666677058357, + "learning_rate": 6.625293532829236e-06, + "loss": 0.7856, + "step": 984 + }, + { + "epoch": 0.576297217245036, + "grad_norm": 0.05035693118475441, + "learning_rate": 6.622691585100488e-06, + "loss": 0.8212, + "step": 985 + }, + { + "epoch": 0.5768822905620361, + "grad_norm": 0.0661002129965687, + "learning_rate": 6.620087759634569e-06, + "loss": 0.8248, + "step": 986 + }, + { + "epoch": 0.5774673638790361, + "grad_norm": 0.043048564438655845, + "learning_rate": 6.617482058631201e-06, + "loss": 0.7865, + "step": 987 + }, + { + "epoch": 0.5780524371960362, + "grad_norm": 0.04716626817210766, + "learning_rate": 6.614874484291688e-06, + "loss": 0.7806, + "step": 988 + }, + { + "epoch": 0.5786375105130361, + "grad_norm": 0.043537127489980836, + "learning_rate": 6.612265038818915e-06, + "loss": 0.8248, + "step": 989 + }, + { + "epoch": 0.5792225838300362, + "grad_norm": 0.036528767071679374, + "learning_rate": 6.609653724417354e-06, + "loss": 0.8464, + "step": 990 + }, + { + "epoch": 0.5798076571470362, + "grad_norm": 0.041985098940740775, + "learning_rate": 6.6070405432930495e-06, + "loss": 0.8371, + "step": 991 + }, + { + "epoch": 0.5803927304640363, + "grad_norm": 0.037661629039364965, + "learning_rate": 6.604425497653627e-06, + "loss": 0.8133, + "step": 992 + }, + { + "epoch": 0.5809778037810364, + "grad_norm": 0.04709785126617865, + "learning_rate": 6.6018085897082845e-06, + "loss": 0.7926, + "step": 993 + }, + { + "epoch": 0.5815628770980363, + "grad_norm": 0.04920382998315035, + "learning_rate": 6.5991898216677945e-06, + "loss": 0.8511, + "step": 994 + }, + { + "epoch": 0.5821479504150364, + "grad_norm": 0.0384231269239162, + "learning_rate": 6.596569195744502e-06, + "loss": 0.8767, + "step": 995 + }, + { + "epoch": 0.5827330237320364, + "grad_norm": 0.0469334210563224, + "learning_rate": 6.59394671415232e-06, + "loss": 0.8445, + "step": 996 + }, + { + "epoch": 0.5833180970490365, + "grad_norm": 0.07961405813390293, + "learning_rate": 6.591322379106728e-06, + "loss": 0.7951, + "step": 997 + }, + { + "epoch": 0.5839031703660365, + "grad_norm": 0.039593707752364676, + "learning_rate": 6.588696192824775e-06, + "loss": 0.8325, + "step": 998 + }, + { + "epoch": 0.5844882436830365, + "grad_norm": 0.04184294201712494, + "learning_rate": 6.5860681575250706e-06, + "loss": 0.8599, + "step": 999 + }, + { + "epoch": 0.5850733170000366, + "grad_norm": 0.10237775503964221, + "learning_rate": 6.5834382754277885e-06, + "loss": 0.8159, + "step": 1000 + }, + { + "epoch": 0.5856583903170366, + "grad_norm": 0.03881481319706578, + "learning_rate": 6.580806548754661e-06, + "loss": 0.7969, + "step": 1001 + }, + { + "epoch": 0.5862434636340367, + "grad_norm": 0.043457660890875695, + "learning_rate": 6.578172979728979e-06, + "loss": 0.8012, + "step": 1002 + }, + { + "epoch": 0.5868285369510367, + "grad_norm": 0.037934756068100706, + "learning_rate": 6.5755375705755924e-06, + "loss": 0.8248, + "step": 1003 + }, + { + "epoch": 0.5874136102680367, + "grad_norm": 0.12610177824785532, + "learning_rate": 6.572900323520901e-06, + "loss": 0.7342, + "step": 1004 + }, + { + "epoch": 0.5879986835850367, + "grad_norm": 0.03661390969286209, + "learning_rate": 6.570261240792861e-06, + "loss": 0.7682, + "step": 1005 + }, + { + "epoch": 0.5885837569020368, + "grad_norm": 0.06798106192365762, + "learning_rate": 6.5676203246209785e-06, + "loss": 0.793, + "step": 1006 + }, + { + "epoch": 0.5891688302190369, + "grad_norm": 0.05106297642369706, + "learning_rate": 6.564977577236309e-06, + "loss": 0.881, + "step": 1007 + }, + { + "epoch": 0.5897539035360368, + "grad_norm": 0.04384540012407079, + "learning_rate": 6.5623330008714505e-06, + "loss": 0.7453, + "step": 1008 + }, + { + "epoch": 0.5903389768530369, + "grad_norm": 0.10758927483483961, + "learning_rate": 6.559686597760555e-06, + "loss": 0.8367, + "step": 1009 + }, + { + "epoch": 0.5909240501700369, + "grad_norm": 0.04079443571028819, + "learning_rate": 6.557038370139307e-06, + "loss": 0.8972, + "step": 1010 + }, + { + "epoch": 0.591509123487037, + "grad_norm": 0.04314093124710063, + "learning_rate": 6.554388320244943e-06, + "loss": 0.8316, + "step": 1011 + }, + { + "epoch": 0.592094196804037, + "grad_norm": 0.037939172015033606, + "learning_rate": 6.5517364503162315e-06, + "loss": 0.8587, + "step": 1012 + }, + { + "epoch": 0.592679270121037, + "grad_norm": 0.08510073848850286, + "learning_rate": 6.549082762593481e-06, + "loss": 0.8269, + "step": 1013 + }, + { + "epoch": 0.5932643434380371, + "grad_norm": 0.04708547345385738, + "learning_rate": 6.546427259318535e-06, + "loss": 0.8102, + "step": 1014 + }, + { + "epoch": 0.5938494167550371, + "grad_norm": 0.053444431590032856, + "learning_rate": 6.543769942734772e-06, + "loss": 0.7774, + "step": 1015 + }, + { + "epoch": 0.5944344900720372, + "grad_norm": 0.04005959184610443, + "learning_rate": 6.541110815087104e-06, + "loss": 0.7808, + "step": 1016 + }, + { + "epoch": 0.5950195633890372, + "grad_norm": 0.05077795167298267, + "learning_rate": 6.538449878621966e-06, + "loss": 0.8758, + "step": 1017 + }, + { + "epoch": 0.5956046367060372, + "grad_norm": 0.03833355241397989, + "learning_rate": 6.535787135587331e-06, + "loss": 0.8331, + "step": 1018 + }, + { + "epoch": 0.5961897100230372, + "grad_norm": 0.053233926527690635, + "learning_rate": 6.533122588232689e-06, + "loss": 0.8339, + "step": 1019 + }, + { + "epoch": 0.5967747833400373, + "grad_norm": 0.044734695814882965, + "learning_rate": 6.530456238809062e-06, + "loss": 0.7773, + "step": 1020 + }, + { + "epoch": 0.5973598566570373, + "grad_norm": 0.05575094862913427, + "learning_rate": 6.527788089568987e-06, + "loss": 0.8303, + "step": 1021 + }, + { + "epoch": 0.5979449299740374, + "grad_norm": 0.04961953370443646, + "learning_rate": 6.525118142766527e-06, + "loss": 0.7392, + "step": 1022 + }, + { + "epoch": 0.5985300032910374, + "grad_norm": 0.09625684788536548, + "learning_rate": 6.522446400657264e-06, + "loss": 0.9054, + "step": 1023 + }, + { + "epoch": 0.5991150766080374, + "grad_norm": 0.05229897404990849, + "learning_rate": 6.519772865498291e-06, + "loss": 0.7961, + "step": 1024 + }, + { + "epoch": 0.5997001499250375, + "grad_norm": 0.043742861050743945, + "learning_rate": 6.51709753954822e-06, + "loss": 0.8543, + "step": 1025 + }, + { + "epoch": 0.6002852232420375, + "grad_norm": 0.04421257742872319, + "learning_rate": 6.514420425067179e-06, + "loss": 0.9387, + "step": 1026 + }, + { + "epoch": 0.6008702965590376, + "grad_norm": 0.05074707736420169, + "learning_rate": 6.511741524316798e-06, + "loss": 0.6896, + "step": 1027 + }, + { + "epoch": 0.6014553698760375, + "grad_norm": 0.038800133730654246, + "learning_rate": 6.509060839560223e-06, + "loss": 0.7429, + "step": 1028 + }, + { + "epoch": 0.6020404431930376, + "grad_norm": 0.0672245152896617, + "learning_rate": 6.506378373062107e-06, + "loss": 0.8246, + "step": 1029 + }, + { + "epoch": 0.6026255165100377, + "grad_norm": 0.04853749164209948, + "learning_rate": 6.503694127088604e-06, + "loss": 0.8006, + "step": 1030 + }, + { + "epoch": 0.6032105898270377, + "grad_norm": 0.0423510296147158, + "learning_rate": 6.501008103907376e-06, + "loss": 0.8068, + "step": 1031 + }, + { + "epoch": 0.6037956631440378, + "grad_norm": 0.04168402528442862, + "learning_rate": 6.498320305787583e-06, + "loss": 0.747, + "step": 1032 + }, + { + "epoch": 0.6043807364610377, + "grad_norm": 0.05536933839665471, + "learning_rate": 6.495630734999885e-06, + "loss": 0.8214, + "step": 1033 + }, + { + "epoch": 0.6049658097780378, + "grad_norm": 0.03706453349529763, + "learning_rate": 6.4929393938164425e-06, + "loss": 0.7507, + "step": 1034 + }, + { + "epoch": 0.6055508830950378, + "grad_norm": 0.03569665816245634, + "learning_rate": 6.490246284510907e-06, + "loss": 0.8255, + "step": 1035 + }, + { + "epoch": 0.6061359564120379, + "grad_norm": 0.04630173604608656, + "learning_rate": 6.487551409358428e-06, + "loss": 0.8046, + "step": 1036 + }, + { + "epoch": 0.606721029729038, + "grad_norm": 0.04723153392312322, + "learning_rate": 6.4848547706356444e-06, + "loss": 0.8256, + "step": 1037 + }, + { + "epoch": 0.6073061030460379, + "grad_norm": 0.03827183486478635, + "learning_rate": 6.482156370620683e-06, + "loss": 0.7563, + "step": 1038 + }, + { + "epoch": 0.607891176363038, + "grad_norm": 0.0388015622476578, + "learning_rate": 6.479456211593165e-06, + "loss": 0.7176, + "step": 1039 + }, + { + "epoch": 0.608476249680038, + "grad_norm": 0.03999779366433317, + "learning_rate": 6.476754295834191e-06, + "loss": 0.8224, + "step": 1040 + }, + { + "epoch": 0.6090613229970381, + "grad_norm": 0.039892311683218856, + "learning_rate": 6.47405062562635e-06, + "loss": 0.8236, + "step": 1041 + }, + { + "epoch": 0.609646396314038, + "grad_norm": 0.03973227428998665, + "learning_rate": 6.471345203253711e-06, + "loss": 0.8184, + "step": 1042 + }, + { + "epoch": 0.6102314696310381, + "grad_norm": 0.04150106217436074, + "learning_rate": 6.468638031001823e-06, + "loss": 0.8804, + "step": 1043 + }, + { + "epoch": 0.6108165429480382, + "grad_norm": 0.04239329779716534, + "learning_rate": 6.465929111157714e-06, + "loss": 0.7935, + "step": 1044 + }, + { + "epoch": 0.6114016162650382, + "grad_norm": 0.05192716135637802, + "learning_rate": 6.463218446009888e-06, + "loss": 0.8526, + "step": 1045 + }, + { + "epoch": 0.6119866895820383, + "grad_norm": 0.04095824415870296, + "learning_rate": 6.4605060378483255e-06, + "loss": 0.901, + "step": 1046 + }, + { + "epoch": 0.6125717628990383, + "grad_norm": 0.03546640975061806, + "learning_rate": 6.457791888964478e-06, + "loss": 0.7811, + "step": 1047 + }, + { + "epoch": 0.6131568362160383, + "grad_norm": 0.040296634693903335, + "learning_rate": 6.455076001651265e-06, + "loss": 0.7403, + "step": 1048 + }, + { + "epoch": 0.6137419095330383, + "grad_norm": 0.039361319712739894, + "learning_rate": 6.452358378203079e-06, + "loss": 0.8359, + "step": 1049 + }, + { + "epoch": 0.6143269828500384, + "grad_norm": 0.04449717698536887, + "learning_rate": 6.449639020915777e-06, + "loss": 0.7877, + "step": 1050 + }, + { + "epoch": 0.6149120561670385, + "grad_norm": 0.05075898809213348, + "learning_rate": 6.446917932086681e-06, + "loss": 0.7867, + "step": 1051 + }, + { + "epoch": 0.6154971294840385, + "grad_norm": 0.05427508854998823, + "learning_rate": 6.444195114014573e-06, + "loss": 0.9079, + "step": 1052 + }, + { + "epoch": 0.6160822028010385, + "grad_norm": 0.039038596627602784, + "learning_rate": 6.441470568999704e-06, + "loss": 0.8449, + "step": 1053 + }, + { + "epoch": 0.6166672761180385, + "grad_norm": 0.05495139920274083, + "learning_rate": 6.438744299343774e-06, + "loss": 0.8195, + "step": 1054 + }, + { + "epoch": 0.6172523494350386, + "grad_norm": 0.04036928882191201, + "learning_rate": 6.436016307349947e-06, + "loss": 0.9269, + "step": 1055 + }, + { + "epoch": 0.6178374227520386, + "grad_norm": 0.03982823893586514, + "learning_rate": 6.4332865953228395e-06, + "loss": 0.7358, + "step": 1056 + }, + { + "epoch": 0.6184224960690387, + "grad_norm": 0.06481474497909782, + "learning_rate": 6.430555165568521e-06, + "loss": 0.7267, + "step": 1057 + }, + { + "epoch": 0.6190075693860387, + "grad_norm": 0.05875784906821891, + "learning_rate": 6.427822020394512e-06, + "loss": 0.8439, + "step": 1058 + }, + { + "epoch": 0.6195926427030387, + "grad_norm": 0.040087218682744265, + "learning_rate": 6.425087162109781e-06, + "loss": 0.7822, + "step": 1059 + }, + { + "epoch": 0.6201777160200388, + "grad_norm": 0.04225947062502592, + "learning_rate": 6.422350593024747e-06, + "loss": 0.8365, + "step": 1060 + }, + { + "epoch": 0.6207627893370388, + "grad_norm": 0.03505701993198131, + "learning_rate": 6.419612315451275e-06, + "loss": 0.7465, + "step": 1061 + }, + { + "epoch": 0.6213478626540389, + "grad_norm": 0.03617054781781428, + "learning_rate": 6.4168723317026655e-06, + "loss": 0.7628, + "step": 1062 + }, + { + "epoch": 0.6219329359710388, + "grad_norm": 0.04149790973744168, + "learning_rate": 6.414130644093669e-06, + "loss": 0.7846, + "step": 1063 + }, + { + "epoch": 0.6225180092880389, + "grad_norm": 0.08318622079767567, + "learning_rate": 6.411387254940473e-06, + "loss": 0.8865, + "step": 1064 + }, + { + "epoch": 0.623103082605039, + "grad_norm": 0.04061744209355914, + "learning_rate": 6.4086421665607e-06, + "loss": 0.7673, + "step": 1065 + }, + { + "epoch": 0.623688155922039, + "grad_norm": 0.04460594650043864, + "learning_rate": 6.405895381273411e-06, + "loss": 0.7735, + "step": 1066 + }, + { + "epoch": 0.624273229239039, + "grad_norm": 0.05550280064698044, + "learning_rate": 6.403146901399098e-06, + "loss": 0.8569, + "step": 1067 + }, + { + "epoch": 0.624858302556039, + "grad_norm": 0.04163827309254167, + "learning_rate": 6.400396729259685e-06, + "loss": 0.8951, + "step": 1068 + }, + { + "epoch": 0.6254433758730391, + "grad_norm": 0.1509844863070891, + "learning_rate": 6.39764486717853e-06, + "loss": 0.8082, + "step": 1069 + }, + { + "epoch": 0.6260284491900391, + "grad_norm": 0.04492131767454413, + "learning_rate": 6.394891317480412e-06, + "loss": 0.7518, + "step": 1070 + }, + { + "epoch": 0.6266135225070392, + "grad_norm": 0.0471963162082604, + "learning_rate": 6.39213608249154e-06, + "loss": 0.8127, + "step": 1071 + }, + { + "epoch": 0.6271985958240393, + "grad_norm": 0.038627758596647896, + "learning_rate": 6.389379164539545e-06, + "loss": 0.7781, + "step": 1072 + }, + { + "epoch": 0.6277836691410392, + "grad_norm": 0.04265598384978139, + "learning_rate": 6.386620565953482e-06, + "loss": 0.7698, + "step": 1073 + }, + { + "epoch": 0.6283687424580393, + "grad_norm": 0.040866964196317926, + "learning_rate": 6.383860289063821e-06, + "loss": 0.741, + "step": 1074 + }, + { + "epoch": 0.6289538157750393, + "grad_norm": 0.04488242542194653, + "learning_rate": 6.3810983362024575e-06, + "loss": 0.8618, + "step": 1075 + }, + { + "epoch": 0.6295388890920394, + "grad_norm": 0.04181264245748804, + "learning_rate": 6.3783347097026935e-06, + "loss": 0.8318, + "step": 1076 + }, + { + "epoch": 0.6301239624090393, + "grad_norm": 0.057019285327405245, + "learning_rate": 6.375569411899253e-06, + "loss": 0.822, + "step": 1077 + }, + { + "epoch": 0.6307090357260394, + "grad_norm": 0.04075452383662032, + "learning_rate": 6.3728024451282675e-06, + "loss": 0.7854, + "step": 1078 + }, + { + "epoch": 0.6312941090430395, + "grad_norm": 0.04369789370291459, + "learning_rate": 6.37003381172728e-06, + "loss": 0.7976, + "step": 1079 + }, + { + "epoch": 0.6318791823600395, + "grad_norm": 0.05849968444823343, + "learning_rate": 6.367263514035242e-06, + "loss": 0.8309, + "step": 1080 + }, + { + "epoch": 0.6324642556770396, + "grad_norm": 0.07398315736379607, + "learning_rate": 6.364491554392508e-06, + "loss": 0.8695, + "step": 1081 + }, + { + "epoch": 0.6330493289940395, + "grad_norm": 0.14149329637182187, + "learning_rate": 6.36171793514084e-06, + "loss": 0.9035, + "step": 1082 + }, + { + "epoch": 0.6336344023110396, + "grad_norm": 0.09431782156363094, + "learning_rate": 6.358942658623402e-06, + "loss": 0.7978, + "step": 1083 + }, + { + "epoch": 0.6342194756280396, + "grad_norm": 0.11444686544474032, + "learning_rate": 6.356165727184753e-06, + "loss": 0.8075, + "step": 1084 + }, + { + "epoch": 0.6348045489450397, + "grad_norm": 0.07683296171985984, + "learning_rate": 6.353387143170856e-06, + "loss": 0.7619, + "step": 1085 + }, + { + "epoch": 0.6353896222620398, + "grad_norm": 0.03611601798693106, + "learning_rate": 6.3506069089290705e-06, + "loss": 0.7886, + "step": 1086 + }, + { + "epoch": 0.6359746955790397, + "grad_norm": 0.055915745635355715, + "learning_rate": 6.3478250268081435e-06, + "loss": 0.8511, + "step": 1087 + }, + { + "epoch": 0.6365597688960398, + "grad_norm": 0.048663447635497695, + "learning_rate": 6.34504149915822e-06, + "loss": 0.9001, + "step": 1088 + }, + { + "epoch": 0.6371448422130398, + "grad_norm": 0.04226309960545497, + "learning_rate": 6.342256328330833e-06, + "loss": 0.7945, + "step": 1089 + }, + { + "epoch": 0.6377299155300399, + "grad_norm": 0.0425747978871558, + "learning_rate": 6.339469516678903e-06, + "loss": 0.7945, + "step": 1090 + }, + { + "epoch": 0.6383149888470399, + "grad_norm": 0.049540193290765706, + "learning_rate": 6.33668106655674e-06, + "loss": 0.8424, + "step": 1091 + }, + { + "epoch": 0.6389000621640399, + "grad_norm": 0.04361089778932384, + "learning_rate": 6.333890980320033e-06, + "loss": 0.8775, + "step": 1092 + }, + { + "epoch": 0.6394851354810399, + "grad_norm": 0.04952042923998371, + "learning_rate": 6.331099260325858e-06, + "loss": 0.7921, + "step": 1093 + }, + { + "epoch": 0.64007020879804, + "grad_norm": 0.08510241121137069, + "learning_rate": 6.32830590893267e-06, + "loss": 0.8449, + "step": 1094 + }, + { + "epoch": 0.6406552821150401, + "grad_norm": 0.052581634750892337, + "learning_rate": 6.325510928500298e-06, + "loss": 0.8504, + "step": 1095 + }, + { + "epoch": 0.64124035543204, + "grad_norm": 0.06971844661189186, + "learning_rate": 6.322714321389955e-06, + "loss": 0.8049, + "step": 1096 + }, + { + "epoch": 0.6418254287490401, + "grad_norm": 0.0425225993758099, + "learning_rate": 6.319916089964221e-06, + "loss": 0.7374, + "step": 1097 + }, + { + "epoch": 0.6424105020660401, + "grad_norm": 0.04070882005149654, + "learning_rate": 6.317116236587052e-06, + "loss": 0.747, + "step": 1098 + }, + { + "epoch": 0.6429955753830402, + "grad_norm": 0.04748840753374928, + "learning_rate": 6.314314763623775e-06, + "loss": 0.7728, + "step": 1099 + }, + { + "epoch": 0.6435806487000402, + "grad_norm": 0.08783073493232298, + "learning_rate": 6.31151167344108e-06, + "loss": 0.8031, + "step": 1100 + }, + { + "epoch": 0.6441657220170403, + "grad_norm": 0.12757918242470023, + "learning_rate": 6.308706968407029e-06, + "loss": 0.8275, + "step": 1101 + }, + { + "epoch": 0.6447507953340403, + "grad_norm": 0.04265066246559783, + "learning_rate": 6.305900650891045e-06, + "loss": 0.8143, + "step": 1102 + }, + { + "epoch": 0.6453358686510403, + "grad_norm": 0.05532213512913742, + "learning_rate": 6.303092723263917e-06, + "loss": 0.7623, + "step": 1103 + }, + { + "epoch": 0.6459209419680404, + "grad_norm": 0.05575276568946241, + "learning_rate": 6.300283187897788e-06, + "loss": 0.7578, + "step": 1104 + }, + { + "epoch": 0.6465060152850404, + "grad_norm": 0.04004488402472709, + "learning_rate": 6.297472047166164e-06, + "loss": 0.8287, + "step": 1105 + }, + { + "epoch": 0.6470910886020405, + "grad_norm": 0.07040462606702534, + "learning_rate": 6.294659303443907e-06, + "loss": 0.802, + "step": 1106 + }, + { + "epoch": 0.6476761619190404, + "grad_norm": 0.04991605035702622, + "learning_rate": 6.291844959107231e-06, + "loss": 0.78, + "step": 1107 + }, + { + "epoch": 0.6482612352360405, + "grad_norm": 0.04810590450360042, + "learning_rate": 6.289029016533705e-06, + "loss": 0.7531, + "step": 1108 + }, + { + "epoch": 0.6488463085530406, + "grad_norm": 0.04521072775635856, + "learning_rate": 6.286211478102243e-06, + "loss": 0.773, + "step": 1109 + }, + { + "epoch": 0.6494313818700406, + "grad_norm": 0.04141710815776832, + "learning_rate": 6.283392346193114e-06, + "loss": 0.813, + "step": 1110 + }, + { + "epoch": 0.6500164551870407, + "grad_norm": 0.043835473627139195, + "learning_rate": 6.280571623187929e-06, + "loss": 0.793, + "step": 1111 + }, + { + "epoch": 0.6506015285040406, + "grad_norm": 0.05669397811371989, + "learning_rate": 6.277749311469643e-06, + "loss": 0.7317, + "step": 1112 + }, + { + "epoch": 0.6511866018210407, + "grad_norm": 0.08646975004658393, + "learning_rate": 6.274925413422558e-06, + "loss": 0.8447, + "step": 1113 + }, + { + "epoch": 0.6517716751380407, + "grad_norm": 0.039267068048581634, + "learning_rate": 6.272099931432308e-06, + "loss": 0.8171, + "step": 1114 + }, + { + "epoch": 0.6523567484550408, + "grad_norm": 0.04626551882560527, + "learning_rate": 6.2692728678858705e-06, + "loss": 0.7719, + "step": 1115 + }, + { + "epoch": 0.6529418217720409, + "grad_norm": 0.08786584886983898, + "learning_rate": 6.26644422517156e-06, + "loss": 0.7863, + "step": 1116 + }, + { + "epoch": 0.6535268950890408, + "grad_norm": 0.07658703901409003, + "learning_rate": 6.26361400567902e-06, + "loss": 0.8893, + "step": 1117 + }, + { + "epoch": 0.6541119684060409, + "grad_norm": 0.03799609243525315, + "learning_rate": 6.2607822117992326e-06, + "loss": 0.7575, + "step": 1118 + }, + { + "epoch": 0.6546970417230409, + "grad_norm": 0.04335037636508533, + "learning_rate": 6.257948845924505e-06, + "loss": 0.7564, + "step": 1119 + }, + { + "epoch": 0.655282115040041, + "grad_norm": 0.04285395891669155, + "learning_rate": 6.2551139104484755e-06, + "loss": 0.8482, + "step": 1120 + }, + { + "epoch": 0.6558671883570409, + "grad_norm": 0.03461985415221269, + "learning_rate": 6.252277407766103e-06, + "loss": 0.7411, + "step": 1121 + }, + { + "epoch": 0.656452261674041, + "grad_norm": 0.06081410937727396, + "learning_rate": 6.249439340273679e-06, + "loss": 0.8753, + "step": 1122 + }, + { + "epoch": 0.6570373349910411, + "grad_norm": 0.08982630298329526, + "learning_rate": 6.246599710368809e-06, + "loss": 0.7514, + "step": 1123 + }, + { + "epoch": 0.6576224083080411, + "grad_norm": 0.04564349868186195, + "learning_rate": 6.243758520450423e-06, + "loss": 0.7989, + "step": 1124 + }, + { + "epoch": 0.6582074816250412, + "grad_norm": 0.043054553052619716, + "learning_rate": 6.240915772918768e-06, + "loss": 0.7454, + "step": 1125 + }, + { + "epoch": 0.6587925549420411, + "grad_norm": 0.03677134636684232, + "learning_rate": 6.238071470175405e-06, + "loss": 0.8528, + "step": 1126 + }, + { + "epoch": 0.6593776282590412, + "grad_norm": 0.04172483135818754, + "learning_rate": 6.235225614623212e-06, + "loss": 0.812, + "step": 1127 + }, + { + "epoch": 0.6599627015760412, + "grad_norm": 0.05277877562040755, + "learning_rate": 6.232378208666376e-06, + "loss": 0.8283, + "step": 1128 + }, + { + "epoch": 0.6605477748930413, + "grad_norm": 0.04454598931616706, + "learning_rate": 6.229529254710396e-06, + "loss": 0.8537, + "step": 1129 + }, + { + "epoch": 0.6611328482100414, + "grad_norm": 0.047900592019452154, + "learning_rate": 6.226678755162076e-06, + "loss": 0.825, + "step": 1130 + }, + { + "epoch": 0.6617179215270413, + "grad_norm": 0.05963183612092722, + "learning_rate": 6.223826712429529e-06, + "loss": 0.8042, + "step": 1131 + }, + { + "epoch": 0.6623029948440414, + "grad_norm": 0.046383510242271296, + "learning_rate": 6.220973128922168e-06, + "loss": 0.865, + "step": 1132 + }, + { + "epoch": 0.6628880681610414, + "grad_norm": 0.04417415078522508, + "learning_rate": 6.218118007050713e-06, + "loss": 0.8235, + "step": 1133 + }, + { + "epoch": 0.6634731414780415, + "grad_norm": 0.04207639503924024, + "learning_rate": 6.215261349227178e-06, + "loss": 0.7858, + "step": 1134 + }, + { + "epoch": 0.6640582147950415, + "grad_norm": 0.04649797867374506, + "learning_rate": 6.212403157864878e-06, + "loss": 0.868, + "step": 1135 + }, + { + "epoch": 0.6646432881120415, + "grad_norm": 0.09407948533657494, + "learning_rate": 6.209543435378422e-06, + "loss": 0.8818, + "step": 1136 + }, + { + "epoch": 0.6652283614290416, + "grad_norm": 0.037478109364168094, + "learning_rate": 6.206682184183712e-06, + "loss": 0.812, + "step": 1137 + }, + { + "epoch": 0.6658134347460416, + "grad_norm": 0.05288676232169846, + "learning_rate": 6.203819406697945e-06, + "loss": 0.7548, + "step": 1138 + }, + { + "epoch": 0.6663985080630417, + "grad_norm": 0.04535893610318677, + "learning_rate": 6.200955105339603e-06, + "loss": 0.8772, + "step": 1139 + }, + { + "epoch": 0.6669835813800417, + "grad_norm": 0.04773693536693857, + "learning_rate": 6.198089282528456e-06, + "loss": 0.7763, + "step": 1140 + }, + { + "epoch": 0.6675686546970417, + "grad_norm": 0.0518520434245037, + "learning_rate": 6.195221940685563e-06, + "loss": 0.7668, + "step": 1141 + }, + { + "epoch": 0.6681537280140417, + "grad_norm": 0.05462892968699057, + "learning_rate": 6.192353082233263e-06, + "loss": 0.7096, + "step": 1142 + }, + { + "epoch": 0.6687388013310418, + "grad_norm": 0.04122645056649732, + "learning_rate": 6.189482709595177e-06, + "loss": 0.7839, + "step": 1143 + }, + { + "epoch": 0.6693238746480419, + "grad_norm": 0.058891294040750164, + "learning_rate": 6.186610825196204e-06, + "loss": 0.7504, + "step": 1144 + }, + { + "epoch": 0.6699089479650419, + "grad_norm": 0.05296752593762354, + "learning_rate": 6.183737431462524e-06, + "loss": 0.7591, + "step": 1145 + }, + { + "epoch": 0.6704940212820419, + "grad_norm": 0.04766421657837364, + "learning_rate": 6.180862530821588e-06, + "loss": 0.742, + "step": 1146 + }, + { + "epoch": 0.6710790945990419, + "grad_norm": 0.0915137633830507, + "learning_rate": 6.177986125702121e-06, + "loss": 0.8167, + "step": 1147 + }, + { + "epoch": 0.671664167916042, + "grad_norm": 0.04064619759463224, + "learning_rate": 6.17510821853412e-06, + "loss": 0.755, + "step": 1148 + }, + { + "epoch": 0.672249241233042, + "grad_norm": 0.03962465637676519, + "learning_rate": 6.17222881174885e-06, + "loss": 0.7952, + "step": 1149 + }, + { + "epoch": 0.6728343145500421, + "grad_norm": 0.0475379390885668, + "learning_rate": 6.169347907778846e-06, + "loss": 0.7889, + "step": 1150 + }, + { + "epoch": 0.6734193878670421, + "grad_norm": 0.04260205050833479, + "learning_rate": 6.166465509057902e-06, + "loss": 0.7094, + "step": 1151 + }, + { + "epoch": 0.6740044611840421, + "grad_norm": 0.0452503260334743, + "learning_rate": 6.163581618021079e-06, + "loss": 0.8137, + "step": 1152 + }, + { + "epoch": 0.6745895345010422, + "grad_norm": 0.049045892950316486, + "learning_rate": 6.1606962371046975e-06, + "loss": 0.7476, + "step": 1153 + }, + { + "epoch": 0.6751746078180422, + "grad_norm": 0.1839546472224546, + "learning_rate": 6.157809368746337e-06, + "loss": 0.7341, + "step": 1154 + }, + { + "epoch": 0.6757596811350423, + "grad_norm": 0.04737636667405579, + "learning_rate": 6.154921015384833e-06, + "loss": 0.8772, + "step": 1155 + }, + { + "epoch": 0.6763447544520422, + "grad_norm": 0.039827421277940374, + "learning_rate": 6.152031179460276e-06, + "loss": 0.8184, + "step": 1156 + }, + { + "epoch": 0.6769298277690423, + "grad_norm": 0.03980204662807788, + "learning_rate": 6.14913986341401e-06, + "loss": 0.8504, + "step": 1157 + }, + { + "epoch": 0.6775149010860424, + "grad_norm": 0.06377067252825656, + "learning_rate": 6.146247069688627e-06, + "loss": 0.8496, + "step": 1158 + }, + { + "epoch": 0.6780999744030424, + "grad_norm": 0.5555285996614837, + "learning_rate": 6.14335280072797e-06, + "loss": 0.8131, + "step": 1159 + }, + { + "epoch": 0.6786850477200425, + "grad_norm": 0.03909895393006111, + "learning_rate": 6.140457058977125e-06, + "loss": 0.7756, + "step": 1160 + }, + { + "epoch": 0.6792701210370424, + "grad_norm": 0.03969052355186135, + "learning_rate": 6.137559846882426e-06, + "loss": 0.8209, + "step": 1161 + }, + { + "epoch": 0.6798551943540425, + "grad_norm": 0.04103151559227765, + "learning_rate": 6.134661166891445e-06, + "loss": 0.8015, + "step": 1162 + }, + { + "epoch": 0.6804402676710425, + "grad_norm": 0.05136493470713414, + "learning_rate": 6.131761021453e-06, + "loss": 0.8234, + "step": 1163 + }, + { + "epoch": 0.6810253409880426, + "grad_norm": 0.04189728013078323, + "learning_rate": 6.128859413017141e-06, + "loss": 0.7964, + "step": 1164 + }, + { + "epoch": 0.6816104143050427, + "grad_norm": 0.04337964675631273, + "learning_rate": 6.1259563440351564e-06, + "loss": 0.8434, + "step": 1165 + }, + { + "epoch": 0.6821954876220426, + "grad_norm": 0.09451825900326687, + "learning_rate": 6.123051816959569e-06, + "loss": 0.8254, + "step": 1166 + }, + { + "epoch": 0.6827805609390427, + "grad_norm": 0.045248262569469115, + "learning_rate": 6.120145834244133e-06, + "loss": 0.7613, + "step": 1167 + }, + { + "epoch": 0.6833656342560427, + "grad_norm": 0.03837960459677211, + "learning_rate": 6.117238398343831e-06, + "loss": 0.7958, + "step": 1168 + }, + { + "epoch": 0.6839507075730428, + "grad_norm": 0.05242659218061473, + "learning_rate": 6.114329511714876e-06, + "loss": 0.8356, + "step": 1169 + }, + { + "epoch": 0.6845357808900427, + "grad_norm": 0.06860429447590187, + "learning_rate": 6.111419176814704e-06, + "loss": 0.802, + "step": 1170 + }, + { + "epoch": 0.6851208542070428, + "grad_norm": 0.04508456559483142, + "learning_rate": 6.108507396101975e-06, + "loss": 0.8444, + "step": 1171 + }, + { + "epoch": 0.6857059275240428, + "grad_norm": 0.06393026794280061, + "learning_rate": 6.105594172036572e-06, + "loss": 0.7585, + "step": 1172 + }, + { + "epoch": 0.6862910008410429, + "grad_norm": 0.05435654372858688, + "learning_rate": 6.102679507079597e-06, + "loss": 0.9379, + "step": 1173 + }, + { + "epoch": 0.686876074158043, + "grad_norm": 0.057231278163684694, + "learning_rate": 6.099763403693366e-06, + "loss": 0.8431, + "step": 1174 + }, + { + "epoch": 0.6874611474750429, + "grad_norm": 0.0414940103750233, + "learning_rate": 6.096845864341415e-06, + "loss": 0.8247, + "step": 1175 + }, + { + "epoch": 0.688046220792043, + "grad_norm": 0.03753965623304419, + "learning_rate": 6.09392689148849e-06, + "loss": 0.7384, + "step": 1176 + }, + { + "epoch": 0.688631294109043, + "grad_norm": 0.0445855037853274, + "learning_rate": 6.09100648760055e-06, + "loss": 0.8913, + "step": 1177 + }, + { + "epoch": 0.6892163674260431, + "grad_norm": 0.03616019636034817, + "learning_rate": 6.08808465514476e-06, + "loss": 0.7741, + "step": 1178 + }, + { + "epoch": 0.6898014407430431, + "grad_norm": 0.03958351633767278, + "learning_rate": 6.085161396589493e-06, + "loss": 0.6991, + "step": 1179 + }, + { + "epoch": 0.6903865140600431, + "grad_norm": 0.03958973007993701, + "learning_rate": 6.082236714404331e-06, + "loss": 0.8455, + "step": 1180 + }, + { + "epoch": 0.6909715873770432, + "grad_norm": 0.04165937145938639, + "learning_rate": 6.079310611060052e-06, + "loss": 0.8072, + "step": 1181 + }, + { + "epoch": 0.6915566606940432, + "grad_norm": 0.045951396419511376, + "learning_rate": 6.07638308902864e-06, + "loss": 0.8313, + "step": 1182 + }, + { + "epoch": 0.6921417340110433, + "grad_norm": 0.04752344885301804, + "learning_rate": 6.073454150783274e-06, + "loss": 0.8802, + "step": 1183 + }, + { + "epoch": 0.6927268073280433, + "grad_norm": 0.05117696052721477, + "learning_rate": 6.070523798798329e-06, + "loss": 0.9472, + "step": 1184 + }, + { + "epoch": 0.6933118806450433, + "grad_norm": 0.040955259256951086, + "learning_rate": 6.06759203554938e-06, + "loss": 0.8126, + "step": 1185 + }, + { + "epoch": 0.6938969539620433, + "grad_norm": 0.034916342462867116, + "learning_rate": 6.064658863513186e-06, + "loss": 0.7091, + "step": 1186 + }, + { + "epoch": 0.6944820272790434, + "grad_norm": 0.042140933363091035, + "learning_rate": 6.061724285167704e-06, + "loss": 0.8323, + "step": 1187 + }, + { + "epoch": 0.6950671005960435, + "grad_norm": 0.04413794065710716, + "learning_rate": 6.058788302992072e-06, + "loss": 0.8419, + "step": 1188 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 0.042514536823805356, + "learning_rate": 6.055850919466621e-06, + "loss": 0.8863, + "step": 1189 + }, + { + "epoch": 0.6962372472300435, + "grad_norm": 0.04885967786149231, + "learning_rate": 6.05291213707286e-06, + "loss": 0.7658, + "step": 1190 + }, + { + "epoch": 0.6968223205470435, + "grad_norm": 0.04978541890148103, + "learning_rate": 6.0499719582934815e-06, + "loss": 0.7496, + "step": 1191 + }, + { + "epoch": 0.6974073938640436, + "grad_norm": 0.05231245753746865, + "learning_rate": 6.047030385612362e-06, + "loss": 0.784, + "step": 1192 + }, + { + "epoch": 0.6979924671810436, + "grad_norm": 0.04028385347084005, + "learning_rate": 6.0440874215145465e-06, + "loss": 0.6913, + "step": 1193 + }, + { + "epoch": 0.6985775404980437, + "grad_norm": 0.062297108276536735, + "learning_rate": 6.041143068486264e-06, + "loss": 0.7337, + "step": 1194 + }, + { + "epoch": 0.6991626138150437, + "grad_norm": 0.03928043528877265, + "learning_rate": 6.038197329014914e-06, + "loss": 0.8415, + "step": 1195 + }, + { + "epoch": 0.6997476871320437, + "grad_norm": 0.046537841954999465, + "learning_rate": 6.035250205589064e-06, + "loss": 0.7873, + "step": 1196 + }, + { + "epoch": 0.7003327604490438, + "grad_norm": 0.042222747639775425, + "learning_rate": 6.032301700698458e-06, + "loss": 0.7629, + "step": 1197 + }, + { + "epoch": 0.7009178337660438, + "grad_norm": 0.05930943404435324, + "learning_rate": 6.029351816833998e-06, + "loss": 0.8273, + "step": 1198 + }, + { + "epoch": 0.7015029070830439, + "grad_norm": 0.05874167359504905, + "learning_rate": 6.026400556487758e-06, + "loss": 0.7159, + "step": 1199 + }, + { + "epoch": 0.7020879804000438, + "grad_norm": 0.052410757851244154, + "learning_rate": 6.023447922152972e-06, + "loss": 0.7887, + "step": 1200 + }, + { + "epoch": 0.7026730537170439, + "grad_norm": 0.03785761302792468, + "learning_rate": 6.020493916324037e-06, + "loss": 0.7942, + "step": 1201 + }, + { + "epoch": 0.703258127034044, + "grad_norm": 0.0392431216621593, + "learning_rate": 6.017538541496503e-06, + "loss": 0.7885, + "step": 1202 + }, + { + "epoch": 0.703843200351044, + "grad_norm": 0.08892449717880838, + "learning_rate": 6.014581800167085e-06, + "loss": 0.8001, + "step": 1203 + }, + { + "epoch": 0.7044282736680441, + "grad_norm": 0.06747014549686459, + "learning_rate": 6.011623694833644e-06, + "loss": 0.727, + "step": 1204 + }, + { + "epoch": 0.705013346985044, + "grad_norm": 0.04543757611973289, + "learning_rate": 6.008664227995198e-06, + "loss": 0.8129, + "step": 1205 + }, + { + "epoch": 0.7055984203020441, + "grad_norm": 0.039104300131473785, + "learning_rate": 6.005703402151916e-06, + "loss": 0.8098, + "step": 1206 + }, + { + "epoch": 0.7061834936190441, + "grad_norm": 0.038265510416551914, + "learning_rate": 6.0027412198051114e-06, + "loss": 0.8279, + "step": 1207 + }, + { + "epoch": 0.7067685669360442, + "grad_norm": 0.03588239116909633, + "learning_rate": 5.999777683457247e-06, + "loss": 0.8096, + "step": 1208 + }, + { + "epoch": 0.7073536402530443, + "grad_norm": 0.03971284808692058, + "learning_rate": 5.996812795611928e-06, + "loss": 0.8331, + "step": 1209 + }, + { + "epoch": 0.7079387135700442, + "grad_norm": 0.03963917599976568, + "learning_rate": 5.9938465587739e-06, + "loss": 0.7908, + "step": 1210 + }, + { + "epoch": 0.7085237868870443, + "grad_norm": 0.045273875457725724, + "learning_rate": 5.990878975449051e-06, + "loss": 0.8111, + "step": 1211 + }, + { + "epoch": 0.7091088602040443, + "grad_norm": 0.04337467569823838, + "learning_rate": 5.9879100481444055e-06, + "loss": 0.8222, + "step": 1212 + }, + { + "epoch": 0.7096939335210444, + "grad_norm": 0.039603085337812066, + "learning_rate": 5.984939779368122e-06, + "loss": 0.7721, + "step": 1213 + }, + { + "epoch": 0.7102790068380443, + "grad_norm": 0.04156772105291293, + "learning_rate": 5.981968171629494e-06, + "loss": 0.8197, + "step": 1214 + }, + { + "epoch": 0.7108640801550444, + "grad_norm": 0.04856521644044076, + "learning_rate": 5.978995227438944e-06, + "loss": 0.8677, + "step": 1215 + }, + { + "epoch": 0.7114491534720445, + "grad_norm": 0.03956445076057253, + "learning_rate": 5.976020949308027e-06, + "loss": 0.7877, + "step": 1216 + }, + { + "epoch": 0.7120342267890445, + "grad_norm": 0.04862552856496891, + "learning_rate": 5.973045339749422e-06, + "loss": 0.8109, + "step": 1217 + }, + { + "epoch": 0.7126193001060446, + "grad_norm": 0.045295501112575506, + "learning_rate": 5.970068401276935e-06, + "loss": 0.791, + "step": 1218 + }, + { + "epoch": 0.7132043734230445, + "grad_norm": 0.03916519269752596, + "learning_rate": 5.967090136405491e-06, + "loss": 0.8131, + "step": 1219 + }, + { + "epoch": 0.7137894467400446, + "grad_norm": 0.038383406339616045, + "learning_rate": 5.96411054765114e-06, + "loss": 0.8102, + "step": 1220 + }, + { + "epoch": 0.7143745200570446, + "grad_norm": 0.06824384503876316, + "learning_rate": 5.961129637531047e-06, + "loss": 0.8104, + "step": 1221 + }, + { + "epoch": 0.7149595933740447, + "grad_norm": 0.07316382930858244, + "learning_rate": 5.958147408563497e-06, + "loss": 0.7862, + "step": 1222 + }, + { + "epoch": 0.7155446666910448, + "grad_norm": 0.04737080928686243, + "learning_rate": 5.9551638632678835e-06, + "loss": 0.8513, + "step": 1223 + }, + { + "epoch": 0.7161297400080447, + "grad_norm": 0.0374937969952467, + "learning_rate": 5.952179004164718e-06, + "loss": 0.8002, + "step": 1224 + }, + { + "epoch": 0.7167148133250448, + "grad_norm": 0.04053689173262939, + "learning_rate": 5.949192833775618e-06, + "loss": 0.7657, + "step": 1225 + }, + { + "epoch": 0.7172998866420448, + "grad_norm": 0.0398778756693423, + "learning_rate": 5.946205354623312e-06, + "loss": 0.7725, + "step": 1226 + }, + { + "epoch": 0.7178849599590449, + "grad_norm": 0.04506339056501446, + "learning_rate": 5.943216569231629e-06, + "loss": 0.7931, + "step": 1227 + }, + { + "epoch": 0.7184700332760449, + "grad_norm": 0.04982830890931998, + "learning_rate": 5.940226480125508e-06, + "loss": 0.8265, + "step": 1228 + }, + { + "epoch": 0.719055106593045, + "grad_norm": 0.0397312539469508, + "learning_rate": 5.937235089830984e-06, + "loss": 0.769, + "step": 1229 + }, + { + "epoch": 0.719640179910045, + "grad_norm": 0.03837231212563197, + "learning_rate": 5.934242400875195e-06, + "loss": 0.8259, + "step": 1230 + }, + { + "epoch": 0.720225253227045, + "grad_norm": 0.03612472247568365, + "learning_rate": 5.931248415786371e-06, + "loss": 0.7637, + "step": 1231 + }, + { + "epoch": 0.7208103265440451, + "grad_norm": 0.03537723098243959, + "learning_rate": 5.928253137093844e-06, + "loss": 0.738, + "step": 1232 + }, + { + "epoch": 0.7213953998610451, + "grad_norm": 0.0345658269866166, + "learning_rate": 5.925256567328036e-06, + "loss": 0.7422, + "step": 1233 + }, + { + "epoch": 0.7219804731780451, + "grad_norm": 0.041550218114552985, + "learning_rate": 5.922258709020456e-06, + "loss": 0.8158, + "step": 1234 + }, + { + "epoch": 0.7225655464950451, + "grad_norm": 0.04738915180347003, + "learning_rate": 5.919259564703705e-06, + "loss": 0.7693, + "step": 1235 + }, + { + "epoch": 0.7231506198120452, + "grad_norm": 0.22798373109574022, + "learning_rate": 5.916259136911472e-06, + "loss": 0.7964, + "step": 1236 + }, + { + "epoch": 0.7237356931290453, + "grad_norm": 0.03960961306940275, + "learning_rate": 5.913257428178526e-06, + "loss": 0.8529, + "step": 1237 + }, + { + "epoch": 0.7243207664460453, + "grad_norm": 0.03762982041843176, + "learning_rate": 5.910254441040723e-06, + "loss": 0.7978, + "step": 1238 + }, + { + "epoch": 0.7249058397630453, + "grad_norm": 0.03967585434913458, + "learning_rate": 5.907250178034994e-06, + "loss": 0.7715, + "step": 1239 + }, + { + "epoch": 0.7254909130800453, + "grad_norm": 0.035499686476307656, + "learning_rate": 5.904244641699352e-06, + "loss": 0.7821, + "step": 1240 + }, + { + "epoch": 0.7260759863970454, + "grad_norm": 0.04146813756606358, + "learning_rate": 5.9012378345728824e-06, + "loss": 0.7832, + "step": 1241 + }, + { + "epoch": 0.7266610597140454, + "grad_norm": 0.035835229174829814, + "learning_rate": 5.8982297591957465e-06, + "loss": 0.7951, + "step": 1242 + }, + { + "epoch": 0.7272461330310455, + "grad_norm": 0.040752453651612, + "learning_rate": 5.8952204181091775e-06, + "loss": 0.7898, + "step": 1243 + }, + { + "epoch": 0.7278312063480454, + "grad_norm": 0.04254122433979048, + "learning_rate": 5.8922098138554745e-06, + "loss": 0.8384, + "step": 1244 + }, + { + "epoch": 0.7284162796650455, + "grad_norm": 0.048324438293500575, + "learning_rate": 5.889197948978008e-06, + "loss": 0.7273, + "step": 1245 + }, + { + "epoch": 0.7290013529820456, + "grad_norm": 0.04876106635437417, + "learning_rate": 5.886184826021208e-06, + "loss": 0.7078, + "step": 1246 + }, + { + "epoch": 0.7295864262990456, + "grad_norm": 0.047909628199116906, + "learning_rate": 5.883170447530575e-06, + "loss": 0.7293, + "step": 1247 + }, + { + "epoch": 0.7301714996160457, + "grad_norm": 0.04759781973779755, + "learning_rate": 5.880154816052666e-06, + "loss": 0.8114, + "step": 1248 + }, + { + "epoch": 0.7307565729330456, + "grad_norm": 0.03579815989872979, + "learning_rate": 5.8771379341350905e-06, + "loss": 0.7843, + "step": 1249 + }, + { + "epoch": 0.7313416462500457, + "grad_norm": 0.048181696474966534, + "learning_rate": 5.874119804326525e-06, + "loss": 0.85, + "step": 1250 + }, + { + "epoch": 0.7319267195670457, + "grad_norm": 0.040629567336692485, + "learning_rate": 5.871100429176694e-06, + "loss": 0.8121, + "step": 1251 + }, + { + "epoch": 0.7325117928840458, + "grad_norm": 0.039483264479736525, + "learning_rate": 5.8680798112363784e-06, + "loss": 0.8365, + "step": 1252 + }, + { + "epoch": 0.7330968662010459, + "grad_norm": 0.05089919878146447, + "learning_rate": 5.865057953057401e-06, + "loss": 0.8076, + "step": 1253 + }, + { + "epoch": 0.7336819395180458, + "grad_norm": 0.035397442355980586, + "learning_rate": 5.862034857192642e-06, + "loss": 0.7798, + "step": 1254 + }, + { + "epoch": 0.7342670128350459, + "grad_norm": 0.03687328400855783, + "learning_rate": 5.859010526196021e-06, + "loss": 0.7859, + "step": 1255 + }, + { + "epoch": 0.7348520861520459, + "grad_norm": 0.06502490332457671, + "learning_rate": 5.855984962622504e-06, + "loss": 0.7927, + "step": 1256 + }, + { + "epoch": 0.735437159469046, + "grad_norm": 0.04157588146277028, + "learning_rate": 5.852958169028094e-06, + "loss": 0.7907, + "step": 1257 + }, + { + "epoch": 0.736022232786046, + "grad_norm": 0.04254658384010035, + "learning_rate": 5.849930147969839e-06, + "loss": 0.7903, + "step": 1258 + }, + { + "epoch": 0.736607306103046, + "grad_norm": 0.04207256800472794, + "learning_rate": 5.846900902005822e-06, + "loss": 0.7673, + "step": 1259 + }, + { + "epoch": 0.7371923794200461, + "grad_norm": 0.03928632402273387, + "learning_rate": 5.843870433695156e-06, + "loss": 0.7548, + "step": 1260 + }, + { + "epoch": 0.7377774527370461, + "grad_norm": 0.039954700127276935, + "learning_rate": 5.8408387455979946e-06, + "loss": 0.7986, + "step": 1261 + }, + { + "epoch": 0.7383625260540462, + "grad_norm": 0.03783285103775797, + "learning_rate": 5.837805840275515e-06, + "loss": 0.8107, + "step": 1262 + }, + { + "epoch": 0.7389475993710461, + "grad_norm": 0.040841957568584206, + "learning_rate": 5.834771720289929e-06, + "loss": 0.8005, + "step": 1263 + }, + { + "epoch": 0.7395326726880462, + "grad_norm": 0.044981013127008924, + "learning_rate": 5.831736388204467e-06, + "loss": 0.7682, + "step": 1264 + }, + { + "epoch": 0.7401177460050462, + "grad_norm": 0.038332859477558344, + "learning_rate": 5.828699846583389e-06, + "loss": 0.7548, + "step": 1265 + }, + { + "epoch": 0.7407028193220463, + "grad_norm": 0.04385966631432292, + "learning_rate": 5.825662097991978e-06, + "loss": 0.8131, + "step": 1266 + }, + { + "epoch": 0.7412878926390464, + "grad_norm": 0.038909792748055394, + "learning_rate": 5.82262314499653e-06, + "loss": 0.8741, + "step": 1267 + }, + { + "epoch": 0.7418729659560463, + "grad_norm": 0.05612505051259805, + "learning_rate": 5.8195829901643655e-06, + "loss": 0.8022, + "step": 1268 + }, + { + "epoch": 0.7424580392730464, + "grad_norm": 0.041230289913411614, + "learning_rate": 5.816541636063816e-06, + "loss": 0.7588, + "step": 1269 + }, + { + "epoch": 0.7430431125900464, + "grad_norm": 0.053472936644260796, + "learning_rate": 5.813499085264229e-06, + "loss": 0.733, + "step": 1270 + }, + { + "epoch": 0.7436281859070465, + "grad_norm": 0.05949960419609776, + "learning_rate": 5.8104553403359586e-06, + "loss": 0.8247, + "step": 1271 + }, + { + "epoch": 0.7442132592240465, + "grad_norm": 0.04317467554051099, + "learning_rate": 5.807410403850371e-06, + "loss": 0.8015, + "step": 1272 + }, + { + "epoch": 0.7447983325410465, + "grad_norm": 0.040008771003069384, + "learning_rate": 5.804364278379842e-06, + "loss": 0.7975, + "step": 1273 + }, + { + "epoch": 0.7453834058580466, + "grad_norm": 0.039926332581406196, + "learning_rate": 5.801316966497744e-06, + "loss": 0.7404, + "step": 1274 + }, + { + "epoch": 0.7459684791750466, + "grad_norm": 0.03922271741776474, + "learning_rate": 5.798268470778461e-06, + "loss": 0.8157, + "step": 1275 + }, + { + "epoch": 0.7465535524920467, + "grad_norm": 0.03804854554685112, + "learning_rate": 5.795218793797367e-06, + "loss": 0.6987, + "step": 1276 + }, + { + "epoch": 0.7471386258090467, + "grad_norm": 0.03876336199264868, + "learning_rate": 5.792167938130842e-06, + "loss": 0.8456, + "step": 1277 + }, + { + "epoch": 0.7477236991260467, + "grad_norm": 0.03771896812051095, + "learning_rate": 5.78911590635626e-06, + "loss": 0.726, + "step": 1278 + }, + { + "epoch": 0.7483087724430467, + "grad_norm": 0.036735705010982965, + "learning_rate": 5.786062701051983e-06, + "loss": 0.9274, + "step": 1279 + }, + { + "epoch": 0.7488938457600468, + "grad_norm": 0.03943361612412777, + "learning_rate": 5.783008324797375e-06, + "loss": 0.7146, + "step": 1280 + }, + { + "epoch": 0.7494789190770469, + "grad_norm": 0.03647462654976768, + "learning_rate": 5.779952780172777e-06, + "loss": 0.7589, + "step": 1281 + }, + { + "epoch": 0.7500639923940469, + "grad_norm": 0.03462458239120716, + "learning_rate": 5.776896069759528e-06, + "loss": 0.7397, + "step": 1282 + }, + { + "epoch": 0.750649065711047, + "grad_norm": 0.03495722370436699, + "learning_rate": 5.773838196139946e-06, + "loss": 0.6993, + "step": 1283 + }, + { + "epoch": 0.7512341390280469, + "grad_norm": 0.05321435758446983, + "learning_rate": 5.770779161897329e-06, + "loss": 0.8397, + "step": 1284 + }, + { + "epoch": 0.751819212345047, + "grad_norm": 0.07760562554265127, + "learning_rate": 5.767718969615965e-06, + "loss": 0.8, + "step": 1285 + }, + { + "epoch": 0.752404285662047, + "grad_norm": 0.044871732227994655, + "learning_rate": 5.7646576218811095e-06, + "loss": 0.7373, + "step": 1286 + }, + { + "epoch": 0.7529893589790471, + "grad_norm": 0.040559575159430344, + "learning_rate": 5.761595121279002e-06, + "loss": 0.8097, + "step": 1287 + }, + { + "epoch": 0.7535744322960471, + "grad_norm": 0.03837125490071528, + "learning_rate": 5.758531470396853e-06, + "loss": 0.8207, + "step": 1288 + }, + { + "epoch": 0.7541595056130471, + "grad_norm": 0.0417380184844728, + "learning_rate": 5.7554666718228435e-06, + "loss": 0.7322, + "step": 1289 + }, + { + "epoch": 0.7547445789300472, + "grad_norm": 0.04177068987632399, + "learning_rate": 5.752400728146127e-06, + "loss": 0.7949, + "step": 1290 + }, + { + "epoch": 0.7553296522470472, + "grad_norm": 0.03750486470453801, + "learning_rate": 5.74933364195682e-06, + "loss": 0.713, + "step": 1291 + }, + { + "epoch": 0.7559147255640473, + "grad_norm": 0.04456432301299169, + "learning_rate": 5.7462654158460105e-06, + "loss": 0.8051, + "step": 1292 + }, + { + "epoch": 0.7564997988810472, + "grad_norm": 0.03721091596694818, + "learning_rate": 5.743196052405745e-06, + "loss": 0.8427, + "step": 1293 + }, + { + "epoch": 0.7570848721980473, + "grad_norm": 0.04755935327200047, + "learning_rate": 5.74012555422903e-06, + "loss": 0.765, + "step": 1294 + }, + { + "epoch": 0.7576699455150474, + "grad_norm": 0.038790617615815835, + "learning_rate": 5.737053923909832e-06, + "loss": 0.7368, + "step": 1295 + }, + { + "epoch": 0.7582550188320474, + "grad_norm": 0.03835922133019433, + "learning_rate": 5.7339811640430805e-06, + "loss": 0.738, + "step": 1296 + }, + { + "epoch": 0.7588400921490475, + "grad_norm": 0.04054793146664343, + "learning_rate": 5.730907277224645e-06, + "loss": 0.8177, + "step": 1297 + }, + { + "epoch": 0.7594251654660474, + "grad_norm": 0.04247379928605068, + "learning_rate": 5.727832266051361e-06, + "loss": 0.8173, + "step": 1298 + }, + { + "epoch": 0.7600102387830475, + "grad_norm": 0.0445025614285575, + "learning_rate": 5.724756133121007e-06, + "loss": 0.8927, + "step": 1299 + }, + { + "epoch": 0.7605953121000475, + "grad_norm": 0.0926458972504, + "learning_rate": 5.721678881032307e-06, + "loss": 0.9005, + "step": 1300 + }, + { + "epoch": 0.7611803854170476, + "grad_norm": 0.03742317171404953, + "learning_rate": 5.718600512384938e-06, + "loss": 0.8046, + "step": 1301 + }, + { + "epoch": 0.7617654587340477, + "grad_norm": 0.03916116385336752, + "learning_rate": 5.715521029779511e-06, + "loss": 0.8223, + "step": 1302 + }, + { + "epoch": 0.7623505320510476, + "grad_norm": 0.05168394725564997, + "learning_rate": 5.71244043581759e-06, + "loss": 0.8097, + "step": 1303 + }, + { + "epoch": 0.7629356053680477, + "grad_norm": 0.04369686184899199, + "learning_rate": 5.709358733101665e-06, + "loss": 0.7624, + "step": 1304 + }, + { + "epoch": 0.7635206786850477, + "grad_norm": 0.052165784587932, + "learning_rate": 5.7062759242351705e-06, + "loss": 0.8156, + "step": 1305 + }, + { + "epoch": 0.7641057520020478, + "grad_norm": 0.04082442609405472, + "learning_rate": 5.703192011822474e-06, + "loss": 0.8779, + "step": 1306 + }, + { + "epoch": 0.7646908253190478, + "grad_norm": 0.0416953556866423, + "learning_rate": 5.700106998468874e-06, + "loss": 0.8806, + "step": 1307 + }, + { + "epoch": 0.7652758986360478, + "grad_norm": 0.03820299209890676, + "learning_rate": 5.697020886780601e-06, + "loss": 0.7473, + "step": 1308 + }, + { + "epoch": 0.7658609719530479, + "grad_norm": 0.036763238071425314, + "learning_rate": 5.693933679364812e-06, + "loss": 0.7509, + "step": 1309 + }, + { + "epoch": 0.7664460452700479, + "grad_norm": 0.03779190565018098, + "learning_rate": 5.690845378829589e-06, + "loss": 0.8345, + "step": 1310 + }, + { + "epoch": 0.767031118587048, + "grad_norm": 0.03632799411852302, + "learning_rate": 5.68775598778394e-06, + "loss": 0.7592, + "step": 1311 + }, + { + "epoch": 0.767616191904048, + "grad_norm": 0.04011020299884528, + "learning_rate": 5.684665508837792e-06, + "loss": 0.7438, + "step": 1312 + }, + { + "epoch": 0.768201265221048, + "grad_norm": 0.03991606487653607, + "learning_rate": 5.681573944601991e-06, + "loss": 0.8176, + "step": 1313 + }, + { + "epoch": 0.768786338538048, + "grad_norm": 0.03597560209452427, + "learning_rate": 5.678481297688301e-06, + "loss": 0.7547, + "step": 1314 + }, + { + "epoch": 0.7693714118550481, + "grad_norm": 0.04169549636775327, + "learning_rate": 5.675387570709403e-06, + "loss": 0.8889, + "step": 1315 + }, + { + "epoch": 0.7699564851720482, + "grad_norm": 0.04170869376840914, + "learning_rate": 5.672292766278882e-06, + "loss": 0.8606, + "step": 1316 + }, + { + "epoch": 0.7705415584890482, + "grad_norm": 0.044938956861301685, + "learning_rate": 5.669196887011245e-06, + "loss": 0.8402, + "step": 1317 + }, + { + "epoch": 0.7711266318060482, + "grad_norm": 0.035153707331906876, + "learning_rate": 5.666099935521897e-06, + "loss": 0.7729, + "step": 1318 + }, + { + "epoch": 0.7717117051230482, + "grad_norm": 0.06118327685992558, + "learning_rate": 5.663001914427155e-06, + "loss": 0.7555, + "step": 1319 + }, + { + "epoch": 0.7722967784400483, + "grad_norm": 0.03968655862023267, + "learning_rate": 5.659902826344237e-06, + "loss": 0.7255, + "step": 1320 + }, + { + "epoch": 0.7728818517570483, + "grad_norm": 0.04086333995138192, + "learning_rate": 5.656802673891261e-06, + "loss": 0.7239, + "step": 1321 + }, + { + "epoch": 0.7734669250740484, + "grad_norm": 0.04463158217642648, + "learning_rate": 5.6537014596872496e-06, + "loss": 0.7722, + "step": 1322 + }, + { + "epoch": 0.7740519983910483, + "grad_norm": 0.06012694877055056, + "learning_rate": 5.650599186352116e-06, + "loss": 0.8534, + "step": 1323 + }, + { + "epoch": 0.7746370717080484, + "grad_norm": 0.03696498183729998, + "learning_rate": 5.6474958565066705e-06, + "loss": 0.8262, + "step": 1324 + }, + { + "epoch": 0.7752221450250485, + "grad_norm": 0.0413037337153161, + "learning_rate": 5.644391472772617e-06, + "loss": 0.7777, + "step": 1325 + }, + { + "epoch": 0.7758072183420485, + "grad_norm": 0.03916694037724885, + "learning_rate": 5.641286037772551e-06, + "loss": 0.7261, + "step": 1326 + }, + { + "epoch": 0.7763922916590486, + "grad_norm": 0.042884361024367525, + "learning_rate": 5.638179554129953e-06, + "loss": 0.8306, + "step": 1327 + }, + { + "epoch": 0.7769773649760485, + "grad_norm": 0.04318783701390697, + "learning_rate": 5.635072024469189e-06, + "loss": 0.7502, + "step": 1328 + }, + { + "epoch": 0.7775624382930486, + "grad_norm": 0.04238539765681867, + "learning_rate": 5.631963451415512e-06, + "loss": 0.8651, + "step": 1329 + }, + { + "epoch": 0.7781475116100486, + "grad_norm": 0.0606324772137342, + "learning_rate": 5.628853837595055e-06, + "loss": 0.8407, + "step": 1330 + }, + { + "epoch": 0.7787325849270487, + "grad_norm": 0.03973726867422112, + "learning_rate": 5.62574318563483e-06, + "loss": 0.7456, + "step": 1331 + }, + { + "epoch": 0.7793176582440487, + "grad_norm": 0.04553233971802566, + "learning_rate": 5.622631498162723e-06, + "loss": 0.7778, + "step": 1332 + }, + { + "epoch": 0.7799027315610487, + "grad_norm": 0.045327548192169206, + "learning_rate": 5.619518777807502e-06, + "loss": 0.7996, + "step": 1333 + }, + { + "epoch": 0.7804878048780488, + "grad_norm": 0.05867372409357833, + "learning_rate": 5.616405027198802e-06, + "loss": 0.8378, + "step": 1334 + }, + { + "epoch": 0.7810728781950488, + "grad_norm": 0.03626782443560853, + "learning_rate": 5.613290248967128e-06, + "loss": 0.7915, + "step": 1335 + }, + { + "epoch": 0.7816579515120489, + "grad_norm": 0.03991616366449104, + "learning_rate": 5.610174445743857e-06, + "loss": 0.7521, + "step": 1336 + }, + { + "epoch": 0.7822430248290488, + "grad_norm": 0.03644002616250411, + "learning_rate": 5.6070576201612286e-06, + "loss": 0.8306, + "step": 1337 + }, + { + "epoch": 0.7828280981460489, + "grad_norm": 0.03526533642183784, + "learning_rate": 5.603939774852349e-06, + "loss": 0.7778, + "step": 1338 + }, + { + "epoch": 0.783413171463049, + "grad_norm": 0.05592830536988608, + "learning_rate": 5.6008209124511825e-06, + "loss": 0.804, + "step": 1339 + }, + { + "epoch": 0.783998244780049, + "grad_norm": 0.04740521258730855, + "learning_rate": 5.597701035592555e-06, + "loss": 0.8071, + "step": 1340 + }, + { + "epoch": 0.7845833180970491, + "grad_norm": 0.22162476710778273, + "learning_rate": 5.59458014691215e-06, + "loss": 0.8066, + "step": 1341 + }, + { + "epoch": 0.785168391414049, + "grad_norm": 0.09169014540583856, + "learning_rate": 5.5914582490465015e-06, + "loss": 0.8464, + "step": 1342 + }, + { + "epoch": 0.7857534647310491, + "grad_norm": 0.039007767079512076, + "learning_rate": 5.588335344633001e-06, + "loss": 0.779, + "step": 1343 + }, + { + "epoch": 0.7863385380480491, + "grad_norm": 0.04485954456072006, + "learning_rate": 5.585211436309887e-06, + "loss": 0.777, + "step": 1344 + }, + { + "epoch": 0.7869236113650492, + "grad_norm": 0.039288691984984164, + "learning_rate": 5.58208652671625e-06, + "loss": 0.8205, + "step": 1345 + }, + { + "epoch": 0.7875086846820493, + "grad_norm": 0.04480005712178459, + "learning_rate": 5.578960618492022e-06, + "loss": 0.8083, + "step": 1346 + }, + { + "epoch": 0.7880937579990492, + "grad_norm": 0.03846704640576893, + "learning_rate": 5.575833714277983e-06, + "loss": 0.7769, + "step": 1347 + }, + { + "epoch": 0.7886788313160493, + "grad_norm": 0.0414560185138673, + "learning_rate": 5.572705816715748e-06, + "loss": 0.8654, + "step": 1348 + }, + { + "epoch": 0.7892639046330493, + "grad_norm": 0.0501606429389028, + "learning_rate": 5.569576928447778e-06, + "loss": 0.7904, + "step": 1349 + }, + { + "epoch": 0.7898489779500494, + "grad_norm": 0.041891454878211096, + "learning_rate": 5.566447052117368e-06, + "loss": 0.7469, + "step": 1350 + }, + { + "epoch": 0.7904340512670494, + "grad_norm": 0.04101885937248216, + "learning_rate": 5.563316190368648e-06, + "loss": 0.759, + "step": 1351 + }, + { + "epoch": 0.7910191245840494, + "grad_norm": 0.036683416502670804, + "learning_rate": 5.560184345846577e-06, + "loss": 0.8344, + "step": 1352 + }, + { + "epoch": 0.7916041979010495, + "grad_norm": 0.045836001062305856, + "learning_rate": 5.557051521196952e-06, + "loss": 0.8326, + "step": 1353 + }, + { + "epoch": 0.7921892712180495, + "grad_norm": 0.0397925205994133, + "learning_rate": 5.553917719066391e-06, + "loss": 0.8104, + "step": 1354 + }, + { + "epoch": 0.7927743445350496, + "grad_norm": 0.041024359015282515, + "learning_rate": 5.5507829421023415e-06, + "loss": 0.8124, + "step": 1355 + }, + { + "epoch": 0.7933594178520496, + "grad_norm": 0.03779172917164305, + "learning_rate": 5.547647192953073e-06, + "loss": 0.788, + "step": 1356 + }, + { + "epoch": 0.7939444911690496, + "grad_norm": 0.03788055346324226, + "learning_rate": 5.544510474267677e-06, + "loss": 0.7715, + "step": 1357 + }, + { + "epoch": 0.7945295644860496, + "grad_norm": 0.03853302326944159, + "learning_rate": 5.541372788696063e-06, + "loss": 0.7159, + "step": 1358 + }, + { + "epoch": 0.7951146378030497, + "grad_norm": 0.03754302624204286, + "learning_rate": 5.538234138888959e-06, + "loss": 0.8184, + "step": 1359 + }, + { + "epoch": 0.7956997111200498, + "grad_norm": 0.0394348592493888, + "learning_rate": 5.535094527497905e-06, + "loss": 0.8367, + "step": 1360 + }, + { + "epoch": 0.7962847844370498, + "grad_norm": 0.03906040463168052, + "learning_rate": 5.531953957175258e-06, + "loss": 0.7447, + "step": 1361 + }, + { + "epoch": 0.7968698577540498, + "grad_norm": 0.03608720224204406, + "learning_rate": 5.528812430574179e-06, + "loss": 0.7342, + "step": 1362 + }, + { + "epoch": 0.7974549310710498, + "grad_norm": 0.04229914698861472, + "learning_rate": 5.525669950348644e-06, + "loss": 0.7747, + "step": 1363 + }, + { + "epoch": 0.7980400043880499, + "grad_norm": 0.037880293992017665, + "learning_rate": 5.522526519153425e-06, + "loss": 0.806, + "step": 1364 + }, + { + "epoch": 0.7986250777050499, + "grad_norm": 0.03984285750551966, + "learning_rate": 5.519382139644108e-06, + "loss": 0.7931, + "step": 1365 + }, + { + "epoch": 0.79921015102205, + "grad_norm": 0.08742568964292106, + "learning_rate": 5.516236814477073e-06, + "loss": 0.783, + "step": 1366 + }, + { + "epoch": 0.79979522433905, + "grad_norm": 0.05924902432803569, + "learning_rate": 5.513090546309501e-06, + "loss": 0.8574, + "step": 1367 + }, + { + "epoch": 0.80038029765605, + "grad_norm": 0.03846930309455261, + "learning_rate": 5.50994333779937e-06, + "loss": 0.7829, + "step": 1368 + }, + { + "epoch": 0.8009653709730501, + "grad_norm": 0.03668377181993435, + "learning_rate": 5.506795191605454e-06, + "loss": 0.7873, + "step": 1369 + }, + { + "epoch": 0.8015504442900501, + "grad_norm": 0.04173707671452499, + "learning_rate": 5.503646110387313e-06, + "loss": 0.8333, + "step": 1370 + }, + { + "epoch": 0.8021355176070502, + "grad_norm": 0.03842538857985326, + "learning_rate": 5.500496096805305e-06, + "loss": 0.6669, + "step": 1371 + }, + { + "epoch": 0.8027205909240501, + "grad_norm": 0.045318530527092046, + "learning_rate": 5.49734515352057e-06, + "loss": 0.7893, + "step": 1372 + }, + { + "epoch": 0.8033056642410502, + "grad_norm": 0.042283011239155184, + "learning_rate": 5.494193283195036e-06, + "loss": 0.7635, + "step": 1373 + }, + { + "epoch": 0.8038907375580503, + "grad_norm": 0.042565824937185165, + "learning_rate": 5.491040488491412e-06, + "loss": 0.7387, + "step": 1374 + }, + { + "epoch": 0.8044758108750503, + "grad_norm": 0.040462074923080384, + "learning_rate": 5.487886772073193e-06, + "loss": 0.7739, + "step": 1375 + }, + { + "epoch": 0.8050608841920504, + "grad_norm": 0.03792367430136744, + "learning_rate": 5.484732136604644e-06, + "loss": 0.7566, + "step": 1376 + }, + { + "epoch": 0.8056459575090503, + "grad_norm": 0.03692580293915857, + "learning_rate": 5.481576584750816e-06, + "loss": 0.8481, + "step": 1377 + }, + { + "epoch": 0.8062310308260504, + "grad_norm": 0.046451787891487024, + "learning_rate": 5.478420119177527e-06, + "loss": 0.8445, + "step": 1378 + }, + { + "epoch": 0.8068161041430504, + "grad_norm": 0.04084326621384236, + "learning_rate": 5.47526274255137e-06, + "loss": 0.7751, + "step": 1379 + }, + { + "epoch": 0.8074011774600505, + "grad_norm": 0.04059410095244892, + "learning_rate": 5.4721044575397074e-06, + "loss": 0.8631, + "step": 1380 + }, + { + "epoch": 0.8079862507770506, + "grad_norm": 0.04272865593875444, + "learning_rate": 5.468945266810668e-06, + "loss": 0.7363, + "step": 1381 + }, + { + "epoch": 0.8085713240940505, + "grad_norm": 0.040838508223671875, + "learning_rate": 5.4657851730331475e-06, + "loss": 0.7909, + "step": 1382 + }, + { + "epoch": 0.8091563974110506, + "grad_norm": 0.03888480720535378, + "learning_rate": 5.462624178876804e-06, + "loss": 0.7638, + "step": 1383 + }, + { + "epoch": 0.8097414707280506, + "grad_norm": 0.04210282976595793, + "learning_rate": 5.459462287012054e-06, + "loss": 0.7131, + "step": 1384 + }, + { + "epoch": 0.8103265440450507, + "grad_norm": 0.041245050616825804, + "learning_rate": 5.456299500110075e-06, + "loss": 0.798, + "step": 1385 + }, + { + "epoch": 0.8109116173620506, + "grad_norm": 0.03765154790140343, + "learning_rate": 5.453135820842801e-06, + "loss": 0.7585, + "step": 1386 + }, + { + "epoch": 0.8114966906790507, + "grad_norm": 0.04000694111877368, + "learning_rate": 5.449971251882917e-06, + "loss": 0.7911, + "step": 1387 + }, + { + "epoch": 0.8120817639960508, + "grad_norm": 0.03476195859648224, + "learning_rate": 5.4468057959038604e-06, + "loss": 0.761, + "step": 1388 + }, + { + "epoch": 0.8126668373130508, + "grad_norm": 0.03902654854791974, + "learning_rate": 5.4436394555798215e-06, + "loss": 0.7841, + "step": 1389 + }, + { + "epoch": 0.8132519106300509, + "grad_norm": 0.04263837806916918, + "learning_rate": 5.4404722335857345e-06, + "loss": 0.8001, + "step": 1390 + }, + { + "epoch": 0.8138369839470508, + "grad_norm": 0.04109922825698143, + "learning_rate": 5.437304132597278e-06, + "loss": 0.7333, + "step": 1391 + }, + { + "epoch": 0.8144220572640509, + "grad_norm": 0.06739804898618297, + "learning_rate": 5.4341351552908744e-06, + "loss": 0.7495, + "step": 1392 + }, + { + "epoch": 0.8150071305810509, + "grad_norm": 0.04932649676543138, + "learning_rate": 5.430965304343685e-06, + "loss": 0.8227, + "step": 1393 + }, + { + "epoch": 0.815592203898051, + "grad_norm": 0.0394930305814403, + "learning_rate": 5.427794582433614e-06, + "loss": 0.7627, + "step": 1394 + }, + { + "epoch": 0.816177277215051, + "grad_norm": 0.03667171398926786, + "learning_rate": 5.424622992239294e-06, + "loss": 0.7338, + "step": 1395 + }, + { + "epoch": 0.816762350532051, + "grad_norm": 0.03845361319571211, + "learning_rate": 5.421450536440095e-06, + "loss": 0.8347, + "step": 1396 + }, + { + "epoch": 0.8173474238490511, + "grad_norm": 0.042037246421225145, + "learning_rate": 5.41827721771612e-06, + "loss": 0.749, + "step": 1397 + }, + { + "epoch": 0.8179324971660511, + "grad_norm": 0.03891125248418732, + "learning_rate": 5.415103038748198e-06, + "loss": 0.744, + "step": 1398 + }, + { + "epoch": 0.8185175704830512, + "grad_norm": 0.06129253436260354, + "learning_rate": 5.411928002217888e-06, + "loss": 0.7449, + "step": 1399 + }, + { + "epoch": 0.8191026438000512, + "grad_norm": 0.038946849210413866, + "learning_rate": 5.408752110807467e-06, + "loss": 0.7716, + "step": 1400 + }, + { + "epoch": 0.8196877171170512, + "grad_norm": 0.033497315808280095, + "learning_rate": 5.405575367199942e-06, + "loss": 0.8, + "step": 1401 + }, + { + "epoch": 0.8202727904340512, + "grad_norm": 0.042207165872255904, + "learning_rate": 5.402397774079036e-06, + "loss": 0.7393, + "step": 1402 + }, + { + "epoch": 0.8208578637510513, + "grad_norm": 0.03955442411585693, + "learning_rate": 5.3992193341291886e-06, + "loss": 0.7736, + "step": 1403 + }, + { + "epoch": 0.8214429370680514, + "grad_norm": 0.05003446882200215, + "learning_rate": 5.396040050035557e-06, + "loss": 0.7875, + "step": 1404 + }, + { + "epoch": 0.8220280103850514, + "grad_norm": 1.585774424689007, + "learning_rate": 5.392859924484012e-06, + "loss": 0.7529, + "step": 1405 + }, + { + "epoch": 0.8226130837020514, + "grad_norm": 0.03926754895363517, + "learning_rate": 5.389678960161134e-06, + "loss": 0.7126, + "step": 1406 + }, + { + "epoch": 0.8231981570190514, + "grad_norm": 0.045548787620002675, + "learning_rate": 5.3864971597542095e-06, + "loss": 0.8089, + "step": 1407 + }, + { + "epoch": 0.8237832303360515, + "grad_norm": 0.050549309003184176, + "learning_rate": 5.383314525951239e-06, + "loss": 0.7794, + "step": 1408 + }, + { + "epoch": 0.8243683036530515, + "grad_norm": 0.03761672158310273, + "learning_rate": 5.380131061440916e-06, + "loss": 0.813, + "step": 1409 + }, + { + "epoch": 0.8249533769700516, + "grad_norm": 0.06445609012820583, + "learning_rate": 5.376946768912645e-06, + "loss": 0.7872, + "step": 1410 + }, + { + "epoch": 0.8255384502870516, + "grad_norm": 0.04256126719536265, + "learning_rate": 5.3737616510565285e-06, + "loss": 0.8119, + "step": 1411 + }, + { + "epoch": 0.8261235236040516, + "grad_norm": 0.041183658130345274, + "learning_rate": 5.370575710563362e-06, + "loss": 0.6822, + "step": 1412 + }, + { + "epoch": 0.8267085969210517, + "grad_norm": 0.04394347764957121, + "learning_rate": 5.367388950124638e-06, + "loss": 0.8018, + "step": 1413 + }, + { + "epoch": 0.8272936702380517, + "grad_norm": 0.0489504060626976, + "learning_rate": 5.364201372432544e-06, + "loss": 0.7585, + "step": 1414 + }, + { + "epoch": 0.8278787435550518, + "grad_norm": 0.03696771639000695, + "learning_rate": 5.361012980179955e-06, + "loss": 0.7659, + "step": 1415 + }, + { + "epoch": 0.8284638168720517, + "grad_norm": 0.03914887066611441, + "learning_rate": 5.357823776060436e-06, + "loss": 0.7116, + "step": 1416 + }, + { + "epoch": 0.8290488901890518, + "grad_norm": 0.0509069459164667, + "learning_rate": 5.3546337627682375e-06, + "loss": 0.8464, + "step": 1417 + }, + { + "epoch": 0.8296339635060519, + "grad_norm": 0.03774518281104574, + "learning_rate": 5.351442942998291e-06, + "loss": 0.7202, + "step": 1418 + }, + { + "epoch": 0.8302190368230519, + "grad_norm": 0.04736561860882581, + "learning_rate": 5.348251319446215e-06, + "loss": 0.7846, + "step": 1419 + }, + { + "epoch": 0.830804110140052, + "grad_norm": 0.05148202522600593, + "learning_rate": 5.3450588948083e-06, + "loss": 0.8305, + "step": 1420 + }, + { + "epoch": 0.8313891834570519, + "grad_norm": 0.043173058947526516, + "learning_rate": 5.341865671781521e-06, + "loss": 0.7154, + "step": 1421 + }, + { + "epoch": 0.831974256774052, + "grad_norm": 0.07628502093896318, + "learning_rate": 5.338671653063519e-06, + "loss": 0.7989, + "step": 1422 + }, + { + "epoch": 0.832559330091052, + "grad_norm": 0.055710961638667414, + "learning_rate": 5.335476841352614e-06, + "loss": 0.8053, + "step": 1423 + }, + { + "epoch": 0.8331444034080521, + "grad_norm": 0.03907449804352977, + "learning_rate": 5.3322812393477926e-06, + "loss": 0.768, + "step": 1424 + }, + { + "epoch": 0.8337294767250522, + "grad_norm": 0.05390227526387696, + "learning_rate": 5.32908484974871e-06, + "loss": 0.8034, + "step": 1425 + }, + { + "epoch": 0.8343145500420521, + "grad_norm": 0.0398118255725142, + "learning_rate": 5.325887675255688e-06, + "loss": 0.813, + "step": 1426 + }, + { + "epoch": 0.8348996233590522, + "grad_norm": 0.04145492964896595, + "learning_rate": 5.3226897185697085e-06, + "loss": 0.7782, + "step": 1427 + }, + { + "epoch": 0.8354846966760522, + "grad_norm": 0.045142573535406814, + "learning_rate": 5.319490982392416e-06, + "loss": 0.7923, + "step": 1428 + }, + { + "epoch": 0.8360697699930523, + "grad_norm": 0.0417853187675979, + "learning_rate": 5.316291469426113e-06, + "loss": 0.7188, + "step": 1429 + }, + { + "epoch": 0.8366548433100522, + "grad_norm": 0.03824273821943993, + "learning_rate": 5.313091182373761e-06, + "loss": 0.8268, + "step": 1430 + }, + { + "epoch": 0.8372399166270523, + "grad_norm": 0.04265515858265742, + "learning_rate": 5.309890123938971e-06, + "loss": 0.7841, + "step": 1431 + }, + { + "epoch": 0.8378249899440524, + "grad_norm": 0.04319663661457448, + "learning_rate": 5.306688296826007e-06, + "loss": 0.8278, + "step": 1432 + }, + { + "epoch": 0.8384100632610524, + "grad_norm": 0.04289615774239121, + "learning_rate": 5.3034857037397855e-06, + "loss": 0.7259, + "step": 1433 + }, + { + "epoch": 0.8389951365780525, + "grad_norm": 0.042931577411094914, + "learning_rate": 5.3002823473858645e-06, + "loss": 0.8723, + "step": 1434 + }, + { + "epoch": 0.8395802098950524, + "grad_norm": 0.06290471800979577, + "learning_rate": 5.297078230470454e-06, + "loss": 0.8477, + "step": 1435 + }, + { + "epoch": 0.8401652832120525, + "grad_norm": 0.03920031661089491, + "learning_rate": 5.293873355700399e-06, + "loss": 0.7391, + "step": 1436 + }, + { + "epoch": 0.8407503565290525, + "grad_norm": 0.04019727257985879, + "learning_rate": 5.290667725783189e-06, + "loss": 0.7166, + "step": 1437 + }, + { + "epoch": 0.8413354298460526, + "grad_norm": 0.05482096600060075, + "learning_rate": 5.287461343426953e-06, + "loss": 0.7527, + "step": 1438 + }, + { + "epoch": 0.8419205031630527, + "grad_norm": 0.08758101157201069, + "learning_rate": 5.28425421134045e-06, + "loss": 0.8409, + "step": 1439 + }, + { + "epoch": 0.8425055764800526, + "grad_norm": 0.042702212748573325, + "learning_rate": 5.281046332233078e-06, + "loss": 0.7841, + "step": 1440 + }, + { + "epoch": 0.8430906497970527, + "grad_norm": 0.058403994145729214, + "learning_rate": 5.277837708814864e-06, + "loss": 0.8108, + "step": 1441 + }, + { + "epoch": 0.8436757231140527, + "grad_norm": 0.04768258856736286, + "learning_rate": 5.274628343796462e-06, + "loss": 0.8224, + "step": 1442 + }, + { + "epoch": 0.8442607964310528, + "grad_norm": 0.036438567596678934, + "learning_rate": 5.271418239889155e-06, + "loss": 0.8098, + "step": 1443 + }, + { + "epoch": 0.8448458697480528, + "grad_norm": 0.038906276549651175, + "learning_rate": 5.268207399804851e-06, + "loss": 0.7441, + "step": 1444 + }, + { + "epoch": 0.8454309430650528, + "grad_norm": 0.04178256613424276, + "learning_rate": 5.264995826256074e-06, + "loss": 0.8095, + "step": 1445 + }, + { + "epoch": 0.8460160163820529, + "grad_norm": 0.0567861747607305, + "learning_rate": 5.2617835219559775e-06, + "loss": 0.7407, + "step": 1446 + }, + { + "epoch": 0.8466010896990529, + "grad_norm": 0.04554919492814466, + "learning_rate": 5.258570489618321e-06, + "loss": 0.7694, + "step": 1447 + }, + { + "epoch": 0.847186163016053, + "grad_norm": 0.048208046364352455, + "learning_rate": 5.255356731957488e-06, + "loss": 0.7255, + "step": 1448 + }, + { + "epoch": 0.847771236333053, + "grad_norm": 0.040443603231303464, + "learning_rate": 5.252142251688474e-06, + "loss": 0.8018, + "step": 1449 + }, + { + "epoch": 0.848356309650053, + "grad_norm": 0.06530684120775691, + "learning_rate": 5.248927051526877e-06, + "loss": 0.7677, + "step": 1450 + }, + { + "epoch": 0.848941382967053, + "grad_norm": 0.039867530196060805, + "learning_rate": 5.245711134188912e-06, + "loss": 0.769, + "step": 1451 + }, + { + "epoch": 0.8495264562840531, + "grad_norm": 0.04207996814905851, + "learning_rate": 5.242494502391395e-06, + "loss": 0.7442, + "step": 1452 + }, + { + "epoch": 0.8501115296010532, + "grad_norm": 0.04561472811714227, + "learning_rate": 5.239277158851749e-06, + "loss": 0.7115, + "step": 1453 + }, + { + "epoch": 0.8506966029180532, + "grad_norm": 0.04360996606853927, + "learning_rate": 5.236059106287996e-06, + "loss": 0.8547, + "step": 1454 + }, + { + "epoch": 0.8512816762350532, + "grad_norm": 0.04267227083806566, + "learning_rate": 5.232840347418756e-06, + "loss": 0.7316, + "step": 1455 + }, + { + "epoch": 0.8518667495520532, + "grad_norm": 0.04424954078490177, + "learning_rate": 5.229620884963249e-06, + "loss": 0.7864, + "step": 1456 + }, + { + "epoch": 0.8524518228690533, + "grad_norm": 0.043831224967070745, + "learning_rate": 5.226400721641285e-06, + "loss": 0.7831, + "step": 1457 + }, + { + "epoch": 0.8530368961860533, + "grad_norm": 0.041068348295436305, + "learning_rate": 5.223179860173271e-06, + "loss": 0.864, + "step": 1458 + }, + { + "epoch": 0.8536219695030534, + "grad_norm": 0.03891142553337627, + "learning_rate": 5.219958303280201e-06, + "loss": 0.8002, + "step": 1459 + }, + { + "epoch": 0.8542070428200534, + "grad_norm": 0.038910550390794815, + "learning_rate": 5.216736053683655e-06, + "loss": 0.6932, + "step": 1460 + }, + { + "epoch": 0.8547921161370534, + "grad_norm": 0.08030189721000423, + "learning_rate": 5.213513114105802e-06, + "loss": 0.7286, + "step": 1461 + }, + { + "epoch": 0.8553771894540535, + "grad_norm": 0.0376740189158807, + "learning_rate": 5.210289487269392e-06, + "loss": 0.7832, + "step": 1462 + }, + { + "epoch": 0.8559622627710535, + "grad_norm": 0.03877582322418199, + "learning_rate": 5.207065175897753e-06, + "loss": 0.8055, + "step": 1463 + }, + { + "epoch": 0.8565473360880536, + "grad_norm": 0.041349052303152146, + "learning_rate": 5.203840182714796e-06, + "loss": 0.6915, + "step": 1464 + }, + { + "epoch": 0.8571324094050535, + "grad_norm": 0.04301186186037183, + "learning_rate": 5.200614510445007e-06, + "loss": 0.7925, + "step": 1465 + }, + { + "epoch": 0.8577174827220536, + "grad_norm": 0.04787283684151834, + "learning_rate": 5.197388161813443e-06, + "loss": 0.7505, + "step": 1466 + }, + { + "epoch": 0.8583025560390537, + "grad_norm": 0.09574674570840018, + "learning_rate": 5.194161139545733e-06, + "loss": 0.8076, + "step": 1467 + }, + { + "epoch": 0.8588876293560537, + "grad_norm": 0.04155223740576836, + "learning_rate": 5.190933446368078e-06, + "loss": 0.7496, + "step": 1468 + }, + { + "epoch": 0.8594727026730538, + "grad_norm": 0.03701742254247571, + "learning_rate": 5.187705085007243e-06, + "loss": 0.7563, + "step": 1469 + }, + { + "epoch": 0.8600577759900537, + "grad_norm": 0.04678171662949799, + "learning_rate": 5.184476058190558e-06, + "loss": 0.7159, + "step": 1470 + }, + { + "epoch": 0.8606428493070538, + "grad_norm": 0.04884408311526059, + "learning_rate": 5.181246368645916e-06, + "loss": 0.9046, + "step": 1471 + }, + { + "epoch": 0.8612279226240538, + "grad_norm": 0.04557695869573877, + "learning_rate": 5.1780160191017685e-06, + "loss": 0.8452, + "step": 1472 + }, + { + "epoch": 0.8618129959410539, + "grad_norm": 0.04072103320834028, + "learning_rate": 5.1747850122871275e-06, + "loss": 0.7661, + "step": 1473 + }, + { + "epoch": 0.8623980692580538, + "grad_norm": 0.062070335429827074, + "learning_rate": 5.171553350931555e-06, + "loss": 0.7456, + "step": 1474 + }, + { + "epoch": 0.8629831425750539, + "grad_norm": 0.04041423716100811, + "learning_rate": 5.1683210377651716e-06, + "loss": 0.6993, + "step": 1475 + }, + { + "epoch": 0.863568215892054, + "grad_norm": 0.037496335851542203, + "learning_rate": 5.165088075518646e-06, + "loss": 0.7977, + "step": 1476 + }, + { + "epoch": 0.864153289209054, + "grad_norm": 0.04125529850667817, + "learning_rate": 5.161854466923194e-06, + "loss": 0.803, + "step": 1477 + }, + { + "epoch": 0.8647383625260541, + "grad_norm": 0.06648674433353301, + "learning_rate": 5.15862021471058e-06, + "loss": 0.7707, + "step": 1478 + }, + { + "epoch": 0.865323435843054, + "grad_norm": 0.04189383564593303, + "learning_rate": 5.155385321613111e-06, + "loss": 0.8043, + "step": 1479 + }, + { + "epoch": 0.8659085091600541, + "grad_norm": 0.04233513271829965, + "learning_rate": 5.1521497903636335e-06, + "loss": 0.7909, + "step": 1480 + }, + { + "epoch": 0.8664935824770541, + "grad_norm": 0.03892645909618685, + "learning_rate": 5.148913623695536e-06, + "loss": 0.7712, + "step": 1481 + }, + { + "epoch": 0.8670786557940542, + "grad_norm": 0.042788389662538735, + "learning_rate": 5.145676824342745e-06, + "loss": 0.8628, + "step": 1482 + }, + { + "epoch": 0.8676637291110543, + "grad_norm": 0.03339414866019395, + "learning_rate": 5.142439395039715e-06, + "loss": 0.8101, + "step": 1483 + }, + { + "epoch": 0.8682488024280542, + "grad_norm": 0.04123013617638013, + "learning_rate": 5.139201338521442e-06, + "loss": 0.7808, + "step": 1484 + }, + { + "epoch": 0.8688338757450543, + "grad_norm": 0.15209820057805395, + "learning_rate": 5.135962657523442e-06, + "loss": 0.8155, + "step": 1485 + }, + { + "epoch": 0.8694189490620543, + "grad_norm": 0.038333113007498906, + "learning_rate": 5.1327233547817655e-06, + "loss": 0.8114, + "step": 1486 + }, + { + "epoch": 0.8700040223790544, + "grad_norm": 0.1091230339415769, + "learning_rate": 5.129483433032988e-06, + "loss": 0.7413, + "step": 1487 + }, + { + "epoch": 0.8705890956960544, + "grad_norm": 0.04419253223096684, + "learning_rate": 5.126242895014202e-06, + "loss": 0.8002, + "step": 1488 + }, + { + "epoch": 0.8711741690130544, + "grad_norm": 0.03647244403949355, + "learning_rate": 5.123001743463026e-06, + "loss": 0.7514, + "step": 1489 + }, + { + "epoch": 0.8717592423300545, + "grad_norm": 0.040974797817733756, + "learning_rate": 5.119759981117599e-06, + "loss": 0.7391, + "step": 1490 + }, + { + "epoch": 0.8723443156470545, + "grad_norm": 0.03756329508778396, + "learning_rate": 5.116517610716568e-06, + "loss": 0.7665, + "step": 1491 + }, + { + "epoch": 0.8729293889640546, + "grad_norm": 0.03838778975594434, + "learning_rate": 5.113274634999101e-06, + "loss": 0.7946, + "step": 1492 + }, + { + "epoch": 0.8735144622810546, + "grad_norm": 0.04331410260349596, + "learning_rate": 5.110031056704872e-06, + "loss": 0.8715, + "step": 1493 + }, + { + "epoch": 0.8740995355980546, + "grad_norm": 0.0394075894195183, + "learning_rate": 5.10678687857407e-06, + "loss": 0.7063, + "step": 1494 + }, + { + "epoch": 0.8746846089150546, + "grad_norm": 0.04199482290559279, + "learning_rate": 5.103542103347385e-06, + "loss": 0.7819, + "step": 1495 + }, + { + "epoch": 0.8752696822320547, + "grad_norm": 0.03998823252954996, + "learning_rate": 5.100296733766013e-06, + "loss": 0.6424, + "step": 1496 + }, + { + "epoch": 0.8758547555490548, + "grad_norm": 0.04740849060105482, + "learning_rate": 5.0970507725716545e-06, + "loss": 0.8076, + "step": 1497 + }, + { + "epoch": 0.8764398288660548, + "grad_norm": 0.03967591518671566, + "learning_rate": 5.09380422250651e-06, + "loss": 0.7159, + "step": 1498 + }, + { + "epoch": 0.8770249021830548, + "grad_norm": 0.040850522852139234, + "learning_rate": 5.090557086313272e-06, + "loss": 0.767, + "step": 1499 + }, + { + "epoch": 0.8776099755000548, + "grad_norm": 0.15770863380367606, + "learning_rate": 5.087309366735134e-06, + "loss": 0.8364, + "step": 1500 + }, + { + "epoch": 0.8781950488170549, + "grad_norm": 0.03898410562227323, + "learning_rate": 5.0840610665157806e-06, + "loss": 0.7527, + "step": 1501 + }, + { + "epoch": 0.8787801221340549, + "grad_norm": 0.040414642012746985, + "learning_rate": 5.080812188399384e-06, + "loss": 0.7949, + "step": 1502 + }, + { + "epoch": 0.879365195451055, + "grad_norm": 0.04083305581647033, + "learning_rate": 5.0775627351306094e-06, + "loss": 0.7524, + "step": 1503 + }, + { + "epoch": 0.879950268768055, + "grad_norm": 0.032514118714502355, + "learning_rate": 5.074312709454604e-06, + "loss": 0.729, + "step": 1504 + }, + { + "epoch": 0.880535342085055, + "grad_norm": 0.038019163282452334, + "learning_rate": 5.0710621141170015e-06, + "loss": 0.7649, + "step": 1505 + }, + { + "epoch": 0.8811204154020551, + "grad_norm": 0.03465271277149766, + "learning_rate": 5.0678109518639164e-06, + "loss": 0.7869, + "step": 1506 + }, + { + "epoch": 0.8817054887190551, + "grad_norm": 0.03772455496689458, + "learning_rate": 5.06455922544194e-06, + "loss": 0.7613, + "step": 1507 + }, + { + "epoch": 0.8822905620360552, + "grad_norm": 0.037177660173090675, + "learning_rate": 5.06130693759814e-06, + "loss": 0.7243, + "step": 1508 + }, + { + "epoch": 0.8828756353530551, + "grad_norm": 0.039221859770761806, + "learning_rate": 5.058054091080063e-06, + "loss": 0.7136, + "step": 1509 + }, + { + "epoch": 0.8834607086700552, + "grad_norm": 0.048694373606749476, + "learning_rate": 5.054800688635721e-06, + "loss": 0.7674, + "step": 1510 + }, + { + "epoch": 0.8840457819870553, + "grad_norm": 0.04046266225913142, + "learning_rate": 5.051546733013602e-06, + "loss": 0.7526, + "step": 1511 + }, + { + "epoch": 0.8846308553040553, + "grad_norm": 0.04165158889305381, + "learning_rate": 5.048292226962659e-06, + "loss": 0.7713, + "step": 1512 + }, + { + "epoch": 0.8852159286210554, + "grad_norm": 0.03903357348513953, + "learning_rate": 5.045037173232306e-06, + "loss": 0.7989, + "step": 1513 + }, + { + "epoch": 0.8858010019380553, + "grad_norm": 0.03893629709023498, + "learning_rate": 5.041781574572425e-06, + "loss": 0.7602, + "step": 1514 + }, + { + "epoch": 0.8863860752550554, + "grad_norm": 0.053074781240617146, + "learning_rate": 5.038525433733358e-06, + "loss": 0.7197, + "step": 1515 + }, + { + "epoch": 0.8869711485720554, + "grad_norm": 0.03986757723797883, + "learning_rate": 5.035268753465901e-06, + "loss": 0.7613, + "step": 1516 + }, + { + "epoch": 0.8875562218890555, + "grad_norm": 0.03768647901063803, + "learning_rate": 5.032011536521309e-06, + "loss": 0.8723, + "step": 1517 + }, + { + "epoch": 0.8881412952060556, + "grad_norm": 0.04233597893650536, + "learning_rate": 5.0287537856512874e-06, + "loss": 0.829, + "step": 1518 + }, + { + "epoch": 0.8887263685230555, + "grad_norm": 0.03833944818184599, + "learning_rate": 5.025495503608001e-06, + "loss": 0.7874, + "step": 1519 + }, + { + "epoch": 0.8893114418400556, + "grad_norm": 0.038092427816242835, + "learning_rate": 5.0222366931440515e-06, + "loss": 0.7448, + "step": 1520 + }, + { + "epoch": 0.8898965151570556, + "grad_norm": 0.03305492326165736, + "learning_rate": 5.018977357012494e-06, + "loss": 0.6638, + "step": 1521 + }, + { + "epoch": 0.8904815884740557, + "grad_norm": 0.04283989577661302, + "learning_rate": 5.015717497966829e-06, + "loss": 0.773, + "step": 1522 + }, + { + "epoch": 0.8910666617910556, + "grad_norm": 0.03767830677413742, + "learning_rate": 5.012457118760993e-06, + "loss": 0.746, + "step": 1523 + }, + { + "epoch": 0.8916517351080557, + "grad_norm": 0.31951539666885187, + "learning_rate": 5.009196222149366e-06, + "loss": 0.8181, + "step": 1524 + }, + { + "epoch": 0.8922368084250558, + "grad_norm": 0.04141411707140542, + "learning_rate": 5.005934810886765e-06, + "loss": 0.7699, + "step": 1525 + }, + { + "epoch": 0.8928218817420558, + "grad_norm": 0.04045968293380357, + "learning_rate": 5.0026728877284415e-06, + "loss": 0.7258, + "step": 1526 + }, + { + "epoch": 0.8934069550590559, + "grad_norm": 0.039404297272624296, + "learning_rate": 4.999410455430076e-06, + "loss": 0.7537, + "step": 1527 + }, + { + "epoch": 0.8939920283760558, + "grad_norm": 0.041014176601729714, + "learning_rate": 4.9961475167477856e-06, + "loss": 0.7699, + "step": 1528 + }, + { + "epoch": 0.8945771016930559, + "grad_norm": 0.038058343537700656, + "learning_rate": 4.992884074438108e-06, + "loss": 0.7459, + "step": 1529 + }, + { + "epoch": 0.8951621750100559, + "grad_norm": 0.038810739375882135, + "learning_rate": 4.989620131258013e-06, + "loss": 0.7668, + "step": 1530 + }, + { + "epoch": 0.895747248327056, + "grad_norm": 0.042807540000545895, + "learning_rate": 4.98635568996489e-06, + "loss": 0.7196, + "step": 1531 + }, + { + "epoch": 0.8963323216440561, + "grad_norm": 0.04859312187849364, + "learning_rate": 4.9830907533165485e-06, + "loss": 0.743, + "step": 1532 + }, + { + "epoch": 0.896917394961056, + "grad_norm": 0.04633256062984912, + "learning_rate": 4.979825324071219e-06, + "loss": 0.788, + "step": 1533 + }, + { + "epoch": 0.8975024682780561, + "grad_norm": 0.03792342373881924, + "learning_rate": 4.976559404987544e-06, + "loss": 0.8347, + "step": 1534 + }, + { + "epoch": 0.8980875415950561, + "grad_norm": 0.043360533681012677, + "learning_rate": 4.973292998824588e-06, + "loss": 0.7931, + "step": 1535 + }, + { + "epoch": 0.8986726149120562, + "grad_norm": 0.03665588059872115, + "learning_rate": 4.970026108341818e-06, + "loss": 0.8341, + "step": 1536 + }, + { + "epoch": 0.8992576882290562, + "grad_norm": 0.03911321895257535, + "learning_rate": 4.966758736299114e-06, + "loss": 0.7022, + "step": 1537 + }, + { + "epoch": 0.8998427615460562, + "grad_norm": 0.03807679640234228, + "learning_rate": 4.963490885456764e-06, + "loss": 0.7606, + "step": 1538 + }, + { + "epoch": 0.9004278348630563, + "grad_norm": 0.047813376297185586, + "learning_rate": 4.9602225585754575e-06, + "loss": 0.851, + "step": 1539 + }, + { + "epoch": 0.9010129081800563, + "grad_norm": 0.04059937455713067, + "learning_rate": 4.95695375841629e-06, + "loss": 0.8127, + "step": 1540 + }, + { + "epoch": 0.9015979814970564, + "grad_norm": 0.05021922046354935, + "learning_rate": 4.953684487740751e-06, + "loss": 0.8058, + "step": 1541 + }, + { + "epoch": 0.9021830548140564, + "grad_norm": 0.036688823164468184, + "learning_rate": 4.950414749310736e-06, + "loss": 0.7953, + "step": 1542 + }, + { + "epoch": 0.9027681281310564, + "grad_norm": 0.07379482387202636, + "learning_rate": 4.947144545888528e-06, + "loss": 0.7334, + "step": 1543 + }, + { + "epoch": 0.9033532014480564, + "grad_norm": 0.0436885604988347, + "learning_rate": 4.943873880236805e-06, + "loss": 0.7843, + "step": 1544 + }, + { + "epoch": 0.9039382747650565, + "grad_norm": 0.037138950603287514, + "learning_rate": 4.940602755118637e-06, + "loss": 0.7426, + "step": 1545 + }, + { + "epoch": 0.9045233480820565, + "grad_norm": 0.06299316081615561, + "learning_rate": 4.93733117329748e-06, + "loss": 0.7667, + "step": 1546 + }, + { + "epoch": 0.9051084213990566, + "grad_norm": 0.14368489621412747, + "learning_rate": 4.934059137537182e-06, + "loss": 0.7187, + "step": 1547 + }, + { + "epoch": 0.9056934947160566, + "grad_norm": 0.04204980565272586, + "learning_rate": 4.9307866506019625e-06, + "loss": 0.775, + "step": 1548 + }, + { + "epoch": 0.9062785680330566, + "grad_norm": 0.04379760553699639, + "learning_rate": 4.927513715256434e-06, + "loss": 0.7758, + "step": 1549 + }, + { + "epoch": 0.9068636413500567, + "grad_norm": 0.03874093982602192, + "learning_rate": 4.9242403342655815e-06, + "loss": 0.8522, + "step": 1550 + }, + { + "epoch": 0.9074487146670567, + "grad_norm": 0.16908575016119626, + "learning_rate": 4.920966510394769e-06, + "loss": 0.7786, + "step": 1551 + }, + { + "epoch": 0.9080337879840568, + "grad_norm": 0.03659836060994663, + "learning_rate": 4.917692246409733e-06, + "loss": 0.7795, + "step": 1552 + }, + { + "epoch": 0.9086188613010567, + "grad_norm": 0.040782376168304955, + "learning_rate": 4.9144175450765814e-06, + "loss": 0.7682, + "step": 1553 + }, + { + "epoch": 0.9092039346180568, + "grad_norm": 0.04146338306008847, + "learning_rate": 4.911142409161795e-06, + "loss": 0.7366, + "step": 1554 + }, + { + "epoch": 0.9097890079350569, + "grad_norm": 0.04286248778324747, + "learning_rate": 4.9078668414322176e-06, + "loss": 0.7368, + "step": 1555 + }, + { + "epoch": 0.9103740812520569, + "grad_norm": 0.04888887313290673, + "learning_rate": 4.9045908446550595e-06, + "loss": 0.7896, + "step": 1556 + }, + { + "epoch": 0.910959154569057, + "grad_norm": 0.03802578544146469, + "learning_rate": 4.901314421597895e-06, + "loss": 0.7867, + "step": 1557 + }, + { + "epoch": 0.9115442278860569, + "grad_norm": 0.0697904135176372, + "learning_rate": 4.898037575028657e-06, + "loss": 0.7006, + "step": 1558 + }, + { + "epoch": 0.912129301203057, + "grad_norm": 0.03992583542856418, + "learning_rate": 4.894760307715634e-06, + "loss": 0.7847, + "step": 1559 + }, + { + "epoch": 0.912714374520057, + "grad_norm": 0.04168078061737128, + "learning_rate": 4.891482622427473e-06, + "loss": 0.7257, + "step": 1560 + }, + { + "epoch": 0.9132994478370571, + "grad_norm": 0.03886703909354667, + "learning_rate": 4.888204521933178e-06, + "loss": 0.7405, + "step": 1561 + }, + { + "epoch": 0.9138845211540572, + "grad_norm": 0.038571266137394804, + "learning_rate": 4.8849260090020916e-06, + "loss": 0.7847, + "step": 1562 + }, + { + "epoch": 0.9144695944710571, + "grad_norm": 0.039942483875795645, + "learning_rate": 4.881647086403918e-06, + "loss": 0.7541, + "step": 1563 + }, + { + "epoch": 0.9150546677880572, + "grad_norm": 0.04050391982359178, + "learning_rate": 4.878367756908698e-06, + "loss": 0.8032, + "step": 1564 + }, + { + "epoch": 0.9156397411050572, + "grad_norm": 0.06213525861224018, + "learning_rate": 4.875088023286823e-06, + "loss": 0.7917, + "step": 1565 + }, + { + "epoch": 0.9162248144220573, + "grad_norm": 0.04040488630209474, + "learning_rate": 4.871807888309021e-06, + "loss": 0.7906, + "step": 1566 + }, + { + "epoch": 0.9168098877390572, + "grad_norm": 0.04159089864338253, + "learning_rate": 4.868527354746361e-06, + "loss": 0.8193, + "step": 1567 + }, + { + "epoch": 0.9173949610560573, + "grad_norm": 0.06139536834412755, + "learning_rate": 4.86524642537025e-06, + "loss": 0.7674, + "step": 1568 + }, + { + "epoch": 0.9179800343730574, + "grad_norm": 0.03947767749509195, + "learning_rate": 4.861965102952426e-06, + "loss": 0.8447, + "step": 1569 + }, + { + "epoch": 0.9185651076900574, + "grad_norm": 0.043695480643704855, + "learning_rate": 4.858683390264963e-06, + "loss": 0.87, + "step": 1570 + }, + { + "epoch": 0.9191501810070575, + "grad_norm": 0.03962500281019953, + "learning_rate": 4.855401290080259e-06, + "loss": 0.7799, + "step": 1571 + }, + { + "epoch": 0.9197352543240574, + "grad_norm": 0.03883602861594149, + "learning_rate": 4.8521188051710465e-06, + "loss": 0.7198, + "step": 1572 + }, + { + "epoch": 0.9203203276410575, + "grad_norm": 0.038996303115135654, + "learning_rate": 4.84883593831038e-06, + "loss": 0.8369, + "step": 1573 + }, + { + "epoch": 0.9209054009580575, + "grad_norm": 0.036243906078929175, + "learning_rate": 4.845552692271633e-06, + "loss": 0.7819, + "step": 1574 + }, + { + "epoch": 0.9214904742750576, + "grad_norm": 0.03598026113530175, + "learning_rate": 4.8422690698285055e-06, + "loss": 0.7848, + "step": 1575 + }, + { + "epoch": 0.9220755475920577, + "grad_norm": 0.03927177230794752, + "learning_rate": 4.83898507375501e-06, + "loss": 0.8389, + "step": 1576 + }, + { + "epoch": 0.9226606209090576, + "grad_norm": 0.0385535598643195, + "learning_rate": 4.83570070682548e-06, + "loss": 0.8231, + "step": 1577 + }, + { + "epoch": 0.9232456942260577, + "grad_norm": 0.03748485100626454, + "learning_rate": 4.832415971814558e-06, + "loss": 0.7769, + "step": 1578 + }, + { + "epoch": 0.9238307675430577, + "grad_norm": 0.04406376389798333, + "learning_rate": 4.829130871497198e-06, + "loss": 0.8501, + "step": 1579 + }, + { + "epoch": 0.9244158408600578, + "grad_norm": 0.04181053014039125, + "learning_rate": 4.825845408648666e-06, + "loss": 0.7151, + "step": 1580 + }, + { + "epoch": 0.9250009141770578, + "grad_norm": 0.04012827948350568, + "learning_rate": 4.82255958604453e-06, + "loss": 0.7475, + "step": 1581 + }, + { + "epoch": 0.9255859874940578, + "grad_norm": 0.04066339251456573, + "learning_rate": 4.819273406460664e-06, + "loss": 0.7764, + "step": 1582 + }, + { + "epoch": 0.9261710608110579, + "grad_norm": 0.03654201108348175, + "learning_rate": 4.815986872673245e-06, + "loss": 0.7002, + "step": 1583 + }, + { + "epoch": 0.9267561341280579, + "grad_norm": 0.05574027310447493, + "learning_rate": 4.812699987458744e-06, + "loss": 0.7396, + "step": 1584 + }, + { + "epoch": 0.927341207445058, + "grad_norm": 0.043906342122207766, + "learning_rate": 4.809412753593935e-06, + "loss": 0.842, + "step": 1585 + }, + { + "epoch": 0.927926280762058, + "grad_norm": 0.03608562466923883, + "learning_rate": 4.806125173855884e-06, + "loss": 0.879, + "step": 1586 + }, + { + "epoch": 0.928511354079058, + "grad_norm": 0.053877179770256106, + "learning_rate": 4.802837251021948e-06, + "loss": 0.7109, + "step": 1587 + }, + { + "epoch": 0.929096427396058, + "grad_norm": 0.044680943459160405, + "learning_rate": 4.799548987869778e-06, + "loss": 0.7893, + "step": 1588 + }, + { + "epoch": 0.9296815007130581, + "grad_norm": 0.051152646979938876, + "learning_rate": 4.7962603871773066e-06, + "loss": 0.8483, + "step": 1589 + }, + { + "epoch": 0.9302665740300582, + "grad_norm": 0.03887470736846696, + "learning_rate": 4.7929714517227524e-06, + "loss": 0.7322, + "step": 1590 + }, + { + "epoch": 0.9308516473470582, + "grad_norm": 0.037211850030407195, + "learning_rate": 4.789682184284625e-06, + "loss": 0.7299, + "step": 1591 + }, + { + "epoch": 0.9314367206640582, + "grad_norm": 0.037429526440719174, + "learning_rate": 4.786392587641704e-06, + "loss": 0.8137, + "step": 1592 + }, + { + "epoch": 0.9320217939810582, + "grad_norm": 0.16993450311970792, + "learning_rate": 4.783102664573052e-06, + "loss": 0.7883, + "step": 1593 + }, + { + "epoch": 0.9326068672980583, + "grad_norm": 0.04297358226397087, + "learning_rate": 4.779812417858008e-06, + "loss": 0.8226, + "step": 1594 + }, + { + "epoch": 0.9331919406150583, + "grad_norm": 0.03667744575467755, + "learning_rate": 4.776521850276182e-06, + "loss": 0.7385, + "step": 1595 + }, + { + "epoch": 0.9337770139320584, + "grad_norm": 0.03913765169009846, + "learning_rate": 4.7732309646074585e-06, + "loss": 0.8733, + "step": 1596 + }, + { + "epoch": 0.9343620872490584, + "grad_norm": 0.03736629949079511, + "learning_rate": 4.769939763631986e-06, + "loss": 0.7932, + "step": 1597 + }, + { + "epoch": 0.9349471605660584, + "grad_norm": 0.037356612963331204, + "learning_rate": 4.766648250130183e-06, + "loss": 0.8071, + "step": 1598 + }, + { + "epoch": 0.9355322338830585, + "grad_norm": 0.04829265429843901, + "learning_rate": 4.76335642688273e-06, + "loss": 0.7536, + "step": 1599 + }, + { + "epoch": 0.9361173072000585, + "grad_norm": 0.03663643287071603, + "learning_rate": 4.760064296670573e-06, + "loss": 0.7419, + "step": 1600 + }, + { + "epoch": 0.9367023805170586, + "grad_norm": 0.03851622839931266, + "learning_rate": 4.756771862274911e-06, + "loss": 0.7391, + "step": 1601 + }, + { + "epoch": 0.9372874538340585, + "grad_norm": 0.058935243933442134, + "learning_rate": 4.753479126477207e-06, + "loss": 0.8036, + "step": 1602 + }, + { + "epoch": 0.9378725271510586, + "grad_norm": 0.05643207099843358, + "learning_rate": 4.750186092059171e-06, + "loss": 0.7764, + "step": 1603 + }, + { + "epoch": 0.9384576004680587, + "grad_norm": 0.038227121653592495, + "learning_rate": 4.746892761802771e-06, + "loss": 0.6986, + "step": 1604 + }, + { + "epoch": 0.9390426737850587, + "grad_norm": 0.041109384181244574, + "learning_rate": 4.743599138490224e-06, + "loss": 0.6453, + "step": 1605 + }, + { + "epoch": 0.9396277471020588, + "grad_norm": 0.04596157162267912, + "learning_rate": 4.740305224903993e-06, + "loss": 0.8228, + "step": 1606 + }, + { + "epoch": 0.9402128204190587, + "grad_norm": 0.04749314030842733, + "learning_rate": 4.73701102382679e-06, + "loss": 0.8627, + "step": 1607 + }, + { + "epoch": 0.9407978937360588, + "grad_norm": 0.04073293983059644, + "learning_rate": 4.733716538041561e-06, + "loss": 0.7932, + "step": 1608 + }, + { + "epoch": 0.9413829670530588, + "grad_norm": 0.03695351421055921, + "learning_rate": 4.730421770331503e-06, + "loss": 0.8384, + "step": 1609 + }, + { + "epoch": 0.9419680403700589, + "grad_norm": 0.041338479689365416, + "learning_rate": 4.727126723480046e-06, + "loss": 0.7826, + "step": 1610 + }, + { + "epoch": 0.942553113687059, + "grad_norm": 0.039567218912003696, + "learning_rate": 4.723831400270854e-06, + "loss": 0.71, + "step": 1611 + }, + { + "epoch": 0.9431381870040589, + "grad_norm": 0.038765181273196586, + "learning_rate": 4.7205358034878295e-06, + "loss": 0.7528, + "step": 1612 + }, + { + "epoch": 0.943723260321059, + "grad_norm": 0.03857144409516923, + "learning_rate": 4.717239935915105e-06, + "loss": 0.8085, + "step": 1613 + }, + { + "epoch": 0.944308333638059, + "grad_norm": 0.050546795531220844, + "learning_rate": 4.713943800337034e-06, + "loss": 0.814, + "step": 1614 + }, + { + "epoch": 0.9448934069550591, + "grad_norm": 0.03877844831137139, + "learning_rate": 4.710647399538209e-06, + "loss": 0.8016, + "step": 1615 + }, + { + "epoch": 0.945478480272059, + "grad_norm": 0.04286623826991479, + "learning_rate": 4.707350736303439e-06, + "loss": 0.7046, + "step": 1616 + }, + { + "epoch": 0.9460635535890591, + "grad_norm": 0.041695250387251796, + "learning_rate": 4.704053813417753e-06, + "loss": 0.7558, + "step": 1617 + }, + { + "epoch": 0.9466486269060592, + "grad_norm": 0.03686599980639858, + "learning_rate": 4.700756633666404e-06, + "loss": 0.7133, + "step": 1618 + }, + { + "epoch": 0.9472337002230592, + "grad_norm": 0.05197679285294168, + "learning_rate": 4.6974591998348605e-06, + "loss": 0.7537, + "step": 1619 + }, + { + "epoch": 0.9478187735400593, + "grad_norm": 0.04448660325795372, + "learning_rate": 4.694161514708806e-06, + "loss": 0.7808, + "step": 1620 + }, + { + "epoch": 0.9484038468570593, + "grad_norm": 0.04186213228118794, + "learning_rate": 4.690863581074137e-06, + "loss": 0.7767, + "step": 1621 + }, + { + "epoch": 0.9489889201740593, + "grad_norm": 0.0400854810264849, + "learning_rate": 4.687565401716953e-06, + "loss": 0.7603, + "step": 1622 + }, + { + "epoch": 0.9495739934910593, + "grad_norm": 0.03701384913843474, + "learning_rate": 4.684266979423573e-06, + "loss": 0.633, + "step": 1623 + }, + { + "epoch": 0.9501590668080594, + "grad_norm": 0.04591450855041653, + "learning_rate": 4.680968316980514e-06, + "loss": 0.7848, + "step": 1624 + }, + { + "epoch": 0.9507441401250594, + "grad_norm": 0.04476228996175277, + "learning_rate": 4.677669417174493e-06, + "loss": 0.7866, + "step": 1625 + }, + { + "epoch": 0.9513292134420595, + "grad_norm": 0.03707636228363127, + "learning_rate": 4.674370282792435e-06, + "loss": 0.7799, + "step": 1626 + }, + { + "epoch": 0.9519142867590595, + "grad_norm": 0.03969252104571136, + "learning_rate": 4.671070916621458e-06, + "loss": 0.7713, + "step": 1627 + }, + { + "epoch": 0.9524993600760595, + "grad_norm": 0.07811310064791632, + "learning_rate": 4.66777132144888e-06, + "loss": 0.72, + "step": 1628 + }, + { + "epoch": 0.9530844333930596, + "grad_norm": 0.03719654474999105, + "learning_rate": 4.6644715000622065e-06, + "loss": 0.7672, + "step": 1629 + }, + { + "epoch": 0.9536695067100596, + "grad_norm": 0.041072520733934526, + "learning_rate": 4.6611714552491385e-06, + "loss": 0.7603, + "step": 1630 + }, + { + "epoch": 0.9542545800270597, + "grad_norm": 0.07853658068992475, + "learning_rate": 4.657871189797565e-06, + "loss": 0.8341, + "step": 1631 + }, + { + "epoch": 0.9548396533440596, + "grad_norm": 0.0388996416456516, + "learning_rate": 4.654570706495562e-06, + "loss": 0.6744, + "step": 1632 + }, + { + "epoch": 0.9554247266610597, + "grad_norm": 0.04381785661377394, + "learning_rate": 4.651270008131388e-06, + "loss": 0.7378, + "step": 1633 + }, + { + "epoch": 0.9560097999780598, + "grad_norm": 0.04201885634367334, + "learning_rate": 4.647969097493483e-06, + "loss": 0.7474, + "step": 1634 + }, + { + "epoch": 0.9565948732950598, + "grad_norm": 0.03726848394227385, + "learning_rate": 4.644667977370468e-06, + "loss": 0.832, + "step": 1635 + }, + { + "epoch": 0.9571799466120599, + "grad_norm": 0.04564574185484768, + "learning_rate": 4.64136665055114e-06, + "loss": 0.754, + "step": 1636 + }, + { + "epoch": 0.9577650199290598, + "grad_norm": 0.03922811287637288, + "learning_rate": 4.638065119824471e-06, + "loss": 0.7385, + "step": 1637 + }, + { + "epoch": 0.9583500932460599, + "grad_norm": 0.04617849099423774, + "learning_rate": 4.634763387979606e-06, + "loss": 0.8023, + "step": 1638 + }, + { + "epoch": 0.9589351665630599, + "grad_norm": 0.051954956199202075, + "learning_rate": 4.631461457805855e-06, + "loss": 0.731, + "step": 1639 + }, + { + "epoch": 0.95952023988006, + "grad_norm": 0.03670055716355514, + "learning_rate": 4.628159332092705e-06, + "loss": 0.7279, + "step": 1640 + }, + { + "epoch": 0.96010531319706, + "grad_norm": 0.03879162525758375, + "learning_rate": 4.624857013629797e-06, + "loss": 0.7359, + "step": 1641 + }, + { + "epoch": 0.96069038651406, + "grad_norm": 0.03884240382534952, + "learning_rate": 4.621554505206946e-06, + "loss": 0.7002, + "step": 1642 + }, + { + "epoch": 0.9612754598310601, + "grad_norm": 0.03949437881641414, + "learning_rate": 4.618251809614119e-06, + "loss": 0.8259, + "step": 1643 + }, + { + "epoch": 0.9618605331480601, + "grad_norm": 0.03808396641461775, + "learning_rate": 4.614948929641444e-06, + "loss": 0.7026, + "step": 1644 + }, + { + "epoch": 0.9624456064650602, + "grad_norm": 0.040652640698166743, + "learning_rate": 4.611645868079205e-06, + "loss": 0.7399, + "step": 1645 + }, + { + "epoch": 0.9630306797820601, + "grad_norm": 0.041448998916834554, + "learning_rate": 4.60834262771784e-06, + "loss": 0.7924, + "step": 1646 + }, + { + "epoch": 0.9636157530990602, + "grad_norm": 0.03414951748294008, + "learning_rate": 4.605039211347937e-06, + "loss": 0.694, + "step": 1647 + }, + { + "epoch": 0.9642008264160603, + "grad_norm": 0.042150214675898114, + "learning_rate": 4.601735621760234e-06, + "loss": 0.8276, + "step": 1648 + }, + { + "epoch": 0.9647858997330603, + "grad_norm": 0.04420133355554904, + "learning_rate": 4.598431861745613e-06, + "loss": 0.7827, + "step": 1649 + }, + { + "epoch": 0.9653709730500604, + "grad_norm": 0.04509650606205026, + "learning_rate": 4.5951279340951e-06, + "loss": 0.7904, + "step": 1650 + }, + { + "epoch": 0.9659560463670603, + "grad_norm": 0.038633078744749694, + "learning_rate": 4.591823841599868e-06, + "loss": 0.7758, + "step": 1651 + }, + { + "epoch": 0.9665411196840604, + "grad_norm": 0.038236691470263345, + "learning_rate": 4.588519587051222e-06, + "loss": 0.8076, + "step": 1652 + }, + { + "epoch": 0.9671261930010604, + "grad_norm": 0.03695431719529444, + "learning_rate": 4.585215173240608e-06, + "loss": 0.7743, + "step": 1653 + }, + { + "epoch": 0.9677112663180605, + "grad_norm": 0.036468585815911905, + "learning_rate": 4.581910602959607e-06, + "loss": 0.8153, + "step": 1654 + }, + { + "epoch": 0.9682963396350606, + "grad_norm": 0.03926208587753877, + "learning_rate": 4.5786058789999275e-06, + "loss": 0.7298, + "step": 1655 + }, + { + "epoch": 0.9688814129520605, + "grad_norm": 0.04187645286136086, + "learning_rate": 4.575301004153415e-06, + "loss": 0.7198, + "step": 1656 + }, + { + "epoch": 0.9694664862690606, + "grad_norm": 0.0413499471995496, + "learning_rate": 4.571995981212035e-06, + "loss": 0.7575, + "step": 1657 + }, + { + "epoch": 0.9700515595860606, + "grad_norm": 0.03415914885523872, + "learning_rate": 4.568690812967886e-06, + "loss": 0.8051, + "step": 1658 + }, + { + "epoch": 0.9706366329030607, + "grad_norm": 0.03725068359340925, + "learning_rate": 4.565385502213181e-06, + "loss": 0.735, + "step": 1659 + }, + { + "epoch": 0.9712217062200607, + "grad_norm": 0.03893390573113403, + "learning_rate": 4.562080051740261e-06, + "loss": 0.7236, + "step": 1660 + }, + { + "epoch": 0.9718067795370607, + "grad_norm": 0.03672862046599132, + "learning_rate": 4.558774464341577e-06, + "loss": 0.8214, + "step": 1661 + }, + { + "epoch": 0.9723918528540608, + "grad_norm": 0.035620814744529494, + "learning_rate": 4.555468742809705e-06, + "loss": 0.8607, + "step": 1662 + }, + { + "epoch": 0.9729769261710608, + "grad_norm": 0.045219348615134776, + "learning_rate": 4.552162889937326e-06, + "loss": 0.793, + "step": 1663 + }, + { + "epoch": 0.9735619994880609, + "grad_norm": 0.04120637349080747, + "learning_rate": 4.548856908517236e-06, + "loss": 0.761, + "step": 1664 + }, + { + "epoch": 0.9741470728050609, + "grad_norm": 0.037426001969961896, + "learning_rate": 4.545550801342341e-06, + "loss": 0.7146, + "step": 1665 + }, + { + "epoch": 0.9747321461220609, + "grad_norm": 0.04095353340190115, + "learning_rate": 4.542244571205649e-06, + "loss": 0.7821, + "step": 1666 + }, + { + "epoch": 0.9753172194390609, + "grad_norm": 0.035000026858081075, + "learning_rate": 4.538938220900275e-06, + "loss": 0.7288, + "step": 1667 + }, + { + "epoch": 0.975902292756061, + "grad_norm": 0.03646938717887608, + "learning_rate": 4.535631753219433e-06, + "loss": 0.7056, + "step": 1668 + }, + { + "epoch": 0.9764873660730611, + "grad_norm": 0.14152781507488307, + "learning_rate": 4.53232517095644e-06, + "loss": 0.8503, + "step": 1669 + }, + { + "epoch": 0.977072439390061, + "grad_norm": 0.040871359096417335, + "learning_rate": 4.529018476904708e-06, + "loss": 0.6784, + "step": 1670 + }, + { + "epoch": 0.9776575127070611, + "grad_norm": 0.03608347625330109, + "learning_rate": 4.52571167385774e-06, + "loss": 0.7103, + "step": 1671 + }, + { + "epoch": 0.9782425860240611, + "grad_norm": 0.038805523132996685, + "learning_rate": 4.522404764609138e-06, + "loss": 0.7475, + "step": 1672 + }, + { + "epoch": 0.9788276593410612, + "grad_norm": 0.05674110098536558, + "learning_rate": 4.5190977519525864e-06, + "loss": 0.8209, + "step": 1673 + }, + { + "epoch": 0.9794127326580612, + "grad_norm": 0.03773221772259324, + "learning_rate": 4.5157906386818625e-06, + "loss": 0.8054, + "step": 1674 + }, + { + "epoch": 0.9799978059750613, + "grad_norm": 0.03641333096994463, + "learning_rate": 4.512483427590826e-06, + "loss": 0.8017, + "step": 1675 + }, + { + "epoch": 0.9805828792920613, + "grad_norm": 0.03773409152086726, + "learning_rate": 4.509176121473418e-06, + "loss": 0.7771, + "step": 1676 + }, + { + "epoch": 0.9811679526090613, + "grad_norm": 0.03675860145260396, + "learning_rate": 4.505868723123664e-06, + "loss": 0.7667, + "step": 1677 + }, + { + "epoch": 0.9817530259260614, + "grad_norm": 0.03759372778875698, + "learning_rate": 4.502561235335665e-06, + "loss": 0.7969, + "step": 1678 + }, + { + "epoch": 0.9823380992430614, + "grad_norm": 0.0375414007747319, + "learning_rate": 4.499253660903595e-06, + "loss": 0.7952, + "step": 1679 + }, + { + "epoch": 0.9829231725600615, + "grad_norm": 0.03422344712918065, + "learning_rate": 4.495946002621706e-06, + "loss": 0.7594, + "step": 1680 + }, + { + "epoch": 0.9835082458770614, + "grad_norm": 0.056821433139655465, + "learning_rate": 4.492638263284318e-06, + "loss": 0.749, + "step": 1681 + }, + { + "epoch": 0.9840933191940615, + "grad_norm": 0.11252954598380907, + "learning_rate": 4.489330445685821e-06, + "loss": 0.7391, + "step": 1682 + }, + { + "epoch": 0.9846783925110616, + "grad_norm": 0.042469382100373515, + "learning_rate": 4.486022552620667e-06, + "loss": 0.7923, + "step": 1683 + }, + { + "epoch": 0.9852634658280616, + "grad_norm": 0.048834820135742855, + "learning_rate": 4.482714586883381e-06, + "loss": 0.7472, + "step": 1684 + }, + { + "epoch": 0.9858485391450617, + "grad_norm": 0.03586075302565133, + "learning_rate": 4.479406551268536e-06, + "loss": 0.7405, + "step": 1685 + }, + { + "epoch": 0.9864336124620616, + "grad_norm": 0.033808844435251076, + "learning_rate": 4.476098448570778e-06, + "loss": 0.7668, + "step": 1686 + }, + { + "epoch": 0.9870186857790617, + "grad_norm": 0.03839203972551903, + "learning_rate": 4.472790281584798e-06, + "loss": 0.7892, + "step": 1687 + }, + { + "epoch": 0.9876037590960617, + "grad_norm": 0.041714468819081474, + "learning_rate": 4.469482053105348e-06, + "loss": 0.6825, + "step": 1688 + }, + { + "epoch": 0.9881888324130618, + "grad_norm": 0.04639986807865863, + "learning_rate": 4.4661737659272335e-06, + "loss": 0.77, + "step": 1689 + }, + { + "epoch": 0.9887739057300619, + "grad_norm": 0.03542864575710173, + "learning_rate": 4.462865422845301e-06, + "loss": 0.7578, + "step": 1690 + }, + { + "epoch": 0.9893589790470618, + "grad_norm": 0.03475434478064272, + "learning_rate": 4.459557026654453e-06, + "loss": 0.7783, + "step": 1691 + }, + { + "epoch": 0.9899440523640619, + "grad_norm": 0.15800557603413382, + "learning_rate": 4.456248580149634e-06, + "loss": 0.8381, + "step": 1692 + }, + { + "epoch": 0.9905291256810619, + "grad_norm": 0.03764241378382179, + "learning_rate": 4.452940086125827e-06, + "loss": 0.7773, + "step": 1693 + }, + { + "epoch": 0.991114198998062, + "grad_norm": 0.036921463016592035, + "learning_rate": 4.449631547378062e-06, + "loss": 0.7178, + "step": 1694 + }, + { + "epoch": 0.9916992723150619, + "grad_norm": 0.04206493996925596, + "learning_rate": 4.446322966701402e-06, + "loss": 0.8461, + "step": 1695 + }, + { + "epoch": 0.992284345632062, + "grad_norm": 0.03684149770322195, + "learning_rate": 4.443014346890949e-06, + "loss": 0.7421, + "step": 1696 + }, + { + "epoch": 0.992869418949062, + "grad_norm": 0.03706235629788984, + "learning_rate": 4.439705690741832e-06, + "loss": 0.747, + "step": 1697 + }, + { + "epoch": 0.9934544922660621, + "grad_norm": 0.03593179042253731, + "learning_rate": 4.436397001049217e-06, + "loss": 0.7973, + "step": 1698 + }, + { + "epoch": 0.9940395655830622, + "grad_norm": 0.03758870754317747, + "learning_rate": 4.4330882806082965e-06, + "loss": 0.752, + "step": 1699 + }, + { + "epoch": 0.9946246389000621, + "grad_norm": 0.039890034089472524, + "learning_rate": 4.429779532214288e-06, + "loss": 0.8217, + "step": 1700 + }, + { + "epoch": 0.9952097122170622, + "grad_norm": 0.038591012254334464, + "learning_rate": 4.4264707586624305e-06, + "loss": 0.7909, + "step": 1701 + }, + { + "epoch": 0.9957947855340622, + "grad_norm": 0.037417734290621826, + "learning_rate": 4.4231619627479895e-06, + "loss": 0.8227, + "step": 1702 + }, + { + "epoch": 0.9963798588510623, + "grad_norm": 0.036754661863113706, + "learning_rate": 4.419853147266248e-06, + "loss": 0.8273, + "step": 1703 + }, + { + "epoch": 0.9969649321680623, + "grad_norm": 0.03545112711834771, + "learning_rate": 4.4165443150125e-06, + "loss": 0.7733, + "step": 1704 + }, + { + "epoch": 0.9975500054850623, + "grad_norm": 0.0453520557159223, + "learning_rate": 4.413235468782063e-06, + "loss": 0.7291, + "step": 1705 + }, + { + "epoch": 0.9981350788020624, + "grad_norm": 0.03434680179451016, + "learning_rate": 4.409926611370255e-06, + "loss": 0.7253, + "step": 1706 + }, + { + "epoch": 0.9987201521190624, + "grad_norm": 0.03386357517901744, + "learning_rate": 4.406617745572415e-06, + "loss": 0.7843, + "step": 1707 + }, + { + "epoch": 0.9993052254360625, + "grad_norm": 0.03935904245669636, + "learning_rate": 4.403308874183881e-06, + "loss": 0.7553, + "step": 1708 + }, + { + "epoch": 0.9998902987530625, + "grad_norm": 0.03529453299196854, + "learning_rate": 4.4e-06, + "loss": 0.8177, + "step": 1709 + }, + { + "epoch": 1.0003291037408126, + "grad_norm": 0.0618709966791903, + "learning_rate": 4.3966911258161184e-06, + "loss": 1.2009, + "step": 1710 + }, + { + "epoch": 1.0009141770578125, + "grad_norm": 0.05054959075407372, + "learning_rate": 4.393382254427584e-06, + "loss": 0.7522, + "step": 1711 + }, + { + "epoch": 1.0014992503748126, + "grad_norm": 0.046546989029328134, + "learning_rate": 4.390073388629746e-06, + "loss": 0.6742, + "step": 1712 + }, + { + "epoch": 1.0020843236918127, + "grad_norm": 0.0394482669118922, + "learning_rate": 4.386764531217939e-06, + "loss": 0.6234, + "step": 1713 + }, + { + "epoch": 1.0026693970088127, + "grad_norm": 0.04036631292187251, + "learning_rate": 4.3834556849875e-06, + "loss": 0.5347, + "step": 1714 + }, + { + "epoch": 1.0032544703258126, + "grad_norm": 0.0683784483426072, + "learning_rate": 4.380146852733752e-06, + "loss": 0.6644, + "step": 1715 + }, + { + "epoch": 1.0038395436428127, + "grad_norm": 0.04024887608738653, + "learning_rate": 4.376838037252009e-06, + "loss": 0.5429, + "step": 1716 + }, + { + "epoch": 1.0044246169598128, + "grad_norm": 0.04001998958059999, + "learning_rate": 4.37352924133757e-06, + "loss": 0.6346, + "step": 1717 + }, + { + "epoch": 1.0050096902768129, + "grad_norm": 0.03981740214221622, + "learning_rate": 4.370220467785714e-06, + "loss": 0.6179, + "step": 1718 + }, + { + "epoch": 1.005594763593813, + "grad_norm": 0.04221085502304135, + "learning_rate": 4.366911719391705e-06, + "loss": 0.6728, + "step": 1719 + }, + { + "epoch": 1.0061798369108128, + "grad_norm": 0.041008201601693474, + "learning_rate": 4.363602998950783e-06, + "loss": 0.664, + "step": 1720 + }, + { + "epoch": 1.006764910227813, + "grad_norm": 0.04267520538484461, + "learning_rate": 4.360294309258169e-06, + "loss": 0.5861, + "step": 1721 + }, + { + "epoch": 1.007349983544813, + "grad_norm": 0.0380878408560243, + "learning_rate": 4.3569856531090524e-06, + "loss": 0.6581, + "step": 1722 + }, + { + "epoch": 1.007935056861813, + "grad_norm": 0.044581500184632424, + "learning_rate": 4.353677033298599e-06, + "loss": 0.6328, + "step": 1723 + }, + { + "epoch": 1.008520130178813, + "grad_norm": 0.04959235942274387, + "learning_rate": 4.350368452621938e-06, + "loss": 0.7268, + "step": 1724 + }, + { + "epoch": 1.009105203495813, + "grad_norm": 0.2678728322882004, + "learning_rate": 4.347059913874174e-06, + "loss": 0.5815, + "step": 1725 + }, + { + "epoch": 1.009690276812813, + "grad_norm": 0.04227602325404652, + "learning_rate": 4.343751419850368e-06, + "loss": 0.6351, + "step": 1726 + }, + { + "epoch": 1.0102753501298132, + "grad_norm": 0.04538776787193514, + "learning_rate": 4.340442973345546e-06, + "loss": 0.5471, + "step": 1727 + }, + { + "epoch": 1.0108604234468133, + "grad_norm": 0.06267951676327517, + "learning_rate": 4.337134577154699e-06, + "loss": 0.6463, + "step": 1728 + }, + { + "epoch": 1.0114454967638131, + "grad_norm": 0.04037151798930933, + "learning_rate": 4.333826234072769e-06, + "loss": 0.6353, + "step": 1729 + }, + { + "epoch": 1.0120305700808132, + "grad_norm": 0.03689888346419047, + "learning_rate": 4.330517946894651e-06, + "loss": 0.5699, + "step": 1730 + }, + { + "epoch": 1.0126156433978133, + "grad_norm": 0.03497818177140738, + "learning_rate": 4.3272097184152026e-06, + "loss": 0.5889, + "step": 1731 + }, + { + "epoch": 1.0132007167148134, + "grad_norm": 0.11544161330826186, + "learning_rate": 4.3239015514292225e-06, + "loss": 0.6106, + "step": 1732 + }, + { + "epoch": 1.0137857900318135, + "grad_norm": 0.04017346774193498, + "learning_rate": 4.320593448731463e-06, + "loss": 0.6093, + "step": 1733 + }, + { + "epoch": 1.0143708633488133, + "grad_norm": 0.06993118467504732, + "learning_rate": 4.317285413116621e-06, + "loss": 0.6178, + "step": 1734 + }, + { + "epoch": 1.0149559366658134, + "grad_norm": 0.042065264751811854, + "learning_rate": 4.313977447379332e-06, + "loss": 0.6502, + "step": 1735 + }, + { + "epoch": 1.0155410099828135, + "grad_norm": 0.041084510994879864, + "learning_rate": 4.310669554314179e-06, + "loss": 0.5617, + "step": 1736 + }, + { + "epoch": 1.0161260832998136, + "grad_norm": 0.049710466173704275, + "learning_rate": 4.307361736715682e-06, + "loss": 0.6016, + "step": 1737 + }, + { + "epoch": 1.0167111566168134, + "grad_norm": 0.03634779224936381, + "learning_rate": 4.304053997378294e-06, + "loss": 0.6092, + "step": 1738 + }, + { + "epoch": 1.0172962299338135, + "grad_norm": 0.05093471107840585, + "learning_rate": 4.300746339096404e-06, + "loss": 0.6604, + "step": 1739 + }, + { + "epoch": 1.0178813032508136, + "grad_norm": 0.052249007194382246, + "learning_rate": 4.2974387646643366e-06, + "loss": 0.6268, + "step": 1740 + }, + { + "epoch": 1.0184663765678137, + "grad_norm": 0.04168400074209779, + "learning_rate": 4.294131276876336e-06, + "loss": 0.6615, + "step": 1741 + }, + { + "epoch": 1.0190514498848138, + "grad_norm": 0.10284332944913842, + "learning_rate": 4.290823878526582e-06, + "loss": 0.6481, + "step": 1742 + }, + { + "epoch": 1.0196365232018136, + "grad_norm": 0.06107877656234674, + "learning_rate": 4.287516572409174e-06, + "loss": 0.5273, + "step": 1743 + }, + { + "epoch": 1.0202215965188137, + "grad_norm": 0.039593505867561385, + "learning_rate": 4.284209361318137e-06, + "loss": 0.5541, + "step": 1744 + }, + { + "epoch": 1.0208066698358138, + "grad_norm": 0.03815708858382624, + "learning_rate": 4.280902248047414e-06, + "loss": 0.6568, + "step": 1745 + }, + { + "epoch": 1.021391743152814, + "grad_norm": 0.045350052500293725, + "learning_rate": 4.277595235390863e-06, + "loss": 0.4949, + "step": 1746 + }, + { + "epoch": 1.021976816469814, + "grad_norm": 0.062366644080380174, + "learning_rate": 4.27428832614226e-06, + "loss": 0.6412, + "step": 1747 + }, + { + "epoch": 1.0225618897868138, + "grad_norm": 0.038514364508854086, + "learning_rate": 4.270981523095292e-06, + "loss": 0.5954, + "step": 1748 + }, + { + "epoch": 1.023146963103814, + "grad_norm": 0.040934523714329187, + "learning_rate": 4.267674829043559e-06, + "loss": 0.6202, + "step": 1749 + }, + { + "epoch": 1.023732036420814, + "grad_norm": 0.04396368047960114, + "learning_rate": 4.2643682467805675e-06, + "loss": 0.5798, + "step": 1750 + }, + { + "epoch": 1.024317109737814, + "grad_norm": 0.05011201210029028, + "learning_rate": 4.261061779099726e-06, + "loss": 0.6321, + "step": 1751 + }, + { + "epoch": 1.024902183054814, + "grad_norm": 0.037782483274636854, + "learning_rate": 4.2577554287943515e-06, + "loss": 0.6632, + "step": 1752 + }, + { + "epoch": 1.025487256371814, + "grad_norm": 0.04152834895888912, + "learning_rate": 4.2544491986576595e-06, + "loss": 0.5945, + "step": 1753 + }, + { + "epoch": 1.0260723296888141, + "grad_norm": 0.05806333477040265, + "learning_rate": 4.2511430914827634e-06, + "loss": 0.6637, + "step": 1754 + }, + { + "epoch": 1.0266574030058142, + "grad_norm": 0.10117835297951212, + "learning_rate": 4.247837110062674e-06, + "loss": 0.6018, + "step": 1755 + }, + { + "epoch": 1.0272424763228143, + "grad_norm": 0.03885375577058434, + "learning_rate": 4.244531257190297e-06, + "loss": 0.5957, + "step": 1756 + }, + { + "epoch": 1.0278275496398142, + "grad_norm": 0.03754400735025208, + "learning_rate": 4.2412255356584226e-06, + "loss": 0.6496, + "step": 1757 + }, + { + "epoch": 1.0284126229568142, + "grad_norm": 0.04039354810616503, + "learning_rate": 4.23791994825974e-06, + "loss": 0.636, + "step": 1758 + }, + { + "epoch": 1.0289976962738143, + "grad_norm": 0.03960082134430345, + "learning_rate": 4.2346144977868195e-06, + "loss": 0.6303, + "step": 1759 + }, + { + "epoch": 1.0295827695908144, + "grad_norm": 0.038169466661217184, + "learning_rate": 4.231309187032114e-06, + "loss": 0.667, + "step": 1760 + }, + { + "epoch": 1.0301678429078145, + "grad_norm": 0.036760609517840784, + "learning_rate": 4.228004018787964e-06, + "loss": 0.5326, + "step": 1761 + }, + { + "epoch": 1.0307529162248144, + "grad_norm": 0.05224287627733055, + "learning_rate": 4.224698995846585e-06, + "loss": 0.5847, + "step": 1762 + }, + { + "epoch": 1.0313379895418144, + "grad_norm": 0.04647643862134824, + "learning_rate": 4.221394121000072e-06, + "loss": 0.6364, + "step": 1763 + }, + { + "epoch": 1.0319230628588145, + "grad_norm": 0.039427816196321165, + "learning_rate": 4.218089397040393e-06, + "loss": 0.6066, + "step": 1764 + }, + { + "epoch": 1.0325081361758146, + "grad_norm": 0.03480024025349239, + "learning_rate": 4.214784826759391e-06, + "loss": 0.5554, + "step": 1765 + }, + { + "epoch": 1.0330932094928145, + "grad_norm": 0.08040264519097115, + "learning_rate": 4.211480412948778e-06, + "loss": 0.5786, + "step": 1766 + }, + { + "epoch": 1.0336782828098146, + "grad_norm": 0.04884829861187939, + "learning_rate": 4.208176158400132e-06, + "loss": 0.6147, + "step": 1767 + }, + { + "epoch": 1.0342633561268146, + "grad_norm": 0.03732626269644399, + "learning_rate": 4.2048720659049e-06, + "loss": 0.6238, + "step": 1768 + }, + { + "epoch": 1.0348484294438147, + "grad_norm": 0.03742496299770894, + "learning_rate": 4.201568138254388e-06, + "loss": 0.6473, + "step": 1769 + }, + { + "epoch": 1.0354335027608148, + "grad_norm": 0.037259663998191445, + "learning_rate": 4.1982643782397665e-06, + "loss": 0.5993, + "step": 1770 + }, + { + "epoch": 1.0360185760778147, + "grad_norm": 0.03806318673198438, + "learning_rate": 4.194960788652061e-06, + "loss": 0.5744, + "step": 1771 + }, + { + "epoch": 1.0366036493948148, + "grad_norm": 0.04271776837607931, + "learning_rate": 4.191657372282161e-06, + "loss": 0.6405, + "step": 1772 + }, + { + "epoch": 1.0371887227118148, + "grad_norm": 0.1193247723794853, + "learning_rate": 4.188354131920795e-06, + "loss": 0.6305, + "step": 1773 + }, + { + "epoch": 1.037773796028815, + "grad_norm": 0.041408678944841584, + "learning_rate": 4.185051070358556e-06, + "loss": 0.672, + "step": 1774 + }, + { + "epoch": 1.038358869345815, + "grad_norm": 0.04305727303228828, + "learning_rate": 4.181748190385882e-06, + "loss": 0.574, + "step": 1775 + }, + { + "epoch": 1.0389439426628149, + "grad_norm": 0.03600148051856665, + "learning_rate": 4.178445494793054e-06, + "loss": 0.5927, + "step": 1776 + }, + { + "epoch": 1.039529015979815, + "grad_norm": 0.036257379509770316, + "learning_rate": 4.1751429863702024e-06, + "loss": 0.6351, + "step": 1777 + }, + { + "epoch": 1.040114089296815, + "grad_norm": 0.05508170928413832, + "learning_rate": 4.171840667907297e-06, + "loss": 0.6693, + "step": 1778 + }, + { + "epoch": 1.0406991626138151, + "grad_norm": 0.0359589616622991, + "learning_rate": 4.168538542194145e-06, + "loss": 0.643, + "step": 1779 + }, + { + "epoch": 1.041284235930815, + "grad_norm": 0.04574135624876753, + "learning_rate": 4.1652366120203955e-06, + "loss": 0.6592, + "step": 1780 + }, + { + "epoch": 1.041869309247815, + "grad_norm": 0.04576629116297331, + "learning_rate": 4.161934880175529e-06, + "loss": 0.6442, + "step": 1781 + }, + { + "epoch": 1.0424543825648152, + "grad_norm": 0.0390727224308255, + "learning_rate": 4.15863334944886e-06, + "loss": 0.6548, + "step": 1782 + }, + { + "epoch": 1.0430394558818152, + "grad_norm": 0.041392206592811995, + "learning_rate": 4.155332022629533e-06, + "loss": 0.6029, + "step": 1783 + }, + { + "epoch": 1.0436245291988153, + "grad_norm": 0.03762867728092406, + "learning_rate": 4.152030902506518e-06, + "loss": 0.6379, + "step": 1784 + }, + { + "epoch": 1.0442096025158152, + "grad_norm": 0.04399197262339416, + "learning_rate": 4.148729991868613e-06, + "loss": 0.6502, + "step": 1785 + }, + { + "epoch": 1.0447946758328153, + "grad_norm": 0.034378949993079645, + "learning_rate": 4.145429293504438e-06, + "loss": 0.5196, + "step": 1786 + }, + { + "epoch": 1.0453797491498154, + "grad_norm": 0.08443203067098491, + "learning_rate": 4.142128810202434e-06, + "loss": 0.5683, + "step": 1787 + }, + { + "epoch": 1.0459648224668154, + "grad_norm": 0.0379632191291091, + "learning_rate": 4.138828544750861e-06, + "loss": 0.592, + "step": 1788 + }, + { + "epoch": 1.0465498957838153, + "grad_norm": 0.038094993376842946, + "learning_rate": 4.135528499937794e-06, + "loss": 0.6709, + "step": 1789 + }, + { + "epoch": 1.0471349691008154, + "grad_norm": 0.04377752430482512, + "learning_rate": 4.13222867855112e-06, + "loss": 0.5478, + "step": 1790 + }, + { + "epoch": 1.0477200424178155, + "grad_norm": 0.040127517533422315, + "learning_rate": 4.128929083378542e-06, + "loss": 0.6338, + "step": 1791 + }, + { + "epoch": 1.0483051157348156, + "grad_norm": 0.041540955383264945, + "learning_rate": 4.125629717207565e-06, + "loss": 0.6282, + "step": 1792 + }, + { + "epoch": 1.0488901890518156, + "grad_norm": 0.04313573791852679, + "learning_rate": 4.122330582825507e-06, + "loss": 0.5929, + "step": 1793 + }, + { + "epoch": 1.0494752623688155, + "grad_norm": 0.04920349769302968, + "learning_rate": 4.119031683019487e-06, + "loss": 0.642, + "step": 1794 + }, + { + "epoch": 1.0500603356858156, + "grad_norm": 0.04118717172286108, + "learning_rate": 4.115733020576427e-06, + "loss": 0.6701, + "step": 1795 + }, + { + "epoch": 1.0506454090028157, + "grad_norm": 0.03762242324561743, + "learning_rate": 4.112434598283046e-06, + "loss": 0.5557, + "step": 1796 + }, + { + "epoch": 1.0512304823198158, + "grad_norm": 0.04863139641918998, + "learning_rate": 4.1091364189258645e-06, + "loss": 0.6696, + "step": 1797 + }, + { + "epoch": 1.0518155556368158, + "grad_norm": 0.03800325798941253, + "learning_rate": 4.105838485291194e-06, + "loss": 0.5444, + "step": 1798 + }, + { + "epoch": 1.0524006289538157, + "grad_norm": 0.04250406776849721, + "learning_rate": 4.102540800165137e-06, + "loss": 0.6006, + "step": 1799 + }, + { + "epoch": 1.0529857022708158, + "grad_norm": 0.03566004409727574, + "learning_rate": 4.099243366333598e-06, + "loss": 0.5474, + "step": 1800 + }, + { + "epoch": 1.0535707755878159, + "grad_norm": 0.03920829695942249, + "learning_rate": 4.0959461865822476e-06, + "loss": 0.6428, + "step": 1801 + }, + { + "epoch": 1.054155848904816, + "grad_norm": 0.04622201693753423, + "learning_rate": 4.092649263696562e-06, + "loss": 0.5943, + "step": 1802 + }, + { + "epoch": 1.054740922221816, + "grad_norm": 0.05615949708259005, + "learning_rate": 4.089352600461791e-06, + "loss": 0.5458, + "step": 1803 + }, + { + "epoch": 1.055325995538816, + "grad_norm": 0.04045765925529075, + "learning_rate": 4.086056199662964e-06, + "loss": 0.635, + "step": 1804 + }, + { + "epoch": 1.055911068855816, + "grad_norm": 0.04630492862429359, + "learning_rate": 4.0827600640848965e-06, + "loss": 0.6339, + "step": 1805 + }, + { + "epoch": 1.056496142172816, + "grad_norm": 0.04703035260473936, + "learning_rate": 4.079464196512169e-06, + "loss": 0.6374, + "step": 1806 + }, + { + "epoch": 1.0570812154898162, + "grad_norm": 0.03739049283716097, + "learning_rate": 4.0761685997291465e-06, + "loss": 0.6206, + "step": 1807 + }, + { + "epoch": 1.057666288806816, + "grad_norm": 0.036569806915412575, + "learning_rate": 4.072873276519955e-06, + "loss": 0.5138, + "step": 1808 + }, + { + "epoch": 1.058251362123816, + "grad_norm": 0.04051229683477904, + "learning_rate": 4.069578229668497e-06, + "loss": 0.6494, + "step": 1809 + }, + { + "epoch": 1.0588364354408162, + "grad_norm": 0.035018447172935556, + "learning_rate": 4.0662834619584385e-06, + "loss": 0.5486, + "step": 1810 + }, + { + "epoch": 1.0594215087578163, + "grad_norm": 0.04259663408302132, + "learning_rate": 4.062988976173212e-06, + "loss": 0.5984, + "step": 1811 + }, + { + "epoch": 1.0600065820748164, + "grad_norm": 0.04168057194830636, + "learning_rate": 4.059694775096007e-06, + "loss": 0.6659, + "step": 1812 + }, + { + "epoch": 1.0605916553918162, + "grad_norm": 0.03900712312055412, + "learning_rate": 4.056400861509775e-06, + "loss": 0.6419, + "step": 1813 + }, + { + "epoch": 1.0611767287088163, + "grad_norm": 0.034201541655564785, + "learning_rate": 4.053107238197229e-06, + "loss": 0.5947, + "step": 1814 + }, + { + "epoch": 1.0617618020258164, + "grad_norm": 0.03574462040512478, + "learning_rate": 4.0498139079408295e-06, + "loss": 0.6378, + "step": 1815 + }, + { + "epoch": 1.0623468753428165, + "grad_norm": 0.038067071655048315, + "learning_rate": 4.046520873522795e-06, + "loss": 0.5626, + "step": 1816 + }, + { + "epoch": 1.0629319486598163, + "grad_norm": 0.04383692370100588, + "learning_rate": 4.043228137725088e-06, + "loss": 0.6189, + "step": 1817 + }, + { + "epoch": 1.0635170219768164, + "grad_norm": 0.03887409023229734, + "learning_rate": 4.039935703329427e-06, + "loss": 0.6488, + "step": 1818 + }, + { + "epoch": 1.0641020952938165, + "grad_norm": 0.037631106644317916, + "learning_rate": 4.03664357311727e-06, + "loss": 0.5685, + "step": 1819 + }, + { + "epoch": 1.0646871686108166, + "grad_norm": 0.03915394810150019, + "learning_rate": 4.033351749869816e-06, + "loss": 0.5143, + "step": 1820 + }, + { + "epoch": 1.0652722419278167, + "grad_norm": 0.23381999090629155, + "learning_rate": 4.030060236368015e-06, + "loss": 0.748, + "step": 1821 + }, + { + "epoch": 1.0658573152448165, + "grad_norm": 0.03673527214266167, + "learning_rate": 4.026769035392542e-06, + "loss": 0.6073, + "step": 1822 + }, + { + "epoch": 1.0664423885618166, + "grad_norm": 0.04415310297747642, + "learning_rate": 4.0234781497238175e-06, + "loss": 0.5535, + "step": 1823 + }, + { + "epoch": 1.0670274618788167, + "grad_norm": 0.03714005735947303, + "learning_rate": 4.020187582141993e-06, + "loss": 0.5971, + "step": 1824 + }, + { + "epoch": 1.0676125351958168, + "grad_norm": 0.03766490691087146, + "learning_rate": 4.016897335426948e-06, + "loss": 0.6188, + "step": 1825 + }, + { + "epoch": 1.0681976085128169, + "grad_norm": 0.04033316810065516, + "learning_rate": 4.013607412358296e-06, + "loss": 0.6111, + "step": 1826 + }, + { + "epoch": 1.0687826818298167, + "grad_norm": 0.05353718654581134, + "learning_rate": 4.010317815715376e-06, + "loss": 0.6179, + "step": 1827 + }, + { + "epoch": 1.0693677551468168, + "grad_norm": 0.046608435545504985, + "learning_rate": 4.007028548277247e-06, + "loss": 0.6532, + "step": 1828 + }, + { + "epoch": 1.069952828463817, + "grad_norm": 0.03697488633055468, + "learning_rate": 4.003739612822695e-06, + "loss": 0.5901, + "step": 1829 + }, + { + "epoch": 1.070537901780817, + "grad_norm": 0.036722760608521324, + "learning_rate": 4.000451012130223e-06, + "loss": 0.605, + "step": 1830 + }, + { + "epoch": 1.0711229750978168, + "grad_norm": 0.038328830780971296, + "learning_rate": 3.997162748978052e-06, + "loss": 0.6481, + "step": 1831 + }, + { + "epoch": 1.071708048414817, + "grad_norm": 0.03873004486583279, + "learning_rate": 3.993874826144117e-06, + "loss": 0.636, + "step": 1832 + }, + { + "epoch": 1.072293121731817, + "grad_norm": 0.03337836188489294, + "learning_rate": 3.990587246406065e-06, + "loss": 0.5294, + "step": 1833 + }, + { + "epoch": 1.072878195048817, + "grad_norm": 0.039178449513301655, + "learning_rate": 3.987300012541256e-06, + "loss": 0.6828, + "step": 1834 + }, + { + "epoch": 1.0734632683658172, + "grad_norm": 0.03482979527366474, + "learning_rate": 3.984013127326756e-06, + "loss": 0.6274, + "step": 1835 + }, + { + "epoch": 1.074048341682817, + "grad_norm": 0.03869433482507194, + "learning_rate": 3.980726593539336e-06, + "loss": 0.6027, + "step": 1836 + }, + { + "epoch": 1.0746334149998171, + "grad_norm": 0.04937610492549535, + "learning_rate": 3.9774404139554695e-06, + "loss": 0.6049, + "step": 1837 + }, + { + "epoch": 1.0752184883168172, + "grad_norm": 0.04799441643569765, + "learning_rate": 3.974154591351334e-06, + "loss": 0.6649, + "step": 1838 + }, + { + "epoch": 1.0758035616338173, + "grad_norm": 0.05456302001307194, + "learning_rate": 3.9708691285028015e-06, + "loss": 0.5912, + "step": 1839 + }, + { + "epoch": 1.0763886349508174, + "grad_norm": 0.03519964787380995, + "learning_rate": 3.967584028185443e-06, + "loss": 0.5268, + "step": 1840 + }, + { + "epoch": 1.0769737082678172, + "grad_norm": 0.038771677450854726, + "learning_rate": 3.9642992931745195e-06, + "loss": 0.6219, + "step": 1841 + }, + { + "epoch": 1.0775587815848173, + "grad_norm": 0.03752406445751352, + "learning_rate": 3.961014926244989e-06, + "loss": 0.5636, + "step": 1842 + }, + { + "epoch": 1.0781438549018174, + "grad_norm": 0.04781643644337222, + "learning_rate": 3.957730930171496e-06, + "loss": 0.5356, + "step": 1843 + }, + { + "epoch": 1.0787289282188175, + "grad_norm": 0.03852154632487492, + "learning_rate": 3.954447307728367e-06, + "loss": 0.7087, + "step": 1844 + }, + { + "epoch": 1.0793140015358174, + "grad_norm": 0.03440089125130851, + "learning_rate": 3.951164061689621e-06, + "loss": 0.5732, + "step": 1845 + }, + { + "epoch": 1.0798990748528174, + "grad_norm": 0.10561960370834975, + "learning_rate": 3.947881194828953e-06, + "loss": 0.5871, + "step": 1846 + }, + { + "epoch": 1.0804841481698175, + "grad_norm": 0.03365711719667564, + "learning_rate": 3.944598709919741e-06, + "loss": 0.5541, + "step": 1847 + }, + { + "epoch": 1.0810692214868176, + "grad_norm": 0.04080392556587059, + "learning_rate": 3.941316609735038e-06, + "loss": 0.6567, + "step": 1848 + }, + { + "epoch": 1.0816542948038177, + "grad_norm": 0.03668175589524903, + "learning_rate": 3.9380348970475745e-06, + "loss": 0.6026, + "step": 1849 + }, + { + "epoch": 1.0822393681208176, + "grad_norm": 0.039068033523461096, + "learning_rate": 3.934753574629751e-06, + "loss": 0.6756, + "step": 1850 + }, + { + "epoch": 1.0828244414378176, + "grad_norm": 0.03600473203520014, + "learning_rate": 3.931472645253638e-06, + "loss": 0.5998, + "step": 1851 + }, + { + "epoch": 1.0834095147548177, + "grad_norm": 0.04025386130635245, + "learning_rate": 3.928192111690978e-06, + "loss": 0.5597, + "step": 1852 + }, + { + "epoch": 1.0839945880718178, + "grad_norm": 0.036383871440146785, + "learning_rate": 3.924911976713177e-06, + "loss": 0.6221, + "step": 1853 + }, + { + "epoch": 1.0845796613888177, + "grad_norm": 0.04167477133917602, + "learning_rate": 3.9216322430913026e-06, + "loss": 0.6218, + "step": 1854 + }, + { + "epoch": 1.0851647347058178, + "grad_norm": 0.04451384993527383, + "learning_rate": 3.9183529135960836e-06, + "loss": 0.6512, + "step": 1855 + }, + { + "epoch": 1.0857498080228178, + "grad_norm": 0.043002906810256446, + "learning_rate": 3.915073990997909e-06, + "loss": 0.6139, + "step": 1856 + }, + { + "epoch": 1.086334881339818, + "grad_norm": 0.04885435791726515, + "learning_rate": 3.911795478066823e-06, + "loss": 0.5525, + "step": 1857 + }, + { + "epoch": 1.086919954656818, + "grad_norm": 0.04165286964749901, + "learning_rate": 3.908517377572525e-06, + "loss": 0.5547, + "step": 1858 + }, + { + "epoch": 1.0875050279738179, + "grad_norm": 0.1274643153042942, + "learning_rate": 3.905239692284366e-06, + "loss": 0.7246, + "step": 1859 + }, + { + "epoch": 1.088090101290818, + "grad_norm": 0.03908126994536125, + "learning_rate": 3.9019624249713445e-06, + "loss": 0.5542, + "step": 1860 + }, + { + "epoch": 1.088675174607818, + "grad_norm": 0.03773075708542036, + "learning_rate": 3.898685578402105e-06, + "loss": 0.5707, + "step": 1861 + }, + { + "epoch": 1.0892602479248181, + "grad_norm": 0.036829088849046586, + "learning_rate": 3.89540915534494e-06, + "loss": 0.5886, + "step": 1862 + }, + { + "epoch": 1.0898453212418182, + "grad_norm": 0.04130199746948449, + "learning_rate": 3.892133158567783e-06, + "loss": 0.6131, + "step": 1863 + }, + { + "epoch": 1.090430394558818, + "grad_norm": 0.04746832213288144, + "learning_rate": 3.888857590838205e-06, + "loss": 0.6648, + "step": 1864 + }, + { + "epoch": 1.0910154678758182, + "grad_norm": 0.041508017388942386, + "learning_rate": 3.885582454923419e-06, + "loss": 0.6277, + "step": 1865 + }, + { + "epoch": 1.0916005411928182, + "grad_norm": 0.0409505246603534, + "learning_rate": 3.882307753590267e-06, + "loss": 0.6125, + "step": 1866 + }, + { + "epoch": 1.0921856145098183, + "grad_norm": 0.03566935377869805, + "learning_rate": 3.879033489605231e-06, + "loss": 0.5809, + "step": 1867 + }, + { + "epoch": 1.0927706878268184, + "grad_norm": 0.04208144075756873, + "learning_rate": 3.875759665734418e-06, + "loss": 0.6503, + "step": 1868 + }, + { + "epoch": 1.0933557611438183, + "grad_norm": 0.05319025544123129, + "learning_rate": 3.872486284743565e-06, + "loss": 0.6612, + "step": 1869 + }, + { + "epoch": 1.0939408344608184, + "grad_norm": 0.036973295820488555, + "learning_rate": 3.869213349398037e-06, + "loss": 0.6477, + "step": 1870 + }, + { + "epoch": 1.0945259077778184, + "grad_norm": 0.03394449055816878, + "learning_rate": 3.8659408624628196e-06, + "loss": 0.5772, + "step": 1871 + }, + { + "epoch": 1.0951109810948185, + "grad_norm": 0.036674864070854414, + "learning_rate": 3.8626688267025195e-06, + "loss": 0.6398, + "step": 1872 + }, + { + "epoch": 1.0956960544118184, + "grad_norm": 0.03929339407053973, + "learning_rate": 3.859397244881364e-06, + "loss": 0.5085, + "step": 1873 + }, + { + "epoch": 1.0962811277288185, + "grad_norm": 0.040046901855836564, + "learning_rate": 3.856126119763195e-06, + "loss": 0.6364, + "step": 1874 + }, + { + "epoch": 1.0968662010458186, + "grad_norm": 0.035486379801793984, + "learning_rate": 3.8528554541114725e-06, + "loss": 0.5641, + "step": 1875 + }, + { + "epoch": 1.0974512743628186, + "grad_norm": 0.03936545902658793, + "learning_rate": 3.849585250689265e-06, + "loss": 0.5571, + "step": 1876 + }, + { + "epoch": 1.0980363476798187, + "grad_norm": 0.03913539174688499, + "learning_rate": 3.846315512259249e-06, + "loss": 0.5749, + "step": 1877 + }, + { + "epoch": 1.0986214209968186, + "grad_norm": 0.05225621206411416, + "learning_rate": 3.843046241583711e-06, + "loss": 0.6485, + "step": 1878 + }, + { + "epoch": 1.0992064943138187, + "grad_norm": 0.038252002337340885, + "learning_rate": 3.839777441424543e-06, + "loss": 0.5985, + "step": 1879 + }, + { + "epoch": 1.0997915676308188, + "grad_norm": 0.07992087638883832, + "learning_rate": 3.8365091145432355e-06, + "loss": 0.5751, + "step": 1880 + }, + { + "epoch": 1.1003766409478188, + "grad_norm": 0.041079963315258525, + "learning_rate": 3.833241263700887e-06, + "loss": 0.5941, + "step": 1881 + }, + { + "epoch": 1.1009617142648187, + "grad_norm": 0.03572906188540022, + "learning_rate": 3.829973891658183e-06, + "loss": 0.597, + "step": 1882 + }, + { + "epoch": 1.1015467875818188, + "grad_norm": 0.03454484109641548, + "learning_rate": 3.8267070011754125e-06, + "loss": 0.522, + "step": 1883 + }, + { + "epoch": 1.1021318608988189, + "grad_norm": 0.03529581537504829, + "learning_rate": 3.823440595012456e-06, + "loss": 0.5534, + "step": 1884 + }, + { + "epoch": 1.102716934215819, + "grad_norm": 0.03860812108580398, + "learning_rate": 3.820174675928782e-06, + "loss": 0.6171, + "step": 1885 + }, + { + "epoch": 1.103302007532819, + "grad_norm": 0.03891069294549701, + "learning_rate": 3.816909246683452e-06, + "loss": 0.6313, + "step": 1886 + }, + { + "epoch": 1.103887080849819, + "grad_norm": 0.04460994600621541, + "learning_rate": 3.8136443100351114e-06, + "loss": 0.6144, + "step": 1887 + }, + { + "epoch": 1.104472154166819, + "grad_norm": 0.03793462372864508, + "learning_rate": 3.8103798687419875e-06, + "loss": 0.6043, + "step": 1888 + }, + { + "epoch": 1.105057227483819, + "grad_norm": 0.03789544727466346, + "learning_rate": 3.8071159255618916e-06, + "loss": 0.5585, + "step": 1889 + }, + { + "epoch": 1.1056423008008192, + "grad_norm": 0.03902301020371876, + "learning_rate": 3.803852483252215e-06, + "loss": 0.5883, + "step": 1890 + }, + { + "epoch": 1.1062273741178192, + "grad_norm": 0.04123331452349562, + "learning_rate": 3.800589544569924e-06, + "loss": 0.5997, + "step": 1891 + }, + { + "epoch": 1.106812447434819, + "grad_norm": 0.0384499004027348, + "learning_rate": 3.79732711227156e-06, + "loss": 0.5425, + "step": 1892 + }, + { + "epoch": 1.1073975207518192, + "grad_norm": 0.03931159943079186, + "learning_rate": 3.794065189113235e-06, + "loss": 0.6912, + "step": 1893 + }, + { + "epoch": 1.1079825940688193, + "grad_norm": 0.042235818217412, + "learning_rate": 3.790803777850633e-06, + "loss": 0.536, + "step": 1894 + }, + { + "epoch": 1.1085676673858194, + "grad_norm": 0.07854013217181985, + "learning_rate": 3.787542881239007e-06, + "loss": 0.6006, + "step": 1895 + }, + { + "epoch": 1.1091527407028194, + "grad_norm": 0.03802962654136962, + "learning_rate": 3.7842825020331704e-06, + "loss": 0.5405, + "step": 1896 + }, + { + "epoch": 1.1097378140198193, + "grad_norm": 0.03899078017028906, + "learning_rate": 3.781022642987504e-06, + "loss": 0.606, + "step": 1897 + }, + { + "epoch": 1.1103228873368194, + "grad_norm": 0.05928727770359705, + "learning_rate": 3.7777633068559485e-06, + "loss": 0.625, + "step": 1898 + }, + { + "epoch": 1.1109079606538195, + "grad_norm": 0.036332098995253276, + "learning_rate": 3.7745044963919995e-06, + "loss": 0.563, + "step": 1899 + }, + { + "epoch": 1.1114930339708196, + "grad_norm": 0.04254812999743139, + "learning_rate": 3.7712462143487117e-06, + "loss": 0.5929, + "step": 1900 + }, + { + "epoch": 1.1120781072878194, + "grad_norm": 0.047049542101719036, + "learning_rate": 3.7679884634786917e-06, + "loss": 0.6332, + "step": 1901 + }, + { + "epoch": 1.1126631806048195, + "grad_norm": 0.05287536643154779, + "learning_rate": 3.7647312465340998e-06, + "loss": 0.6421, + "step": 1902 + }, + { + "epoch": 1.1132482539218196, + "grad_norm": 0.04037132747201506, + "learning_rate": 3.7614745662666428e-06, + "loss": 0.6103, + "step": 1903 + }, + { + "epoch": 1.1138333272388197, + "grad_norm": 0.04432953106580269, + "learning_rate": 3.758218425427575e-06, + "loss": 0.614, + "step": 1904 + }, + { + "epoch": 1.1144184005558198, + "grad_norm": 0.040942366129102785, + "learning_rate": 3.754962826767695e-06, + "loss": 0.5639, + "step": 1905 + }, + { + "epoch": 1.1150034738728196, + "grad_norm": 0.043564413168081004, + "learning_rate": 3.751707773037341e-06, + "loss": 0.6363, + "step": 1906 + }, + { + "epoch": 1.1155885471898197, + "grad_norm": 0.04136755699281491, + "learning_rate": 3.748453266986397e-06, + "loss": 0.573, + "step": 1907 + }, + { + "epoch": 1.1161736205068198, + "grad_norm": 0.03824782580749499, + "learning_rate": 3.745199311364278e-06, + "loss": 0.5565, + "step": 1908 + }, + { + "epoch": 1.1167586938238199, + "grad_norm": 0.03974476800325834, + "learning_rate": 3.7419459089199386e-06, + "loss": 0.5743, + "step": 1909 + }, + { + "epoch": 1.1173437671408197, + "grad_norm": 0.040366618049435816, + "learning_rate": 3.73869306240186e-06, + "loss": 0.6937, + "step": 1910 + }, + { + "epoch": 1.1179288404578198, + "grad_norm": 0.05249900276814925, + "learning_rate": 3.7354407745580606e-06, + "loss": 0.6519, + "step": 1911 + }, + { + "epoch": 1.11851391377482, + "grad_norm": 0.036071772850342874, + "learning_rate": 3.7321890481360836e-06, + "loss": 0.6146, + "step": 1912 + }, + { + "epoch": 1.11909898709182, + "grad_norm": 0.03497854387051239, + "learning_rate": 3.7289378858829972e-06, + "loss": 0.5502, + "step": 1913 + }, + { + "epoch": 1.11968406040882, + "grad_norm": 0.03656290869848137, + "learning_rate": 3.725687290545396e-06, + "loss": 0.5734, + "step": 1914 + }, + { + "epoch": 1.12026913372582, + "grad_norm": 0.04595919088109381, + "learning_rate": 3.7224372648693914e-06, + "loss": 0.6477, + "step": 1915 + }, + { + "epoch": 1.12085420704282, + "grad_norm": 0.03665383639665251, + "learning_rate": 3.7191878116006165e-06, + "loss": 0.6453, + "step": 1916 + }, + { + "epoch": 1.12143928035982, + "grad_norm": 0.036312925482254364, + "learning_rate": 3.71593893348422e-06, + "loss": 0.5398, + "step": 1917 + }, + { + "epoch": 1.1220243536768202, + "grad_norm": 0.040072606927414364, + "learning_rate": 3.7126906332648654e-06, + "loss": 0.6523, + "step": 1918 + }, + { + "epoch": 1.12260942699382, + "grad_norm": 0.039060685599701495, + "learning_rate": 3.709442913686727e-06, + "loss": 0.5473, + "step": 1919 + }, + { + "epoch": 1.1231945003108201, + "grad_norm": 0.039666654769551724, + "learning_rate": 3.7061957774934907e-06, + "loss": 0.594, + "step": 1920 + }, + { + "epoch": 1.1237795736278202, + "grad_norm": 0.043648362600625844, + "learning_rate": 3.7029492274283447e-06, + "loss": 0.5124, + "step": 1921 + }, + { + "epoch": 1.1243646469448203, + "grad_norm": 0.03845451561120115, + "learning_rate": 3.699703266233986e-06, + "loss": 0.6418, + "step": 1922 + }, + { + "epoch": 1.1249497202618204, + "grad_norm": 0.0376236779158915, + "learning_rate": 3.696457896652616e-06, + "loss": 0.6533, + "step": 1923 + }, + { + "epoch": 1.1255347935788205, + "grad_norm": 0.05306744881879573, + "learning_rate": 3.6932131214259306e-06, + "loss": 0.6145, + "step": 1924 + }, + { + "epoch": 1.1261198668958203, + "grad_norm": 0.042646944784004735, + "learning_rate": 3.6899689432951282e-06, + "loss": 0.5696, + "step": 1925 + }, + { + "epoch": 1.1267049402128204, + "grad_norm": 0.038706359647602315, + "learning_rate": 3.6867253650008998e-06, + "loss": 0.6775, + "step": 1926 + }, + { + "epoch": 1.1272900135298205, + "grad_norm": 0.03479027884450534, + "learning_rate": 3.6834823892834316e-06, + "loss": 0.5519, + "step": 1927 + }, + { + "epoch": 1.1278750868468206, + "grad_norm": 0.038169378097629214, + "learning_rate": 3.6802400188824017e-06, + "loss": 0.62, + "step": 1928 + }, + { + "epoch": 1.1284601601638204, + "grad_norm": 0.04283058271473719, + "learning_rate": 3.676998256536972e-06, + "loss": 0.6568, + "step": 1929 + }, + { + "epoch": 1.1290452334808205, + "grad_norm": 0.03536662278612379, + "learning_rate": 3.673757104985798e-06, + "loss": 0.5256, + "step": 1930 + }, + { + "epoch": 1.1296303067978206, + "grad_norm": 0.052506764522120546, + "learning_rate": 3.6705165669670126e-06, + "loss": 0.6033, + "step": 1931 + }, + { + "epoch": 1.1302153801148207, + "grad_norm": 0.034900074971183304, + "learning_rate": 3.6672766452182346e-06, + "loss": 0.5209, + "step": 1932 + }, + { + "epoch": 1.1308004534318208, + "grad_norm": 0.035938101531711, + "learning_rate": 3.6640373424765587e-06, + "loss": 0.6279, + "step": 1933 + }, + { + "epoch": 1.1313855267488206, + "grad_norm": 0.0392912667983424, + "learning_rate": 3.660798661478558e-06, + "loss": 0.5815, + "step": 1934 + }, + { + "epoch": 1.1319706000658207, + "grad_norm": 0.0398303511009714, + "learning_rate": 3.6575606049602835e-06, + "loss": 0.626, + "step": 1935 + }, + { + "epoch": 1.1325556733828208, + "grad_norm": 0.03611889807550921, + "learning_rate": 3.6543231756572554e-06, + "loss": 0.6771, + "step": 1936 + }, + { + "epoch": 1.133140746699821, + "grad_norm": 0.03532028188902697, + "learning_rate": 3.651086376304464e-06, + "loss": 0.5152, + "step": 1937 + }, + { + "epoch": 1.1337258200168208, + "grad_norm": 0.03967164811503248, + "learning_rate": 3.647850209636367e-06, + "loss": 0.6211, + "step": 1938 + }, + { + "epoch": 1.1343108933338208, + "grad_norm": 0.038476418430884396, + "learning_rate": 3.6446146783868902e-06, + "loss": 0.6209, + "step": 1939 + }, + { + "epoch": 1.134895966650821, + "grad_norm": 0.04068455632446945, + "learning_rate": 3.6413797852894204e-06, + "loss": 0.5772, + "step": 1940 + }, + { + "epoch": 1.135481039967821, + "grad_norm": 0.04555373439429878, + "learning_rate": 3.6381455330768054e-06, + "loss": 0.7082, + "step": 1941 + }, + { + "epoch": 1.136066113284821, + "grad_norm": 0.03521342334256349, + "learning_rate": 3.634911924481355e-06, + "loss": 0.5159, + "step": 1942 + }, + { + "epoch": 1.136651186601821, + "grad_norm": 0.04258017514314027, + "learning_rate": 3.6316789622348285e-06, + "loss": 0.6753, + "step": 1943 + }, + { + "epoch": 1.137236259918821, + "grad_norm": 0.03516384606881102, + "learning_rate": 3.6284466490684453e-06, + "loss": 0.6492, + "step": 1944 + }, + { + "epoch": 1.1378213332358211, + "grad_norm": 0.04116371229255566, + "learning_rate": 3.625214987712873e-06, + "loss": 0.6333, + "step": 1945 + }, + { + "epoch": 1.1384064065528212, + "grad_norm": 0.04111092828869013, + "learning_rate": 3.621983980898231e-06, + "loss": 0.5967, + "step": 1946 + }, + { + "epoch": 1.138991479869821, + "grad_norm": 0.04510171654268671, + "learning_rate": 3.618753631354085e-06, + "loss": 0.6787, + "step": 1947 + }, + { + "epoch": 1.1395765531868212, + "grad_norm": 0.03695584919253051, + "learning_rate": 3.6155239418094423e-06, + "loss": 0.6041, + "step": 1948 + }, + { + "epoch": 1.1401616265038212, + "grad_norm": 0.03536563522774939, + "learning_rate": 3.6122949149927583e-06, + "loss": 0.5044, + "step": 1949 + }, + { + "epoch": 1.1407466998208213, + "grad_norm": 0.03650916289855313, + "learning_rate": 3.6090665536319217e-06, + "loss": 0.6112, + "step": 1950 + }, + { + "epoch": 1.1413317731378214, + "grad_norm": 0.039407880387507706, + "learning_rate": 3.605838860454266e-06, + "loss": 0.6436, + "step": 1951 + }, + { + "epoch": 1.1419168464548213, + "grad_norm": 0.039216656690735256, + "learning_rate": 3.6026118381865577e-06, + "loss": 0.7106, + "step": 1952 + }, + { + "epoch": 1.1425019197718214, + "grad_norm": 0.03688297014066584, + "learning_rate": 3.5993854895549933e-06, + "loss": 0.6011, + "step": 1953 + }, + { + "epoch": 1.1430869930888214, + "grad_norm": 0.042465561939299803, + "learning_rate": 3.596159817285203e-06, + "loss": 0.5543, + "step": 1954 + }, + { + "epoch": 1.1436720664058215, + "grad_norm": 0.03618763455291692, + "learning_rate": 3.592934824102247e-06, + "loss": 0.5578, + "step": 1955 + }, + { + "epoch": 1.1442571397228214, + "grad_norm": 0.036402015787460676, + "learning_rate": 3.5897105127306093e-06, + "loss": 0.6402, + "step": 1956 + }, + { + "epoch": 1.1448422130398215, + "grad_norm": 0.06377588782844341, + "learning_rate": 3.5864868858941975e-06, + "loss": 0.5915, + "step": 1957 + }, + { + "epoch": 1.1454272863568216, + "grad_norm": 0.03955675423854017, + "learning_rate": 3.5832639463163456e-06, + "loss": 0.5695, + "step": 1958 + }, + { + "epoch": 1.1460123596738216, + "grad_norm": 0.03883728695837122, + "learning_rate": 3.5800416967197994e-06, + "loss": 0.6688, + "step": 1959 + }, + { + "epoch": 1.1465974329908217, + "grad_norm": 0.0398645229902971, + "learning_rate": 3.5768201398267292e-06, + "loss": 0.5659, + "step": 1960 + }, + { + "epoch": 1.1471825063078218, + "grad_norm": 0.0411957993109104, + "learning_rate": 3.573599278358715e-06, + "loss": 0.5577, + "step": 1961 + }, + { + "epoch": 1.1477675796248217, + "grad_norm": 0.034151574742763646, + "learning_rate": 3.5703791150367505e-06, + "loss": 0.594, + "step": 1962 + }, + { + "epoch": 1.1483526529418218, + "grad_norm": 0.03855527081580829, + "learning_rate": 3.5671596525812443e-06, + "loss": 0.5414, + "step": 1963 + }, + { + "epoch": 1.1489377262588218, + "grad_norm": 0.0409230119543168, + "learning_rate": 3.5639408937120043e-06, + "loss": 0.6178, + "step": 1964 + }, + { + "epoch": 1.149522799575822, + "grad_norm": 0.042664078843859005, + "learning_rate": 3.5607228411482505e-06, + "loss": 0.6063, + "step": 1965 + }, + { + "epoch": 1.1501078728928218, + "grad_norm": 0.04213966156831369, + "learning_rate": 3.557505497608604e-06, + "loss": 0.5389, + "step": 1966 + }, + { + "epoch": 1.1506929462098219, + "grad_norm": 0.037379317263132615, + "learning_rate": 3.554288865811088e-06, + "loss": 0.6255, + "step": 1967 + }, + { + "epoch": 1.151278019526822, + "grad_norm": 0.03742555979907474, + "learning_rate": 3.5510729484731233e-06, + "loss": 0.5866, + "step": 1968 + }, + { + "epoch": 1.151863092843822, + "grad_norm": 0.03923026388121522, + "learning_rate": 3.5478577483115277e-06, + "loss": 0.6159, + "step": 1969 + }, + { + "epoch": 1.1524481661608221, + "grad_norm": 0.04249118193800539, + "learning_rate": 3.5446432680425115e-06, + "loss": 0.6039, + "step": 1970 + }, + { + "epoch": 1.153033239477822, + "grad_norm": 0.07675275707771452, + "learning_rate": 3.5414295103816788e-06, + "loss": 0.5494, + "step": 1971 + }, + { + "epoch": 1.153618312794822, + "grad_norm": 0.03437489090682111, + "learning_rate": 3.538216478044024e-06, + "loss": 0.5604, + "step": 1972 + }, + { + "epoch": 1.1542033861118222, + "grad_norm": 0.10483435145121636, + "learning_rate": 3.5350041737439247e-06, + "loss": 0.6421, + "step": 1973 + }, + { + "epoch": 1.1547884594288222, + "grad_norm": 0.047824790606816193, + "learning_rate": 3.5317926001951505e-06, + "loss": 0.6048, + "step": 1974 + }, + { + "epoch": 1.155373532745822, + "grad_norm": 0.03972268704341939, + "learning_rate": 3.5285817601108443e-06, + "loss": 0.5955, + "step": 1975 + }, + { + "epoch": 1.1559586060628222, + "grad_norm": 0.043115977687794516, + "learning_rate": 3.5253716562035385e-06, + "loss": 0.6081, + "step": 1976 + }, + { + "epoch": 1.1565436793798223, + "grad_norm": 0.039959761660668336, + "learning_rate": 3.522162291185136e-06, + "loss": 0.648, + "step": 1977 + }, + { + "epoch": 1.1571287526968224, + "grad_norm": 0.3381087189100133, + "learning_rate": 3.5189536677669213e-06, + "loss": 0.6018, + "step": 1978 + }, + { + "epoch": 1.1577138260138224, + "grad_norm": 0.03675281433214776, + "learning_rate": 3.515745788659549e-06, + "loss": 0.5625, + "step": 1979 + }, + { + "epoch": 1.1582988993308223, + "grad_norm": 0.03556127214421104, + "learning_rate": 3.5125386565730473e-06, + "loss": 0.5644, + "step": 1980 + }, + { + "epoch": 1.1588839726478224, + "grad_norm": 0.03888401533955742, + "learning_rate": 3.5093322742168103e-06, + "loss": 0.6228, + "step": 1981 + }, + { + "epoch": 1.1594690459648225, + "grad_norm": 0.07138672279489928, + "learning_rate": 3.5061266442996013e-06, + "loss": 0.6444, + "step": 1982 + }, + { + "epoch": 1.1600541192818226, + "grad_norm": 0.04367774811390162, + "learning_rate": 3.5029217695295464e-06, + "loss": 0.5675, + "step": 1983 + }, + { + "epoch": 1.1606391925988224, + "grad_norm": 0.033577525885149805, + "learning_rate": 3.4997176526141347e-06, + "loss": 0.5912, + "step": 1984 + }, + { + "epoch": 1.1612242659158225, + "grad_norm": 0.041101043350749655, + "learning_rate": 3.496514296260216e-06, + "loss": 0.5663, + "step": 1985 + }, + { + "epoch": 1.1618093392328226, + "grad_norm": 0.036838996439545135, + "learning_rate": 3.493311703173993e-06, + "loss": 0.6567, + "step": 1986 + }, + { + "epoch": 1.1623944125498227, + "grad_norm": 0.04300438023283441, + "learning_rate": 3.4901098760610292e-06, + "loss": 0.5611, + "step": 1987 + }, + { + "epoch": 1.1629794858668228, + "grad_norm": 0.03514099210411253, + "learning_rate": 3.4869088176262383e-06, + "loss": 0.5295, + "step": 1988 + }, + { + "epoch": 1.1635645591838228, + "grad_norm": 0.04249663621333313, + "learning_rate": 3.483708530573886e-06, + "loss": 0.6283, + "step": 1989 + }, + { + "epoch": 1.1641496325008227, + "grad_norm": 0.03613780881600909, + "learning_rate": 3.4805090176075835e-06, + "loss": 0.6074, + "step": 1990 + }, + { + "epoch": 1.1647347058178228, + "grad_norm": 0.04959791048909453, + "learning_rate": 3.4773102814302915e-06, + "loss": 0.6174, + "step": 1991 + }, + { + "epoch": 1.1653197791348229, + "grad_norm": 0.037749992452740126, + "learning_rate": 3.474112324744312e-06, + "loss": 0.5907, + "step": 1992 + }, + { + "epoch": 1.165904852451823, + "grad_norm": 0.0351608550697518, + "learning_rate": 3.470915150251289e-06, + "loss": 0.5905, + "step": 1993 + }, + { + "epoch": 1.1664899257688228, + "grad_norm": 0.03893625789488356, + "learning_rate": 3.4677187606522066e-06, + "loss": 0.5675, + "step": 1994 + }, + { + "epoch": 1.167074999085823, + "grad_norm": 0.04059108907664171, + "learning_rate": 3.4645231586473856e-06, + "loss": 0.5728, + "step": 1995 + }, + { + "epoch": 1.167660072402823, + "grad_norm": 0.04113287183937823, + "learning_rate": 3.4613283469364812e-06, + "loss": 0.6118, + "step": 1996 + }, + { + "epoch": 1.168245145719823, + "grad_norm": 0.03719646254418526, + "learning_rate": 3.45813432821848e-06, + "loss": 0.6406, + "step": 1997 + }, + { + "epoch": 1.1688302190368232, + "grad_norm": 0.06040366792983708, + "learning_rate": 3.4549411051916993e-06, + "loss": 0.6843, + "step": 1998 + }, + { + "epoch": 1.169415292353823, + "grad_norm": 0.039091200896895856, + "learning_rate": 3.451748680553785e-06, + "loss": 0.5895, + "step": 1999 + }, + { + "epoch": 1.170000365670823, + "grad_norm": 0.03719442419223529, + "learning_rate": 3.4485570570017083e-06, + "loss": 0.6165, + "step": 2000 + }, + { + "epoch": 1.1705854389878232, + "grad_norm": 0.036207964808247124, + "learning_rate": 3.445366237231762e-06, + "loss": 0.5825, + "step": 2001 + }, + { + "epoch": 1.1711705123048233, + "grad_norm": 0.037815350275091156, + "learning_rate": 3.4421762239395646e-06, + "loss": 0.5934, + "step": 2002 + }, + { + "epoch": 1.1717555856218231, + "grad_norm": 0.054653596037471275, + "learning_rate": 3.438987019820045e-06, + "loss": 0.6826, + "step": 2003 + }, + { + "epoch": 1.1723406589388232, + "grad_norm": 0.03400992901791061, + "learning_rate": 3.4357986275674566e-06, + "loss": 0.4815, + "step": 2004 + }, + { + "epoch": 1.1729257322558233, + "grad_norm": 0.0466833061216275, + "learning_rate": 3.432611049875362e-06, + "loss": 0.6306, + "step": 2005 + }, + { + "epoch": 1.1735108055728234, + "grad_norm": 0.03423491044593052, + "learning_rate": 3.429424289436638e-06, + "loss": 0.5756, + "step": 2006 + }, + { + "epoch": 1.1740958788898235, + "grad_norm": 0.04055240441842592, + "learning_rate": 3.426238348943472e-06, + "loss": 0.5772, + "step": 2007 + }, + { + "epoch": 1.1746809522068233, + "grad_norm": 0.1689145341911383, + "learning_rate": 3.423053231087354e-06, + "loss": 0.6136, + "step": 2008 + }, + { + "epoch": 1.1752660255238234, + "grad_norm": 0.036623976167559434, + "learning_rate": 3.4198689385590848e-06, + "loss": 0.5839, + "step": 2009 + }, + { + "epoch": 1.1758510988408235, + "grad_norm": 0.038221379263121774, + "learning_rate": 3.416685474048761e-06, + "loss": 0.7034, + "step": 2010 + }, + { + "epoch": 1.1764361721578236, + "grad_norm": 0.03539988785174503, + "learning_rate": 3.4135028402457888e-06, + "loss": 0.5319, + "step": 2011 + }, + { + "epoch": 1.1770212454748235, + "grad_norm": 0.03800403001773324, + "learning_rate": 3.410321039838866e-06, + "loss": 0.6349, + "step": 2012 + }, + { + "epoch": 1.1776063187918235, + "grad_norm": 0.04227520585992831, + "learning_rate": 3.407140075515988e-06, + "loss": 0.6129, + "step": 2013 + }, + { + "epoch": 1.1781913921088236, + "grad_norm": 0.03831788774127032, + "learning_rate": 3.4039599499644434e-06, + "loss": 0.5848, + "step": 2014 + }, + { + "epoch": 1.1787764654258237, + "grad_norm": 0.04284762288768014, + "learning_rate": 3.400780665870811e-06, + "loss": 0.6488, + "step": 2015 + }, + { + "epoch": 1.1793615387428238, + "grad_norm": 0.041288161978273966, + "learning_rate": 3.3976022259209643e-06, + "loss": 0.627, + "step": 2016 + }, + { + "epoch": 1.1799466120598237, + "grad_norm": 0.039481075603532444, + "learning_rate": 3.3944246328000574e-06, + "loss": 0.5888, + "step": 2017 + }, + { + "epoch": 1.1805316853768237, + "grad_norm": 0.04335101317156415, + "learning_rate": 3.3912478891925338e-06, + "loss": 0.6734, + "step": 2018 + }, + { + "epoch": 1.1811167586938238, + "grad_norm": 0.04116415093116544, + "learning_rate": 3.3880719977821127e-06, + "loss": 0.6432, + "step": 2019 + }, + { + "epoch": 1.181701832010824, + "grad_norm": 0.036848265334549324, + "learning_rate": 3.384896961251802e-06, + "loss": 0.5763, + "step": 2020 + }, + { + "epoch": 1.182286905327824, + "grad_norm": 0.042651651929545335, + "learning_rate": 3.38172278228388e-06, + "loss": 0.5687, + "step": 2021 + }, + { + "epoch": 1.1828719786448239, + "grad_norm": 0.03635888340073088, + "learning_rate": 3.3785494635599048e-06, + "loss": 0.5954, + "step": 2022 + }, + { + "epoch": 1.183457051961824, + "grad_norm": 0.039839665329704434, + "learning_rate": 3.3753770077607076e-06, + "loss": 0.6383, + "step": 2023 + }, + { + "epoch": 1.184042125278824, + "grad_norm": 0.043332517759083845, + "learning_rate": 3.372205417566387e-06, + "loss": 0.6532, + "step": 2024 + }, + { + "epoch": 1.184627198595824, + "grad_norm": 0.03562735039321648, + "learning_rate": 3.3690346956563146e-06, + "loss": 0.6788, + "step": 2025 + }, + { + "epoch": 1.1852122719128242, + "grad_norm": 0.04660842986654955, + "learning_rate": 3.365864844709127e-06, + "loss": 0.6477, + "step": 2026 + }, + { + "epoch": 1.185797345229824, + "grad_norm": 0.03429494053167668, + "learning_rate": 3.3626958674027213e-06, + "loss": 0.5504, + "step": 2027 + }, + { + "epoch": 1.1863824185468241, + "grad_norm": 0.0680307436226198, + "learning_rate": 3.359527766414265e-06, + "loss": 0.668, + "step": 2028 + }, + { + "epoch": 1.1869674918638242, + "grad_norm": 0.05407036614682854, + "learning_rate": 3.3563605444201785e-06, + "loss": 0.6085, + "step": 2029 + }, + { + "epoch": 1.1875525651808243, + "grad_norm": 0.03428910211347692, + "learning_rate": 3.35319420409614e-06, + "loss": 0.5544, + "step": 2030 + }, + { + "epoch": 1.1881376384978242, + "grad_norm": 0.04221499184751478, + "learning_rate": 3.3500287481170847e-06, + "loss": 0.7486, + "step": 2031 + }, + { + "epoch": 1.1887227118148243, + "grad_norm": 0.03921755539646231, + "learning_rate": 3.3468641791572003e-06, + "loss": 0.6926, + "step": 2032 + }, + { + "epoch": 1.1893077851318243, + "grad_norm": 0.0366270608635313, + "learning_rate": 3.343700499889926e-06, + "loss": 0.6307, + "step": 2033 + }, + { + "epoch": 1.1898928584488244, + "grad_norm": 0.036660916253845384, + "learning_rate": 3.340537712987948e-06, + "loss": 0.5853, + "step": 2034 + }, + { + "epoch": 1.1904779317658245, + "grad_norm": 0.03505156447428476, + "learning_rate": 3.3373758211231974e-06, + "loss": 0.5874, + "step": 2035 + }, + { + "epoch": 1.1910630050828244, + "grad_norm": 0.040178249649878345, + "learning_rate": 3.3342148269668525e-06, + "loss": 0.7006, + "step": 2036 + }, + { + "epoch": 1.1916480783998245, + "grad_norm": 0.03533674390624903, + "learning_rate": 3.331054733189332e-06, + "loss": 0.5886, + "step": 2037 + }, + { + "epoch": 1.1922331517168245, + "grad_norm": 0.03692572558788983, + "learning_rate": 3.327895542460292e-06, + "loss": 0.6155, + "step": 2038 + }, + { + "epoch": 1.1928182250338246, + "grad_norm": 0.03938265009175071, + "learning_rate": 3.3247372574486302e-06, + "loss": 0.6368, + "step": 2039 + }, + { + "epoch": 1.1934032983508245, + "grad_norm": 0.04043208798531885, + "learning_rate": 3.321579880822473e-06, + "loss": 0.541, + "step": 2040 + }, + { + "epoch": 1.1939883716678246, + "grad_norm": 0.049286317781066166, + "learning_rate": 3.318423415249184e-06, + "loss": 0.5622, + "step": 2041 + }, + { + "epoch": 1.1945734449848247, + "grad_norm": 0.035803834914162815, + "learning_rate": 3.3152678633953554e-06, + "loss": 0.6379, + "step": 2042 + }, + { + "epoch": 1.1951585183018247, + "grad_norm": 0.03534002984635534, + "learning_rate": 3.312113227926807e-06, + "loss": 0.5832, + "step": 2043 + }, + { + "epoch": 1.1957435916188248, + "grad_norm": 0.03595263497698763, + "learning_rate": 3.3089595115085863e-06, + "loss": 0.6027, + "step": 2044 + }, + { + "epoch": 1.1963286649358247, + "grad_norm": 0.04310890199047902, + "learning_rate": 3.305806716804964e-06, + "loss": 0.6981, + "step": 2045 + }, + { + "epoch": 1.1969137382528248, + "grad_norm": 0.034609593715224284, + "learning_rate": 3.302654846479431e-06, + "loss": 0.5436, + "step": 2046 + }, + { + "epoch": 1.1974988115698249, + "grad_norm": 0.038358840771114136, + "learning_rate": 3.2995039031946954e-06, + "loss": 0.7055, + "step": 2047 + }, + { + "epoch": 1.198083884886825, + "grad_norm": 0.04563367988331309, + "learning_rate": 3.2963538896126863e-06, + "loss": 0.5472, + "step": 2048 + }, + { + "epoch": 1.1986689582038248, + "grad_norm": 0.03704695342032978, + "learning_rate": 3.293204808394547e-06, + "loss": 0.5715, + "step": 2049 + }, + { + "epoch": 1.1992540315208249, + "grad_norm": 0.03741146134749107, + "learning_rate": 3.2900566622006284e-06, + "loss": 0.6012, + "step": 2050 + }, + { + "epoch": 1.199839104837825, + "grad_norm": 0.038195831443333515, + "learning_rate": 3.2869094536904997e-06, + "loss": 0.5778, + "step": 2051 + }, + { + "epoch": 1.200424178154825, + "grad_norm": 0.03838310724213011, + "learning_rate": 3.283763185522927e-06, + "loss": 0.6203, + "step": 2052 + }, + { + "epoch": 1.2010092514718251, + "grad_norm": 0.03679246721866951, + "learning_rate": 3.2806178603558923e-06, + "loss": 0.5866, + "step": 2053 + }, + { + "epoch": 1.2015943247888252, + "grad_norm": 0.037329068284633296, + "learning_rate": 3.2774734808465746e-06, + "loss": 0.5712, + "step": 2054 + }, + { + "epoch": 1.202179398105825, + "grad_norm": 0.03824528624286622, + "learning_rate": 3.2743300496513567e-06, + "loss": 0.6278, + "step": 2055 + }, + { + "epoch": 1.2027644714228252, + "grad_norm": 0.036655970928988306, + "learning_rate": 3.2711875694258202e-06, + "loss": 0.6234, + "step": 2056 + }, + { + "epoch": 1.2033495447398252, + "grad_norm": 0.03552385584165378, + "learning_rate": 3.268046042824742e-06, + "loss": 0.4884, + "step": 2057 + }, + { + "epoch": 1.2039346180568253, + "grad_norm": 0.03841046001130388, + "learning_rate": 3.2649054725020947e-06, + "loss": 0.664, + "step": 2058 + }, + { + "epoch": 1.2045196913738252, + "grad_norm": 0.04118795268419084, + "learning_rate": 3.2617658611110405e-06, + "loss": 0.6416, + "step": 2059 + }, + { + "epoch": 1.2051047646908253, + "grad_norm": 0.03850987838668136, + "learning_rate": 3.2586272113039368e-06, + "loss": 0.5786, + "step": 2060 + }, + { + "epoch": 1.2056898380078254, + "grad_norm": 0.038494984464432946, + "learning_rate": 3.255489525732323e-06, + "loss": 0.6438, + "step": 2061 + }, + { + "epoch": 1.2062749113248254, + "grad_norm": 0.035177849537891506, + "learning_rate": 3.252352807046927e-06, + "loss": 0.5675, + "step": 2062 + }, + { + "epoch": 1.2068599846418255, + "grad_norm": 0.04015731464340186, + "learning_rate": 3.249217057897658e-06, + "loss": 0.5964, + "step": 2063 + }, + { + "epoch": 1.2074450579588254, + "grad_norm": 0.04066687077935284, + "learning_rate": 3.246082280933609e-06, + "loss": 0.5642, + "step": 2064 + }, + { + "epoch": 1.2080301312758255, + "grad_norm": 0.050549965114778064, + "learning_rate": 3.242948478803048e-06, + "loss": 0.5849, + "step": 2065 + }, + { + "epoch": 1.2086152045928256, + "grad_norm": 0.04128169248463715, + "learning_rate": 3.239815654153422e-06, + "loss": 0.6244, + "step": 2066 + }, + { + "epoch": 1.2092002779098256, + "grad_norm": 0.037146084532940146, + "learning_rate": 3.236683809631354e-06, + "loss": 0.5625, + "step": 2067 + }, + { + "epoch": 1.2097853512268255, + "grad_norm": 0.037953855830558866, + "learning_rate": 3.2335529478826324e-06, + "loss": 0.6, + "step": 2068 + }, + { + "epoch": 1.2103704245438256, + "grad_norm": 0.04943862913081636, + "learning_rate": 3.230423071552222e-06, + "loss": 0.5952, + "step": 2069 + }, + { + "epoch": 1.2109554978608257, + "grad_norm": 0.03936477462539683, + "learning_rate": 3.227294183284252e-06, + "loss": 0.6401, + "step": 2070 + }, + { + "epoch": 1.2115405711778258, + "grad_norm": 0.04126544367881334, + "learning_rate": 3.2241662857220176e-06, + "loss": 0.6159, + "step": 2071 + }, + { + "epoch": 1.2121256444948258, + "grad_norm": 0.041020884786539924, + "learning_rate": 3.2210393815079773e-06, + "loss": 0.635, + "step": 2072 + }, + { + "epoch": 1.2127107178118257, + "grad_norm": 0.0358878906382118, + "learning_rate": 3.21791347328375e-06, + "loss": 0.6414, + "step": 2073 + }, + { + "epoch": 1.2132957911288258, + "grad_norm": 0.03516770521480586, + "learning_rate": 3.2147885636901136e-06, + "loss": 0.6641, + "step": 2074 + }, + { + "epoch": 1.2138808644458259, + "grad_norm": 0.03926231427900321, + "learning_rate": 3.2116646553669994e-06, + "loss": 0.626, + "step": 2075 + }, + { + "epoch": 1.214465937762826, + "grad_norm": 0.03869243094148781, + "learning_rate": 3.2085417509534994e-06, + "loss": 0.6433, + "step": 2076 + }, + { + "epoch": 1.2150510110798258, + "grad_norm": 0.03914650880562719, + "learning_rate": 3.205419853087851e-06, + "loss": 0.6486, + "step": 2077 + }, + { + "epoch": 1.215636084396826, + "grad_norm": 0.03719572794194729, + "learning_rate": 3.202298964407445e-06, + "loss": 0.6049, + "step": 2078 + }, + { + "epoch": 1.216221157713826, + "grad_norm": 0.039960704683083076, + "learning_rate": 3.199179087548818e-06, + "loss": 0.5408, + "step": 2079 + }, + { + "epoch": 1.216806231030826, + "grad_norm": 0.038852419722138556, + "learning_rate": 3.1960602251476508e-06, + "loss": 0.6398, + "step": 2080 + }, + { + "epoch": 1.2173913043478262, + "grad_norm": 0.043605000020103986, + "learning_rate": 3.1929423798387706e-06, + "loss": 0.6421, + "step": 2081 + }, + { + "epoch": 1.217976377664826, + "grad_norm": 0.03878520292515712, + "learning_rate": 3.189825554256142e-06, + "loss": 0.6067, + "step": 2082 + }, + { + "epoch": 1.218561450981826, + "grad_norm": 0.041501370594404216, + "learning_rate": 3.186709751032871e-06, + "loss": 0.6132, + "step": 2083 + }, + { + "epoch": 1.2191465242988262, + "grad_norm": 0.03951034345012451, + "learning_rate": 3.183594972801199e-06, + "loss": 0.6016, + "step": 2084 + }, + { + "epoch": 1.2197315976158263, + "grad_norm": 0.04111002191035941, + "learning_rate": 3.1804812221924973e-06, + "loss": 0.7306, + "step": 2085 + }, + { + "epoch": 1.2203166709328264, + "grad_norm": 0.04145625722473185, + "learning_rate": 3.177368501837277e-06, + "loss": 0.6854, + "step": 2086 + }, + { + "epoch": 1.2209017442498262, + "grad_norm": 0.037269331694012336, + "learning_rate": 3.1742568143651704e-06, + "loss": 0.6833, + "step": 2087 + }, + { + "epoch": 1.2214868175668263, + "grad_norm": 0.039255496027705915, + "learning_rate": 3.171146162404945e-06, + "loss": 0.5482, + "step": 2088 + }, + { + "epoch": 1.2220718908838264, + "grad_norm": 0.037070583225945676, + "learning_rate": 3.168036548584488e-06, + "loss": 0.5844, + "step": 2089 + }, + { + "epoch": 1.2226569642008265, + "grad_norm": 0.11493255093128565, + "learning_rate": 3.164927975530812e-06, + "loss": 0.6224, + "step": 2090 + }, + { + "epoch": 1.2232420375178266, + "grad_norm": 0.037232680426261475, + "learning_rate": 3.161820445870048e-06, + "loss": 0.6469, + "step": 2091 + }, + { + "epoch": 1.2238271108348264, + "grad_norm": 0.04032812893153942, + "learning_rate": 3.1587139622274482e-06, + "loss": 0.6057, + "step": 2092 + }, + { + "epoch": 1.2244121841518265, + "grad_norm": 0.03634438517480883, + "learning_rate": 3.1556085272273826e-06, + "loss": 0.544, + "step": 2093 + }, + { + "epoch": 1.2249972574688266, + "grad_norm": 0.04037981016869187, + "learning_rate": 3.1525041434933304e-06, + "loss": 0.6455, + "step": 2094 + }, + { + "epoch": 1.2255823307858267, + "grad_norm": 0.038882617943542315, + "learning_rate": 3.149400813647886e-06, + "loss": 0.6386, + "step": 2095 + }, + { + "epoch": 1.2261674041028265, + "grad_norm": 0.051431593131162474, + "learning_rate": 3.1462985403127505e-06, + "loss": 0.6475, + "step": 2096 + }, + { + "epoch": 1.2267524774198266, + "grad_norm": 0.03733875116363724, + "learning_rate": 3.143197326108738e-06, + "loss": 0.5444, + "step": 2097 + }, + { + "epoch": 1.2273375507368267, + "grad_norm": 0.035252501575400275, + "learning_rate": 3.140097173655763e-06, + "loss": 0.5779, + "step": 2098 + }, + { + "epoch": 1.2279226240538268, + "grad_norm": 0.05240477506652852, + "learning_rate": 3.136998085572844e-06, + "loss": 0.5887, + "step": 2099 + }, + { + "epoch": 1.2285076973708269, + "grad_norm": 0.03805752290907818, + "learning_rate": 3.1339000644781022e-06, + "loss": 0.5838, + "step": 2100 + }, + { + "epoch": 1.2290927706878267, + "grad_norm": 0.04383435394432525, + "learning_rate": 3.130803112988755e-06, + "loss": 0.6787, + "step": 2101 + }, + { + "epoch": 1.2296778440048268, + "grad_norm": 0.03732331672312392, + "learning_rate": 3.1277072337211177e-06, + "loss": 0.6056, + "step": 2102 + }, + { + "epoch": 1.230262917321827, + "grad_norm": 0.036913976358249244, + "learning_rate": 3.1246124292905977e-06, + "loss": 0.5794, + "step": 2103 + }, + { + "epoch": 1.230847990638827, + "grad_norm": 0.045315422806353534, + "learning_rate": 3.1215187023116983e-06, + "loss": 0.6712, + "step": 2104 + }, + { + "epoch": 1.2314330639558269, + "grad_norm": 0.03796939196362949, + "learning_rate": 3.11842605539801e-06, + "loss": 0.5266, + "step": 2105 + }, + { + "epoch": 1.232018137272827, + "grad_norm": 0.26883650914769597, + "learning_rate": 3.1153344911622093e-06, + "loss": 0.5122, + "step": 2106 + }, + { + "epoch": 1.232603210589827, + "grad_norm": 0.046560578154564256, + "learning_rate": 3.1122440122160603e-06, + "loss": 0.6583, + "step": 2107 + }, + { + "epoch": 1.233188283906827, + "grad_norm": 0.039487212735197195, + "learning_rate": 3.10915462117041e-06, + "loss": 0.566, + "step": 2108 + }, + { + "epoch": 1.2337733572238272, + "grad_norm": 0.044996273718641414, + "learning_rate": 3.106066320635188e-06, + "loss": 0.6701, + "step": 2109 + }, + { + "epoch": 1.234358430540827, + "grad_norm": 0.03714598377688584, + "learning_rate": 3.1029791132193978e-06, + "loss": 0.5602, + "step": 2110 + }, + { + "epoch": 1.2349435038578271, + "grad_norm": 0.04302535784595455, + "learning_rate": 3.099893001531126e-06, + "loss": 0.7246, + "step": 2111 + }, + { + "epoch": 1.2355285771748272, + "grad_norm": 0.0377049325672757, + "learning_rate": 3.0968079881775257e-06, + "loss": 0.6343, + "step": 2112 + }, + { + "epoch": 1.2361136504918273, + "grad_norm": 0.04228612856033121, + "learning_rate": 3.0937240757648287e-06, + "loss": 0.6611, + "step": 2113 + }, + { + "epoch": 1.2366987238088272, + "grad_norm": 0.035689681356042134, + "learning_rate": 3.090641266898335e-06, + "loss": 0.5975, + "step": 2114 + }, + { + "epoch": 1.2372837971258273, + "grad_norm": 0.03471425835558048, + "learning_rate": 3.0875595641824096e-06, + "loss": 0.5574, + "step": 2115 + }, + { + "epoch": 1.2378688704428273, + "grad_norm": 0.03692356893026278, + "learning_rate": 3.0844789702204886e-06, + "loss": 0.595, + "step": 2116 + }, + { + "epoch": 1.2384539437598274, + "grad_norm": 0.03922116778653568, + "learning_rate": 3.081399487615062e-06, + "loss": 0.6189, + "step": 2117 + }, + { + "epoch": 1.2390390170768275, + "grad_norm": 0.05993653316174921, + "learning_rate": 3.078321118967693e-06, + "loss": 0.6765, + "step": 2118 + }, + { + "epoch": 1.2396240903938276, + "grad_norm": 0.04167106571338825, + "learning_rate": 3.0752438668789927e-06, + "loss": 0.6151, + "step": 2119 + }, + { + "epoch": 1.2402091637108275, + "grad_norm": 0.03821887002642105, + "learning_rate": 3.072167733948638e-06, + "loss": 0.607, + "step": 2120 + }, + { + "epoch": 1.2407942370278275, + "grad_norm": 0.03807112943742371, + "learning_rate": 3.0690927227753537e-06, + "loss": 0.6594, + "step": 2121 + }, + { + "epoch": 1.2413793103448276, + "grad_norm": 0.040714922346881566, + "learning_rate": 3.066018835956921e-06, + "loss": 0.5834, + "step": 2122 + }, + { + "epoch": 1.2419643836618277, + "grad_norm": 0.039133521242853846, + "learning_rate": 3.0629460760901673e-06, + "loss": 0.597, + "step": 2123 + }, + { + "epoch": 1.2425494569788276, + "grad_norm": 0.0400356957836742, + "learning_rate": 3.0598744457709707e-06, + "loss": 0.6055, + "step": 2124 + }, + { + "epoch": 1.2431345302958277, + "grad_norm": 0.04123802601032258, + "learning_rate": 3.0568039475942556e-06, + "loss": 0.5654, + "step": 2125 + }, + { + "epoch": 1.2437196036128277, + "grad_norm": 0.03639596855605065, + "learning_rate": 3.0537345841539895e-06, + "loss": 0.6412, + "step": 2126 + }, + { + "epoch": 1.2443046769298278, + "grad_norm": 0.04019621315125225, + "learning_rate": 3.0506663580431805e-06, + "loss": 0.5935, + "step": 2127 + }, + { + "epoch": 1.244889750246828, + "grad_norm": 0.04079355192375721, + "learning_rate": 3.0475992718538734e-06, + "loss": 0.5552, + "step": 2128 + }, + { + "epoch": 1.2454748235638278, + "grad_norm": 0.041408451026744786, + "learning_rate": 3.044533328177157e-06, + "loss": 0.584, + "step": 2129 + }, + { + "epoch": 1.2460598968808279, + "grad_norm": 0.05090259880775753, + "learning_rate": 3.041468529603147e-06, + "loss": 0.6054, + "step": 2130 + }, + { + "epoch": 1.246644970197828, + "grad_norm": 0.03868926058783337, + "learning_rate": 3.038404878720997e-06, + "loss": 0.5618, + "step": 2131 + }, + { + "epoch": 1.247230043514828, + "grad_norm": 0.06601222584424662, + "learning_rate": 3.0353423781188897e-06, + "loss": 0.5544, + "step": 2132 + }, + { + "epoch": 1.2478151168318279, + "grad_norm": 0.03702617313486051, + "learning_rate": 3.032281030384036e-06, + "loss": 0.5989, + "step": 2133 + }, + { + "epoch": 1.248400190148828, + "grad_norm": 0.03705800256241544, + "learning_rate": 3.0292208381026703e-06, + "loss": 0.6298, + "step": 2134 + }, + { + "epoch": 1.248985263465828, + "grad_norm": 0.038560461227645317, + "learning_rate": 3.0261618038600556e-06, + "loss": 0.5769, + "step": 2135 + }, + { + "epoch": 1.2495703367828281, + "grad_norm": 0.04189426245677959, + "learning_rate": 3.023103930240471e-06, + "loss": 0.5847, + "step": 2136 + }, + { + "epoch": 1.2501554100998282, + "grad_norm": 0.05575302954020764, + "learning_rate": 3.020047219827222e-06, + "loss": 0.5339, + "step": 2137 + }, + { + "epoch": 1.2507404834168283, + "grad_norm": 0.04057133712962797, + "learning_rate": 3.016991675202627e-06, + "loss": 0.5519, + "step": 2138 + }, + { + "epoch": 1.2513255567338282, + "grad_norm": 0.03597036946672713, + "learning_rate": 3.0139372989480167e-06, + "loss": 0.5791, + "step": 2139 + }, + { + "epoch": 1.2519106300508283, + "grad_norm": 0.04334307004734275, + "learning_rate": 3.0108840936437413e-06, + "loss": 0.5256, + "step": 2140 + } + ], + "logging_steps": 1, + "max_steps": 3418, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 428, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8878468884332544.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}