diff --git "a/marques/outputs/checkpoint-1000/trainer_state.json" "b/marques/outputs/checkpoint-1000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/marques/outputs/checkpoint-1000/trainer_state.json" @@ -0,0 +1,7034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.000351009252516144, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 3.5100925251614403e-07, + "grad_norm": 0.53782719373703, + "learning_rate": 0.0, + "loss": 0.5835, + "step": 1 + }, + { + "epoch": 7.020185050322881e-07, + "grad_norm": 0.6201626062393188, + "learning_rate": 4e-05, + "loss": 0.5242, + "step": 2 + }, + { + "epoch": 1.053027757548432e-06, + "grad_norm": 0.7571901082992554, + "learning_rate": 8e-05, + "loss": 0.5642, + "step": 3 + }, + { + "epoch": 1.4040370100645761e-06, + "grad_norm": 0.5588695406913757, + "learning_rate": 0.00012, + "loss": 0.4859, + "step": 4 + }, + { + "epoch": 1.75504626258072e-06, + "grad_norm": 0.7208331227302551, + "learning_rate": 0.00016, + "loss": 0.4645, + "step": 5 + }, + { + "epoch": 2.106055515096864e-06, + "grad_norm": 0.8169743418693542, + "learning_rate": 0.0002, + "loss": 0.3702, + "step": 6 + }, + { + "epoch": 2.4570647676130083e-06, + "grad_norm": 2.051530599594116, + "learning_rate": 0.00019993322203672788, + "loss": 0.4856, + "step": 7 + }, + { + "epoch": 2.8080740201291522e-06, + "grad_norm": 1.2310550212860107, + "learning_rate": 0.00019986644407345576, + "loss": 0.5192, + "step": 8 + }, + { + "epoch": 3.1590832726452962e-06, + "grad_norm": 1.612046241760254, + "learning_rate": 0.00019979966611018366, + "loss": 0.4719, + "step": 9 + }, + { + "epoch": 3.51009252516144e-06, + "grad_norm": 1.4484680891036987, + "learning_rate": 0.00019973288814691153, + "loss": 0.4416, + "step": 10 + }, + { + "epoch": 3.861101777677584e-06, + "grad_norm": 1.4529719352722168, + "learning_rate": 0.0001996661101836394, + "loss": 0.6275, + "step": 11 + }, + { + "epoch": 4.212111030193728e-06, + "grad_norm": 1.3963671922683716, + "learning_rate": 0.00019959933222036728, + "loss": 0.5874, + "step": 12 + }, + { + "epoch": 4.563120282709872e-06, + "grad_norm": 1.4744153022766113, + "learning_rate": 0.00019953255425709515, + "loss": 0.6422, + "step": 13 + }, + { + "epoch": 4.9141295352260165e-06, + "grad_norm": 0.8640050888061523, + "learning_rate": 0.00019946577629382305, + "loss": 0.5064, + "step": 14 + }, + { + "epoch": 5.26513878774216e-06, + "grad_norm": 0.7137419581413269, + "learning_rate": 0.00019939899833055092, + "loss": 0.5218, + "step": 15 + }, + { + "epoch": 5.6161480402583045e-06, + "grad_norm": 0.7769026756286621, + "learning_rate": 0.00019933222036727882, + "loss": 0.5377, + "step": 16 + }, + { + "epoch": 5.967157292774448e-06, + "grad_norm": 0.7558479905128479, + "learning_rate": 0.0001992654424040067, + "loss": 0.5054, + "step": 17 + }, + { + "epoch": 6.3181665452905924e-06, + "grad_norm": 0.8237054347991943, + "learning_rate": 0.00019919866444073457, + "loss": 0.5094, + "step": 18 + }, + { + "epoch": 6.669175797806736e-06, + "grad_norm": 1.0375059843063354, + "learning_rate": 0.00019913188647746244, + "loss": 0.5751, + "step": 19 + }, + { + "epoch": 7.02018505032288e-06, + "grad_norm": 1.075869083404541, + "learning_rate": 0.00019906510851419034, + "loss": 0.594, + "step": 20 + }, + { + "epoch": 7.371194302839024e-06, + "grad_norm": 0.8041358590126038, + "learning_rate": 0.00019899833055091822, + "loss": 0.553, + "step": 21 + }, + { + "epoch": 7.722203555355168e-06, + "grad_norm": 0.9264736771583557, + "learning_rate": 0.0001989315525876461, + "loss": 0.5555, + "step": 22 + }, + { + "epoch": 8.073212807871313e-06, + "grad_norm": 1.0074031352996826, + "learning_rate": 0.00019886477462437396, + "loss": 0.5353, + "step": 23 + }, + { + "epoch": 8.424222060387455e-06, + "grad_norm": 0.8725020885467529, + "learning_rate": 0.00019879799666110183, + "loss": 0.5557, + "step": 24 + }, + { + "epoch": 8.7752313129036e-06, + "grad_norm": 0.8867582678794861, + "learning_rate": 0.00019873121869782974, + "loss": 0.5992, + "step": 25 + }, + { + "epoch": 9.126240565419744e-06, + "grad_norm": 0.9235608577728271, + "learning_rate": 0.0001986644407345576, + "loss": 0.516, + "step": 26 + }, + { + "epoch": 9.477249817935889e-06, + "grad_norm": 0.8653218150138855, + "learning_rate": 0.00019859766277128548, + "loss": 0.5249, + "step": 27 + }, + { + "epoch": 9.828259070452033e-06, + "grad_norm": 0.7479026913642883, + "learning_rate": 0.00019853088480801335, + "loss": 0.5037, + "step": 28 + }, + { + "epoch": 1.0179268322968176e-05, + "grad_norm": 0.9531452655792236, + "learning_rate": 0.00019846410684474123, + "loss": 0.5896, + "step": 29 + }, + { + "epoch": 1.053027757548432e-05, + "grad_norm": 1.1012492179870605, + "learning_rate": 0.00019839732888146913, + "loss": 0.5139, + "step": 30 + }, + { + "epoch": 1.0881286828000465e-05, + "grad_norm": 1.0198887586593628, + "learning_rate": 0.000198330550918197, + "loss": 0.5587, + "step": 31 + }, + { + "epoch": 1.1232296080516609e-05, + "grad_norm": 0.8081266283988953, + "learning_rate": 0.00019826377295492487, + "loss": 0.4762, + "step": 32 + }, + { + "epoch": 1.1583305333032752e-05, + "grad_norm": 1.1965891122817993, + "learning_rate": 0.00019819699499165277, + "loss": 0.5719, + "step": 33 + }, + { + "epoch": 1.1934314585548896e-05, + "grad_norm": 1.214903473854065, + "learning_rate": 0.00019813021702838065, + "loss": 0.5756, + "step": 34 + }, + { + "epoch": 1.228532383806504e-05, + "grad_norm": 0.8360006213188171, + "learning_rate": 0.00019806343906510852, + "loss": 0.5688, + "step": 35 + }, + { + "epoch": 1.2636333090581185e-05, + "grad_norm": 0.8328489065170288, + "learning_rate": 0.00019799666110183642, + "loss": 0.6418, + "step": 36 + }, + { + "epoch": 1.298734234309733e-05, + "grad_norm": 1.1427714824676514, + "learning_rate": 0.0001979298831385643, + "loss": 0.6531, + "step": 37 + }, + { + "epoch": 1.3338351595613472e-05, + "grad_norm": 1.0145376920700073, + "learning_rate": 0.00019786310517529217, + "loss": 0.6473, + "step": 38 + }, + { + "epoch": 1.3689360848129616e-05, + "grad_norm": 0.8427861928939819, + "learning_rate": 0.00019779632721202004, + "loss": 0.5882, + "step": 39 + }, + { + "epoch": 1.404037010064576e-05, + "grad_norm": 0.8792659044265747, + "learning_rate": 0.00019772954924874791, + "loss": 0.608, + "step": 40 + }, + { + "epoch": 1.4391379353161905e-05, + "grad_norm": 0.9338463544845581, + "learning_rate": 0.00019766277128547581, + "loss": 0.7118, + "step": 41 + }, + { + "epoch": 1.4742388605678048e-05, + "grad_norm": 0.7554420232772827, + "learning_rate": 0.0001975959933222037, + "loss": 0.5898, + "step": 42 + }, + { + "epoch": 1.5093397858194192e-05, + "grad_norm": 0.7700084447860718, + "learning_rate": 0.00019752921535893156, + "loss": 0.6466, + "step": 43 + }, + { + "epoch": 1.5444407110710337e-05, + "grad_norm": 0.8639333248138428, + "learning_rate": 0.00019746243739565943, + "loss": 0.7253, + "step": 44 + }, + { + "epoch": 1.579541636322648e-05, + "grad_norm": 0.7760612964630127, + "learning_rate": 0.0001973956594323873, + "loss": 0.7099, + "step": 45 + }, + { + "epoch": 1.6146425615742626e-05, + "grad_norm": 0.7319066524505615, + "learning_rate": 0.0001973288814691152, + "loss": 0.6664, + "step": 46 + }, + { + "epoch": 1.6497434868258768e-05, + "grad_norm": 0.7557100057601929, + "learning_rate": 0.00019726210350584308, + "loss": 0.6318, + "step": 47 + }, + { + "epoch": 1.684844412077491e-05, + "grad_norm": 0.6420389413833618, + "learning_rate": 0.00019719532554257095, + "loss": 0.6688, + "step": 48 + }, + { + "epoch": 1.7199453373291057e-05, + "grad_norm": 0.660383939743042, + "learning_rate": 0.00019712854757929883, + "loss": 0.6204, + "step": 49 + }, + { + "epoch": 1.75504626258072e-05, + "grad_norm": 0.5614909529685974, + "learning_rate": 0.00019706176961602673, + "loss": 0.664, + "step": 50 + }, + { + "epoch": 1.7901471878323346e-05, + "grad_norm": 0.502738356590271, + "learning_rate": 0.0001969949916527546, + "loss": 0.6918, + "step": 51 + }, + { + "epoch": 1.825248113083949e-05, + "grad_norm": 0.47578102350234985, + "learning_rate": 0.0001969282136894825, + "loss": 0.6747, + "step": 52 + }, + { + "epoch": 1.860349038335563e-05, + "grad_norm": 0.5528931617736816, + "learning_rate": 0.00019686143572621037, + "loss": 0.765, + "step": 53 + }, + { + "epoch": 1.8954499635871777e-05, + "grad_norm": 0.6176997423171997, + "learning_rate": 0.00019679465776293825, + "loss": 0.5959, + "step": 54 + }, + { + "epoch": 1.930550888838792e-05, + "grad_norm": 0.43425047397613525, + "learning_rate": 0.00019672787979966612, + "loss": 0.6437, + "step": 55 + }, + { + "epoch": 1.9656518140904066e-05, + "grad_norm": 0.5135884881019592, + "learning_rate": 0.000196661101836394, + "loss": 0.7019, + "step": 56 + }, + { + "epoch": 2.000752739342021e-05, + "grad_norm": 0.4628916084766388, + "learning_rate": 0.0001965943238731219, + "loss": 0.5722, + "step": 57 + }, + { + "epoch": 2.035853664593635e-05, + "grad_norm": 0.48201897740364075, + "learning_rate": 0.00019652754590984977, + "loss": 0.6288, + "step": 58 + }, + { + "epoch": 2.0709545898452498e-05, + "grad_norm": 0.5772811770439148, + "learning_rate": 0.00019646076794657764, + "loss": 0.6067, + "step": 59 + }, + { + "epoch": 2.106055515096864e-05, + "grad_norm": 0.4976802170276642, + "learning_rate": 0.0001963939899833055, + "loss": 0.4722, + "step": 60 + }, + { + "epoch": 2.1411564403484786e-05, + "grad_norm": 0.4842129051685333, + "learning_rate": 0.00019632721202003339, + "loss": 0.5876, + "step": 61 + }, + { + "epoch": 2.176257365600093e-05, + "grad_norm": 0.46149536967277527, + "learning_rate": 0.00019626043405676129, + "loss": 0.6373, + "step": 62 + }, + { + "epoch": 2.2113582908517072e-05, + "grad_norm": 0.47199445962905884, + "learning_rate": 0.00019619365609348916, + "loss": 0.5546, + "step": 63 + }, + { + "epoch": 2.2464592161033218e-05, + "grad_norm": 0.6109340190887451, + "learning_rate": 0.00019612687813021703, + "loss": 0.6069, + "step": 64 + }, + { + "epoch": 2.281560141354936e-05, + "grad_norm": 0.5529135465621948, + "learning_rate": 0.0001960601001669449, + "loss": 0.553, + "step": 65 + }, + { + "epoch": 2.3166610666065503e-05, + "grad_norm": 0.500245213508606, + "learning_rate": 0.00019599332220367278, + "loss": 0.6149, + "step": 66 + }, + { + "epoch": 2.351761991858165e-05, + "grad_norm": 0.4841914474964142, + "learning_rate": 0.00019592654424040068, + "loss": 0.6509, + "step": 67 + }, + { + "epoch": 2.3868629171097792e-05, + "grad_norm": 0.5308504104614258, + "learning_rate": 0.00019585976627712855, + "loss": 0.7017, + "step": 68 + }, + { + "epoch": 2.4219638423613938e-05, + "grad_norm": 0.5157874822616577, + "learning_rate": 0.00019579298831385645, + "loss": 0.7125, + "step": 69 + }, + { + "epoch": 2.457064767613008e-05, + "grad_norm": 0.47787800431251526, + "learning_rate": 0.00019572621035058433, + "loss": 0.5792, + "step": 70 + }, + { + "epoch": 2.4921656928646224e-05, + "grad_norm": 0.46792763471603394, + "learning_rate": 0.0001956594323873122, + "loss": 0.7, + "step": 71 + }, + { + "epoch": 2.527266618116237e-05, + "grad_norm": 0.5394675135612488, + "learning_rate": 0.00019559265442404007, + "loss": 0.5549, + "step": 72 + }, + { + "epoch": 2.5623675433678512e-05, + "grad_norm": 0.45065200328826904, + "learning_rate": 0.00019552587646076797, + "loss": 0.6663, + "step": 73 + }, + { + "epoch": 2.597468468619466e-05, + "grad_norm": 0.4026688039302826, + "learning_rate": 0.00019545909849749584, + "loss": 0.6315, + "step": 74 + }, + { + "epoch": 2.63256939387108e-05, + "grad_norm": 0.42353659868240356, + "learning_rate": 0.00019539232053422372, + "loss": 0.5419, + "step": 75 + }, + { + "epoch": 2.6676703191226944e-05, + "grad_norm": 0.45561954379081726, + "learning_rate": 0.0001953255425709516, + "loss": 0.6624, + "step": 76 + }, + { + "epoch": 2.702771244374309e-05, + "grad_norm": 0.3954075574874878, + "learning_rate": 0.00019525876460767946, + "loss": 0.5479, + "step": 77 + }, + { + "epoch": 2.7378721696259233e-05, + "grad_norm": 0.4994329512119293, + "learning_rate": 0.00019519198664440736, + "loss": 0.7224, + "step": 78 + }, + { + "epoch": 2.7729730948775375e-05, + "grad_norm": 0.41149672865867615, + "learning_rate": 0.00019512520868113524, + "loss": 0.5621, + "step": 79 + }, + { + "epoch": 2.808074020129152e-05, + "grad_norm": 0.4199008345603943, + "learning_rate": 0.0001950584307178631, + "loss": 0.7038, + "step": 80 + }, + { + "epoch": 2.8431749453807664e-05, + "grad_norm": 0.4378969371318817, + "learning_rate": 0.00019499165275459098, + "loss": 0.6654, + "step": 81 + }, + { + "epoch": 2.878275870632381e-05, + "grad_norm": 0.4653928279876709, + "learning_rate": 0.00019492487479131886, + "loss": 0.6241, + "step": 82 + }, + { + "epoch": 2.9133767958839953e-05, + "grad_norm": 0.5166454911231995, + "learning_rate": 0.00019485809682804673, + "loss": 0.5366, + "step": 83 + }, + { + "epoch": 2.9484777211356096e-05, + "grad_norm": 0.43180733919143677, + "learning_rate": 0.00019479131886477463, + "loss": 0.6178, + "step": 84 + }, + { + "epoch": 2.9835786463872242e-05, + "grad_norm": 0.44828200340270996, + "learning_rate": 0.0001947245409015025, + "loss": 0.6706, + "step": 85 + }, + { + "epoch": 3.0186795716388385e-05, + "grad_norm": 0.384175181388855, + "learning_rate": 0.0001946577629382304, + "loss": 0.5551, + "step": 86 + }, + { + "epoch": 3.053780496890453e-05, + "grad_norm": 0.4359772503376007, + "learning_rate": 0.00019459098497495828, + "loss": 0.5626, + "step": 87 + }, + { + "epoch": 3.0888814221420673e-05, + "grad_norm": 0.4177016615867615, + "learning_rate": 0.00019452420701168615, + "loss": 0.6023, + "step": 88 + }, + { + "epoch": 3.1239823473936816e-05, + "grad_norm": 0.43592438101768494, + "learning_rate": 0.00019445742904841405, + "loss": 0.682, + "step": 89 + }, + { + "epoch": 3.159083272645296e-05, + "grad_norm": 0.48027974367141724, + "learning_rate": 0.00019439065108514192, + "loss": 0.7596, + "step": 90 + }, + { + "epoch": 3.194184197896911e-05, + "grad_norm": 0.35989537835121155, + "learning_rate": 0.0001943238731218698, + "loss": 0.6018, + "step": 91 + }, + { + "epoch": 3.229285123148525e-05, + "grad_norm": 0.48477092385292053, + "learning_rate": 0.00019425709515859767, + "loss": 0.512, + "step": 92 + }, + { + "epoch": 3.2643860484001394e-05, + "grad_norm": 0.38858646154403687, + "learning_rate": 0.00019419031719532554, + "loss": 0.6371, + "step": 93 + }, + { + "epoch": 3.2994869736517536e-05, + "grad_norm": 0.5323147177696228, + "learning_rate": 0.00019412353923205344, + "loss": 0.5221, + "step": 94 + }, + { + "epoch": 3.334587898903368e-05, + "grad_norm": 0.3784274160861969, + "learning_rate": 0.00019405676126878132, + "loss": 0.6158, + "step": 95 + }, + { + "epoch": 3.369688824154982e-05, + "grad_norm": 0.4076334834098816, + "learning_rate": 0.0001939899833055092, + "loss": 0.5535, + "step": 96 + }, + { + "epoch": 3.404789749406597e-05, + "grad_norm": 0.43930479884147644, + "learning_rate": 0.00019392320534223706, + "loss": 0.6482, + "step": 97 + }, + { + "epoch": 3.4398906746582114e-05, + "grad_norm": 0.4266909658908844, + "learning_rate": 0.00019385642737896494, + "loss": 0.6, + "step": 98 + }, + { + "epoch": 3.474991599909826e-05, + "grad_norm": 0.45353513956069946, + "learning_rate": 0.0001937896494156928, + "loss": 0.6596, + "step": 99 + }, + { + "epoch": 3.51009252516144e-05, + "grad_norm": 0.3424838185310364, + "learning_rate": 0.0001937228714524207, + "loss": 0.555, + "step": 100 + }, + { + "epoch": 3.545193450413054e-05, + "grad_norm": 0.40126165747642517, + "learning_rate": 0.00019365609348914858, + "loss": 0.6921, + "step": 101 + }, + { + "epoch": 3.580294375664669e-05, + "grad_norm": 0.36572012305259705, + "learning_rate": 0.00019358931552587646, + "loss": 0.5485, + "step": 102 + }, + { + "epoch": 3.6153953009162834e-05, + "grad_norm": 0.3972407281398773, + "learning_rate": 0.00019352253756260436, + "loss": 0.5884, + "step": 103 + }, + { + "epoch": 3.650496226167898e-05, + "grad_norm": 0.3900579512119293, + "learning_rate": 0.00019345575959933223, + "loss": 0.6664, + "step": 104 + }, + { + "epoch": 3.685597151419512e-05, + "grad_norm": 0.31666621565818787, + "learning_rate": 0.00019338898163606013, + "loss": 0.5009, + "step": 105 + }, + { + "epoch": 3.720698076671126e-05, + "grad_norm": 0.5269597172737122, + "learning_rate": 0.000193322203672788, + "loss": 0.6292, + "step": 106 + }, + { + "epoch": 3.755799001922741e-05, + "grad_norm": 0.4645126163959503, + "learning_rate": 0.00019325542570951588, + "loss": 0.636, + "step": 107 + }, + { + "epoch": 3.7908999271743555e-05, + "grad_norm": 0.3900754153728485, + "learning_rate": 0.00019318864774624375, + "loss": 0.5367, + "step": 108 + }, + { + "epoch": 3.82600085242597e-05, + "grad_norm": 0.42533883452415466, + "learning_rate": 0.00019312186978297162, + "loss": 0.6862, + "step": 109 + }, + { + "epoch": 3.861101777677584e-05, + "grad_norm": 0.6809422969818115, + "learning_rate": 0.00019305509181969952, + "loss": 0.6434, + "step": 110 + }, + { + "epoch": 3.896202702929198e-05, + "grad_norm": 0.5127860307693481, + "learning_rate": 0.0001929883138564274, + "loss": 0.6266, + "step": 111 + }, + { + "epoch": 3.931303628180813e-05, + "grad_norm": 0.5254234671592712, + "learning_rate": 0.00019292153589315527, + "loss": 0.6982, + "step": 112 + }, + { + "epoch": 3.9664045534324275e-05, + "grad_norm": 0.3699031472206116, + "learning_rate": 0.00019285475792988314, + "loss": 0.6037, + "step": 113 + }, + { + "epoch": 4.001505478684042e-05, + "grad_norm": 0.3807130455970764, + "learning_rate": 0.00019278797996661101, + "loss": 0.5861, + "step": 114 + }, + { + "epoch": 4.036606403935656e-05, + "grad_norm": 0.4455645978450775, + "learning_rate": 0.0001927212020033389, + "loss": 0.5658, + "step": 115 + }, + { + "epoch": 4.07170732918727e-05, + "grad_norm": 0.3830210864543915, + "learning_rate": 0.0001926544240400668, + "loss": 0.606, + "step": 116 + }, + { + "epoch": 4.106808254438885e-05, + "grad_norm": 0.41419631242752075, + "learning_rate": 0.00019258764607679466, + "loss": 0.6095, + "step": 117 + }, + { + "epoch": 4.1419091796904995e-05, + "grad_norm": 0.3929574489593506, + "learning_rate": 0.00019252086811352253, + "loss": 0.6464, + "step": 118 + }, + { + "epoch": 4.177010104942114e-05, + "grad_norm": 0.35958629846572876, + "learning_rate": 0.0001924540901502504, + "loss": 0.5185, + "step": 119 + }, + { + "epoch": 4.212111030193728e-05, + "grad_norm": 0.3790556490421295, + "learning_rate": 0.0001923873121869783, + "loss": 0.5156, + "step": 120 + }, + { + "epoch": 4.2472119554453423e-05, + "grad_norm": 0.37452438473701477, + "learning_rate": 0.00019232053422370618, + "loss": 0.5711, + "step": 121 + }, + { + "epoch": 4.282312880696957e-05, + "grad_norm": 0.38976770639419556, + "learning_rate": 0.00019225375626043408, + "loss": 0.6075, + "step": 122 + }, + { + "epoch": 4.3174138059485716e-05, + "grad_norm": 0.4098513424396515, + "learning_rate": 0.00019218697829716195, + "loss": 0.5312, + "step": 123 + }, + { + "epoch": 4.352514731200186e-05, + "grad_norm": 0.33890047669410706, + "learning_rate": 0.00019212020033388983, + "loss": 0.4984, + "step": 124 + }, + { + "epoch": 4.3876156564518e-05, + "grad_norm": 0.49077001214027405, + "learning_rate": 0.0001920534223706177, + "loss": 0.7159, + "step": 125 + }, + { + "epoch": 4.4227165817034144e-05, + "grad_norm": 0.41653814911842346, + "learning_rate": 0.0001919866444073456, + "loss": 0.5642, + "step": 126 + }, + { + "epoch": 4.4578175069550286e-05, + "grad_norm": 0.45710283517837524, + "learning_rate": 0.00019191986644407347, + "loss": 0.6936, + "step": 127 + }, + { + "epoch": 4.4929184322066436e-05, + "grad_norm": 0.36976873874664307, + "learning_rate": 0.00019185308848080135, + "loss": 0.5407, + "step": 128 + }, + { + "epoch": 4.528019357458258e-05, + "grad_norm": 0.42852675914764404, + "learning_rate": 0.00019178631051752922, + "loss": 0.6731, + "step": 129 + }, + { + "epoch": 4.563120282709872e-05, + "grad_norm": 0.5426310300827026, + "learning_rate": 0.0001917195325542571, + "loss": 0.5775, + "step": 130 + }, + { + "epoch": 4.5982212079614864e-05, + "grad_norm": 0.38442543148994446, + "learning_rate": 0.00019165275459098497, + "loss": 0.5994, + "step": 131 + }, + { + "epoch": 4.633322133213101e-05, + "grad_norm": 0.4298035502433777, + "learning_rate": 0.00019158597662771287, + "loss": 0.5563, + "step": 132 + }, + { + "epoch": 4.6684230584647156e-05, + "grad_norm": 0.40397605299949646, + "learning_rate": 0.00019151919866444074, + "loss": 0.6924, + "step": 133 + }, + { + "epoch": 4.70352398371633e-05, + "grad_norm": 0.4338497519493103, + "learning_rate": 0.0001914524207011686, + "loss": 0.5739, + "step": 134 + }, + { + "epoch": 4.738624908967944e-05, + "grad_norm": 0.39713653922080994, + "learning_rate": 0.0001913856427378965, + "loss": 0.4529, + "step": 135 + }, + { + "epoch": 4.7737258342195584e-05, + "grad_norm": 0.31409478187561035, + "learning_rate": 0.0001913188647746244, + "loss": 0.562, + "step": 136 + }, + { + "epoch": 4.808826759471173e-05, + "grad_norm": 0.371624618768692, + "learning_rate": 0.00019125208681135226, + "loss": 0.5288, + "step": 137 + }, + { + "epoch": 4.8439276847227877e-05, + "grad_norm": 0.4600190818309784, + "learning_rate": 0.00019118530884808016, + "loss": 0.6215, + "step": 138 + }, + { + "epoch": 4.879028609974402e-05, + "grad_norm": 0.45351359248161316, + "learning_rate": 0.00019111853088480803, + "loss": 0.686, + "step": 139 + }, + { + "epoch": 4.914129535226016e-05, + "grad_norm": 0.42282962799072266, + "learning_rate": 0.0001910517529215359, + "loss": 0.5966, + "step": 140 + }, + { + "epoch": 4.9492304604776305e-05, + "grad_norm": 0.41479986906051636, + "learning_rate": 0.00019098497495826378, + "loss": 0.5948, + "step": 141 + }, + { + "epoch": 4.984331385729245e-05, + "grad_norm": 0.40453553199768066, + "learning_rate": 0.00019091819699499168, + "loss": 0.6411, + "step": 142 + }, + { + "epoch": 5.01943231098086e-05, + "grad_norm": 0.3939369320869446, + "learning_rate": 0.00019085141903171955, + "loss": 0.5513, + "step": 143 + }, + { + "epoch": 5.054533236232474e-05, + "grad_norm": 0.3700481653213501, + "learning_rate": 0.00019078464106844743, + "loss": 0.5459, + "step": 144 + }, + { + "epoch": 5.089634161484088e-05, + "grad_norm": 0.4377487897872925, + "learning_rate": 0.0001907178631051753, + "loss": 0.6076, + "step": 145 + }, + { + "epoch": 5.1247350867357025e-05, + "grad_norm": 0.37919673323631287, + "learning_rate": 0.00019065108514190317, + "loss": 0.5207, + "step": 146 + }, + { + "epoch": 5.159836011987317e-05, + "grad_norm": 0.3841630816459656, + "learning_rate": 0.00019058430717863107, + "loss": 0.614, + "step": 147 + }, + { + "epoch": 5.194936937238932e-05, + "grad_norm": 0.43541714549064636, + "learning_rate": 0.00019051752921535895, + "loss": 0.6283, + "step": 148 + }, + { + "epoch": 5.230037862490546e-05, + "grad_norm": 0.4853285253047943, + "learning_rate": 0.00019045075125208682, + "loss": 0.5807, + "step": 149 + }, + { + "epoch": 5.26513878774216e-05, + "grad_norm": 0.3572970926761627, + "learning_rate": 0.0001903839732888147, + "loss": 0.6866, + "step": 150 + }, + { + "epoch": 5.3002397129937745e-05, + "grad_norm": 0.3674347698688507, + "learning_rate": 0.00019031719532554257, + "loss": 0.5552, + "step": 151 + }, + { + "epoch": 5.335340638245389e-05, + "grad_norm": 0.37748461961746216, + "learning_rate": 0.00019025041736227044, + "loss": 0.6278, + "step": 152 + }, + { + "epoch": 5.370441563497003e-05, + "grad_norm": 0.3788503408432007, + "learning_rate": 0.00019018363939899834, + "loss": 0.622, + "step": 153 + }, + { + "epoch": 5.405542488748618e-05, + "grad_norm": 0.3736303150653839, + "learning_rate": 0.0001901168614357262, + "loss": 0.5822, + "step": 154 + }, + { + "epoch": 5.440643414000232e-05, + "grad_norm": 0.32680070400238037, + "learning_rate": 0.0001900500834724541, + "loss": 0.5715, + "step": 155 + }, + { + "epoch": 5.4757443392518466e-05, + "grad_norm": 0.34495192766189575, + "learning_rate": 0.00018998330550918199, + "loss": 0.6497, + "step": 156 + }, + { + "epoch": 5.510845264503461e-05, + "grad_norm": 0.4244193136692047, + "learning_rate": 0.00018991652754590986, + "loss": 0.5519, + "step": 157 + }, + { + "epoch": 5.545946189755075e-05, + "grad_norm": 0.4024031162261963, + "learning_rate": 0.00018984974958263776, + "loss": 0.5339, + "step": 158 + }, + { + "epoch": 5.58104711500669e-05, + "grad_norm": 0.46051299571990967, + "learning_rate": 0.00018978297161936563, + "loss": 0.5979, + "step": 159 + }, + { + "epoch": 5.616148040258304e-05, + "grad_norm": 0.49051615595817566, + "learning_rate": 0.0001897161936560935, + "loss": 0.5563, + "step": 160 + }, + { + "epoch": 5.6512489655099186e-05, + "grad_norm": 0.43045854568481445, + "learning_rate": 0.00018964941569282138, + "loss": 0.5984, + "step": 161 + }, + { + "epoch": 5.686349890761533e-05, + "grad_norm": 0.37778228521347046, + "learning_rate": 0.00018958263772954925, + "loss": 0.5955, + "step": 162 + }, + { + "epoch": 5.721450816013147e-05, + "grad_norm": 0.3736341893672943, + "learning_rate": 0.00018951585976627715, + "loss": 0.6438, + "step": 163 + }, + { + "epoch": 5.756551741264762e-05, + "grad_norm": 0.3940117061138153, + "learning_rate": 0.00018944908180300502, + "loss": 0.503, + "step": 164 + }, + { + "epoch": 5.7916526665163763e-05, + "grad_norm": 0.4193519055843353, + "learning_rate": 0.0001893823038397329, + "loss": 0.6324, + "step": 165 + }, + { + "epoch": 5.8267535917679906e-05, + "grad_norm": 0.34481996297836304, + "learning_rate": 0.00018931552587646077, + "loss": 0.5745, + "step": 166 + }, + { + "epoch": 5.861854517019605e-05, + "grad_norm": 0.38285771012306213, + "learning_rate": 0.00018924874791318864, + "loss": 0.639, + "step": 167 + }, + { + "epoch": 5.896955442271219e-05, + "grad_norm": 0.36933982372283936, + "learning_rate": 0.00018918196994991652, + "loss": 0.6681, + "step": 168 + }, + { + "epoch": 5.932056367522834e-05, + "grad_norm": 0.36970776319503784, + "learning_rate": 0.00018911519198664442, + "loss": 0.5626, + "step": 169 + }, + { + "epoch": 5.9671572927744484e-05, + "grad_norm": 0.38494783639907837, + "learning_rate": 0.0001890484140233723, + "loss": 0.6066, + "step": 170 + }, + { + "epoch": 6.0022582180260627e-05, + "grad_norm": 0.3446069061756134, + "learning_rate": 0.00018898163606010016, + "loss": 0.6354, + "step": 171 + }, + { + "epoch": 6.037359143277677e-05, + "grad_norm": 0.4466759264469147, + "learning_rate": 0.00018891485809682806, + "loss": 0.4737, + "step": 172 + }, + { + "epoch": 6.072460068529291e-05, + "grad_norm": 0.43630918860435486, + "learning_rate": 0.00018884808013355594, + "loss": 0.6839, + "step": 173 + }, + { + "epoch": 6.107560993780906e-05, + "grad_norm": 0.37083202600479126, + "learning_rate": 0.00018878130217028384, + "loss": 0.5372, + "step": 174 + }, + { + "epoch": 6.14266191903252e-05, + "grad_norm": 0.37066200375556946, + "learning_rate": 0.0001887145242070117, + "loss": 0.6653, + "step": 175 + }, + { + "epoch": 6.177762844284135e-05, + "grad_norm": 0.5191747546195984, + "learning_rate": 0.00018864774624373958, + "loss": 0.6677, + "step": 176 + }, + { + "epoch": 6.21286376953575e-05, + "grad_norm": 0.4235158860683441, + "learning_rate": 0.00018858096828046746, + "loss": 0.5971, + "step": 177 + }, + { + "epoch": 6.247964694787363e-05, + "grad_norm": 0.405074805021286, + "learning_rate": 0.00018851419031719533, + "loss": 0.5717, + "step": 178 + }, + { + "epoch": 6.283065620038978e-05, + "grad_norm": 0.45817336440086365, + "learning_rate": 0.00018844741235392323, + "loss": 0.5878, + "step": 179 + }, + { + "epoch": 6.318166545290592e-05, + "grad_norm": 0.6313037276268005, + "learning_rate": 0.0001883806343906511, + "loss": 0.62, + "step": 180 + }, + { + "epoch": 6.353267470542207e-05, + "grad_norm": 0.41896742582321167, + "learning_rate": 0.00018831385642737898, + "loss": 0.5565, + "step": 181 + }, + { + "epoch": 6.388368395793822e-05, + "grad_norm": 0.4143432676792145, + "learning_rate": 0.00018824707846410685, + "loss": 0.5552, + "step": 182 + }, + { + "epoch": 6.423469321045435e-05, + "grad_norm": 0.38745641708374023, + "learning_rate": 0.00018818030050083472, + "loss": 0.5949, + "step": 183 + }, + { + "epoch": 6.45857024629705e-05, + "grad_norm": 0.7472612261772156, + "learning_rate": 0.0001881135225375626, + "loss": 0.6708, + "step": 184 + }, + { + "epoch": 6.493671171548664e-05, + "grad_norm": 0.4416198432445526, + "learning_rate": 0.0001880467445742905, + "loss": 0.6069, + "step": 185 + }, + { + "epoch": 6.528772096800279e-05, + "grad_norm": 0.4312993884086609, + "learning_rate": 0.00018797996661101837, + "loss": 0.5778, + "step": 186 + }, + { + "epoch": 6.563873022051894e-05, + "grad_norm": 0.4524860978126526, + "learning_rate": 0.00018791318864774624, + "loss": 0.5091, + "step": 187 + }, + { + "epoch": 6.598973947303507e-05, + "grad_norm": 0.4320828914642334, + "learning_rate": 0.00018784641068447412, + "loss": 0.6557, + "step": 188 + }, + { + "epoch": 6.634074872555122e-05, + "grad_norm": 0.6967452168464661, + "learning_rate": 0.00018777963272120202, + "loss": 0.612, + "step": 189 + }, + { + "epoch": 6.669175797806736e-05, + "grad_norm": 0.4389924705028534, + "learning_rate": 0.0001877128547579299, + "loss": 0.6271, + "step": 190 + }, + { + "epoch": 6.704276723058351e-05, + "grad_norm": 0.3693922162055969, + "learning_rate": 0.0001876460767946578, + "loss": 0.6715, + "step": 191 + }, + { + "epoch": 6.739377648309964e-05, + "grad_norm": 0.32230404019355774, + "learning_rate": 0.00018757929883138566, + "loss": 0.6344, + "step": 192 + }, + { + "epoch": 6.774478573561579e-05, + "grad_norm": 0.4440002143383026, + "learning_rate": 0.00018751252086811354, + "loss": 0.6671, + "step": 193 + }, + { + "epoch": 6.809579498813194e-05, + "grad_norm": 0.5676587820053101, + "learning_rate": 0.0001874457429048414, + "loss": 0.6818, + "step": 194 + }, + { + "epoch": 6.844680424064808e-05, + "grad_norm": 0.36207348108291626, + "learning_rate": 0.0001873789649415693, + "loss": 0.5029, + "step": 195 + }, + { + "epoch": 6.879781349316423e-05, + "grad_norm": 0.35714131593704224, + "learning_rate": 0.00018731218697829718, + "loss": 0.6127, + "step": 196 + }, + { + "epoch": 6.914882274568036e-05, + "grad_norm": 0.4285273551940918, + "learning_rate": 0.00018724540901502506, + "loss": 0.6355, + "step": 197 + }, + { + "epoch": 6.949983199819651e-05, + "grad_norm": 0.42585939168930054, + "learning_rate": 0.00018717863105175293, + "loss": 0.6302, + "step": 198 + }, + { + "epoch": 6.985084125071266e-05, + "grad_norm": 0.524303138256073, + "learning_rate": 0.0001871118530884808, + "loss": 0.6683, + "step": 199 + }, + { + "epoch": 7.02018505032288e-05, + "grad_norm": 0.39635923504829407, + "learning_rate": 0.00018704507512520868, + "loss": 0.6694, + "step": 200 + }, + { + "epoch": 7.055285975574495e-05, + "grad_norm": 0.39712437987327576, + "learning_rate": 0.00018697829716193658, + "loss": 0.5794, + "step": 201 + }, + { + "epoch": 7.090386900826108e-05, + "grad_norm": 0.4115397334098816, + "learning_rate": 0.00018691151919866445, + "loss": 0.5579, + "step": 202 + }, + { + "epoch": 7.125487826077723e-05, + "grad_norm": 0.4776385724544525, + "learning_rate": 0.00018684474123539232, + "loss": 0.5589, + "step": 203 + }, + { + "epoch": 7.160588751329338e-05, + "grad_norm": 0.35574638843536377, + "learning_rate": 0.0001867779632721202, + "loss": 0.5311, + "step": 204 + }, + { + "epoch": 7.195689676580952e-05, + "grad_norm": 0.44872432947158813, + "learning_rate": 0.00018671118530884807, + "loss": 0.635, + "step": 205 + }, + { + "epoch": 7.230790601832567e-05, + "grad_norm": 0.3511079251766205, + "learning_rate": 0.00018664440734557597, + "loss": 0.5317, + "step": 206 + }, + { + "epoch": 7.26589152708418e-05, + "grad_norm": 0.39862194657325745, + "learning_rate": 0.00018657762938230384, + "loss": 0.6653, + "step": 207 + }, + { + "epoch": 7.300992452335795e-05, + "grad_norm": 0.4046575725078583, + "learning_rate": 0.00018651085141903174, + "loss": 0.6065, + "step": 208 + }, + { + "epoch": 7.33609337758741e-05, + "grad_norm": 0.4231868088245392, + "learning_rate": 0.00018644407345575962, + "loss": 0.7078, + "step": 209 + }, + { + "epoch": 7.371194302839024e-05, + "grad_norm": 0.364700049161911, + "learning_rate": 0.0001863772954924875, + "loss": 0.6309, + "step": 210 + }, + { + "epoch": 7.406295228090639e-05, + "grad_norm": 0.5385531187057495, + "learning_rate": 0.0001863105175292154, + "loss": 0.4233, + "step": 211 + }, + { + "epoch": 7.441396153342252e-05, + "grad_norm": 0.39415115118026733, + "learning_rate": 0.00018624373956594326, + "loss": 0.5928, + "step": 212 + }, + { + "epoch": 7.476497078593867e-05, + "grad_norm": 0.6021363735198975, + "learning_rate": 0.00018617696160267113, + "loss": 0.6611, + "step": 213 + }, + { + "epoch": 7.511598003845482e-05, + "grad_norm": 0.3709903061389923, + "learning_rate": 0.000186110183639399, + "loss": 0.6136, + "step": 214 + }, + { + "epoch": 7.546698929097096e-05, + "grad_norm": 0.36710435152053833, + "learning_rate": 0.00018604340567612688, + "loss": 0.5267, + "step": 215 + }, + { + "epoch": 7.581799854348711e-05, + "grad_norm": 0.4379352033138275, + "learning_rate": 0.00018597662771285475, + "loss": 0.6429, + "step": 216 + }, + { + "epoch": 7.616900779600325e-05, + "grad_norm": 0.3408482074737549, + "learning_rate": 0.00018590984974958265, + "loss": 0.5379, + "step": 217 + }, + { + "epoch": 7.65200170485194e-05, + "grad_norm": 0.4487043023109436, + "learning_rate": 0.00018584307178631053, + "loss": 0.6582, + "step": 218 + }, + { + "epoch": 7.687102630103554e-05, + "grad_norm": 0.42003679275512695, + "learning_rate": 0.0001857762938230384, + "loss": 0.5712, + "step": 219 + }, + { + "epoch": 7.722203555355168e-05, + "grad_norm": 0.4698665738105774, + "learning_rate": 0.00018570951585976627, + "loss": 0.5715, + "step": 220 + }, + { + "epoch": 7.757304480606783e-05, + "grad_norm": 0.3777780830860138, + "learning_rate": 0.00018564273789649415, + "loss": 0.4667, + "step": 221 + }, + { + "epoch": 7.792405405858397e-05, + "grad_norm": 0.36794212460517883, + "learning_rate": 0.00018557595993322205, + "loss": 0.5382, + "step": 222 + }, + { + "epoch": 7.827506331110012e-05, + "grad_norm": 0.4582989513874054, + "learning_rate": 0.00018550918196994992, + "loss": 0.6437, + "step": 223 + }, + { + "epoch": 7.862607256361626e-05, + "grad_norm": 0.4065852761268616, + "learning_rate": 0.0001854424040066778, + "loss": 0.6928, + "step": 224 + }, + { + "epoch": 7.89770818161324e-05, + "grad_norm": 0.3857649564743042, + "learning_rate": 0.0001853756260434057, + "loss": 0.5405, + "step": 225 + }, + { + "epoch": 7.932809106864855e-05, + "grad_norm": 0.40056589245796204, + "learning_rate": 0.00018530884808013357, + "loss": 0.6425, + "step": 226 + }, + { + "epoch": 7.967910032116469e-05, + "grad_norm": 0.43137016892433167, + "learning_rate": 0.00018524207011686147, + "loss": 0.5001, + "step": 227 + }, + { + "epoch": 8.003010957368084e-05, + "grad_norm": 0.3723987340927124, + "learning_rate": 0.00018517529215358934, + "loss": 0.5118, + "step": 228 + }, + { + "epoch": 8.038111882619698e-05, + "grad_norm": 0.34196361899375916, + "learning_rate": 0.00018510851419031721, + "loss": 0.5468, + "step": 229 + }, + { + "epoch": 8.073212807871312e-05, + "grad_norm": 0.4319117069244385, + "learning_rate": 0.0001850417362270451, + "loss": 0.5703, + "step": 230 + }, + { + "epoch": 8.108313733122927e-05, + "grad_norm": 0.4467247724533081, + "learning_rate": 0.00018497495826377296, + "loss": 0.6536, + "step": 231 + }, + { + "epoch": 8.14341465837454e-05, + "grad_norm": 0.3569909632205963, + "learning_rate": 0.00018490818030050083, + "loss": 0.5335, + "step": 232 + }, + { + "epoch": 8.178515583626156e-05, + "grad_norm": 0.33486437797546387, + "learning_rate": 0.00018484140233722873, + "loss": 0.6803, + "step": 233 + }, + { + "epoch": 8.21361650887777e-05, + "grad_norm": 0.3783140480518341, + "learning_rate": 0.0001847746243739566, + "loss": 0.6361, + "step": 234 + }, + { + "epoch": 8.248717434129384e-05, + "grad_norm": 0.4844662547111511, + "learning_rate": 0.00018470784641068448, + "loss": 0.5322, + "step": 235 + }, + { + "epoch": 8.283818359380999e-05, + "grad_norm": 0.508406400680542, + "learning_rate": 0.00018464106844741235, + "loss": 0.6676, + "step": 236 + }, + { + "epoch": 8.318919284632613e-05, + "grad_norm": 0.3710225820541382, + "learning_rate": 0.00018457429048414023, + "loss": 0.6656, + "step": 237 + }, + { + "epoch": 8.354020209884228e-05, + "grad_norm": 0.3757292628288269, + "learning_rate": 0.00018450751252086813, + "loss": 0.6095, + "step": 238 + }, + { + "epoch": 8.389121135135843e-05, + "grad_norm": 0.40651261806488037, + "learning_rate": 0.000184440734557596, + "loss": 0.6626, + "step": 239 + }, + { + "epoch": 8.424222060387456e-05, + "grad_norm": 0.40700778365135193, + "learning_rate": 0.00018437395659432387, + "loss": 0.5328, + "step": 240 + }, + { + "epoch": 8.459322985639071e-05, + "grad_norm": 0.5067440867424011, + "learning_rate": 0.00018430717863105175, + "loss": 0.4811, + "step": 241 + }, + { + "epoch": 8.494423910890685e-05, + "grad_norm": 0.3934602737426758, + "learning_rate": 0.00018424040066777965, + "loss": 0.5691, + "step": 242 + }, + { + "epoch": 8.5295248361423e-05, + "grad_norm": 0.3360019624233246, + "learning_rate": 0.00018417362270450752, + "loss": 0.5542, + "step": 243 + }, + { + "epoch": 8.564625761393915e-05, + "grad_norm": 0.4023631513118744, + "learning_rate": 0.00018410684474123542, + "loss": 0.5192, + "step": 244 + }, + { + "epoch": 8.599726686645528e-05, + "grad_norm": 0.41704171895980835, + "learning_rate": 0.0001840400667779633, + "loss": 0.5018, + "step": 245 + }, + { + "epoch": 8.634827611897143e-05, + "grad_norm": 0.361977756023407, + "learning_rate": 0.00018397328881469117, + "loss": 0.6193, + "step": 246 + }, + { + "epoch": 8.669928537148757e-05, + "grad_norm": 0.37774717807769775, + "learning_rate": 0.00018390651085141904, + "loss": 0.5552, + "step": 247 + }, + { + "epoch": 8.705029462400372e-05, + "grad_norm": 0.3408471941947937, + "learning_rate": 0.0001838397328881469, + "loss": 0.5876, + "step": 248 + }, + { + "epoch": 8.740130387651985e-05, + "grad_norm": 0.3892226815223694, + "learning_rate": 0.0001837729549248748, + "loss": 0.4227, + "step": 249 + }, + { + "epoch": 8.7752313129036e-05, + "grad_norm": 0.5315036177635193, + "learning_rate": 0.00018370617696160269, + "loss": 0.5826, + "step": 250 + }, + { + "epoch": 8.810332238155215e-05, + "grad_norm": 0.35433024168014526, + "learning_rate": 0.00018363939899833056, + "loss": 0.5992, + "step": 251 + }, + { + "epoch": 8.845433163406829e-05, + "grad_norm": 0.34777382016181946, + "learning_rate": 0.00018357262103505843, + "loss": 0.4973, + "step": 252 + }, + { + "epoch": 8.880534088658444e-05, + "grad_norm": 0.3936387002468109, + "learning_rate": 0.0001835058430717863, + "loss": 0.6254, + "step": 253 + }, + { + "epoch": 8.915635013910057e-05, + "grad_norm": 0.4009217917919159, + "learning_rate": 0.0001834390651085142, + "loss": 0.4843, + "step": 254 + }, + { + "epoch": 8.950735939161672e-05, + "grad_norm": 0.4863683879375458, + "learning_rate": 0.00018337228714524208, + "loss": 0.5204, + "step": 255 + }, + { + "epoch": 8.985836864413287e-05, + "grad_norm": 0.6100988984107971, + "learning_rate": 0.00018330550918196995, + "loss": 0.7296, + "step": 256 + }, + { + "epoch": 9.020937789664901e-05, + "grad_norm": 0.40949374437332153, + "learning_rate": 0.00018323873121869782, + "loss": 0.5707, + "step": 257 + }, + { + "epoch": 9.056038714916516e-05, + "grad_norm": 0.47316402196884155, + "learning_rate": 0.0001831719532554257, + "loss": 0.6655, + "step": 258 + }, + { + "epoch": 9.091139640168129e-05, + "grad_norm": 0.4053696393966675, + "learning_rate": 0.0001831051752921536, + "loss": 0.5822, + "step": 259 + }, + { + "epoch": 9.126240565419744e-05, + "grad_norm": 0.4582972228527069, + "learning_rate": 0.00018303839732888147, + "loss": 0.5475, + "step": 260 + }, + { + "epoch": 9.161341490671359e-05, + "grad_norm": 0.38666802644729614, + "learning_rate": 0.00018297161936560937, + "loss": 0.4744, + "step": 261 + }, + { + "epoch": 9.196442415922973e-05, + "grad_norm": 0.31954991817474365, + "learning_rate": 0.00018290484140233724, + "loss": 0.6337, + "step": 262 + }, + { + "epoch": 9.231543341174588e-05, + "grad_norm": 0.3590424358844757, + "learning_rate": 0.00018283806343906512, + "loss": 0.5683, + "step": 263 + }, + { + "epoch": 9.266644266426201e-05, + "grad_norm": 0.4042195975780487, + "learning_rate": 0.000182771285475793, + "loss": 0.6142, + "step": 264 + }, + { + "epoch": 9.301745191677816e-05, + "grad_norm": 0.3474234342575073, + "learning_rate": 0.0001827045075125209, + "loss": 0.6035, + "step": 265 + }, + { + "epoch": 9.336846116929431e-05, + "grad_norm": 0.337091326713562, + "learning_rate": 0.00018263772954924876, + "loss": 0.6107, + "step": 266 + }, + { + "epoch": 9.371947042181045e-05, + "grad_norm": 0.3313732445240021, + "learning_rate": 0.00018257095158597664, + "loss": 0.6491, + "step": 267 + }, + { + "epoch": 9.40704796743266e-05, + "grad_norm": 0.3931679129600525, + "learning_rate": 0.0001825041736227045, + "loss": 0.5492, + "step": 268 + }, + { + "epoch": 9.442148892684273e-05, + "grad_norm": 0.5848420262336731, + "learning_rate": 0.00018243739565943238, + "loss": 0.7091, + "step": 269 + }, + { + "epoch": 9.477249817935888e-05, + "grad_norm": 0.4851846992969513, + "learning_rate": 0.00018237061769616028, + "loss": 0.5856, + "step": 270 + }, + { + "epoch": 9.512350743187503e-05, + "grad_norm": 0.3434993326663971, + "learning_rate": 0.00018230383973288816, + "loss": 0.5085, + "step": 271 + }, + { + "epoch": 9.547451668439117e-05, + "grad_norm": 0.2978988587856293, + "learning_rate": 0.00018223706176961603, + "loss": 0.481, + "step": 272 + }, + { + "epoch": 9.582552593690732e-05, + "grad_norm": 0.34215858578681946, + "learning_rate": 0.0001821702838063439, + "loss": 0.5723, + "step": 273 + }, + { + "epoch": 9.617653518942345e-05, + "grad_norm": 0.43445509672164917, + "learning_rate": 0.00018210350584307178, + "loss": 0.5691, + "step": 274 + }, + { + "epoch": 9.65275444419396e-05, + "grad_norm": 0.36094945669174194, + "learning_rate": 0.00018203672787979968, + "loss": 0.5543, + "step": 275 + }, + { + "epoch": 9.687855369445575e-05, + "grad_norm": 0.386106014251709, + "learning_rate": 0.00018196994991652755, + "loss": 0.5561, + "step": 276 + }, + { + "epoch": 9.722956294697189e-05, + "grad_norm": 0.36676689982414246, + "learning_rate": 0.00018190317195325542, + "loss": 0.5479, + "step": 277 + }, + { + "epoch": 9.758057219948804e-05, + "grad_norm": 0.37988394498825073, + "learning_rate": 0.00018183639398998332, + "loss": 0.5772, + "step": 278 + }, + { + "epoch": 9.793158145200417e-05, + "grad_norm": 0.4024789035320282, + "learning_rate": 0.0001817696160267112, + "loss": 0.6065, + "step": 279 + }, + { + "epoch": 9.828259070452032e-05, + "grad_norm": 0.3697255551815033, + "learning_rate": 0.0001817028380634391, + "loss": 0.5021, + "step": 280 + }, + { + "epoch": 9.863359995703647e-05, + "grad_norm": 0.43579426407814026, + "learning_rate": 0.00018163606010016697, + "loss": 0.555, + "step": 281 + }, + { + "epoch": 9.898460920955261e-05, + "grad_norm": 0.4760832190513611, + "learning_rate": 0.00018156928213689484, + "loss": 0.6438, + "step": 282 + }, + { + "epoch": 9.933561846206876e-05, + "grad_norm": 0.45258408784866333, + "learning_rate": 0.00018150250417362272, + "loss": 0.4717, + "step": 283 + }, + { + "epoch": 9.96866277145849e-05, + "grad_norm": 0.428108274936676, + "learning_rate": 0.0001814357262103506, + "loss": 0.6029, + "step": 284 + }, + { + "epoch": 0.00010003763696710104, + "grad_norm": 0.3999852240085602, + "learning_rate": 0.00018136894824707846, + "loss": 0.4524, + "step": 285 + }, + { + "epoch": 0.0001003886462196172, + "grad_norm": 0.44319403171539307, + "learning_rate": 0.00018130217028380636, + "loss": 0.6619, + "step": 286 + }, + { + "epoch": 0.00010073965547213333, + "grad_norm": 0.43008357286453247, + "learning_rate": 0.00018123539232053424, + "loss": 0.6105, + "step": 287 + }, + { + "epoch": 0.00010109066472464948, + "grad_norm": 0.38037821650505066, + "learning_rate": 0.0001811686143572621, + "loss": 0.6649, + "step": 288 + }, + { + "epoch": 0.00010144167397716562, + "grad_norm": 0.3713517487049103, + "learning_rate": 0.00018110183639398998, + "loss": 0.6381, + "step": 289 + }, + { + "epoch": 0.00010179268322968176, + "grad_norm": 0.3437170386314392, + "learning_rate": 0.00018103505843071786, + "loss": 0.4563, + "step": 290 + }, + { + "epoch": 0.00010214369248219791, + "grad_norm": 0.3661468029022217, + "learning_rate": 0.00018096828046744576, + "loss": 0.606, + "step": 291 + }, + { + "epoch": 0.00010249470173471405, + "grad_norm": 0.36346200108528137, + "learning_rate": 0.00018090150250417363, + "loss": 0.5895, + "step": 292 + }, + { + "epoch": 0.0001028457109872302, + "grad_norm": 0.31052225828170776, + "learning_rate": 0.0001808347245409015, + "loss": 0.4409, + "step": 293 + }, + { + "epoch": 0.00010319672023974634, + "grad_norm": 0.37012970447540283, + "learning_rate": 0.00018076794657762938, + "loss": 0.505, + "step": 294 + }, + { + "epoch": 0.00010354772949226248, + "grad_norm": 0.3958667814731598, + "learning_rate": 0.00018070116861435728, + "loss": 0.5371, + "step": 295 + }, + { + "epoch": 0.00010389873874477863, + "grad_norm": 0.4892179071903229, + "learning_rate": 0.00018063439065108515, + "loss": 0.6737, + "step": 296 + }, + { + "epoch": 0.00010424974799729477, + "grad_norm": 0.41874751448631287, + "learning_rate": 0.00018056761268781305, + "loss": 0.651, + "step": 297 + }, + { + "epoch": 0.00010460075724981092, + "grad_norm": 0.4167911410331726, + "learning_rate": 0.00018050083472454092, + "loss": 0.5531, + "step": 298 + }, + { + "epoch": 0.00010495176650232706, + "grad_norm": 0.3758225440979004, + "learning_rate": 0.0001804340567612688, + "loss": 0.6285, + "step": 299 + }, + { + "epoch": 0.0001053027757548432, + "grad_norm": 0.3688598573207855, + "learning_rate": 0.00018036727879799667, + "loss": 0.5219, + "step": 300 + }, + { + "epoch": 0.00010565378500735934, + "grad_norm": 0.3501751124858856, + "learning_rate": 0.00018030050083472454, + "loss": 0.6351, + "step": 301 + }, + { + "epoch": 0.00010600479425987549, + "grad_norm": 0.42876511812210083, + "learning_rate": 0.00018023372287145244, + "loss": 0.544, + "step": 302 + }, + { + "epoch": 0.00010635580351239164, + "grad_norm": 0.47046172618865967, + "learning_rate": 0.00018016694490818031, + "loss": 0.6304, + "step": 303 + }, + { + "epoch": 0.00010670681276490778, + "grad_norm": 0.402271032333374, + "learning_rate": 0.0001801001669449082, + "loss": 0.5039, + "step": 304 + }, + { + "epoch": 0.00010705782201742393, + "grad_norm": 0.41232413053512573, + "learning_rate": 0.00018003338898163606, + "loss": 0.5892, + "step": 305 + }, + { + "epoch": 0.00010740883126994006, + "grad_norm": 0.3628154993057251, + "learning_rate": 0.00017996661101836393, + "loss": 0.5737, + "step": 306 + }, + { + "epoch": 0.00010775984052245621, + "grad_norm": 0.4291020631790161, + "learning_rate": 0.00017989983305509183, + "loss": 0.6597, + "step": 307 + }, + { + "epoch": 0.00010811084977497236, + "grad_norm": 0.33218181133270264, + "learning_rate": 0.0001798330550918197, + "loss": 0.5726, + "step": 308 + }, + { + "epoch": 0.0001084618590274885, + "grad_norm": 0.3439387381076813, + "learning_rate": 0.00017976627712854758, + "loss": 0.5615, + "step": 309 + }, + { + "epoch": 0.00010881286828000465, + "grad_norm": 0.3523644208908081, + "learning_rate": 0.00017969949916527545, + "loss": 0.4968, + "step": 310 + }, + { + "epoch": 0.00010916387753252078, + "grad_norm": 0.4045630991458893, + "learning_rate": 0.00017963272120200333, + "loss": 0.6425, + "step": 311 + }, + { + "epoch": 0.00010951488678503693, + "grad_norm": 0.3726767599582672, + "learning_rate": 0.00017956594323873123, + "loss": 0.6575, + "step": 312 + }, + { + "epoch": 0.00010986589603755308, + "grad_norm": 0.32131972908973694, + "learning_rate": 0.0001794991652754591, + "loss": 0.5146, + "step": 313 + }, + { + "epoch": 0.00011021690529006922, + "grad_norm": 0.5013764500617981, + "learning_rate": 0.000179432387312187, + "loss": 0.53, + "step": 314 + }, + { + "epoch": 0.00011056791454258537, + "grad_norm": 0.36830246448516846, + "learning_rate": 0.00017936560934891487, + "loss": 0.6291, + "step": 315 + }, + { + "epoch": 0.0001109189237951015, + "grad_norm": 0.3587378263473511, + "learning_rate": 0.00017929883138564275, + "loss": 0.4954, + "step": 316 + }, + { + "epoch": 0.00011126993304761765, + "grad_norm": 0.3480195105075836, + "learning_rate": 0.00017923205342237062, + "loss": 0.606, + "step": 317 + }, + { + "epoch": 0.0001116209423001338, + "grad_norm": 0.38415858149528503, + "learning_rate": 0.00017916527545909852, + "loss": 0.7281, + "step": 318 + }, + { + "epoch": 0.00011197195155264994, + "grad_norm": 0.35853826999664307, + "learning_rate": 0.0001790984974958264, + "loss": 0.5851, + "step": 319 + }, + { + "epoch": 0.00011232296080516609, + "grad_norm": 0.42092210054397583, + "learning_rate": 0.00017903171953255427, + "loss": 0.5324, + "step": 320 + }, + { + "epoch": 0.00011267397005768222, + "grad_norm": 0.34538987278938293, + "learning_rate": 0.00017896494156928214, + "loss": 0.6387, + "step": 321 + }, + { + "epoch": 0.00011302497931019837, + "grad_norm": 0.38299745321273804, + "learning_rate": 0.00017889816360601, + "loss": 0.6013, + "step": 322 + }, + { + "epoch": 0.00011337598856271452, + "grad_norm": 0.32100436091423035, + "learning_rate": 0.0001788313856427379, + "loss": 0.4627, + "step": 323 + }, + { + "epoch": 0.00011372699781523066, + "grad_norm": 0.3458426594734192, + "learning_rate": 0.0001787646076794658, + "loss": 0.5865, + "step": 324 + }, + { + "epoch": 0.0001140780070677468, + "grad_norm": 0.33228665590286255, + "learning_rate": 0.00017869782971619366, + "loss": 0.4611, + "step": 325 + }, + { + "epoch": 0.00011442901632026294, + "grad_norm": 0.38747021555900574, + "learning_rate": 0.00017863105175292153, + "loss": 0.5777, + "step": 326 + }, + { + "epoch": 0.00011478002557277909, + "grad_norm": 0.3888608515262604, + "learning_rate": 0.0001785642737896494, + "loss": 0.5664, + "step": 327 + }, + { + "epoch": 0.00011513103482529524, + "grad_norm": 0.4084737002849579, + "learning_rate": 0.0001784974958263773, + "loss": 0.5939, + "step": 328 + }, + { + "epoch": 0.00011548204407781138, + "grad_norm": 0.4964492917060852, + "learning_rate": 0.00017843071786310518, + "loss": 0.6256, + "step": 329 + }, + { + "epoch": 0.00011583305333032753, + "grad_norm": 0.37329745292663574, + "learning_rate": 0.00017836393989983305, + "loss": 0.5388, + "step": 330 + }, + { + "epoch": 0.00011618406258284366, + "grad_norm": 0.37680140137672424, + "learning_rate": 0.00017829716193656095, + "loss": 0.6203, + "step": 331 + }, + { + "epoch": 0.00011653507183535981, + "grad_norm": 0.4162957966327667, + "learning_rate": 0.00017823038397328883, + "loss": 0.6478, + "step": 332 + }, + { + "epoch": 0.00011688608108787596, + "grad_norm": 0.3473896086215973, + "learning_rate": 0.0001781636060100167, + "loss": 0.589, + "step": 333 + }, + { + "epoch": 0.0001172370903403921, + "grad_norm": 0.4039511978626251, + "learning_rate": 0.0001780968280467446, + "loss": 0.5681, + "step": 334 + }, + { + "epoch": 0.00011758809959290825, + "grad_norm": 0.3135715425014496, + "learning_rate": 0.00017803005008347247, + "loss": 0.5069, + "step": 335 + }, + { + "epoch": 0.00011793910884542438, + "grad_norm": 0.4296559989452362, + "learning_rate": 0.00017796327212020035, + "loss": 0.5413, + "step": 336 + }, + { + "epoch": 0.00011829011809794053, + "grad_norm": 0.4197536110877991, + "learning_rate": 0.00017789649415692822, + "loss": 0.694, + "step": 337 + }, + { + "epoch": 0.00011864112735045668, + "grad_norm": 0.3633468449115753, + "learning_rate": 0.0001778297161936561, + "loss": 0.5475, + "step": 338 + }, + { + "epoch": 0.00011899213660297282, + "grad_norm": 0.2867147922515869, + "learning_rate": 0.000177762938230384, + "loss": 0.485, + "step": 339 + }, + { + "epoch": 0.00011934314585548897, + "grad_norm": 0.3445490300655365, + "learning_rate": 0.00017769616026711187, + "loss": 0.6304, + "step": 340 + }, + { + "epoch": 0.0001196941551080051, + "grad_norm": 0.31692221760749817, + "learning_rate": 0.00017762938230383974, + "loss": 0.5804, + "step": 341 + }, + { + "epoch": 0.00012004516436052125, + "grad_norm": 0.31391167640686035, + "learning_rate": 0.0001775626043405676, + "loss": 0.5945, + "step": 342 + }, + { + "epoch": 0.0001203961736130374, + "grad_norm": 0.3484472632408142, + "learning_rate": 0.00017749582637729548, + "loss": 0.6577, + "step": 343 + }, + { + "epoch": 0.00012074718286555354, + "grad_norm": 0.37430596351623535, + "learning_rate": 0.00017742904841402339, + "loss": 0.6854, + "step": 344 + }, + { + "epoch": 0.00012109819211806969, + "grad_norm": 0.34305211901664734, + "learning_rate": 0.00017736227045075126, + "loss": 0.5123, + "step": 345 + }, + { + "epoch": 0.00012144920137058582, + "grad_norm": 0.3398534059524536, + "learning_rate": 0.00017729549248747913, + "loss": 0.5602, + "step": 346 + }, + { + "epoch": 0.00012180021062310197, + "grad_norm": 0.4278014600276947, + "learning_rate": 0.000177228714524207, + "loss": 0.5152, + "step": 347 + }, + { + "epoch": 0.00012215121987561812, + "grad_norm": 0.4011085629463196, + "learning_rate": 0.0001771619365609349, + "loss": 0.6217, + "step": 348 + }, + { + "epoch": 0.00012250222912813427, + "grad_norm": 0.3425695598125458, + "learning_rate": 0.00017709515859766278, + "loss": 0.5037, + "step": 349 + }, + { + "epoch": 0.0001228532383806504, + "grad_norm": 0.34036242961883545, + "learning_rate": 0.00017702838063439068, + "loss": 0.649, + "step": 350 + }, + { + "epoch": 0.00012320424763316654, + "grad_norm": 0.5631874203681946, + "learning_rate": 0.00017696160267111855, + "loss": 0.5656, + "step": 351 + }, + { + "epoch": 0.0001235552568856827, + "grad_norm": 0.4195176661014557, + "learning_rate": 0.00017689482470784642, + "loss": 0.6899, + "step": 352 + }, + { + "epoch": 0.00012390626613819884, + "grad_norm": 0.41814154386520386, + "learning_rate": 0.0001768280467445743, + "loss": 0.551, + "step": 353 + }, + { + "epoch": 0.000124257275390715, + "grad_norm": 0.3374340534210205, + "learning_rate": 0.00017676126878130217, + "loss": 0.7022, + "step": 354 + }, + { + "epoch": 0.00012460828464323112, + "grad_norm": 0.41464921832084656, + "learning_rate": 0.00017669449081803007, + "loss": 0.5301, + "step": 355 + }, + { + "epoch": 0.00012495929389574726, + "grad_norm": 0.4443178176879883, + "learning_rate": 0.00017662771285475794, + "loss": 0.5487, + "step": 356 + }, + { + "epoch": 0.00012531030314826341, + "grad_norm": 0.3389272093772888, + "learning_rate": 0.00017656093489148582, + "loss": 0.581, + "step": 357 + }, + { + "epoch": 0.00012566131240077956, + "grad_norm": 0.29650986194610596, + "learning_rate": 0.0001764941569282137, + "loss": 0.5801, + "step": 358 + }, + { + "epoch": 0.0001260123216532957, + "grad_norm": 0.40271905064582825, + "learning_rate": 0.00017642737896494156, + "loss": 0.6738, + "step": 359 + }, + { + "epoch": 0.00012636333090581184, + "grad_norm": 0.352225661277771, + "learning_rate": 0.00017636060100166946, + "loss": 0.5727, + "step": 360 + }, + { + "epoch": 0.00012671434015832798, + "grad_norm": 0.3469563126564026, + "learning_rate": 0.00017629382303839734, + "loss": 0.5188, + "step": 361 + }, + { + "epoch": 0.00012706534941084413, + "grad_norm": 0.30644670128822327, + "learning_rate": 0.0001762270450751252, + "loss": 0.497, + "step": 362 + }, + { + "epoch": 0.00012741635866336028, + "grad_norm": 0.3472917377948761, + "learning_rate": 0.00017616026711185308, + "loss": 0.6363, + "step": 363 + }, + { + "epoch": 0.00012776736791587643, + "grad_norm": 0.37184756994247437, + "learning_rate": 0.00017609348914858096, + "loss": 0.5223, + "step": 364 + }, + { + "epoch": 0.00012811837716839256, + "grad_norm": 0.3247138559818268, + "learning_rate": 0.00017602671118530886, + "loss": 0.5457, + "step": 365 + }, + { + "epoch": 0.0001284693864209087, + "grad_norm": 0.5236158967018127, + "learning_rate": 0.00017595993322203673, + "loss": 0.615, + "step": 366 + }, + { + "epoch": 0.00012882039567342485, + "grad_norm": 0.33708465099334717, + "learning_rate": 0.00017589315525876463, + "loss": 0.6163, + "step": 367 + }, + { + "epoch": 0.000129171404925941, + "grad_norm": 0.33848705887794495, + "learning_rate": 0.0001758263772954925, + "loss": 0.4229, + "step": 368 + }, + { + "epoch": 0.00012952241417845715, + "grad_norm": 0.5827682018280029, + "learning_rate": 0.00017575959933222038, + "loss": 0.5668, + "step": 369 + }, + { + "epoch": 0.00012987342343097328, + "grad_norm": 0.36217448115348816, + "learning_rate": 0.00017569282136894825, + "loss": 0.4983, + "step": 370 + }, + { + "epoch": 0.00013022443268348943, + "grad_norm": 0.329414963722229, + "learning_rate": 0.00017562604340567615, + "loss": 0.4281, + "step": 371 + }, + { + "epoch": 0.00013057544193600557, + "grad_norm": 0.36746612191200256, + "learning_rate": 0.00017555926544240402, + "loss": 0.6629, + "step": 372 + }, + { + "epoch": 0.00013092645118852172, + "grad_norm": 0.3954717516899109, + "learning_rate": 0.0001754924874791319, + "loss": 0.5784, + "step": 373 + }, + { + "epoch": 0.00013127746044103787, + "grad_norm": 0.41279932856559753, + "learning_rate": 0.00017542570951585977, + "loss": 0.5994, + "step": 374 + }, + { + "epoch": 0.000131628469693554, + "grad_norm": 0.3019951581954956, + "learning_rate": 0.00017535893155258764, + "loss": 0.5584, + "step": 375 + }, + { + "epoch": 0.00013197947894607015, + "grad_norm": 0.3079768121242523, + "learning_rate": 0.00017529215358931554, + "loss": 0.5904, + "step": 376 + }, + { + "epoch": 0.0001323304881985863, + "grad_norm": 0.5678027272224426, + "learning_rate": 0.00017522537562604342, + "loss": 0.6441, + "step": 377 + }, + { + "epoch": 0.00013268149745110244, + "grad_norm": 0.38624581694602966, + "learning_rate": 0.0001751585976627713, + "loss": 0.5582, + "step": 378 + }, + { + "epoch": 0.0001330325067036186, + "grad_norm": 0.4368002712726593, + "learning_rate": 0.00017509181969949916, + "loss": 0.686, + "step": 379 + }, + { + "epoch": 0.00013338351595613472, + "grad_norm": 0.3409269154071808, + "learning_rate": 0.00017502504173622704, + "loss": 0.582, + "step": 380 + }, + { + "epoch": 0.00013373452520865087, + "grad_norm": 0.3772698938846588, + "learning_rate": 0.0001749582637729549, + "loss": 0.5314, + "step": 381 + }, + { + "epoch": 0.00013408553446116702, + "grad_norm": 0.3791707158088684, + "learning_rate": 0.0001748914858096828, + "loss": 0.6143, + "step": 382 + }, + { + "epoch": 0.00013443654371368317, + "grad_norm": 0.4441101551055908, + "learning_rate": 0.0001748247078464107, + "loss": 0.5726, + "step": 383 + }, + { + "epoch": 0.0001347875529661993, + "grad_norm": 0.4160211980342865, + "learning_rate": 0.00017475792988313858, + "loss": 0.6003, + "step": 384 + }, + { + "epoch": 0.00013513856221871544, + "grad_norm": 0.41698628664016724, + "learning_rate": 0.00017469115191986646, + "loss": 0.4539, + "step": 385 + }, + { + "epoch": 0.00013548957147123159, + "grad_norm": 0.337007999420166, + "learning_rate": 0.00017462437395659433, + "loss": 0.5176, + "step": 386 + }, + { + "epoch": 0.00013584058072374774, + "grad_norm": 0.30926409363746643, + "learning_rate": 0.00017455759599332223, + "loss": 0.6072, + "step": 387 + }, + { + "epoch": 0.00013619158997626389, + "grad_norm": 0.3663052022457123, + "learning_rate": 0.0001744908180300501, + "loss": 0.538, + "step": 388 + }, + { + "epoch": 0.00013654259922878, + "grad_norm": 0.3410074710845947, + "learning_rate": 0.00017442404006677798, + "loss": 0.5687, + "step": 389 + }, + { + "epoch": 0.00013689360848129616, + "grad_norm": 0.5266095399856567, + "learning_rate": 0.00017435726210350585, + "loss": 0.6685, + "step": 390 + }, + { + "epoch": 0.0001372446177338123, + "grad_norm": 0.4020686149597168, + "learning_rate": 0.00017429048414023372, + "loss": 0.586, + "step": 391 + }, + { + "epoch": 0.00013759562698632846, + "grad_norm": 0.39995548129081726, + "learning_rate": 0.00017422370617696162, + "loss": 0.6958, + "step": 392 + }, + { + "epoch": 0.0001379466362388446, + "grad_norm": 0.4024721682071686, + "learning_rate": 0.0001741569282136895, + "loss": 0.6411, + "step": 393 + }, + { + "epoch": 0.00013829764549136073, + "grad_norm": 0.38193392753601074, + "learning_rate": 0.00017409015025041737, + "loss": 0.5857, + "step": 394 + }, + { + "epoch": 0.00013864865474387688, + "grad_norm": 0.39786526560783386, + "learning_rate": 0.00017402337228714524, + "loss": 0.5215, + "step": 395 + }, + { + "epoch": 0.00013899966399639303, + "grad_norm": 0.49223974347114563, + "learning_rate": 0.00017395659432387311, + "loss": 0.5881, + "step": 396 + }, + { + "epoch": 0.00013935067324890918, + "grad_norm": 0.3398894667625427, + "learning_rate": 0.00017388981636060101, + "loss": 0.5466, + "step": 397 + }, + { + "epoch": 0.00013970168250142533, + "grad_norm": 0.34891223907470703, + "learning_rate": 0.0001738230383973289, + "loss": 0.5901, + "step": 398 + }, + { + "epoch": 0.00014005269175394145, + "grad_norm": 0.47644108533859253, + "learning_rate": 0.00017375626043405676, + "loss": 0.5075, + "step": 399 + }, + { + "epoch": 0.0001404037010064576, + "grad_norm": 0.42530229687690735, + "learning_rate": 0.00017368948247078466, + "loss": 0.663, + "step": 400 + }, + { + "epoch": 0.00014075471025897375, + "grad_norm": 0.30858534574508667, + "learning_rate": 0.00017362270450751253, + "loss": 0.4724, + "step": 401 + }, + { + "epoch": 0.0001411057195114899, + "grad_norm": 0.42453449964523315, + "learning_rate": 0.0001735559265442404, + "loss": 0.6074, + "step": 402 + }, + { + "epoch": 0.00014145672876400605, + "grad_norm": 0.3964505195617676, + "learning_rate": 0.0001734891485809683, + "loss": 0.4913, + "step": 403 + }, + { + "epoch": 0.00014180773801652217, + "grad_norm": 0.3317703902721405, + "learning_rate": 0.00017342237061769618, + "loss": 0.5504, + "step": 404 + }, + { + "epoch": 0.00014215874726903832, + "grad_norm": 0.3912264108657837, + "learning_rate": 0.00017335559265442405, + "loss": 0.6301, + "step": 405 + }, + { + "epoch": 0.00014250975652155447, + "grad_norm": 0.3582877218723297, + "learning_rate": 0.00017328881469115193, + "loss": 0.6205, + "step": 406 + }, + { + "epoch": 0.00014286076577407062, + "grad_norm": 0.3691099286079407, + "learning_rate": 0.0001732220367278798, + "loss": 0.5348, + "step": 407 + }, + { + "epoch": 0.00014321177502658677, + "grad_norm": 0.35860803723335266, + "learning_rate": 0.0001731552587646077, + "loss": 0.6029, + "step": 408 + }, + { + "epoch": 0.0001435627842791029, + "grad_norm": 0.3640693426132202, + "learning_rate": 0.00017308848080133557, + "loss": 0.6673, + "step": 409 + }, + { + "epoch": 0.00014391379353161904, + "grad_norm": 0.3550623953342438, + "learning_rate": 0.00017302170283806345, + "loss": 0.4659, + "step": 410 + }, + { + "epoch": 0.0001442648027841352, + "grad_norm": 0.45885637402534485, + "learning_rate": 0.00017295492487479132, + "loss": 0.4781, + "step": 411 + }, + { + "epoch": 0.00014461581203665134, + "grad_norm": 0.3703556954860687, + "learning_rate": 0.0001728881469115192, + "loss": 0.4829, + "step": 412 + }, + { + "epoch": 0.0001449668212891675, + "grad_norm": 0.5436837077140808, + "learning_rate": 0.0001728213689482471, + "loss": 0.6056, + "step": 413 + }, + { + "epoch": 0.0001453178305416836, + "grad_norm": 0.3953244686126709, + "learning_rate": 0.00017275459098497497, + "loss": 0.4884, + "step": 414 + }, + { + "epoch": 0.00014566883979419976, + "grad_norm": 0.34003904461860657, + "learning_rate": 0.00017268781302170284, + "loss": 0.6014, + "step": 415 + }, + { + "epoch": 0.0001460198490467159, + "grad_norm": 0.3463648557662964, + "learning_rate": 0.0001726210350584307, + "loss": 0.603, + "step": 416 + }, + { + "epoch": 0.00014637085829923206, + "grad_norm": 0.4293590784072876, + "learning_rate": 0.0001725542570951586, + "loss": 0.6686, + "step": 417 + }, + { + "epoch": 0.0001467218675517482, + "grad_norm": 0.4243469834327698, + "learning_rate": 0.0001724874791318865, + "loss": 0.6422, + "step": 418 + }, + { + "epoch": 0.00014707287680426433, + "grad_norm": 0.38327839970588684, + "learning_rate": 0.0001724207011686144, + "loss": 0.5595, + "step": 419 + }, + { + "epoch": 0.00014742388605678048, + "grad_norm": 0.31334301829338074, + "learning_rate": 0.00017235392320534226, + "loss": 0.474, + "step": 420 + }, + { + "epoch": 0.00014777489530929663, + "grad_norm": 0.3335350453853607, + "learning_rate": 0.00017228714524207013, + "loss": 0.6172, + "step": 421 + }, + { + "epoch": 0.00014812590456181278, + "grad_norm": 0.373696506023407, + "learning_rate": 0.000172220367278798, + "loss": 0.6183, + "step": 422 + }, + { + "epoch": 0.00014847691381432893, + "grad_norm": 0.45814886689186096, + "learning_rate": 0.00017215358931552588, + "loss": 0.5059, + "step": 423 + }, + { + "epoch": 0.00014882792306684505, + "grad_norm": 0.3578277826309204, + "learning_rate": 0.00017208681135225378, + "loss": 0.5771, + "step": 424 + }, + { + "epoch": 0.0001491789323193612, + "grad_norm": 0.42081883549690247, + "learning_rate": 0.00017202003338898165, + "loss": 0.5604, + "step": 425 + }, + { + "epoch": 0.00014952994157187735, + "grad_norm": 0.3173503875732422, + "learning_rate": 0.00017195325542570953, + "loss": 0.5738, + "step": 426 + }, + { + "epoch": 0.0001498809508243935, + "grad_norm": 0.38292011618614197, + "learning_rate": 0.0001718864774624374, + "loss": 0.6067, + "step": 427 + }, + { + "epoch": 0.00015023196007690965, + "grad_norm": 0.3518977463245392, + "learning_rate": 0.00017181969949916527, + "loss": 0.5073, + "step": 428 + }, + { + "epoch": 0.00015058296932942577, + "grad_norm": 0.5157706141471863, + "learning_rate": 0.00017175292153589317, + "loss": 0.5496, + "step": 429 + }, + { + "epoch": 0.00015093397858194192, + "grad_norm": 0.32064110040664673, + "learning_rate": 0.00017168614357262105, + "loss": 0.4766, + "step": 430 + }, + { + "epoch": 0.00015128498783445807, + "grad_norm": 0.42229798436164856, + "learning_rate": 0.00017161936560934892, + "loss": 0.5953, + "step": 431 + }, + { + "epoch": 0.00015163599708697422, + "grad_norm": 0.4723895192146301, + "learning_rate": 0.0001715525876460768, + "loss": 0.4783, + "step": 432 + }, + { + "epoch": 0.00015198700633949037, + "grad_norm": 0.3841445744037628, + "learning_rate": 0.00017148580968280467, + "loss": 0.5003, + "step": 433 + }, + { + "epoch": 0.0001523380155920065, + "grad_norm": 0.38026461005210876, + "learning_rate": 0.00017141903171953257, + "loss": 0.5093, + "step": 434 + }, + { + "epoch": 0.00015268902484452264, + "grad_norm": 0.37034904956817627, + "learning_rate": 0.00017135225375626044, + "loss": 0.6158, + "step": 435 + }, + { + "epoch": 0.0001530400340970388, + "grad_norm": 0.3876091241836548, + "learning_rate": 0.00017128547579298834, + "loss": 0.5287, + "step": 436 + }, + { + "epoch": 0.00015339104334955494, + "grad_norm": 0.30055519938468933, + "learning_rate": 0.0001712186978297162, + "loss": 0.5018, + "step": 437 + }, + { + "epoch": 0.0001537420526020711, + "grad_norm": 0.36094966530799866, + "learning_rate": 0.00017115191986644409, + "loss": 0.4961, + "step": 438 + }, + { + "epoch": 0.0001540930618545872, + "grad_norm": 0.3300524055957794, + "learning_rate": 0.00017108514190317196, + "loss": 0.5246, + "step": 439 + }, + { + "epoch": 0.00015444407110710336, + "grad_norm": 0.40980783104896545, + "learning_rate": 0.00017101836393989986, + "loss": 0.5705, + "step": 440 + }, + { + "epoch": 0.0001547950803596195, + "grad_norm": 0.3442326784133911, + "learning_rate": 0.00017095158597662773, + "loss": 0.5595, + "step": 441 + }, + { + "epoch": 0.00015514608961213566, + "grad_norm": 0.48015034198760986, + "learning_rate": 0.0001708848080133556, + "loss": 0.5642, + "step": 442 + }, + { + "epoch": 0.0001554970988646518, + "grad_norm": 0.5570142269134521, + "learning_rate": 0.00017081803005008348, + "loss": 0.6111, + "step": 443 + }, + { + "epoch": 0.00015584810811716793, + "grad_norm": 0.30470094084739685, + "learning_rate": 0.00017075125208681135, + "loss": 0.5151, + "step": 444 + }, + { + "epoch": 0.00015619911736968408, + "grad_norm": 0.31946614384651184, + "learning_rate": 0.00017068447412353925, + "loss": 0.5265, + "step": 445 + }, + { + "epoch": 0.00015655012662220023, + "grad_norm": 0.38980719447135925, + "learning_rate": 0.00017061769616026712, + "loss": 0.575, + "step": 446 + }, + { + "epoch": 0.00015690113587471638, + "grad_norm": 0.4077732264995575, + "learning_rate": 0.000170550918196995, + "loss": 0.5729, + "step": 447 + }, + { + "epoch": 0.00015725214512723253, + "grad_norm": 0.38632732629776, + "learning_rate": 0.00017048414023372287, + "loss": 0.594, + "step": 448 + }, + { + "epoch": 0.00015760315437974865, + "grad_norm": 0.37193921208381653, + "learning_rate": 0.00017041736227045074, + "loss": 0.6062, + "step": 449 + }, + { + "epoch": 0.0001579541636322648, + "grad_norm": 0.399029016494751, + "learning_rate": 0.00017035058430717862, + "loss": 0.4538, + "step": 450 + }, + { + "epoch": 0.00015830517288478095, + "grad_norm": 0.37710487842559814, + "learning_rate": 0.00017028380634390652, + "loss": 0.5615, + "step": 451 + }, + { + "epoch": 0.0001586561821372971, + "grad_norm": 0.38591668009757996, + "learning_rate": 0.0001702170283806344, + "loss": 0.5316, + "step": 452 + }, + { + "epoch": 0.00015900719138981325, + "grad_norm": 0.3453538417816162, + "learning_rate": 0.0001701502504173623, + "loss": 0.4645, + "step": 453 + }, + { + "epoch": 0.00015935820064232937, + "grad_norm": 0.34171512722969055, + "learning_rate": 0.00017008347245409016, + "loss": 0.5856, + "step": 454 + }, + { + "epoch": 0.00015970920989484552, + "grad_norm": 0.39591720700263977, + "learning_rate": 0.00017001669449081804, + "loss": 0.573, + "step": 455 + }, + { + "epoch": 0.00016006021914736167, + "grad_norm": 0.4127822816371918, + "learning_rate": 0.00016994991652754594, + "loss": 0.5183, + "step": 456 + }, + { + "epoch": 0.00016041122839987782, + "grad_norm": 0.37893375754356384, + "learning_rate": 0.0001698831385642738, + "loss": 0.566, + "step": 457 + }, + { + "epoch": 0.00016076223765239397, + "grad_norm": 0.33429333567619324, + "learning_rate": 0.00016981636060100168, + "loss": 0.449, + "step": 458 + }, + { + "epoch": 0.0001611132469049101, + "grad_norm": 0.3333180546760559, + "learning_rate": 0.00016974958263772956, + "loss": 0.4441, + "step": 459 + }, + { + "epoch": 0.00016146425615742624, + "grad_norm": 0.3591359257698059, + "learning_rate": 0.00016968280467445743, + "loss": 0.55, + "step": 460 + }, + { + "epoch": 0.0001618152654099424, + "grad_norm": 0.35390427708625793, + "learning_rate": 0.00016961602671118533, + "loss": 0.6445, + "step": 461 + }, + { + "epoch": 0.00016216627466245854, + "grad_norm": 0.42036697268486023, + "learning_rate": 0.0001695492487479132, + "loss": 0.5411, + "step": 462 + }, + { + "epoch": 0.0001625172839149747, + "grad_norm": 0.42147770524024963, + "learning_rate": 0.00016948247078464108, + "loss": 0.6218, + "step": 463 + }, + { + "epoch": 0.0001628682931674908, + "grad_norm": 0.3960399329662323, + "learning_rate": 0.00016941569282136895, + "loss": 0.6608, + "step": 464 + }, + { + "epoch": 0.00016321930242000696, + "grad_norm": 0.39676985144615173, + "learning_rate": 0.00016934891485809682, + "loss": 0.5838, + "step": 465 + }, + { + "epoch": 0.0001635703116725231, + "grad_norm": 0.2839520573616028, + "learning_rate": 0.0001692821368948247, + "loss": 0.5334, + "step": 466 + }, + { + "epoch": 0.00016392132092503926, + "grad_norm": 0.3654347062110901, + "learning_rate": 0.0001692153589315526, + "loss": 0.6065, + "step": 467 + }, + { + "epoch": 0.0001642723301775554, + "grad_norm": 0.3709166646003723, + "learning_rate": 0.00016914858096828047, + "loss": 0.509, + "step": 468 + }, + { + "epoch": 0.00016462333943007153, + "grad_norm": 0.29224780201911926, + "learning_rate": 0.00016908180300500834, + "loss": 0.5372, + "step": 469 + }, + { + "epoch": 0.00016497434868258768, + "grad_norm": 0.34979283809661865, + "learning_rate": 0.00016901502504173624, + "loss": 0.3968, + "step": 470 + }, + { + "epoch": 0.00016532535793510383, + "grad_norm": 0.34580183029174805, + "learning_rate": 0.00016894824707846412, + "loss": 0.6032, + "step": 471 + }, + { + "epoch": 0.00016567636718761998, + "grad_norm": 0.39046213030815125, + "learning_rate": 0.00016888146911519202, + "loss": 0.5628, + "step": 472 + }, + { + "epoch": 0.00016602737644013613, + "grad_norm": 0.35301411151885986, + "learning_rate": 0.0001688146911519199, + "loss": 0.607, + "step": 473 + }, + { + "epoch": 0.00016637838569265225, + "grad_norm": 0.4572748839855194, + "learning_rate": 0.00016874791318864776, + "loss": 0.5018, + "step": 474 + }, + { + "epoch": 0.0001667293949451684, + "grad_norm": 0.38230374455451965, + "learning_rate": 0.00016868113522537564, + "loss": 0.5026, + "step": 475 + }, + { + "epoch": 0.00016708040419768455, + "grad_norm": 0.37066343426704407, + "learning_rate": 0.0001686143572621035, + "loss": 0.5819, + "step": 476 + }, + { + "epoch": 0.0001674314134502007, + "grad_norm": 0.3658660054206848, + "learning_rate": 0.0001685475792988314, + "loss": 0.6825, + "step": 477 + }, + { + "epoch": 0.00016778242270271685, + "grad_norm": 0.42174890637397766, + "learning_rate": 0.00016848080133555928, + "loss": 0.6065, + "step": 478 + }, + { + "epoch": 0.00016813343195523297, + "grad_norm": 0.3462882936000824, + "learning_rate": 0.00016841402337228716, + "loss": 0.5888, + "step": 479 + }, + { + "epoch": 0.00016848444120774912, + "grad_norm": 0.44681960344314575, + "learning_rate": 0.00016834724540901503, + "loss": 0.4987, + "step": 480 + }, + { + "epoch": 0.00016883545046026527, + "grad_norm": 0.3535650372505188, + "learning_rate": 0.0001682804674457429, + "loss": 0.6478, + "step": 481 + }, + { + "epoch": 0.00016918645971278142, + "grad_norm": 0.3357018232345581, + "learning_rate": 0.00016821368948247077, + "loss": 0.4949, + "step": 482 + }, + { + "epoch": 0.00016953746896529757, + "grad_norm": 0.42756739258766174, + "learning_rate": 0.00016814691151919868, + "loss": 0.6475, + "step": 483 + }, + { + "epoch": 0.0001698884782178137, + "grad_norm": 0.36174866557121277, + "learning_rate": 0.00016808013355592655, + "loss": 0.598, + "step": 484 + }, + { + "epoch": 0.00017023948747032984, + "grad_norm": 0.37115278840065, + "learning_rate": 0.00016801335559265442, + "loss": 0.6215, + "step": 485 + }, + { + "epoch": 0.000170590496722846, + "grad_norm": 0.340249627828598, + "learning_rate": 0.0001679465776293823, + "loss": 0.5702, + "step": 486 + }, + { + "epoch": 0.00017094150597536214, + "grad_norm": 0.31226348876953125, + "learning_rate": 0.0001678797996661102, + "loss": 0.6531, + "step": 487 + }, + { + "epoch": 0.0001712925152278783, + "grad_norm": 0.35571998357772827, + "learning_rate": 0.00016781302170283807, + "loss": 0.6406, + "step": 488 + }, + { + "epoch": 0.00017164352448039441, + "grad_norm": 0.4167378842830658, + "learning_rate": 0.00016774624373956597, + "loss": 0.5111, + "step": 489 + }, + { + "epoch": 0.00017199453373291056, + "grad_norm": 0.292304128408432, + "learning_rate": 0.00016767946577629384, + "loss": 0.6643, + "step": 490 + }, + { + "epoch": 0.0001723455429854267, + "grad_norm": 0.38789069652557373, + "learning_rate": 0.00016761268781302171, + "loss": 0.4542, + "step": 491 + }, + { + "epoch": 0.00017269655223794286, + "grad_norm": 0.33764714002609253, + "learning_rate": 0.0001675459098497496, + "loss": 0.4158, + "step": 492 + }, + { + "epoch": 0.00017304756149045898, + "grad_norm": 0.34849148988723755, + "learning_rate": 0.0001674791318864775, + "loss": 0.4737, + "step": 493 + }, + { + "epoch": 0.00017339857074297513, + "grad_norm": 0.2921352684497833, + "learning_rate": 0.00016741235392320536, + "loss": 0.679, + "step": 494 + }, + { + "epoch": 0.00017374957999549128, + "grad_norm": 0.33746641874313354, + "learning_rate": 0.00016734557595993323, + "loss": 0.4957, + "step": 495 + }, + { + "epoch": 0.00017410058924800743, + "grad_norm": 0.4029395878314972, + "learning_rate": 0.0001672787979966611, + "loss": 0.6708, + "step": 496 + }, + { + "epoch": 0.00017445159850052358, + "grad_norm": 0.440033882856369, + "learning_rate": 0.00016721202003338898, + "loss": 0.5889, + "step": 497 + }, + { + "epoch": 0.0001748026077530397, + "grad_norm": 0.330692857503891, + "learning_rate": 0.00016714524207011685, + "loss": 0.5942, + "step": 498 + }, + { + "epoch": 0.00017515361700555585, + "grad_norm": 0.3111809492111206, + "learning_rate": 0.00016707846410684475, + "loss": 0.5506, + "step": 499 + }, + { + "epoch": 0.000175504626258072, + "grad_norm": 0.38885676860809326, + "learning_rate": 0.00016701168614357263, + "loss": 0.4713, + "step": 500 + }, + { + "epoch": 0.00017585563551058815, + "grad_norm": 0.3697550296783447, + "learning_rate": 0.0001669449081803005, + "loss": 0.5955, + "step": 501 + }, + { + "epoch": 0.0001762066447631043, + "grad_norm": 0.35807061195373535, + "learning_rate": 0.00016687813021702837, + "loss": 0.555, + "step": 502 + }, + { + "epoch": 0.00017655765401562043, + "grad_norm": 0.44033464789390564, + "learning_rate": 0.00016681135225375625, + "loss": 0.5668, + "step": 503 + }, + { + "epoch": 0.00017690866326813657, + "grad_norm": 0.3363400399684906, + "learning_rate": 0.00016674457429048415, + "loss": 0.6176, + "step": 504 + }, + { + "epoch": 0.00017725967252065272, + "grad_norm": 0.31457507610321045, + "learning_rate": 0.00016667779632721202, + "loss": 0.6524, + "step": 505 + }, + { + "epoch": 0.00017761068177316887, + "grad_norm": 0.38115641474723816, + "learning_rate": 0.00016661101836393992, + "loss": 0.5848, + "step": 506 + }, + { + "epoch": 0.00017796169102568502, + "grad_norm": 0.3387603759765625, + "learning_rate": 0.0001665442404006678, + "loss": 0.6992, + "step": 507 + }, + { + "epoch": 0.00017831270027820115, + "grad_norm": 0.31671345233917236, + "learning_rate": 0.00016647746243739567, + "loss": 0.5744, + "step": 508 + }, + { + "epoch": 0.0001786637095307173, + "grad_norm": 0.3776471018791199, + "learning_rate": 0.00016641068447412357, + "loss": 0.622, + "step": 509 + }, + { + "epoch": 0.00017901471878323344, + "grad_norm": 0.37572941184043884, + "learning_rate": 0.00016634390651085144, + "loss": 0.5259, + "step": 510 + }, + { + "epoch": 0.0001793657280357496, + "grad_norm": 0.3335510194301605, + "learning_rate": 0.0001662771285475793, + "loss": 0.547, + "step": 511 + }, + { + "epoch": 0.00017971673728826574, + "grad_norm": 0.33241015672683716, + "learning_rate": 0.00016621035058430719, + "loss": 0.5827, + "step": 512 + }, + { + "epoch": 0.00018006774654078187, + "grad_norm": 0.3761122524738312, + "learning_rate": 0.00016614357262103506, + "loss": 0.6962, + "step": 513 + }, + { + "epoch": 0.00018041875579329802, + "grad_norm": 0.4172234833240509, + "learning_rate": 0.00016607679465776293, + "loss": 0.4922, + "step": 514 + }, + { + "epoch": 0.00018076976504581416, + "grad_norm": 0.45372599363327026, + "learning_rate": 0.00016601001669449083, + "loss": 0.5804, + "step": 515 + }, + { + "epoch": 0.00018112077429833031, + "grad_norm": 0.3854759931564331, + "learning_rate": 0.0001659432387312187, + "loss": 0.6026, + "step": 516 + }, + { + "epoch": 0.00018147178355084646, + "grad_norm": 0.3399171829223633, + "learning_rate": 0.00016587646076794658, + "loss": 0.4773, + "step": 517 + }, + { + "epoch": 0.00018182279280336259, + "grad_norm": 0.36649778485298157, + "learning_rate": 0.00016580968280467445, + "loss": 0.59, + "step": 518 + }, + { + "epoch": 0.00018217380205587874, + "grad_norm": 0.39988765120506287, + "learning_rate": 0.00016574290484140233, + "loss": 0.6094, + "step": 519 + }, + { + "epoch": 0.00018252481130839489, + "grad_norm": 0.34659436345100403, + "learning_rate": 0.00016567612687813023, + "loss": 0.4832, + "step": 520 + }, + { + "epoch": 0.00018287582056091103, + "grad_norm": 0.3742654025554657, + "learning_rate": 0.0001656093489148581, + "loss": 0.413, + "step": 521 + }, + { + "epoch": 0.00018322682981342718, + "grad_norm": 0.43068456649780273, + "learning_rate": 0.00016554257095158597, + "loss": 0.6576, + "step": 522 + }, + { + "epoch": 0.0001835778390659433, + "grad_norm": 0.42455193400382996, + "learning_rate": 0.00016547579298831387, + "loss": 0.5897, + "step": 523 + }, + { + "epoch": 0.00018392884831845946, + "grad_norm": 0.3290526568889618, + "learning_rate": 0.00016540901502504175, + "loss": 0.4022, + "step": 524 + }, + { + "epoch": 0.0001842798575709756, + "grad_norm": 0.3744141161441803, + "learning_rate": 0.00016534223706176965, + "loss": 0.5577, + "step": 525 + }, + { + "epoch": 0.00018463086682349176, + "grad_norm": 0.3516618609428406, + "learning_rate": 0.00016527545909849752, + "loss": 0.5481, + "step": 526 + }, + { + "epoch": 0.0001849818760760079, + "grad_norm": 0.3591526448726654, + "learning_rate": 0.0001652086811352254, + "loss": 0.6339, + "step": 527 + }, + { + "epoch": 0.00018533288532852403, + "grad_norm": 0.4024425745010376, + "learning_rate": 0.00016514190317195327, + "loss": 0.5268, + "step": 528 + }, + { + "epoch": 0.00018568389458104018, + "grad_norm": 0.3502136766910553, + "learning_rate": 0.00016507512520868114, + "loss": 0.5112, + "step": 529 + }, + { + "epoch": 0.00018603490383355633, + "grad_norm": 0.3338727056980133, + "learning_rate": 0.00016500834724540904, + "loss": 0.5623, + "step": 530 + }, + { + "epoch": 0.00018638591308607248, + "grad_norm": 0.43554845452308655, + "learning_rate": 0.0001649415692821369, + "loss": 0.5853, + "step": 531 + }, + { + "epoch": 0.00018673692233858862, + "grad_norm": 0.34424322843551636, + "learning_rate": 0.00016487479131886478, + "loss": 0.4951, + "step": 532 + }, + { + "epoch": 0.00018708793159110475, + "grad_norm": 0.4424237012863159, + "learning_rate": 0.00016480801335559266, + "loss": 0.4576, + "step": 533 + }, + { + "epoch": 0.0001874389408436209, + "grad_norm": 0.4616681933403015, + "learning_rate": 0.00016474123539232053, + "loss": 0.4974, + "step": 534 + }, + { + "epoch": 0.00018778995009613705, + "grad_norm": 0.3599206507205963, + "learning_rate": 0.0001646744574290484, + "loss": 0.5987, + "step": 535 + }, + { + "epoch": 0.0001881409593486532, + "grad_norm": 0.40468478202819824, + "learning_rate": 0.0001646076794657763, + "loss": 0.5914, + "step": 536 + }, + { + "epoch": 0.00018849196860116935, + "grad_norm": 0.5389227271080017, + "learning_rate": 0.00016454090150250418, + "loss": 0.6459, + "step": 537 + }, + { + "epoch": 0.00018884297785368547, + "grad_norm": 0.3493568003177643, + "learning_rate": 0.00016447412353923205, + "loss": 0.5191, + "step": 538 + }, + { + "epoch": 0.00018919398710620162, + "grad_norm": 0.31237804889678955, + "learning_rate": 0.00016440734557595992, + "loss": 0.4819, + "step": 539 + }, + { + "epoch": 0.00018954499635871777, + "grad_norm": 0.31142041087150574, + "learning_rate": 0.00016434056761268782, + "loss": 0.5659, + "step": 540 + }, + { + "epoch": 0.00018989600561123392, + "grad_norm": 0.3323245644569397, + "learning_rate": 0.0001642737896494157, + "loss": 0.5779, + "step": 541 + }, + { + "epoch": 0.00019024701486375007, + "grad_norm": 0.3679036498069763, + "learning_rate": 0.0001642070116861436, + "loss": 0.6919, + "step": 542 + }, + { + "epoch": 0.0001905980241162662, + "grad_norm": 0.3094903528690338, + "learning_rate": 0.00016414023372287147, + "loss": 0.4773, + "step": 543 + }, + { + "epoch": 0.00019094903336878234, + "grad_norm": 0.37995582818984985, + "learning_rate": 0.00016407345575959934, + "loss": 0.539, + "step": 544 + }, + { + "epoch": 0.0001913000426212985, + "grad_norm": 0.46415746212005615, + "learning_rate": 0.00016400667779632722, + "loss": 0.6708, + "step": 545 + }, + { + "epoch": 0.00019165105187381464, + "grad_norm": 0.3479398190975189, + "learning_rate": 0.00016393989983305512, + "loss": 0.5496, + "step": 546 + }, + { + "epoch": 0.00019200206112633079, + "grad_norm": 0.3740891218185425, + "learning_rate": 0.000163873121869783, + "loss": 0.6256, + "step": 547 + }, + { + "epoch": 0.0001923530703788469, + "grad_norm": 0.4934074878692627, + "learning_rate": 0.00016380634390651086, + "loss": 0.6788, + "step": 548 + }, + { + "epoch": 0.00019270407963136306, + "grad_norm": 0.42659157514572144, + "learning_rate": 0.00016373956594323874, + "loss": 0.5981, + "step": 549 + }, + { + "epoch": 0.0001930550888838792, + "grad_norm": 0.35727575421333313, + "learning_rate": 0.0001636727879799666, + "loss": 0.4095, + "step": 550 + }, + { + "epoch": 0.00019340609813639536, + "grad_norm": 0.4294300377368927, + "learning_rate": 0.00016360601001669448, + "loss": 0.5386, + "step": 551 + }, + { + "epoch": 0.0001937571073889115, + "grad_norm": 0.33482253551483154, + "learning_rate": 0.00016353923205342238, + "loss": 0.4901, + "step": 552 + }, + { + "epoch": 0.00019410811664142763, + "grad_norm": 0.3379746079444885, + "learning_rate": 0.00016347245409015026, + "loss": 0.5454, + "step": 553 + }, + { + "epoch": 0.00019445912589394378, + "grad_norm": 0.42393919825553894, + "learning_rate": 0.00016340567612687813, + "loss": 0.5959, + "step": 554 + }, + { + "epoch": 0.00019481013514645993, + "grad_norm": 0.31975501775741577, + "learning_rate": 0.000163338898163606, + "loss": 0.6048, + "step": 555 + }, + { + "epoch": 0.00019516114439897608, + "grad_norm": 0.43404972553253174, + "learning_rate": 0.00016327212020033388, + "loss": 0.6252, + "step": 556 + }, + { + "epoch": 0.00019551215365149223, + "grad_norm": 0.3559292256832123, + "learning_rate": 0.00016320534223706178, + "loss": 0.6036, + "step": 557 + }, + { + "epoch": 0.00019586316290400835, + "grad_norm": 0.3134891092777252, + "learning_rate": 0.00016313856427378965, + "loss": 0.5656, + "step": 558 + }, + { + "epoch": 0.0001962141721565245, + "grad_norm": 0.32056671380996704, + "learning_rate": 0.00016307178631051755, + "loss": 0.6509, + "step": 559 + }, + { + "epoch": 0.00019656518140904065, + "grad_norm": 0.46249130368232727, + "learning_rate": 0.00016300500834724542, + "loss": 0.6379, + "step": 560 + }, + { + "epoch": 0.0001969161906615568, + "grad_norm": 0.36366966366767883, + "learning_rate": 0.0001629382303839733, + "loss": 0.5334, + "step": 561 + }, + { + "epoch": 0.00019726719991407295, + "grad_norm": 0.4234124422073364, + "learning_rate": 0.0001628714524207012, + "loss": 0.4864, + "step": 562 + }, + { + "epoch": 0.00019761820916658907, + "grad_norm": 0.3687801659107208, + "learning_rate": 0.00016280467445742907, + "loss": 0.4855, + "step": 563 + }, + { + "epoch": 0.00019796921841910522, + "grad_norm": 0.37247028946876526, + "learning_rate": 0.00016273789649415694, + "loss": 0.6215, + "step": 564 + }, + { + "epoch": 0.00019832022767162137, + "grad_norm": 0.30445635318756104, + "learning_rate": 0.00016267111853088482, + "loss": 0.5741, + "step": 565 + }, + { + "epoch": 0.00019867123692413752, + "grad_norm": 0.3349187970161438, + "learning_rate": 0.0001626043405676127, + "loss": 0.4524, + "step": 566 + }, + { + "epoch": 0.00019902224617665367, + "grad_norm": 0.36938101053237915, + "learning_rate": 0.00016253756260434056, + "loss": 0.5046, + "step": 567 + }, + { + "epoch": 0.0001993732554291698, + "grad_norm": 0.37673529982566833, + "learning_rate": 0.00016247078464106846, + "loss": 0.5001, + "step": 568 + }, + { + "epoch": 0.00019972426468168594, + "grad_norm": 0.3571556508541107, + "learning_rate": 0.00016240400667779634, + "loss": 0.6419, + "step": 569 + }, + { + "epoch": 0.0002000752739342021, + "grad_norm": 0.35543423891067505, + "learning_rate": 0.0001623372287145242, + "loss": 0.6191, + "step": 570 + }, + { + "epoch": 0.00020042628318671824, + "grad_norm": 0.3096729516983032, + "learning_rate": 0.00016227045075125208, + "loss": 0.5373, + "step": 571 + }, + { + "epoch": 0.0002007772924392344, + "grad_norm": 0.30310383439064026, + "learning_rate": 0.00016220367278797996, + "loss": 0.558, + "step": 572 + }, + { + "epoch": 0.0002011283016917505, + "grad_norm": 0.3616211712360382, + "learning_rate": 0.00016213689482470786, + "loss": 0.6504, + "step": 573 + }, + { + "epoch": 0.00020147931094426666, + "grad_norm": 0.34818220138549805, + "learning_rate": 0.00016207011686143573, + "loss": 0.6136, + "step": 574 + }, + { + "epoch": 0.0002018303201967828, + "grad_norm": 0.36225444078445435, + "learning_rate": 0.0001620033388981636, + "loss": 0.4905, + "step": 575 + }, + { + "epoch": 0.00020218132944929896, + "grad_norm": 0.40039536356925964, + "learning_rate": 0.0001619365609348915, + "loss": 0.5997, + "step": 576 + }, + { + "epoch": 0.0002025323387018151, + "grad_norm": 0.33715930581092834, + "learning_rate": 0.00016186978297161938, + "loss": 0.5284, + "step": 577 + }, + { + "epoch": 0.00020288334795433123, + "grad_norm": 0.4137067198753357, + "learning_rate": 0.00016180300500834728, + "loss": 0.6873, + "step": 578 + }, + { + "epoch": 0.00020323435720684738, + "grad_norm": 0.41598305106163025, + "learning_rate": 0.00016173622704507515, + "loss": 0.491, + "step": 579 + }, + { + "epoch": 0.00020358536645936353, + "grad_norm": 0.5466423034667969, + "learning_rate": 0.00016166944908180302, + "loss": 0.6188, + "step": 580 + }, + { + "epoch": 0.00020393637571187968, + "grad_norm": 0.3718060851097107, + "learning_rate": 0.0001616026711185309, + "loss": 0.5573, + "step": 581 + }, + { + "epoch": 0.00020428738496439583, + "grad_norm": 0.33747225999832153, + "learning_rate": 0.00016153589315525877, + "loss": 0.4887, + "step": 582 + }, + { + "epoch": 0.00020463839421691195, + "grad_norm": 0.36478081345558167, + "learning_rate": 0.00016146911519198664, + "loss": 0.553, + "step": 583 + }, + { + "epoch": 0.0002049894034694281, + "grad_norm": 0.38441962003707886, + "learning_rate": 0.00016140233722871454, + "loss": 0.4833, + "step": 584 + }, + { + "epoch": 0.00020534041272194425, + "grad_norm": 0.45594358444213867, + "learning_rate": 0.00016133555926544241, + "loss": 0.5877, + "step": 585 + }, + { + "epoch": 0.0002056914219744604, + "grad_norm": 0.356517493724823, + "learning_rate": 0.0001612687813021703, + "loss": 0.5614, + "step": 586 + }, + { + "epoch": 0.00020604243122697655, + "grad_norm": 0.4051963686943054, + "learning_rate": 0.00016120200333889816, + "loss": 0.5208, + "step": 587 + }, + { + "epoch": 0.00020639344047949267, + "grad_norm": 0.36947959661483765, + "learning_rate": 0.00016113522537562603, + "loss": 0.4385, + "step": 588 + }, + { + "epoch": 0.00020674444973200882, + "grad_norm": 0.45947200059890747, + "learning_rate": 0.00016106844741235393, + "loss": 0.4972, + "step": 589 + }, + { + "epoch": 0.00020709545898452497, + "grad_norm": 0.40610602498054504, + "learning_rate": 0.0001610016694490818, + "loss": 0.4022, + "step": 590 + }, + { + "epoch": 0.00020744646823704112, + "grad_norm": 0.3529384732246399, + "learning_rate": 0.00016093489148580968, + "loss": 0.5222, + "step": 591 + }, + { + "epoch": 0.00020779747748955727, + "grad_norm": 0.35114821791648865, + "learning_rate": 0.00016086811352253755, + "loss": 0.6224, + "step": 592 + }, + { + "epoch": 0.0002081484867420734, + "grad_norm": 0.3596336841583252, + "learning_rate": 0.00016080133555926545, + "loss": 0.5081, + "step": 593 + }, + { + "epoch": 0.00020849949599458954, + "grad_norm": 0.4214174747467041, + "learning_rate": 0.00016073455759599333, + "loss": 0.5189, + "step": 594 + }, + { + "epoch": 0.0002088505052471057, + "grad_norm": 0.39635175466537476, + "learning_rate": 0.00016066777963272123, + "loss": 0.582, + "step": 595 + }, + { + "epoch": 0.00020920151449962184, + "grad_norm": 0.36160576343536377, + "learning_rate": 0.0001606010016694491, + "loss": 0.568, + "step": 596 + }, + { + "epoch": 0.000209552523752138, + "grad_norm": 0.4242927134037018, + "learning_rate": 0.00016053422370617697, + "loss": 0.6235, + "step": 597 + }, + { + "epoch": 0.0002099035330046541, + "grad_norm": 0.4257853925228119, + "learning_rate": 0.00016046744574290485, + "loss": 0.5294, + "step": 598 + }, + { + "epoch": 0.00021025454225717026, + "grad_norm": 0.3890500068664551, + "learning_rate": 0.00016040066777963272, + "loss": 0.6224, + "step": 599 + }, + { + "epoch": 0.0002106055515096864, + "grad_norm": 0.2971879541873932, + "learning_rate": 0.00016033388981636062, + "loss": 0.5951, + "step": 600 + }, + { + "epoch": 0.00021095656076220256, + "grad_norm": 0.29551970958709717, + "learning_rate": 0.0001602671118530885, + "loss": 0.6713, + "step": 601 + }, + { + "epoch": 0.00021130757001471868, + "grad_norm": 0.31588122248649597, + "learning_rate": 0.00016020033388981637, + "loss": 0.6384, + "step": 602 + }, + { + "epoch": 0.00021165857926723483, + "grad_norm": 0.3138657510280609, + "learning_rate": 0.00016013355592654424, + "loss": 0.5846, + "step": 603 + }, + { + "epoch": 0.00021200958851975098, + "grad_norm": 0.31286585330963135, + "learning_rate": 0.0001600667779632721, + "loss": 0.6236, + "step": 604 + }, + { + "epoch": 0.00021236059777226713, + "grad_norm": 0.32098105549812317, + "learning_rate": 0.00016, + "loss": 0.4926, + "step": 605 + }, + { + "epoch": 0.00021271160702478328, + "grad_norm": 0.371427446603775, + "learning_rate": 0.00015993322203672789, + "loss": 0.6205, + "step": 606 + }, + { + "epoch": 0.0002130626162772994, + "grad_norm": 0.28764042258262634, + "learning_rate": 0.00015986644407345576, + "loss": 0.449, + "step": 607 + }, + { + "epoch": 0.00021341362552981555, + "grad_norm": 0.35086238384246826, + "learning_rate": 0.00015979966611018363, + "loss": 0.549, + "step": 608 + }, + { + "epoch": 0.0002137646347823317, + "grad_norm": 0.3118048906326294, + "learning_rate": 0.0001597328881469115, + "loss": 0.6037, + "step": 609 + }, + { + "epoch": 0.00021411564403484785, + "grad_norm": 0.3894517123699188, + "learning_rate": 0.0001596661101836394, + "loss": 0.5989, + "step": 610 + }, + { + "epoch": 0.000214466653287364, + "grad_norm": 0.39642322063446045, + "learning_rate": 0.00015959933222036728, + "loss": 0.566, + "step": 611 + }, + { + "epoch": 0.00021481766253988012, + "grad_norm": 0.35333508253097534, + "learning_rate": 0.00015953255425709518, + "loss": 0.5055, + "step": 612 + }, + { + "epoch": 0.00021516867179239627, + "grad_norm": 0.39200490713119507, + "learning_rate": 0.00015946577629382305, + "loss": 0.5951, + "step": 613 + }, + { + "epoch": 0.00021551968104491242, + "grad_norm": 0.38436442613601685, + "learning_rate": 0.00015939899833055093, + "loss": 0.4876, + "step": 614 + }, + { + "epoch": 0.00021587069029742857, + "grad_norm": 0.3397504389286041, + "learning_rate": 0.0001593322203672788, + "loss": 0.6287, + "step": 615 + }, + { + "epoch": 0.00021622169954994472, + "grad_norm": 0.35870012640953064, + "learning_rate": 0.0001592654424040067, + "loss": 0.5857, + "step": 616 + }, + { + "epoch": 0.00021657270880246084, + "grad_norm": 0.31163597106933594, + "learning_rate": 0.00015919866444073457, + "loss": 0.4831, + "step": 617 + }, + { + "epoch": 0.000216923718054977, + "grad_norm": 0.35106539726257324, + "learning_rate": 0.00015913188647746245, + "loss": 0.5776, + "step": 618 + }, + { + "epoch": 0.00021727472730749314, + "grad_norm": 0.3639923334121704, + "learning_rate": 0.00015906510851419032, + "loss": 0.5039, + "step": 619 + }, + { + "epoch": 0.0002176257365600093, + "grad_norm": 0.3622918128967285, + "learning_rate": 0.0001589983305509182, + "loss": 0.6293, + "step": 620 + }, + { + "epoch": 0.00021797674581252544, + "grad_norm": 0.3899349868297577, + "learning_rate": 0.0001589315525876461, + "loss": 0.567, + "step": 621 + }, + { + "epoch": 0.00021832775506504156, + "grad_norm": 0.3834361732006073, + "learning_rate": 0.00015886477462437397, + "loss": 0.5106, + "step": 622 + }, + { + "epoch": 0.0002186787643175577, + "grad_norm": 0.34996962547302246, + "learning_rate": 0.00015879799666110184, + "loss": 0.5155, + "step": 623 + }, + { + "epoch": 0.00021902977357007386, + "grad_norm": 0.47908079624176025, + "learning_rate": 0.0001587312186978297, + "loss": 0.4529, + "step": 624 + }, + { + "epoch": 0.00021938078282259, + "grad_norm": 0.3167901635169983, + "learning_rate": 0.00015866444073455758, + "loss": 0.6075, + "step": 625 + }, + { + "epoch": 0.00021973179207510616, + "grad_norm": 0.4254927337169647, + "learning_rate": 0.00015859766277128548, + "loss": 0.6404, + "step": 626 + }, + { + "epoch": 0.00022008280132762228, + "grad_norm": 0.4317469000816345, + "learning_rate": 0.00015853088480801336, + "loss": 0.5881, + "step": 627 + }, + { + "epoch": 0.00022043381058013843, + "grad_norm": 0.4441644251346588, + "learning_rate": 0.00015846410684474123, + "loss": 0.5864, + "step": 628 + }, + { + "epoch": 0.00022078481983265458, + "grad_norm": 0.37883102893829346, + "learning_rate": 0.00015839732888146913, + "loss": 0.5664, + "step": 629 + }, + { + "epoch": 0.00022113582908517073, + "grad_norm": 0.35548868775367737, + "learning_rate": 0.000158330550918197, + "loss": 0.5712, + "step": 630 + }, + { + "epoch": 0.00022148683833768688, + "grad_norm": 0.31588616967201233, + "learning_rate": 0.00015826377295492488, + "loss": 0.4856, + "step": 631 + }, + { + "epoch": 0.000221837847590203, + "grad_norm": 0.3186424672603607, + "learning_rate": 0.00015819699499165278, + "loss": 0.542, + "step": 632 + }, + { + "epoch": 0.00022218885684271915, + "grad_norm": 0.41098466515541077, + "learning_rate": 0.00015813021702838065, + "loss": 0.6311, + "step": 633 + }, + { + "epoch": 0.0002225398660952353, + "grad_norm": 0.413401335477829, + "learning_rate": 0.00015806343906510852, + "loss": 0.5036, + "step": 634 + }, + { + "epoch": 0.00022289087534775145, + "grad_norm": 0.34203773736953735, + "learning_rate": 0.0001579966611018364, + "loss": 0.5508, + "step": 635 + }, + { + "epoch": 0.0002232418846002676, + "grad_norm": 0.34416648745536804, + "learning_rate": 0.00015792988313856427, + "loss": 0.5442, + "step": 636 + }, + { + "epoch": 0.00022359289385278372, + "grad_norm": 0.3439941704273224, + "learning_rate": 0.00015786310517529217, + "loss": 0.4969, + "step": 637 + }, + { + "epoch": 0.00022394390310529987, + "grad_norm": 0.3547762930393219, + "learning_rate": 0.00015779632721202004, + "loss": 0.5564, + "step": 638 + }, + { + "epoch": 0.00022429491235781602, + "grad_norm": 0.35666894912719727, + "learning_rate": 0.00015772954924874792, + "loss": 0.4759, + "step": 639 + }, + { + "epoch": 0.00022464592161033217, + "grad_norm": 0.3175058364868164, + "learning_rate": 0.0001576627712854758, + "loss": 0.5708, + "step": 640 + }, + { + "epoch": 0.00022499693086284832, + "grad_norm": 0.4329943358898163, + "learning_rate": 0.00015759599332220366, + "loss": 0.5293, + "step": 641 + }, + { + "epoch": 0.00022534794011536444, + "grad_norm": 0.5703821778297424, + "learning_rate": 0.00015752921535893156, + "loss": 0.6187, + "step": 642 + }, + { + "epoch": 0.0002256989493678806, + "grad_norm": 0.32244032621383667, + "learning_rate": 0.00015746243739565944, + "loss": 0.4847, + "step": 643 + }, + { + "epoch": 0.00022604995862039674, + "grad_norm": 0.36224085092544556, + "learning_rate": 0.0001573956594323873, + "loss": 0.6804, + "step": 644 + }, + { + "epoch": 0.0002264009678729129, + "grad_norm": 0.3316931426525116, + "learning_rate": 0.0001573288814691152, + "loss": 0.6413, + "step": 645 + }, + { + "epoch": 0.00022675197712542904, + "grad_norm": 0.38156425952911377, + "learning_rate": 0.00015726210350584308, + "loss": 0.5659, + "step": 646 + }, + { + "epoch": 0.00022710298637794516, + "grad_norm": 0.48353493213653564, + "learning_rate": 0.00015719532554257096, + "loss": 0.5788, + "step": 647 + }, + { + "epoch": 0.00022745399563046131, + "grad_norm": 0.3913673758506775, + "learning_rate": 0.00015712854757929886, + "loss": 0.6899, + "step": 648 + }, + { + "epoch": 0.00022780500488297746, + "grad_norm": 0.46836981177330017, + "learning_rate": 0.00015706176961602673, + "loss": 0.5712, + "step": 649 + }, + { + "epoch": 0.0002281560141354936, + "grad_norm": 0.34713172912597656, + "learning_rate": 0.0001569949916527546, + "loss": 0.381, + "step": 650 + }, + { + "epoch": 0.00022850702338800976, + "grad_norm": 0.3837398886680603, + "learning_rate": 0.00015692821368948248, + "loss": 0.5236, + "step": 651 + }, + { + "epoch": 0.00022885803264052589, + "grad_norm": 0.5181556940078735, + "learning_rate": 0.00015686143572621035, + "loss": 0.5889, + "step": 652 + }, + { + "epoch": 0.00022920904189304203, + "grad_norm": 0.42713961005210876, + "learning_rate": 0.00015679465776293825, + "loss": 0.5346, + "step": 653 + }, + { + "epoch": 0.00022956005114555818, + "grad_norm": 0.2868479788303375, + "learning_rate": 0.00015672787979966612, + "loss": 0.5546, + "step": 654 + }, + { + "epoch": 0.00022991106039807433, + "grad_norm": 0.31901800632476807, + "learning_rate": 0.000156661101836394, + "loss": 0.5014, + "step": 655 + }, + { + "epoch": 0.00023026206965059048, + "grad_norm": 0.41681963205337524, + "learning_rate": 0.00015659432387312187, + "loss": 0.5709, + "step": 656 + }, + { + "epoch": 0.0002306130789031066, + "grad_norm": 0.5942090749740601, + "learning_rate": 0.00015652754590984974, + "loss": 0.6022, + "step": 657 + }, + { + "epoch": 0.00023096408815562276, + "grad_norm": 0.405391126871109, + "learning_rate": 0.00015646076794657764, + "loss": 0.5363, + "step": 658 + }, + { + "epoch": 0.0002313150974081389, + "grad_norm": 0.3201390206813812, + "learning_rate": 0.00015639398998330552, + "loss": 0.6045, + "step": 659 + }, + { + "epoch": 0.00023166610666065505, + "grad_norm": 0.2989407479763031, + "learning_rate": 0.0001563272120200334, + "loss": 0.5604, + "step": 660 + }, + { + "epoch": 0.0002320171159131712, + "grad_norm": 0.3919268548488617, + "learning_rate": 0.00015626043405676126, + "loss": 0.5413, + "step": 661 + }, + { + "epoch": 0.00023236812516568733, + "grad_norm": 0.4080122709274292, + "learning_rate": 0.00015619365609348916, + "loss": 0.498, + "step": 662 + }, + { + "epoch": 0.00023271913441820348, + "grad_norm": 0.38974156975746155, + "learning_rate": 0.00015612687813021704, + "loss": 0.6149, + "step": 663 + }, + { + "epoch": 0.00023307014367071962, + "grad_norm": 0.3145015835762024, + "learning_rate": 0.00015606010016694494, + "loss": 0.4886, + "step": 664 + }, + { + "epoch": 0.00023342115292323577, + "grad_norm": 0.3009328246116638, + "learning_rate": 0.0001559933222036728, + "loss": 0.5534, + "step": 665 + }, + { + "epoch": 0.00023377216217575192, + "grad_norm": 0.4774717092514038, + "learning_rate": 0.00015592654424040068, + "loss": 0.6006, + "step": 666 + }, + { + "epoch": 0.00023412317142826805, + "grad_norm": 0.32965418696403503, + "learning_rate": 0.00015585976627712856, + "loss": 0.5463, + "step": 667 + }, + { + "epoch": 0.0002344741806807842, + "grad_norm": 0.3066554665565491, + "learning_rate": 0.00015579298831385643, + "loss": 0.5675, + "step": 668 + }, + { + "epoch": 0.00023482518993330035, + "grad_norm": 0.3879207372665405, + "learning_rate": 0.00015572621035058433, + "loss": 0.5825, + "step": 669 + }, + { + "epoch": 0.0002351761991858165, + "grad_norm": 0.3171943128108978, + "learning_rate": 0.0001556594323873122, + "loss": 0.5677, + "step": 670 + }, + { + "epoch": 0.00023552720843833264, + "grad_norm": 0.36982622742652893, + "learning_rate": 0.00015559265442404007, + "loss": 0.5885, + "step": 671 + }, + { + "epoch": 0.00023587821769084877, + "grad_norm": 0.30437183380126953, + "learning_rate": 0.00015552587646076795, + "loss": 0.6288, + "step": 672 + }, + { + "epoch": 0.00023622922694336492, + "grad_norm": 0.30654504895210266, + "learning_rate": 0.00015545909849749582, + "loss": 0.5924, + "step": 673 + }, + { + "epoch": 0.00023658023619588107, + "grad_norm": 0.3771214783191681, + "learning_rate": 0.00015539232053422372, + "loss": 0.4901, + "step": 674 + }, + { + "epoch": 0.00023693124544839721, + "grad_norm": 0.3018699884414673, + "learning_rate": 0.0001553255425709516, + "loss": 0.6159, + "step": 675 + }, + { + "epoch": 0.00023728225470091336, + "grad_norm": 0.32899734377861023, + "learning_rate": 0.00015525876460767947, + "loss": 0.6197, + "step": 676 + }, + { + "epoch": 0.0002376332639534295, + "grad_norm": 0.31837883591651917, + "learning_rate": 0.00015519198664440734, + "loss": 0.5449, + "step": 677 + }, + { + "epoch": 0.00023798427320594564, + "grad_norm": 0.35326528549194336, + "learning_rate": 0.00015512520868113521, + "loss": 0.6315, + "step": 678 + }, + { + "epoch": 0.00023833528245846179, + "grad_norm": 0.3714829385280609, + "learning_rate": 0.00015505843071786311, + "loss": 0.6352, + "step": 679 + }, + { + "epoch": 0.00023868629171097794, + "grad_norm": 0.4002094864845276, + "learning_rate": 0.000154991652754591, + "loss": 0.4235, + "step": 680 + }, + { + "epoch": 0.00023903730096349408, + "grad_norm": 0.3382783532142639, + "learning_rate": 0.0001549248747913189, + "loss": 0.5476, + "step": 681 + }, + { + "epoch": 0.0002393883102160102, + "grad_norm": 0.2985747158527374, + "learning_rate": 0.00015485809682804676, + "loss": 0.5684, + "step": 682 + }, + { + "epoch": 0.00023973931946852636, + "grad_norm": 0.3288929760456085, + "learning_rate": 0.00015479131886477463, + "loss": 0.5657, + "step": 683 + }, + { + "epoch": 0.0002400903287210425, + "grad_norm": 0.39641210436820984, + "learning_rate": 0.0001547245409015025, + "loss": 0.6283, + "step": 684 + }, + { + "epoch": 0.00024044133797355866, + "grad_norm": 0.37413230538368225, + "learning_rate": 0.0001546577629382304, + "loss": 0.5778, + "step": 685 + }, + { + "epoch": 0.0002407923472260748, + "grad_norm": 0.28837504982948303, + "learning_rate": 0.00015459098497495828, + "loss": 0.5079, + "step": 686 + }, + { + "epoch": 0.00024114335647859093, + "grad_norm": 0.32851526141166687, + "learning_rate": 0.00015452420701168615, + "loss": 0.649, + "step": 687 + }, + { + "epoch": 0.00024149436573110708, + "grad_norm": 0.3848758637905121, + "learning_rate": 0.00015445742904841403, + "loss": 0.6099, + "step": 688 + }, + { + "epoch": 0.00024184537498362323, + "grad_norm": 0.35494935512542725, + "learning_rate": 0.0001543906510851419, + "loss": 0.6498, + "step": 689 + }, + { + "epoch": 0.00024219638423613938, + "grad_norm": 0.3431280553340912, + "learning_rate": 0.0001543238731218698, + "loss": 0.4934, + "step": 690 + }, + { + "epoch": 0.00024254739348865553, + "grad_norm": 0.33980974555015564, + "learning_rate": 0.00015425709515859767, + "loss": 0.5556, + "step": 691 + }, + { + "epoch": 0.00024289840274117165, + "grad_norm": 0.3086068034172058, + "learning_rate": 0.00015419031719532555, + "loss": 0.5955, + "step": 692 + }, + { + "epoch": 0.0002432494119936878, + "grad_norm": 0.33093178272247314, + "learning_rate": 0.00015412353923205342, + "loss": 0.5926, + "step": 693 + }, + { + "epoch": 0.00024360042124620395, + "grad_norm": 0.3660534620285034, + "learning_rate": 0.0001540567612687813, + "loss": 0.5494, + "step": 694 + }, + { + "epoch": 0.0002439514304987201, + "grad_norm": 0.29803964495658875, + "learning_rate": 0.0001539899833055092, + "loss": 0.6074, + "step": 695 + }, + { + "epoch": 0.00024430243975123625, + "grad_norm": 0.36542224884033203, + "learning_rate": 0.00015392320534223707, + "loss": 0.59, + "step": 696 + }, + { + "epoch": 0.00024465344900375237, + "grad_norm": 0.34015166759490967, + "learning_rate": 0.00015385642737896494, + "loss": 0.6029, + "step": 697 + }, + { + "epoch": 0.00024500445825626854, + "grad_norm": 0.3211725950241089, + "learning_rate": 0.00015378964941569284, + "loss": 0.535, + "step": 698 + }, + { + "epoch": 0.00024535546750878467, + "grad_norm": 0.37027183175086975, + "learning_rate": 0.0001537228714524207, + "loss": 0.6265, + "step": 699 + }, + { + "epoch": 0.0002457064767613008, + "grad_norm": 0.3447396159172058, + "learning_rate": 0.00015365609348914859, + "loss": 0.6061, + "step": 700 + }, + { + "epoch": 0.00024605748601381697, + "grad_norm": 0.3344075679779053, + "learning_rate": 0.00015358931552587649, + "loss": 0.5412, + "step": 701 + }, + { + "epoch": 0.0002464084952663331, + "grad_norm": 0.29049620032310486, + "learning_rate": 0.00015352253756260436, + "loss": 0.5137, + "step": 702 + }, + { + "epoch": 0.00024675950451884926, + "grad_norm": 0.37048932909965515, + "learning_rate": 0.00015345575959933223, + "loss": 0.6118, + "step": 703 + }, + { + "epoch": 0.0002471105137713654, + "grad_norm": 0.38212522864341736, + "learning_rate": 0.0001533889816360601, + "loss": 0.466, + "step": 704 + }, + { + "epoch": 0.0002474615230238815, + "grad_norm": 0.3576483428478241, + "learning_rate": 0.00015332220367278798, + "loss": 0.561, + "step": 705 + }, + { + "epoch": 0.0002478125322763977, + "grad_norm": 0.3550293743610382, + "learning_rate": 0.00015325542570951588, + "loss": 0.5634, + "step": 706 + }, + { + "epoch": 0.0002481635415289138, + "grad_norm": 0.362474650144577, + "learning_rate": 0.00015318864774624375, + "loss": 0.5608, + "step": 707 + }, + { + "epoch": 0.00024851455078143, + "grad_norm": 0.39463603496551514, + "learning_rate": 0.00015312186978297163, + "loss": 0.64, + "step": 708 + }, + { + "epoch": 0.0002488655600339461, + "grad_norm": 0.3456307649612427, + "learning_rate": 0.0001530550918196995, + "loss": 0.4631, + "step": 709 + }, + { + "epoch": 0.00024921656928646223, + "grad_norm": 0.3300929367542267, + "learning_rate": 0.00015298831385642737, + "loss": 0.3984, + "step": 710 + }, + { + "epoch": 0.0002495675785389784, + "grad_norm": 0.35923343896865845, + "learning_rate": 0.00015292153589315527, + "loss": 0.6003, + "step": 711 + }, + { + "epoch": 0.00024991858779149453, + "grad_norm": 0.4047611653804779, + "learning_rate": 0.00015285475792988315, + "loss": 0.5715, + "step": 712 + }, + { + "epoch": 0.0002502695970440107, + "grad_norm": 0.43539851903915405, + "learning_rate": 0.00015278797996661102, + "loss": 0.571, + "step": 713 + }, + { + "epoch": 0.00025062060629652683, + "grad_norm": 0.34745046496391296, + "learning_rate": 0.0001527212020033389, + "loss": 0.622, + "step": 714 + }, + { + "epoch": 0.00025097161554904295, + "grad_norm": 0.3130028247833252, + "learning_rate": 0.0001526544240400668, + "loss": 0.507, + "step": 715 + }, + { + "epoch": 0.0002513226248015591, + "grad_norm": 0.3093617558479309, + "learning_rate": 0.00015258764607679466, + "loss": 0.4951, + "step": 716 + }, + { + "epoch": 0.00025167363405407525, + "grad_norm": 0.34299540519714355, + "learning_rate": 0.00015252086811352257, + "loss": 0.539, + "step": 717 + }, + { + "epoch": 0.0002520246433065914, + "grad_norm": 0.32698413729667664, + "learning_rate": 0.00015245409015025044, + "loss": 0.4588, + "step": 718 + }, + { + "epoch": 0.00025237565255910755, + "grad_norm": 0.37853989005088806, + "learning_rate": 0.0001523873121869783, + "loss": 0.6227, + "step": 719 + }, + { + "epoch": 0.00025272666181162367, + "grad_norm": 0.32887300848960876, + "learning_rate": 0.00015232053422370618, + "loss": 0.5893, + "step": 720 + }, + { + "epoch": 0.00025307767106413985, + "grad_norm": 0.43352028727531433, + "learning_rate": 0.00015225375626043406, + "loss": 0.5811, + "step": 721 + }, + { + "epoch": 0.00025342868031665597, + "grad_norm": 0.42844903469085693, + "learning_rate": 0.00015218697829716196, + "loss": 0.6196, + "step": 722 + }, + { + "epoch": 0.00025377968956917215, + "grad_norm": 0.39929670095443726, + "learning_rate": 0.00015212020033388983, + "loss": 0.6722, + "step": 723 + }, + { + "epoch": 0.00025413069882168827, + "grad_norm": 0.5063486695289612, + "learning_rate": 0.0001520534223706177, + "loss": 0.6086, + "step": 724 + }, + { + "epoch": 0.0002544817080742044, + "grad_norm": 0.3625267446041107, + "learning_rate": 0.00015198664440734558, + "loss": 0.6331, + "step": 725 + }, + { + "epoch": 0.00025483271732672057, + "grad_norm": 0.3452700078487396, + "learning_rate": 0.00015191986644407345, + "loss": 0.5812, + "step": 726 + }, + { + "epoch": 0.0002551837265792367, + "grad_norm": 0.31915003061294556, + "learning_rate": 0.00015185308848080135, + "loss": 0.5653, + "step": 727 + }, + { + "epoch": 0.00025553473583175287, + "grad_norm": 0.3085877299308777, + "learning_rate": 0.00015178631051752922, + "loss": 0.4702, + "step": 728 + }, + { + "epoch": 0.000255885745084269, + "grad_norm": 0.31519320607185364, + "learning_rate": 0.0001517195325542571, + "loss": 0.5096, + "step": 729 + }, + { + "epoch": 0.0002562367543367851, + "grad_norm": 0.3637699782848358, + "learning_rate": 0.00015165275459098497, + "loss": 0.6001, + "step": 730 + }, + { + "epoch": 0.0002565877635893013, + "grad_norm": 0.34056970477104187, + "learning_rate": 0.00015158597662771284, + "loss": 0.5546, + "step": 731 + }, + { + "epoch": 0.0002569387728418174, + "grad_norm": 0.37110257148742676, + "learning_rate": 0.00015151919866444074, + "loss": 0.5612, + "step": 732 + }, + { + "epoch": 0.0002572897820943336, + "grad_norm": 0.35854101181030273, + "learning_rate": 0.00015145242070116862, + "loss": 0.6364, + "step": 733 + }, + { + "epoch": 0.0002576407913468497, + "grad_norm": 0.4340030252933502, + "learning_rate": 0.00015138564273789652, + "loss": 0.5772, + "step": 734 + }, + { + "epoch": 0.00025799180059936583, + "grad_norm": 0.3807721436023712, + "learning_rate": 0.0001513188647746244, + "loss": 0.4986, + "step": 735 + }, + { + "epoch": 0.000258342809851882, + "grad_norm": 0.3522527813911438, + "learning_rate": 0.00015125208681135226, + "loss": 0.5982, + "step": 736 + }, + { + "epoch": 0.00025869381910439813, + "grad_norm": 0.31251296401023865, + "learning_rate": 0.00015118530884808014, + "loss": 0.5239, + "step": 737 + }, + { + "epoch": 0.0002590448283569143, + "grad_norm": 0.3460885286331177, + "learning_rate": 0.00015111853088480804, + "loss": 0.5881, + "step": 738 + }, + { + "epoch": 0.00025939583760943043, + "grad_norm": 0.33298879861831665, + "learning_rate": 0.0001510517529215359, + "loss": 0.5272, + "step": 739 + }, + { + "epoch": 0.00025974684686194655, + "grad_norm": 0.351468950510025, + "learning_rate": 0.00015098497495826378, + "loss": 0.6049, + "step": 740 + }, + { + "epoch": 0.00026009785611446273, + "grad_norm": 0.3449242413043976, + "learning_rate": 0.00015091819699499166, + "loss": 0.5983, + "step": 741 + }, + { + "epoch": 0.00026044886536697885, + "grad_norm": 0.34724265336990356, + "learning_rate": 0.00015085141903171953, + "loss": 0.5292, + "step": 742 + }, + { + "epoch": 0.00026079987461949503, + "grad_norm": 0.3525671660900116, + "learning_rate": 0.00015078464106844743, + "loss": 0.5391, + "step": 743 + }, + { + "epoch": 0.00026115088387201115, + "grad_norm": 0.33959653973579407, + "learning_rate": 0.0001507178631051753, + "loss": 0.5898, + "step": 744 + }, + { + "epoch": 0.00026150189312452727, + "grad_norm": 0.5051225423812866, + "learning_rate": 0.00015065108514190318, + "loss": 0.5408, + "step": 745 + }, + { + "epoch": 0.00026185290237704345, + "grad_norm": 0.3298085629940033, + "learning_rate": 0.00015058430717863105, + "loss": 0.557, + "step": 746 + }, + { + "epoch": 0.00026220391162955957, + "grad_norm": 0.3375703990459442, + "learning_rate": 0.00015051752921535892, + "loss": 0.5541, + "step": 747 + }, + { + "epoch": 0.00026255492088207575, + "grad_norm": 0.27896445989608765, + "learning_rate": 0.0001504507512520868, + "loss": 0.5273, + "step": 748 + }, + { + "epoch": 0.00026290593013459187, + "grad_norm": 0.30591917037963867, + "learning_rate": 0.0001503839732888147, + "loss": 0.5988, + "step": 749 + }, + { + "epoch": 0.000263256939387108, + "grad_norm": 0.41014084219932556, + "learning_rate": 0.00015031719532554257, + "loss": 0.555, + "step": 750 + }, + { + "epoch": 0.00026360794863962417, + "grad_norm": 0.2935464084148407, + "learning_rate": 0.00015025041736227047, + "loss": 0.625, + "step": 751 + }, + { + "epoch": 0.0002639589578921403, + "grad_norm": 0.46361032128334045, + "learning_rate": 0.00015018363939899834, + "loss": 0.4753, + "step": 752 + }, + { + "epoch": 0.00026430996714465647, + "grad_norm": 0.35808300971984863, + "learning_rate": 0.00015011686143572622, + "loss": 0.5531, + "step": 753 + }, + { + "epoch": 0.0002646609763971726, + "grad_norm": 0.3411274254322052, + "learning_rate": 0.00015005008347245412, + "loss": 0.5577, + "step": 754 + }, + { + "epoch": 0.0002650119856496887, + "grad_norm": 0.34169328212738037, + "learning_rate": 0.000149983305509182, + "loss": 0.4856, + "step": 755 + }, + { + "epoch": 0.0002653629949022049, + "grad_norm": 0.38024139404296875, + "learning_rate": 0.00014991652754590986, + "loss": 0.5203, + "step": 756 + }, + { + "epoch": 0.000265714004154721, + "grad_norm": 0.35004425048828125, + "learning_rate": 0.00014984974958263774, + "loss": 0.4999, + "step": 757 + }, + { + "epoch": 0.0002660650134072372, + "grad_norm": 0.47526153922080994, + "learning_rate": 0.0001497829716193656, + "loss": 0.5503, + "step": 758 + }, + { + "epoch": 0.0002664160226597533, + "grad_norm": 0.35096925497055054, + "learning_rate": 0.0001497161936560935, + "loss": 0.5812, + "step": 759 + }, + { + "epoch": 0.00026676703191226943, + "grad_norm": 0.4505446255207062, + "learning_rate": 0.00014964941569282138, + "loss": 0.6069, + "step": 760 + }, + { + "epoch": 0.0002671180411647856, + "grad_norm": 0.3261663019657135, + "learning_rate": 0.00014958263772954926, + "loss": 0.5601, + "step": 761 + }, + { + "epoch": 0.00026746905041730173, + "grad_norm": 0.3397548794746399, + "learning_rate": 0.00014951585976627713, + "loss": 0.5572, + "step": 762 + }, + { + "epoch": 0.00026782005966981785, + "grad_norm": 0.35547688603401184, + "learning_rate": 0.000149449081803005, + "loss": 0.5983, + "step": 763 + }, + { + "epoch": 0.00026817106892233403, + "grad_norm": 0.41515079140663147, + "learning_rate": 0.00014938230383973287, + "loss": 0.6106, + "step": 764 + }, + { + "epoch": 0.00026852207817485015, + "grad_norm": 0.3840051591396332, + "learning_rate": 0.00014931552587646077, + "loss": 0.5328, + "step": 765 + }, + { + "epoch": 0.00026887308742736633, + "grad_norm": 0.3401285707950592, + "learning_rate": 0.00014924874791318865, + "loss": 0.4666, + "step": 766 + }, + { + "epoch": 0.00026922409667988245, + "grad_norm": 0.32983794808387756, + "learning_rate": 0.00014918196994991652, + "loss": 0.5214, + "step": 767 + }, + { + "epoch": 0.0002695751059323986, + "grad_norm": 0.30202198028564453, + "learning_rate": 0.00014911519198664442, + "loss": 0.4969, + "step": 768 + }, + { + "epoch": 0.00026992611518491475, + "grad_norm": 0.3222092092037201, + "learning_rate": 0.0001490484140233723, + "loss": 0.5093, + "step": 769 + }, + { + "epoch": 0.0002702771244374309, + "grad_norm": 0.4211997091770172, + "learning_rate": 0.0001489816360601002, + "loss": 0.6295, + "step": 770 + }, + { + "epoch": 0.00027062813368994705, + "grad_norm": 0.32112184166908264, + "learning_rate": 0.00014891485809682807, + "loss": 0.5611, + "step": 771 + }, + { + "epoch": 0.00027097914294246317, + "grad_norm": 0.3272956609725952, + "learning_rate": 0.00014884808013355594, + "loss": 0.6438, + "step": 772 + }, + { + "epoch": 0.0002713301521949793, + "grad_norm": 0.39423295855522156, + "learning_rate": 0.00014878130217028381, + "loss": 0.6029, + "step": 773 + }, + { + "epoch": 0.00027168116144749547, + "grad_norm": 0.3053528070449829, + "learning_rate": 0.0001487145242070117, + "loss": 0.4978, + "step": 774 + }, + { + "epoch": 0.0002720321707000116, + "grad_norm": 0.312774658203125, + "learning_rate": 0.0001486477462437396, + "loss": 0.5753, + "step": 775 + }, + { + "epoch": 0.00027238317995252777, + "grad_norm": 0.343964546918869, + "learning_rate": 0.00014858096828046746, + "loss": 0.5173, + "step": 776 + }, + { + "epoch": 0.0002727341892050439, + "grad_norm": 0.39104631543159485, + "learning_rate": 0.00014851419031719533, + "loss": 0.6381, + "step": 777 + }, + { + "epoch": 0.00027308519845756, + "grad_norm": 0.3958207070827484, + "learning_rate": 0.0001484474123539232, + "loss": 0.6046, + "step": 778 + }, + { + "epoch": 0.0002734362077100762, + "grad_norm": 0.36198097467422485, + "learning_rate": 0.00014838063439065108, + "loss": 0.6066, + "step": 779 + }, + { + "epoch": 0.0002737872169625923, + "grad_norm": 0.29619571566581726, + "learning_rate": 0.00014831385642737895, + "loss": 0.5131, + "step": 780 + }, + { + "epoch": 0.0002741382262151085, + "grad_norm": 0.344784677028656, + "learning_rate": 0.00014824707846410685, + "loss": 0.5626, + "step": 781 + }, + { + "epoch": 0.0002744892354676246, + "grad_norm": 0.35641250014305115, + "learning_rate": 0.00014818030050083473, + "loss": 0.5451, + "step": 782 + }, + { + "epoch": 0.00027484024472014074, + "grad_norm": 0.3496847152709961, + "learning_rate": 0.0001481135225375626, + "loss": 0.4814, + "step": 783 + }, + { + "epoch": 0.0002751912539726569, + "grad_norm": 0.3726658821105957, + "learning_rate": 0.00014804674457429047, + "loss": 0.6244, + "step": 784 + }, + { + "epoch": 0.00027554226322517303, + "grad_norm": 0.3317565619945526, + "learning_rate": 0.00014797996661101837, + "loss": 0.562, + "step": 785 + }, + { + "epoch": 0.0002758932724776892, + "grad_norm": 0.3478979468345642, + "learning_rate": 0.00014791318864774625, + "loss": 0.613, + "step": 786 + }, + { + "epoch": 0.00027624428173020533, + "grad_norm": 0.3572550415992737, + "learning_rate": 0.00014784641068447415, + "loss": 0.4841, + "step": 787 + }, + { + "epoch": 0.00027659529098272146, + "grad_norm": 0.34030210971832275, + "learning_rate": 0.00014777963272120202, + "loss": 0.4879, + "step": 788 + }, + { + "epoch": 0.00027694630023523763, + "grad_norm": 0.378203421831131, + "learning_rate": 0.0001477128547579299, + "loss": 0.6086, + "step": 789 + }, + { + "epoch": 0.00027729730948775375, + "grad_norm": 0.3390562832355499, + "learning_rate": 0.00014764607679465777, + "loss": 0.586, + "step": 790 + }, + { + "epoch": 0.00027764831874026993, + "grad_norm": 0.4986645579338074, + "learning_rate": 0.00014757929883138567, + "loss": 0.5592, + "step": 791 + }, + { + "epoch": 0.00027799932799278605, + "grad_norm": 0.3361869156360626, + "learning_rate": 0.00014751252086811354, + "loss": 0.4632, + "step": 792 + }, + { + "epoch": 0.0002783503372453022, + "grad_norm": 0.3726123571395874, + "learning_rate": 0.0001474457429048414, + "loss": 0.4915, + "step": 793 + }, + { + "epoch": 0.00027870134649781835, + "grad_norm": 0.3358845114707947, + "learning_rate": 0.00014737896494156929, + "loss": 0.5593, + "step": 794 + }, + { + "epoch": 0.0002790523557503345, + "grad_norm": 0.30473607778549194, + "learning_rate": 0.00014731218697829716, + "loss": 0.3672, + "step": 795 + }, + { + "epoch": 0.00027940336500285065, + "grad_norm": 0.33929023146629333, + "learning_rate": 0.00014724540901502506, + "loss": 0.5404, + "step": 796 + }, + { + "epoch": 0.0002797543742553668, + "grad_norm": 0.30778205394744873, + "learning_rate": 0.00014717863105175293, + "loss": 0.4379, + "step": 797 + }, + { + "epoch": 0.0002801053835078829, + "grad_norm": 0.286443829536438, + "learning_rate": 0.0001471118530884808, + "loss": 0.5579, + "step": 798 + }, + { + "epoch": 0.0002804563927603991, + "grad_norm": 0.4246799051761627, + "learning_rate": 0.00014704507512520868, + "loss": 0.536, + "step": 799 + }, + { + "epoch": 0.0002808074020129152, + "grad_norm": 0.4085538983345032, + "learning_rate": 0.00014697829716193655, + "loss": 0.5309, + "step": 800 + }, + { + "epoch": 0.00028115841126543137, + "grad_norm": 0.35396453738212585, + "learning_rate": 0.00014691151919866443, + "loss": 0.5307, + "step": 801 + }, + { + "epoch": 0.0002815094205179475, + "grad_norm": 0.45588648319244385, + "learning_rate": 0.00014684474123539233, + "loss": 0.5905, + "step": 802 + }, + { + "epoch": 0.0002818604297704636, + "grad_norm": 0.3353815972805023, + "learning_rate": 0.0001467779632721202, + "loss": 0.612, + "step": 803 + }, + { + "epoch": 0.0002822114390229798, + "grad_norm": 0.4152653217315674, + "learning_rate": 0.0001467111853088481, + "loss": 0.592, + "step": 804 + }, + { + "epoch": 0.0002825624482754959, + "grad_norm": 0.3651511073112488, + "learning_rate": 0.00014664440734557597, + "loss": 0.5909, + "step": 805 + }, + { + "epoch": 0.0002829134575280121, + "grad_norm": 0.3518235385417938, + "learning_rate": 0.00014657762938230385, + "loss": 0.5684, + "step": 806 + }, + { + "epoch": 0.0002832644667805282, + "grad_norm": 0.33562156558036804, + "learning_rate": 0.00014651085141903175, + "loss": 0.5165, + "step": 807 + }, + { + "epoch": 0.00028361547603304434, + "grad_norm": 0.3648052513599396, + "learning_rate": 0.00014644407345575962, + "loss": 0.5451, + "step": 808 + }, + { + "epoch": 0.0002839664852855605, + "grad_norm": 0.44342300295829773, + "learning_rate": 0.0001463772954924875, + "loss": 0.5907, + "step": 809 + }, + { + "epoch": 0.00028431749453807664, + "grad_norm": 0.33331966400146484, + "learning_rate": 0.00014631051752921536, + "loss": 0.4254, + "step": 810 + }, + { + "epoch": 0.0002846685037905928, + "grad_norm": 0.3444873094558716, + "learning_rate": 0.00014624373956594324, + "loss": 0.5201, + "step": 811 + }, + { + "epoch": 0.00028501951304310894, + "grad_norm": 0.4239615201950073, + "learning_rate": 0.00014617696160267114, + "loss": 0.5098, + "step": 812 + }, + { + "epoch": 0.00028537052229562506, + "grad_norm": 0.47895997762680054, + "learning_rate": 0.000146110183639399, + "loss": 0.6243, + "step": 813 + }, + { + "epoch": 0.00028572153154814123, + "grad_norm": 0.47322046756744385, + "learning_rate": 0.00014604340567612688, + "loss": 0.6841, + "step": 814 + }, + { + "epoch": 0.00028607254080065736, + "grad_norm": 0.35017871856689453, + "learning_rate": 0.00014597662771285476, + "loss": 0.5313, + "step": 815 + }, + { + "epoch": 0.00028642355005317353, + "grad_norm": 0.4342300295829773, + "learning_rate": 0.00014590984974958263, + "loss": 0.4363, + "step": 816 + }, + { + "epoch": 0.00028677455930568966, + "grad_norm": 0.2966228723526001, + "learning_rate": 0.0001458430717863105, + "loss": 0.6428, + "step": 817 + }, + { + "epoch": 0.0002871255685582058, + "grad_norm": 0.3320361375808716, + "learning_rate": 0.0001457762938230384, + "loss": 0.5266, + "step": 818 + }, + { + "epoch": 0.00028747657781072195, + "grad_norm": 0.3318590223789215, + "learning_rate": 0.00014570951585976628, + "loss": 0.5676, + "step": 819 + }, + { + "epoch": 0.0002878275870632381, + "grad_norm": 0.38573157787323, + "learning_rate": 0.00014564273789649415, + "loss": 0.7083, + "step": 820 + }, + { + "epoch": 0.00028817859631575425, + "grad_norm": 0.3731164038181305, + "learning_rate": 0.00014557595993322205, + "loss": 0.578, + "step": 821 + }, + { + "epoch": 0.0002885296055682704, + "grad_norm": 0.33610039949417114, + "learning_rate": 0.00014550918196994992, + "loss": 0.5923, + "step": 822 + }, + { + "epoch": 0.0002888806148207865, + "grad_norm": 0.3393179476261139, + "learning_rate": 0.00014544240400667782, + "loss": 0.5162, + "step": 823 + }, + { + "epoch": 0.0002892316240733027, + "grad_norm": 0.35552918910980225, + "learning_rate": 0.0001453756260434057, + "loss": 0.556, + "step": 824 + }, + { + "epoch": 0.0002895826333258188, + "grad_norm": 0.32425832748413086, + "learning_rate": 0.00014530884808013357, + "loss": 0.5157, + "step": 825 + }, + { + "epoch": 0.000289933642578335, + "grad_norm": 0.3353455662727356, + "learning_rate": 0.00014524207011686144, + "loss": 0.483, + "step": 826 + }, + { + "epoch": 0.0002902846518308511, + "grad_norm": 0.46254628896713257, + "learning_rate": 0.00014517529215358932, + "loss": 0.633, + "step": 827 + }, + { + "epoch": 0.0002906356610833672, + "grad_norm": 0.3275732100009918, + "learning_rate": 0.00014510851419031722, + "loss": 0.5502, + "step": 828 + }, + { + "epoch": 0.0002909866703358834, + "grad_norm": 0.3495190441608429, + "learning_rate": 0.0001450417362270451, + "loss": 0.368, + "step": 829 + }, + { + "epoch": 0.0002913376795883995, + "grad_norm": 0.35350501537323, + "learning_rate": 0.00014497495826377296, + "loss": 0.5819, + "step": 830 + }, + { + "epoch": 0.0002916886888409157, + "grad_norm": 0.37886378169059753, + "learning_rate": 0.00014490818030050084, + "loss": 0.5418, + "step": 831 + }, + { + "epoch": 0.0002920396980934318, + "grad_norm": 0.4279928505420685, + "learning_rate": 0.0001448414023372287, + "loss": 0.5199, + "step": 832 + }, + { + "epoch": 0.00029239070734594794, + "grad_norm": 0.33105382323265076, + "learning_rate": 0.00014477462437395658, + "loss": 0.5952, + "step": 833 + }, + { + "epoch": 0.0002927417165984641, + "grad_norm": 0.40114086866378784, + "learning_rate": 0.00014470784641068448, + "loss": 0.4611, + "step": 834 + }, + { + "epoch": 0.00029309272585098024, + "grad_norm": 0.3294037878513336, + "learning_rate": 0.00014464106844741236, + "loss": 0.5562, + "step": 835 + }, + { + "epoch": 0.0002934437351034964, + "grad_norm": 0.3391546607017517, + "learning_rate": 0.00014457429048414023, + "loss": 0.5748, + "step": 836 + }, + { + "epoch": 0.00029379474435601254, + "grad_norm": 0.4093922972679138, + "learning_rate": 0.0001445075125208681, + "loss": 0.4607, + "step": 837 + }, + { + "epoch": 0.00029414575360852866, + "grad_norm": 0.3331819176673889, + "learning_rate": 0.000144440734557596, + "loss": 0.5874, + "step": 838 + }, + { + "epoch": 0.00029449676286104484, + "grad_norm": 0.43205946683883667, + "learning_rate": 0.00014437395659432388, + "loss": 0.6152, + "step": 839 + }, + { + "epoch": 0.00029484777211356096, + "grad_norm": 0.36046868562698364, + "learning_rate": 0.00014430717863105178, + "loss": 0.4781, + "step": 840 + }, + { + "epoch": 0.00029519878136607713, + "grad_norm": 0.35514524579048157, + "learning_rate": 0.00014424040066777965, + "loss": 0.568, + "step": 841 + }, + { + "epoch": 0.00029554979061859326, + "grad_norm": 0.40260326862335205, + "learning_rate": 0.00014417362270450752, + "loss": 0.6075, + "step": 842 + }, + { + "epoch": 0.0002959007998711094, + "grad_norm": 0.3102671205997467, + "learning_rate": 0.0001441068447412354, + "loss": 0.4927, + "step": 843 + }, + { + "epoch": 0.00029625180912362556, + "grad_norm": 0.30940982699394226, + "learning_rate": 0.0001440400667779633, + "loss": 0.5549, + "step": 844 + }, + { + "epoch": 0.0002966028183761417, + "grad_norm": 0.3652762174606323, + "learning_rate": 0.00014397328881469117, + "loss": 0.6085, + "step": 845 + }, + { + "epoch": 0.00029695382762865786, + "grad_norm": 0.43056777119636536, + "learning_rate": 0.00014390651085141904, + "loss": 0.494, + "step": 846 + }, + { + "epoch": 0.000297304836881174, + "grad_norm": 0.3112967014312744, + "learning_rate": 0.00014383973288814692, + "loss": 0.5141, + "step": 847 + }, + { + "epoch": 0.0002976558461336901, + "grad_norm": 0.36729326844215393, + "learning_rate": 0.0001437729549248748, + "loss": 0.5435, + "step": 848 + }, + { + "epoch": 0.0002980068553862063, + "grad_norm": 0.3128114938735962, + "learning_rate": 0.00014370617696160266, + "loss": 0.5419, + "step": 849 + }, + { + "epoch": 0.0002983578646387224, + "grad_norm": 0.4030589163303375, + "learning_rate": 0.00014363939899833056, + "loss": 0.5959, + "step": 850 + }, + { + "epoch": 0.0002987088738912386, + "grad_norm": 0.39571288228034973, + "learning_rate": 0.00014357262103505844, + "loss": 0.6798, + "step": 851 + }, + { + "epoch": 0.0002990598831437547, + "grad_norm": 0.3388408422470093, + "learning_rate": 0.0001435058430717863, + "loss": 0.4887, + "step": 852 + }, + { + "epoch": 0.0002994108923962708, + "grad_norm": 0.39615562558174133, + "learning_rate": 0.00014343906510851418, + "loss": 0.5654, + "step": 853 + }, + { + "epoch": 0.000299761901648787, + "grad_norm": 0.3967401683330536, + "learning_rate": 0.00014337228714524205, + "loss": 0.6192, + "step": 854 + }, + { + "epoch": 0.0003001129109013031, + "grad_norm": 0.5597772002220154, + "learning_rate": 0.00014330550918196995, + "loss": 0.5808, + "step": 855 + }, + { + "epoch": 0.0003004639201538193, + "grad_norm": 0.36231061816215515, + "learning_rate": 0.00014323873121869783, + "loss": 0.4936, + "step": 856 + }, + { + "epoch": 0.0003008149294063354, + "grad_norm": 0.3775942027568817, + "learning_rate": 0.00014317195325542573, + "loss": 0.5706, + "step": 857 + }, + { + "epoch": 0.00030116593865885154, + "grad_norm": 0.4139408767223358, + "learning_rate": 0.0001431051752921536, + "loss": 0.5784, + "step": 858 + }, + { + "epoch": 0.0003015169479113677, + "grad_norm": 0.4101429879665375, + "learning_rate": 0.00014303839732888147, + "loss": 0.5937, + "step": 859 + }, + { + "epoch": 0.00030186795716388384, + "grad_norm": 0.5272162556648254, + "learning_rate": 0.00014297161936560937, + "loss": 0.5244, + "step": 860 + }, + { + "epoch": 0.0003022189664164, + "grad_norm": 0.3587292730808258, + "learning_rate": 0.00014290484140233725, + "loss": 0.6333, + "step": 861 + }, + { + "epoch": 0.00030256997566891614, + "grad_norm": 0.3284890353679657, + "learning_rate": 0.00014283806343906512, + "loss": 0.5414, + "step": 862 + }, + { + "epoch": 0.00030292098492143226, + "grad_norm": 0.414974182844162, + "learning_rate": 0.000142771285475793, + "loss": 0.6116, + "step": 863 + }, + { + "epoch": 0.00030327199417394844, + "grad_norm": 0.33619245886802673, + "learning_rate": 0.00014270450751252087, + "loss": 0.5506, + "step": 864 + }, + { + "epoch": 0.00030362300342646456, + "grad_norm": 0.45475640892982483, + "learning_rate": 0.00014263772954924874, + "loss": 0.6347, + "step": 865 + }, + { + "epoch": 0.00030397401267898074, + "grad_norm": 0.2695920765399933, + "learning_rate": 0.00014257095158597664, + "loss": 0.4529, + "step": 866 + }, + { + "epoch": 0.00030432502193149686, + "grad_norm": 0.3314480781555176, + "learning_rate": 0.00014250417362270451, + "loss": 0.5812, + "step": 867 + }, + { + "epoch": 0.000304676031184013, + "grad_norm": 0.31949582695961, + "learning_rate": 0.0001424373956594324, + "loss": 0.5213, + "step": 868 + }, + { + "epoch": 0.00030502704043652916, + "grad_norm": 0.34049752354621887, + "learning_rate": 0.00014237061769616026, + "loss": 0.4645, + "step": 869 + }, + { + "epoch": 0.0003053780496890453, + "grad_norm": 0.4304719567298889, + "learning_rate": 0.00014230383973288813, + "loss": 0.5065, + "step": 870 + }, + { + "epoch": 0.00030572905894156146, + "grad_norm": 0.32379043102264404, + "learning_rate": 0.00014223706176961603, + "loss": 0.553, + "step": 871 + }, + { + "epoch": 0.0003060800681940776, + "grad_norm": 0.33285439014434814, + "learning_rate": 0.0001421702838063439, + "loss": 0.5092, + "step": 872 + }, + { + "epoch": 0.0003064310774465937, + "grad_norm": 0.336795449256897, + "learning_rate": 0.00014210350584307178, + "loss": 0.4967, + "step": 873 + }, + { + "epoch": 0.0003067820866991099, + "grad_norm": 0.34653040766716003, + "learning_rate": 0.00014203672787979968, + "loss": 0.5353, + "step": 874 + }, + { + "epoch": 0.000307133095951626, + "grad_norm": 0.3352467715740204, + "learning_rate": 0.00014196994991652755, + "loss": 0.5594, + "step": 875 + }, + { + "epoch": 0.0003074841052041422, + "grad_norm": 0.38723453879356384, + "learning_rate": 0.00014190317195325545, + "loss": 0.5897, + "step": 876 + }, + { + "epoch": 0.0003078351144566583, + "grad_norm": 0.3987238109111786, + "learning_rate": 0.00014183639398998333, + "loss": 0.4647, + "step": 877 + }, + { + "epoch": 0.0003081861237091744, + "grad_norm": 0.3452693223953247, + "learning_rate": 0.0001417696160267112, + "loss": 0.5687, + "step": 878 + }, + { + "epoch": 0.0003085371329616906, + "grad_norm": 0.3561328649520874, + "learning_rate": 0.00014170283806343907, + "loss": 0.5845, + "step": 879 + }, + { + "epoch": 0.0003088881422142067, + "grad_norm": 0.29658418893814087, + "learning_rate": 0.00014163606010016695, + "loss": 0.5202, + "step": 880 + }, + { + "epoch": 0.0003092391514667229, + "grad_norm": 0.3908213973045349, + "learning_rate": 0.00014156928213689482, + "loss": 0.4439, + "step": 881 + }, + { + "epoch": 0.000309590160719239, + "grad_norm": 0.35816919803619385, + "learning_rate": 0.00014150250417362272, + "loss": 0.5384, + "step": 882 + }, + { + "epoch": 0.00030994116997175514, + "grad_norm": 0.3681255877017975, + "learning_rate": 0.0001414357262103506, + "loss": 0.5999, + "step": 883 + }, + { + "epoch": 0.0003102921792242713, + "grad_norm": 0.31137388944625854, + "learning_rate": 0.00014136894824707847, + "loss": 0.4495, + "step": 884 + }, + { + "epoch": 0.00031064318847678744, + "grad_norm": 0.2831423878669739, + "learning_rate": 0.00014130217028380634, + "loss": 0.4576, + "step": 885 + }, + { + "epoch": 0.0003109941977293036, + "grad_norm": 0.25953516364097595, + "learning_rate": 0.0001412353923205342, + "loss": 0.5606, + "step": 886 + }, + { + "epoch": 0.00031134520698181974, + "grad_norm": 0.31105297803878784, + "learning_rate": 0.0001411686143572621, + "loss": 0.5986, + "step": 887 + }, + { + "epoch": 0.00031169621623433586, + "grad_norm": 0.35177484154701233, + "learning_rate": 0.00014110183639398999, + "loss": 0.3394, + "step": 888 + }, + { + "epoch": 0.00031204722548685204, + "grad_norm": 0.373470276594162, + "learning_rate": 0.00014103505843071786, + "loss": 0.5862, + "step": 889 + }, + { + "epoch": 0.00031239823473936816, + "grad_norm": 0.37227189540863037, + "learning_rate": 0.00014096828046744576, + "loss": 0.4677, + "step": 890 + }, + { + "epoch": 0.00031274924399188434, + "grad_norm": 0.3799666464328766, + "learning_rate": 0.00014090150250417363, + "loss": 0.5255, + "step": 891 + }, + { + "epoch": 0.00031310025324440046, + "grad_norm": 0.3630129098892212, + "learning_rate": 0.00014083472454090153, + "loss": 0.5111, + "step": 892 + }, + { + "epoch": 0.0003134512624969166, + "grad_norm": 0.5131457448005676, + "learning_rate": 0.0001407679465776294, + "loss": 0.5207, + "step": 893 + }, + { + "epoch": 0.00031380227174943276, + "grad_norm": 0.3759867548942566, + "learning_rate": 0.00014070116861435728, + "loss": 0.6678, + "step": 894 + }, + { + "epoch": 0.0003141532810019489, + "grad_norm": 0.5577414631843567, + "learning_rate": 0.00014063439065108515, + "loss": 0.62, + "step": 895 + }, + { + "epoch": 0.00031450429025446506, + "grad_norm": 0.2789120376110077, + "learning_rate": 0.00014056761268781303, + "loss": 0.4204, + "step": 896 + }, + { + "epoch": 0.0003148552995069812, + "grad_norm": 0.2897239327430725, + "learning_rate": 0.0001405008347245409, + "loss": 0.432, + "step": 897 + }, + { + "epoch": 0.0003152063087594973, + "grad_norm": 0.3552323579788208, + "learning_rate": 0.0001404340567612688, + "loss": 0.5512, + "step": 898 + }, + { + "epoch": 0.0003155573180120135, + "grad_norm": 0.49963894486427307, + "learning_rate": 0.00014036727879799667, + "loss": 0.5868, + "step": 899 + }, + { + "epoch": 0.0003159083272645296, + "grad_norm": 0.37479934096336365, + "learning_rate": 0.00014030050083472454, + "loss": 0.6682, + "step": 900 + }, + { + "epoch": 0.0003162593365170458, + "grad_norm": 0.3415648639202118, + "learning_rate": 0.00014023372287145242, + "loss": 0.5301, + "step": 901 + }, + { + "epoch": 0.0003166103457695619, + "grad_norm": 0.37530943751335144, + "learning_rate": 0.0001401669449081803, + "loss": 0.5409, + "step": 902 + }, + { + "epoch": 0.000316961355022078, + "grad_norm": 0.37487658858299255, + "learning_rate": 0.0001401001669449082, + "loss": 0.5976, + "step": 903 + }, + { + "epoch": 0.0003173123642745942, + "grad_norm": 0.37174728512763977, + "learning_rate": 0.00014003338898163606, + "loss": 0.5933, + "step": 904 + }, + { + "epoch": 0.0003176633735271103, + "grad_norm": 0.491584450006485, + "learning_rate": 0.00013996661101836394, + "loss": 0.5112, + "step": 905 + }, + { + "epoch": 0.0003180143827796265, + "grad_norm": 0.38381487131118774, + "learning_rate": 0.0001398998330550918, + "loss": 0.6486, + "step": 906 + }, + { + "epoch": 0.0003183653920321426, + "grad_norm": 0.2867659330368042, + "learning_rate": 0.0001398330550918197, + "loss": 0.5033, + "step": 907 + }, + { + "epoch": 0.00031871640128465874, + "grad_norm": 0.3146355450153351, + "learning_rate": 0.00013976627712854758, + "loss": 0.5878, + "step": 908 + }, + { + "epoch": 0.0003190674105371749, + "grad_norm": 0.3454856276512146, + "learning_rate": 0.00013969949916527548, + "loss": 0.4751, + "step": 909 + }, + { + "epoch": 0.00031941841978969104, + "grad_norm": 0.32241204380989075, + "learning_rate": 0.00013963272120200336, + "loss": 0.6378, + "step": 910 + }, + { + "epoch": 0.0003197694290422072, + "grad_norm": 0.33703315258026123, + "learning_rate": 0.00013956594323873123, + "loss": 0.4634, + "step": 911 + }, + { + "epoch": 0.00032012043829472334, + "grad_norm": 0.3781648576259613, + "learning_rate": 0.0001394991652754591, + "loss": 0.5218, + "step": 912 + }, + { + "epoch": 0.00032047144754723946, + "grad_norm": 0.4124391973018646, + "learning_rate": 0.00013943238731218698, + "loss": 0.4958, + "step": 913 + }, + { + "epoch": 0.00032082245679975564, + "grad_norm": 0.3970220685005188, + "learning_rate": 0.00013936560934891488, + "loss": 0.5624, + "step": 914 + }, + { + "epoch": 0.00032117346605227176, + "grad_norm": 0.43682703375816345, + "learning_rate": 0.00013929883138564275, + "loss": 0.544, + "step": 915 + }, + { + "epoch": 0.00032152447530478794, + "grad_norm": 0.3476586639881134, + "learning_rate": 0.00013923205342237062, + "loss": 0.4418, + "step": 916 + }, + { + "epoch": 0.00032187548455730406, + "grad_norm": 0.36963552236557007, + "learning_rate": 0.0001391652754590985, + "loss": 0.5946, + "step": 917 + }, + { + "epoch": 0.0003222264938098202, + "grad_norm": 0.3445582985877991, + "learning_rate": 0.00013909849749582637, + "loss": 0.5879, + "step": 918 + }, + { + "epoch": 0.00032257750306233636, + "grad_norm": 0.39813530445098877, + "learning_rate": 0.00013903171953255427, + "loss": 0.5759, + "step": 919 + }, + { + "epoch": 0.0003229285123148525, + "grad_norm": 0.3314265012741089, + "learning_rate": 0.00013896494156928214, + "loss": 0.6165, + "step": 920 + }, + { + "epoch": 0.00032327952156736866, + "grad_norm": 0.4094330072402954, + "learning_rate": 0.00013889816360601002, + "loss": 0.5787, + "step": 921 + }, + { + "epoch": 0.0003236305308198848, + "grad_norm": 0.36821484565734863, + "learning_rate": 0.0001388313856427379, + "loss": 0.5303, + "step": 922 + }, + { + "epoch": 0.0003239815400724009, + "grad_norm": 0.3517453968524933, + "learning_rate": 0.00013876460767946576, + "loss": 0.4586, + "step": 923 + }, + { + "epoch": 0.0003243325493249171, + "grad_norm": 0.2959018647670746, + "learning_rate": 0.00013869782971619366, + "loss": 0.5225, + "step": 924 + }, + { + "epoch": 0.0003246835585774332, + "grad_norm": 0.3286895751953125, + "learning_rate": 0.00013863105175292154, + "loss": 0.5353, + "step": 925 + }, + { + "epoch": 0.0003250345678299494, + "grad_norm": 0.3328275680541992, + "learning_rate": 0.00013856427378964944, + "loss": 0.5915, + "step": 926 + }, + { + "epoch": 0.0003253855770824655, + "grad_norm": 0.3400813937187195, + "learning_rate": 0.0001384974958263773, + "loss": 0.4598, + "step": 927 + }, + { + "epoch": 0.0003257365863349816, + "grad_norm": 0.2876541018486023, + "learning_rate": 0.00013843071786310518, + "loss": 0.4835, + "step": 928 + }, + { + "epoch": 0.0003260875955874978, + "grad_norm": 0.3401765525341034, + "learning_rate": 0.00013836393989983308, + "loss": 0.56, + "step": 929 + }, + { + "epoch": 0.0003264386048400139, + "grad_norm": 0.34506598114967346, + "learning_rate": 0.00013829716193656096, + "loss": 0.6234, + "step": 930 + }, + { + "epoch": 0.0003267896140925301, + "grad_norm": 0.33732855319976807, + "learning_rate": 0.00013823038397328883, + "loss": 0.5686, + "step": 931 + }, + { + "epoch": 0.0003271406233450462, + "grad_norm": 0.34300100803375244, + "learning_rate": 0.0001381636060100167, + "loss": 0.6091, + "step": 932 + }, + { + "epoch": 0.00032749163259756235, + "grad_norm": 0.30349200963974, + "learning_rate": 0.00013809682804674458, + "loss": 0.4836, + "step": 933 + }, + { + "epoch": 0.0003278426418500785, + "grad_norm": 0.35742175579071045, + "learning_rate": 0.00013803005008347245, + "loss": 0.6443, + "step": 934 + }, + { + "epoch": 0.00032819365110259464, + "grad_norm": 0.33582496643066406, + "learning_rate": 0.00013796327212020035, + "loss": 0.6361, + "step": 935 + }, + { + "epoch": 0.0003285446603551108, + "grad_norm": 0.33403804898262024, + "learning_rate": 0.00013789649415692822, + "loss": 0.5911, + "step": 936 + }, + { + "epoch": 0.00032889566960762694, + "grad_norm": 0.4263191521167755, + "learning_rate": 0.0001378297161936561, + "loss": 0.5243, + "step": 937 + }, + { + "epoch": 0.00032924667886014307, + "grad_norm": 0.31543296575546265, + "learning_rate": 0.00013776293823038397, + "loss": 0.554, + "step": 938 + }, + { + "epoch": 0.00032959768811265924, + "grad_norm": 0.38975203037261963, + "learning_rate": 0.00013769616026711184, + "loss": 0.5358, + "step": 939 + }, + { + "epoch": 0.00032994869736517536, + "grad_norm": 0.3175157904624939, + "learning_rate": 0.00013762938230383974, + "loss": 0.5385, + "step": 940 + }, + { + "epoch": 0.00033029970661769154, + "grad_norm": 0.32753151655197144, + "learning_rate": 0.00013756260434056762, + "loss": 0.5191, + "step": 941 + }, + { + "epoch": 0.00033065071587020766, + "grad_norm": 0.2516227066516876, + "learning_rate": 0.0001374958263772955, + "loss": 0.3496, + "step": 942 + }, + { + "epoch": 0.0003310017251227238, + "grad_norm": 0.275806188583374, + "learning_rate": 0.0001374290484140234, + "loss": 0.4197, + "step": 943 + }, + { + "epoch": 0.00033135273437523996, + "grad_norm": 0.30234864354133606, + "learning_rate": 0.00013736227045075126, + "loss": 0.4909, + "step": 944 + }, + { + "epoch": 0.0003317037436277561, + "grad_norm": 0.32561683654785156, + "learning_rate": 0.00013729549248747916, + "loss": 0.5865, + "step": 945 + }, + { + "epoch": 0.00033205475288027226, + "grad_norm": 0.32075145840644836, + "learning_rate": 0.00013722871452420704, + "loss": 0.5957, + "step": 946 + }, + { + "epoch": 0.0003324057621327884, + "grad_norm": 0.3077705204486847, + "learning_rate": 0.0001371619365609349, + "loss": 0.6026, + "step": 947 + }, + { + "epoch": 0.0003327567713853045, + "grad_norm": 0.3092177212238312, + "learning_rate": 0.00013709515859766278, + "loss": 0.553, + "step": 948 + }, + { + "epoch": 0.0003331077806378207, + "grad_norm": 0.3611501157283783, + "learning_rate": 0.00013702838063439065, + "loss": 0.5707, + "step": 949 + }, + { + "epoch": 0.0003334587898903368, + "grad_norm": 0.3343827724456787, + "learning_rate": 0.00013696160267111853, + "loss": 0.5626, + "step": 950 + }, + { + "epoch": 0.000333809799142853, + "grad_norm": 0.3330281376838684, + "learning_rate": 0.00013689482470784643, + "loss": 0.6353, + "step": 951 + }, + { + "epoch": 0.0003341608083953691, + "grad_norm": 0.4045816957950592, + "learning_rate": 0.0001368280467445743, + "loss": 0.5781, + "step": 952 + }, + { + "epoch": 0.0003345118176478852, + "grad_norm": 0.3618166446685791, + "learning_rate": 0.00013676126878130217, + "loss": 0.6702, + "step": 953 + }, + { + "epoch": 0.0003348628269004014, + "grad_norm": 0.2836553752422333, + "learning_rate": 0.00013669449081803005, + "loss": 0.4371, + "step": 954 + }, + { + "epoch": 0.0003352138361529175, + "grad_norm": 0.3100498914718628, + "learning_rate": 0.00013662771285475792, + "loss": 0.5184, + "step": 955 + }, + { + "epoch": 0.0003355648454054337, + "grad_norm": 0.34877723455429077, + "learning_rate": 0.00013656093489148582, + "loss": 0.4778, + "step": 956 + }, + { + "epoch": 0.0003359158546579498, + "grad_norm": 0.27756938338279724, + "learning_rate": 0.0001364941569282137, + "loss": 0.4314, + "step": 957 + }, + { + "epoch": 0.00033626686391046595, + "grad_norm": 0.36129051446914673, + "learning_rate": 0.00013642737896494157, + "loss": 0.5837, + "step": 958 + }, + { + "epoch": 0.0003366178731629821, + "grad_norm": 0.35625776648521423, + "learning_rate": 0.00013636060100166944, + "loss": 0.5579, + "step": 959 + }, + { + "epoch": 0.00033696888241549825, + "grad_norm": 0.3735104501247406, + "learning_rate": 0.00013629382303839734, + "loss": 0.5283, + "step": 960 + }, + { + "epoch": 0.0003373198916680144, + "grad_norm": 0.34185606241226196, + "learning_rate": 0.00013622704507512521, + "loss": 0.5669, + "step": 961 + }, + { + "epoch": 0.00033767090092053054, + "grad_norm": 0.29324260354042053, + "learning_rate": 0.00013616026711185311, + "loss": 0.4468, + "step": 962 + }, + { + "epoch": 0.00033802191017304667, + "grad_norm": 0.3439052700996399, + "learning_rate": 0.000136093489148581, + "loss": 0.5196, + "step": 963 + }, + { + "epoch": 0.00033837291942556284, + "grad_norm": 0.3536570370197296, + "learning_rate": 0.00013602671118530886, + "loss": 0.5251, + "step": 964 + }, + { + "epoch": 0.00033872392867807897, + "grad_norm": 0.4759911298751831, + "learning_rate": 0.00013595993322203673, + "loss": 0.7017, + "step": 965 + }, + { + "epoch": 0.00033907493793059514, + "grad_norm": 0.2958674728870392, + "learning_rate": 0.0001358931552587646, + "loss": 0.4936, + "step": 966 + }, + { + "epoch": 0.00033942594718311126, + "grad_norm": 0.32770562171936035, + "learning_rate": 0.0001358263772954925, + "loss": 0.5741, + "step": 967 + }, + { + "epoch": 0.0003397769564356274, + "grad_norm": 0.35697153210639954, + "learning_rate": 0.00013575959933222038, + "loss": 0.428, + "step": 968 + }, + { + "epoch": 0.00034012796568814356, + "grad_norm": 0.3409043252468109, + "learning_rate": 0.00013569282136894825, + "loss": 0.6142, + "step": 969 + }, + { + "epoch": 0.0003404789749406597, + "grad_norm": 0.47055551409721375, + "learning_rate": 0.00013562604340567613, + "loss": 0.463, + "step": 970 + }, + { + "epoch": 0.00034082998419317586, + "grad_norm": 0.38270413875579834, + "learning_rate": 0.000135559265442404, + "loss": 0.462, + "step": 971 + }, + { + "epoch": 0.000341180993445692, + "grad_norm": 0.26209867000579834, + "learning_rate": 0.0001354924874791319, + "loss": 0.5341, + "step": 972 + }, + { + "epoch": 0.0003415320026982081, + "grad_norm": 0.37498748302459717, + "learning_rate": 0.00013542570951585977, + "loss": 0.5196, + "step": 973 + }, + { + "epoch": 0.0003418830119507243, + "grad_norm": 0.36789608001708984, + "learning_rate": 0.00013535893155258765, + "loss": 0.4723, + "step": 974 + }, + { + "epoch": 0.0003422340212032404, + "grad_norm": 0.33915975689888, + "learning_rate": 0.00013529215358931552, + "loss": 0.5511, + "step": 975 + }, + { + "epoch": 0.0003425850304557566, + "grad_norm": 0.43045058846473694, + "learning_rate": 0.0001352253756260434, + "loss": 0.5667, + "step": 976 + }, + { + "epoch": 0.0003429360397082727, + "grad_norm": 0.2948949933052063, + "learning_rate": 0.0001351585976627713, + "loss": 0.4804, + "step": 977 + }, + { + "epoch": 0.00034328704896078883, + "grad_norm": 0.3249470889568329, + "learning_rate": 0.00013509181969949917, + "loss": 0.6041, + "step": 978 + }, + { + "epoch": 0.000343638058213305, + "grad_norm": 0.2865908741950989, + "learning_rate": 0.00013502504173622707, + "loss": 0.5617, + "step": 979 + }, + { + "epoch": 0.0003439890674658211, + "grad_norm": 0.3190818428993225, + "learning_rate": 0.00013495826377295494, + "loss": 0.4902, + "step": 980 + }, + { + "epoch": 0.00034434007671833725, + "grad_norm": 0.3111664950847626, + "learning_rate": 0.0001348914858096828, + "loss": 0.5504, + "step": 981 + }, + { + "epoch": 0.0003446910859708534, + "grad_norm": 0.3255857229232788, + "learning_rate": 0.00013482470784641069, + "loss": 0.5592, + "step": 982 + }, + { + "epoch": 0.00034504209522336955, + "grad_norm": 0.30806589126586914, + "learning_rate": 0.00013475792988313859, + "loss": 0.5567, + "step": 983 + }, + { + "epoch": 0.0003453931044758857, + "grad_norm": 0.33785945177078247, + "learning_rate": 0.00013469115191986646, + "loss": 0.5881, + "step": 984 + }, + { + "epoch": 0.00034574411372840185, + "grad_norm": 0.34626781940460205, + "learning_rate": 0.00013462437395659433, + "loss": 0.578, + "step": 985 + }, + { + "epoch": 0.00034609512298091797, + "grad_norm": 0.367034912109375, + "learning_rate": 0.0001345575959933222, + "loss": 0.5893, + "step": 986 + }, + { + "epoch": 0.00034644613223343415, + "grad_norm": 0.37824952602386475, + "learning_rate": 0.00013449081803005008, + "loss": 0.5681, + "step": 987 + }, + { + "epoch": 0.00034679714148595027, + "grad_norm": 0.4054035544395447, + "learning_rate": 0.00013442404006677798, + "loss": 0.6108, + "step": 988 + }, + { + "epoch": 0.00034714815073846645, + "grad_norm": 0.4374067485332489, + "learning_rate": 0.00013435726210350585, + "loss": 0.6002, + "step": 989 + }, + { + "epoch": 0.00034749915999098257, + "grad_norm": 0.3554278016090393, + "learning_rate": 0.00013429048414023373, + "loss": 0.6444, + "step": 990 + }, + { + "epoch": 0.0003478501692434987, + "grad_norm": 0.3428646922111511, + "learning_rate": 0.0001342237061769616, + "loss": 0.6527, + "step": 991 + }, + { + "epoch": 0.00034820117849601487, + "grad_norm": 0.25603657960891724, + "learning_rate": 0.00013415692821368947, + "loss": 0.5244, + "step": 992 + }, + { + "epoch": 0.000348552187748531, + "grad_norm": 0.35237595438957214, + "learning_rate": 0.00013409015025041737, + "loss": 0.557, + "step": 993 + }, + { + "epoch": 0.00034890319700104717, + "grad_norm": 0.33666110038757324, + "learning_rate": 0.00013402337228714524, + "loss": 0.5674, + "step": 994 + }, + { + "epoch": 0.0003492542062535633, + "grad_norm": 0.30283182859420776, + "learning_rate": 0.00013395659432387312, + "loss": 0.6081, + "step": 995 + }, + { + "epoch": 0.0003496052155060794, + "grad_norm": 0.30893146991729736, + "learning_rate": 0.00013388981636060102, + "loss": 0.6089, + "step": 996 + }, + { + "epoch": 0.0003499562247585956, + "grad_norm": 0.2617473304271698, + "learning_rate": 0.0001338230383973289, + "loss": 0.6104, + "step": 997 + }, + { + "epoch": 0.0003503072340111117, + "grad_norm": 0.29493093490600586, + "learning_rate": 0.00013375626043405676, + "loss": 0.5047, + "step": 998 + }, + { + "epoch": 0.0003506582432636279, + "grad_norm": 0.3991663157939911, + "learning_rate": 0.00013368948247078466, + "loss": 0.5137, + "step": 999 + }, + { + "epoch": 0.000351009252516144, + "grad_norm": 0.31760329008102417, + "learning_rate": 0.00013362270450751254, + "loss": 0.4371, + "step": 1000 + } + ], + "logging_steps": 1, + "max_steps": 3000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.846828653872742e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}