diff --git "a/checkpoint-1000/trainer_state.json" "b/checkpoint-1000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1000/trainer_state.json" @@ -0,0 +1,7050 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6916825177243645, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006916825177243646, + "grad_norm": 0.21880632638931274, + "learning_rate": 0.0, + "loss": 2.5104, + "step": 1 + }, + { + "epoch": 0.0013833650354487291, + "grad_norm": 0.2225637435913086, + "learning_rate": 1.36986301369863e-05, + "loss": 2.7879, + "step": 2 + }, + { + "epoch": 0.0020750475531730937, + "grad_norm": 0.21454782783985138, + "learning_rate": 2.73972602739726e-05, + "loss": 2.8515, + "step": 3 + }, + { + "epoch": 0.0027667300708974583, + "grad_norm": 0.285408616065979, + "learning_rate": 4.1095890410958905e-05, + "loss": 2.8123, + "step": 4 + }, + { + "epoch": 0.0034584125886218224, + "grad_norm": 0.231473907828331, + "learning_rate": 5.47945205479452e-05, + "loss": 2.5181, + "step": 5 + }, + { + "epoch": 0.004150095106346187, + "grad_norm": 0.20668023824691772, + "learning_rate": 6.84931506849315e-05, + "loss": 2.066, + "step": 6 + }, + { + "epoch": 0.0048417776240705515, + "grad_norm": 0.3103766441345215, + "learning_rate": 8.219178082191781e-05, + "loss": 2.0647, + "step": 7 + }, + { + "epoch": 0.0055334601417949165, + "grad_norm": 0.3536206781864166, + "learning_rate": 9.58904109589041e-05, + "loss": 2.1267, + "step": 8 + }, + { + "epoch": 0.006225142659519281, + "grad_norm": 0.2078174203634262, + "learning_rate": 0.0001095890410958904, + "loss": 2.5859, + "step": 9 + }, + { + "epoch": 0.006916825177243645, + "grad_norm": 0.2702614367008209, + "learning_rate": 0.0001232876712328767, + "loss": 2.7732, + "step": 10 + }, + { + "epoch": 0.00760850769496801, + "grad_norm": 0.348145067691803, + "learning_rate": 0.000136986301369863, + "loss": 2.6776, + "step": 11 + }, + { + "epoch": 0.008300190212692375, + "grad_norm": 0.32872459292411804, + "learning_rate": 0.0001506849315068493, + "loss": 1.5802, + "step": 12 + }, + { + "epoch": 0.008991872730416739, + "grad_norm": 0.7062669992446899, + "learning_rate": 0.00016438356164383562, + "loss": 1.9955, + "step": 13 + }, + { + "epoch": 0.009683555248141103, + "grad_norm": 0.252165287733078, + "learning_rate": 0.00017808219178082192, + "loss": 2.9662, + "step": 14 + }, + { + "epoch": 0.010375237765865467, + "grad_norm": 0.29550454020500183, + "learning_rate": 0.0001917808219178082, + "loss": 2.3048, + "step": 15 + }, + { + "epoch": 0.011066920283589833, + "grad_norm": 0.3947546184062958, + "learning_rate": 0.0002054794520547945, + "loss": 2.8061, + "step": 16 + }, + { + "epoch": 0.011758602801314197, + "grad_norm": 0.26285308599472046, + "learning_rate": 0.0002191780821917808, + "loss": 2.7137, + "step": 17 + }, + { + "epoch": 0.012450285319038561, + "grad_norm": 0.3446462154388428, + "learning_rate": 0.00023287671232876712, + "loss": 2.3531, + "step": 18 + }, + { + "epoch": 0.013141967836762926, + "grad_norm": 0.2948848307132721, + "learning_rate": 0.0002465753424657534, + "loss": 2.2846, + "step": 19 + }, + { + "epoch": 0.01383365035448729, + "grad_norm": 0.3657473027706146, + "learning_rate": 0.0002602739726027397, + "loss": 2.4455, + "step": 20 + }, + { + "epoch": 0.014525332872211656, + "grad_norm": 0.2797200381755829, + "learning_rate": 0.000273972602739726, + "loss": 2.6308, + "step": 21 + }, + { + "epoch": 0.01521701538993602, + "grad_norm": 0.32914993166923523, + "learning_rate": 0.0002876712328767123, + "loss": 2.3918, + "step": 22 + }, + { + "epoch": 0.015908697907660384, + "grad_norm": 0.3232629895210266, + "learning_rate": 0.0003013698630136986, + "loss": 1.714, + "step": 23 + }, + { + "epoch": 0.01660038042538475, + "grad_norm": 0.3496573269367218, + "learning_rate": 0.00031506849315068495, + "loss": 2.634, + "step": 24 + }, + { + "epoch": 0.017292062943109112, + "grad_norm": 0.34902870655059814, + "learning_rate": 0.00032876712328767124, + "loss": 2.6269, + "step": 25 + }, + { + "epoch": 0.017983745460833478, + "grad_norm": 0.3312171399593353, + "learning_rate": 0.00034246575342465754, + "loss": 2.5056, + "step": 26 + }, + { + "epoch": 0.01867542797855784, + "grad_norm": 0.35412952303886414, + "learning_rate": 0.00035616438356164383, + "loss": 2.0546, + "step": 27 + }, + { + "epoch": 0.019367110496282206, + "grad_norm": 0.3497133255004883, + "learning_rate": 0.0003698630136986301, + "loss": 2.5338, + "step": 28 + }, + { + "epoch": 0.020058793014006572, + "grad_norm": 0.4878860116004944, + "learning_rate": 0.0003835616438356164, + "loss": 2.4302, + "step": 29 + }, + { + "epoch": 0.020750475531730934, + "grad_norm": 0.6170843243598938, + "learning_rate": 0.0003972602739726027, + "loss": 2.4316, + "step": 30 + }, + { + "epoch": 0.0214421580494553, + "grad_norm": 0.5822828412055969, + "learning_rate": 0.000410958904109589, + "loss": 2.2286, + "step": 31 + }, + { + "epoch": 0.022133840567179666, + "grad_norm": 0.3742135167121887, + "learning_rate": 0.0004246575342465753, + "loss": 2.145, + "step": 32 + }, + { + "epoch": 0.02282552308490403, + "grad_norm": 0.9055424332618713, + "learning_rate": 0.0004383561643835616, + "loss": 1.6547, + "step": 33 + }, + { + "epoch": 0.023517205602628394, + "grad_norm": 0.5974112153053284, + "learning_rate": 0.00045205479452054795, + "loss": 1.3416, + "step": 34 + }, + { + "epoch": 0.024208888120352757, + "grad_norm": 0.5048322677612305, + "learning_rate": 0.00046575342465753425, + "loss": 1.9358, + "step": 35 + }, + { + "epoch": 0.024900570638077123, + "grad_norm": 0.6585083603858948, + "learning_rate": 0.00047945205479452054, + "loss": 2.5502, + "step": 36 + }, + { + "epoch": 0.02559225315580149, + "grad_norm": 0.5043433904647827, + "learning_rate": 0.0004931506849315068, + "loss": 1.9789, + "step": 37 + }, + { + "epoch": 0.02628393567352585, + "grad_norm": 0.7396597266197205, + "learning_rate": 0.0005068493150684932, + "loss": 1.228, + "step": 38 + }, + { + "epoch": 0.026975618191250217, + "grad_norm": 0.6552051901817322, + "learning_rate": 0.0005205479452054794, + "loss": 2.2174, + "step": 39 + }, + { + "epoch": 0.02766730070897458, + "grad_norm": 0.5566487908363342, + "learning_rate": 0.0005342465753424658, + "loss": 1.4923, + "step": 40 + }, + { + "epoch": 0.028358983226698945, + "grad_norm": 0.8825723528862, + "learning_rate": 0.000547945205479452, + "loss": 1.1997, + "step": 41 + }, + { + "epoch": 0.02905066574442331, + "grad_norm": 0.6939167380332947, + "learning_rate": 0.0005616438356164384, + "loss": 0.9761, + "step": 42 + }, + { + "epoch": 0.029742348262147673, + "grad_norm": 0.6930441856384277, + "learning_rate": 0.0005753424657534246, + "loss": 1.859, + "step": 43 + }, + { + "epoch": 0.03043403077987204, + "grad_norm": 0.5989459753036499, + "learning_rate": 0.000589041095890411, + "loss": 2.1215, + "step": 44 + }, + { + "epoch": 0.0311257132975964, + "grad_norm": 0.5096371173858643, + "learning_rate": 0.0006027397260273972, + "loss": 1.6978, + "step": 45 + }, + { + "epoch": 0.03181739581532077, + "grad_norm": 0.505210816860199, + "learning_rate": 0.0006164383561643835, + "loss": 2.2416, + "step": 46 + }, + { + "epoch": 0.03250907833304513, + "grad_norm": 0.4391973316669464, + "learning_rate": 0.0006301369863013699, + "loss": 2.0748, + "step": 47 + }, + { + "epoch": 0.0332007608507695, + "grad_norm": 0.5823452472686768, + "learning_rate": 0.0006438356164383562, + "loss": 1.9801, + "step": 48 + }, + { + "epoch": 0.03389244336849386, + "grad_norm": 0.4575086236000061, + "learning_rate": 0.0006575342465753425, + "loss": 1.7658, + "step": 49 + }, + { + "epoch": 0.034584125886218224, + "grad_norm": 1.1978178024291992, + "learning_rate": 0.0006712328767123288, + "loss": 2.0922, + "step": 50 + }, + { + "epoch": 0.03527580840394259, + "grad_norm": 0.8242403864860535, + "learning_rate": 0.0006849315068493151, + "loss": 1.4001, + "step": 51 + }, + { + "epoch": 0.035967490921666956, + "grad_norm": 0.7775748372077942, + "learning_rate": 0.0006986301369863014, + "loss": 2.2013, + "step": 52 + }, + { + "epoch": 0.03665917343939132, + "grad_norm": 0.6356838941574097, + "learning_rate": 0.0007123287671232877, + "loss": 2.1857, + "step": 53 + }, + { + "epoch": 0.03735085595711568, + "grad_norm": 0.5681482553482056, + "learning_rate": 0.000726027397260274, + "loss": 1.8066, + "step": 54 + }, + { + "epoch": 0.03804253847484005, + "grad_norm": 0.5019308924674988, + "learning_rate": 0.0007397260273972603, + "loss": 2.2206, + "step": 55 + }, + { + "epoch": 0.03873422099256441, + "grad_norm": 0.5633329749107361, + "learning_rate": 0.0007534246575342466, + "loss": 1.8777, + "step": 56 + }, + { + "epoch": 0.039425903510288775, + "grad_norm": 0.7095340490341187, + "learning_rate": 0.0007671232876712328, + "loss": 1.547, + "step": 57 + }, + { + "epoch": 0.040117586028013144, + "grad_norm": 0.5671369433403015, + "learning_rate": 0.0007808219178082192, + "loss": 1.621, + "step": 58 + }, + { + "epoch": 0.040809268545737507, + "grad_norm": 0.6320775747299194, + "learning_rate": 0.0007945205479452054, + "loss": 1.6643, + "step": 59 + }, + { + "epoch": 0.04150095106346187, + "grad_norm": 0.5812399387359619, + "learning_rate": 0.0008082191780821918, + "loss": 2.1796, + "step": 60 + }, + { + "epoch": 0.04219263358118624, + "grad_norm": 0.6970055103302002, + "learning_rate": 0.000821917808219178, + "loss": 1.3168, + "step": 61 + }, + { + "epoch": 0.0428843160989106, + "grad_norm": 0.5469310879707336, + "learning_rate": 0.0008356164383561644, + "loss": 1.9548, + "step": 62 + }, + { + "epoch": 0.04357599861663496, + "grad_norm": 0.6035028100013733, + "learning_rate": 0.0008493150684931506, + "loss": 2.0096, + "step": 63 + }, + { + "epoch": 0.04426768113435933, + "grad_norm": 0.5294916033744812, + "learning_rate": 0.000863013698630137, + "loss": 1.9096, + "step": 64 + }, + { + "epoch": 0.044959363652083695, + "grad_norm": 0.5908515453338623, + "learning_rate": 0.0008767123287671232, + "loss": 1.7588, + "step": 65 + }, + { + "epoch": 0.04565104616980806, + "grad_norm": 0.6100364327430725, + "learning_rate": 0.0008904109589041097, + "loss": 2.354, + "step": 66 + }, + { + "epoch": 0.04634272868753242, + "grad_norm": 0.4925366938114166, + "learning_rate": 0.0009041095890410959, + "loss": 1.9282, + "step": 67 + }, + { + "epoch": 0.04703441120525679, + "grad_norm": 0.6262894868850708, + "learning_rate": 0.0009178082191780823, + "loss": 1.2392, + "step": 68 + }, + { + "epoch": 0.04772609372298115, + "grad_norm": 0.55129474401474, + "learning_rate": 0.0009315068493150685, + "loss": 2.4297, + "step": 69 + }, + { + "epoch": 0.048417776240705514, + "grad_norm": 0.5773240327835083, + "learning_rate": 0.0009452054794520548, + "loss": 1.4238, + "step": 70 + }, + { + "epoch": 0.04910945875842988, + "grad_norm": 0.3298526704311371, + "learning_rate": 0.0009589041095890411, + "loss": 1.5375, + "step": 71 + }, + { + "epoch": 0.049801141276154245, + "grad_norm": 0.9887644648551941, + "learning_rate": 0.0009726027397260274, + "loss": 2.037, + "step": 72 + }, + { + "epoch": 0.05049282379387861, + "grad_norm": 0.7895487546920776, + "learning_rate": 0.0009863013698630137, + "loss": 1.8359, + "step": 73 + }, + { + "epoch": 0.05118450631160298, + "grad_norm": 0.5635783076286316, + "learning_rate": 0.001, + "loss": 1.2403, + "step": 74 + }, + { + "epoch": 0.05187618882932734, + "grad_norm": 0.5721316933631897, + "learning_rate": 0.0009992716678805535, + "loss": 1.9278, + "step": 75 + }, + { + "epoch": 0.0525678713470517, + "grad_norm": 0.4850369095802307, + "learning_rate": 0.000998543335761107, + "loss": 1.9441, + "step": 76 + }, + { + "epoch": 0.053259553864776064, + "grad_norm": 0.5316908955574036, + "learning_rate": 0.0009978150036416607, + "loss": 1.593, + "step": 77 + }, + { + "epoch": 0.053951236382500434, + "grad_norm": 0.4999512732028961, + "learning_rate": 0.000997086671522214, + "loss": 1.8264, + "step": 78 + }, + { + "epoch": 0.054642918900224796, + "grad_norm": 0.4769350588321686, + "learning_rate": 0.0009963583394027677, + "loss": 1.6991, + "step": 79 + }, + { + "epoch": 0.05533460141794916, + "grad_norm": 0.4839954078197479, + "learning_rate": 0.0009956300072833213, + "loss": 1.0442, + "step": 80 + }, + { + "epoch": 0.05602628393567353, + "grad_norm": 0.7724981307983398, + "learning_rate": 0.0009949016751638748, + "loss": 1.5422, + "step": 81 + }, + { + "epoch": 0.05671796645339789, + "grad_norm": 0.7546667456626892, + "learning_rate": 0.0009941733430444283, + "loss": 1.7832, + "step": 82 + }, + { + "epoch": 0.05740964897112225, + "grad_norm": 0.5036157369613647, + "learning_rate": 0.0009934450109249818, + "loss": 1.9083, + "step": 83 + }, + { + "epoch": 0.05810133148884662, + "grad_norm": 0.5091835260391235, + "learning_rate": 0.0009927166788055353, + "loss": 1.5611, + "step": 84 + }, + { + "epoch": 0.058793014006570984, + "grad_norm": 0.5591360926628113, + "learning_rate": 0.0009919883466860888, + "loss": 1.517, + "step": 85 + }, + { + "epoch": 0.05948469652429535, + "grad_norm": 0.5279435515403748, + "learning_rate": 0.0009912600145666425, + "loss": 1.541, + "step": 86 + }, + { + "epoch": 0.060176379042019716, + "grad_norm": 0.8345214128494263, + "learning_rate": 0.000990531682447196, + "loss": 0.8178, + "step": 87 + }, + { + "epoch": 0.06086806155974408, + "grad_norm": 0.7917139530181885, + "learning_rate": 0.0009898033503277495, + "loss": 2.0548, + "step": 88 + }, + { + "epoch": 0.06155974407746844, + "grad_norm": 0.46465063095092773, + "learning_rate": 0.000989075018208303, + "loss": 2.045, + "step": 89 + }, + { + "epoch": 0.0622514265951928, + "grad_norm": 0.6480844020843506, + "learning_rate": 0.0009883466860888565, + "loss": 2.1368, + "step": 90 + }, + { + "epoch": 0.06294310911291717, + "grad_norm": 0.5167698264122009, + "learning_rate": 0.00098761835396941, + "loss": 2.2606, + "step": 91 + }, + { + "epoch": 0.06363479163064154, + "grad_norm": 0.6514157652854919, + "learning_rate": 0.0009868900218499635, + "loss": 2.1443, + "step": 92 + }, + { + "epoch": 0.0643264741483659, + "grad_norm": 0.9793212413787842, + "learning_rate": 0.0009861616897305172, + "loss": 1.7448, + "step": 93 + }, + { + "epoch": 0.06501815666609026, + "grad_norm": 0.5196186900138855, + "learning_rate": 0.0009854333576110707, + "loss": 1.6717, + "step": 94 + }, + { + "epoch": 0.06570983918381462, + "grad_norm": 0.4875952899456024, + "learning_rate": 0.0009847050254916242, + "loss": 1.8078, + "step": 95 + }, + { + "epoch": 0.066401521701539, + "grad_norm": 0.5111953020095825, + "learning_rate": 0.0009839766933721777, + "loss": 2.263, + "step": 96 + }, + { + "epoch": 0.06709320421926336, + "grad_norm": 0.6604788303375244, + "learning_rate": 0.0009832483612527312, + "loss": 1.6966, + "step": 97 + }, + { + "epoch": 0.06778488673698772, + "grad_norm": 0.5474271774291992, + "learning_rate": 0.0009825200291332847, + "loss": 2.3985, + "step": 98 + }, + { + "epoch": 0.06847656925471209, + "grad_norm": 0.47275879979133606, + "learning_rate": 0.0009817916970138382, + "loss": 1.7751, + "step": 99 + }, + { + "epoch": 0.06916825177243645, + "grad_norm": 0.5738961696624756, + "learning_rate": 0.000981063364894392, + "loss": 1.5479, + "step": 100 + }, + { + "epoch": 0.06985993429016081, + "grad_norm": 0.5046308636665344, + "learning_rate": 0.0009803350327749454, + "loss": 1.4889, + "step": 101 + }, + { + "epoch": 0.07055161680788519, + "grad_norm": 0.45390692353248596, + "learning_rate": 0.000979606700655499, + "loss": 1.3446, + "step": 102 + }, + { + "epoch": 0.07124329932560955, + "grad_norm": 0.4701155126094818, + "learning_rate": 0.0009788783685360525, + "loss": 1.4525, + "step": 103 + }, + { + "epoch": 0.07193498184333391, + "grad_norm": 0.6199256181716919, + "learning_rate": 0.000978150036416606, + "loss": 1.5779, + "step": 104 + }, + { + "epoch": 0.07262666436105827, + "grad_norm": 0.6306092143058777, + "learning_rate": 0.0009774217042971595, + "loss": 2.0143, + "step": 105 + }, + { + "epoch": 0.07331834687878264, + "grad_norm": 0.5837789177894592, + "learning_rate": 0.000976693372177713, + "loss": 1.2003, + "step": 106 + }, + { + "epoch": 0.074010029396507, + "grad_norm": 0.9713156223297119, + "learning_rate": 0.0009759650400582666, + "loss": 1.5721, + "step": 107 + }, + { + "epoch": 0.07470171191423136, + "grad_norm": 0.694187343120575, + "learning_rate": 0.0009752367079388202, + "loss": 2.2187, + "step": 108 + }, + { + "epoch": 0.07539339443195574, + "grad_norm": 0.465781033039093, + "learning_rate": 0.0009745083758193737, + "loss": 1.768, + "step": 109 + }, + { + "epoch": 0.0760850769496801, + "grad_norm": 0.5198079347610474, + "learning_rate": 0.0009737800436999272, + "loss": 2.1921, + "step": 110 + }, + { + "epoch": 0.07677675946740446, + "grad_norm": 0.7641897201538086, + "learning_rate": 0.0009730517115804807, + "loss": 1.7574, + "step": 111 + }, + { + "epoch": 0.07746844198512882, + "grad_norm": 0.4864037334918976, + "learning_rate": 0.0009723233794610342, + "loss": 1.9409, + "step": 112 + }, + { + "epoch": 0.07816012450285319, + "grad_norm": 1.0721259117126465, + "learning_rate": 0.0009715950473415878, + "loss": 1.2796, + "step": 113 + }, + { + "epoch": 0.07885180702057755, + "grad_norm": 0.6161507964134216, + "learning_rate": 0.0009708667152221413, + "loss": 1.646, + "step": 114 + }, + { + "epoch": 0.07954348953830193, + "grad_norm": 0.6296889185905457, + "learning_rate": 0.0009701383831026949, + "loss": 1.2842, + "step": 115 + }, + { + "epoch": 0.08023517205602629, + "grad_norm": 0.6511496901512146, + "learning_rate": 0.0009694100509832484, + "loss": 2.0559, + "step": 116 + }, + { + "epoch": 0.08092685457375065, + "grad_norm": 0.5697126984596252, + "learning_rate": 0.0009686817188638019, + "loss": 1.5121, + "step": 117 + }, + { + "epoch": 0.08161853709147501, + "grad_norm": 0.506841242313385, + "learning_rate": 0.0009679533867443554, + "loss": 1.6908, + "step": 118 + }, + { + "epoch": 0.08231021960919938, + "grad_norm": 1.1525691747665405, + "learning_rate": 0.0009672250546249089, + "loss": 2.0817, + "step": 119 + }, + { + "epoch": 0.08300190212692374, + "grad_norm": 0.6273766756057739, + "learning_rate": 0.0009664967225054625, + "loss": 1.783, + "step": 120 + }, + { + "epoch": 0.0836935846446481, + "grad_norm": 0.8089930415153503, + "learning_rate": 0.000965768390386016, + "loss": 1.9208, + "step": 121 + }, + { + "epoch": 0.08438526716237248, + "grad_norm": 0.6257967948913574, + "learning_rate": 0.0009650400582665696, + "loss": 1.8797, + "step": 122 + }, + { + "epoch": 0.08507694968009684, + "grad_norm": 0.6704832911491394, + "learning_rate": 0.0009643117261471231, + "loss": 1.6969, + "step": 123 + }, + { + "epoch": 0.0857686321978212, + "grad_norm": 0.8226727843284607, + "learning_rate": 0.0009635833940276765, + "loss": 1.291, + "step": 124 + }, + { + "epoch": 0.08646031471554556, + "grad_norm": 0.45218008756637573, + "learning_rate": 0.0009628550619082302, + "loss": 1.6168, + "step": 125 + }, + { + "epoch": 0.08715199723326993, + "grad_norm": 0.9265746474266052, + "learning_rate": 0.0009621267297887837, + "loss": 1.5465, + "step": 126 + }, + { + "epoch": 0.08784367975099429, + "grad_norm": 0.7594870924949646, + "learning_rate": 0.0009613983976693373, + "loss": 1.2877, + "step": 127 + }, + { + "epoch": 0.08853536226871866, + "grad_norm": 0.5055251121520996, + "learning_rate": 0.0009606700655498908, + "loss": 1.8451, + "step": 128 + }, + { + "epoch": 0.08922704478644303, + "grad_norm": 0.5842559337615967, + "learning_rate": 0.0009599417334304444, + "loss": 2.4324, + "step": 129 + }, + { + "epoch": 0.08991872730416739, + "grad_norm": 0.42893463373184204, + "learning_rate": 0.0009592134013109979, + "loss": 1.8858, + "step": 130 + }, + { + "epoch": 0.09061040982189175, + "grad_norm": 0.5879374146461487, + "learning_rate": 0.0009584850691915513, + "loss": 2.1098, + "step": 131 + }, + { + "epoch": 0.09130209233961611, + "grad_norm": 1.0884597301483154, + "learning_rate": 0.0009577567370721049, + "loss": 1.1548, + "step": 132 + }, + { + "epoch": 0.09199377485734048, + "grad_norm": 0.4452207684516907, + "learning_rate": 0.0009570284049526584, + "loss": 1.5784, + "step": 133 + }, + { + "epoch": 0.09268545737506484, + "grad_norm": 0.5032292604446411, + "learning_rate": 0.000956300072833212, + "loss": 1.8767, + "step": 134 + }, + { + "epoch": 0.09337713989278922, + "grad_norm": 0.6190866827964783, + "learning_rate": 0.0009555717407137655, + "loss": 1.7698, + "step": 135 + }, + { + "epoch": 0.09406882241051358, + "grad_norm": 0.559252142906189, + "learning_rate": 0.0009548434085943191, + "loss": 2.063, + "step": 136 + }, + { + "epoch": 0.09476050492823794, + "grad_norm": 0.7464174032211304, + "learning_rate": 0.0009541150764748726, + "loss": 1.334, + "step": 137 + }, + { + "epoch": 0.0954521874459623, + "grad_norm": 0.5302634835243225, + "learning_rate": 0.000953386744355426, + "loss": 1.838, + "step": 138 + }, + { + "epoch": 0.09614386996368667, + "grad_norm": 0.5212066173553467, + "learning_rate": 0.0009526584122359796, + "loss": 1.9793, + "step": 139 + }, + { + "epoch": 0.09683555248141103, + "grad_norm": 0.7148857116699219, + "learning_rate": 0.0009519300801165331, + "loss": 1.6045, + "step": 140 + }, + { + "epoch": 0.0975272349991354, + "grad_norm": 1.0729445219039917, + "learning_rate": 0.0009512017479970867, + "loss": 1.5728, + "step": 141 + }, + { + "epoch": 0.09821891751685977, + "grad_norm": 0.438503235578537, + "learning_rate": 0.0009504734158776402, + "loss": 1.5064, + "step": 142 + }, + { + "epoch": 0.09891060003458413, + "grad_norm": 0.6026888489723206, + "learning_rate": 0.0009497450837581938, + "loss": 1.5297, + "step": 143 + }, + { + "epoch": 0.09960228255230849, + "grad_norm": 0.4339958727359772, + "learning_rate": 0.0009490167516387472, + "loss": 1.2461, + "step": 144 + }, + { + "epoch": 0.10029396507003285, + "grad_norm": 0.8123407363891602, + "learning_rate": 0.0009482884195193007, + "loss": 1.7749, + "step": 145 + }, + { + "epoch": 0.10098564758775722, + "grad_norm": 0.938025951385498, + "learning_rate": 0.0009475600873998543, + "loss": 1.2157, + "step": 146 + }, + { + "epoch": 0.10167733010548158, + "grad_norm": 0.8118213415145874, + "learning_rate": 0.0009468317552804079, + "loss": 1.3722, + "step": 147 + }, + { + "epoch": 0.10236901262320595, + "grad_norm": 0.6156368851661682, + "learning_rate": 0.0009461034231609615, + "loss": 1.9435, + "step": 148 + }, + { + "epoch": 0.10306069514093032, + "grad_norm": 0.43706831336021423, + "learning_rate": 0.000945375091041515, + "loss": 1.7467, + "step": 149 + }, + { + "epoch": 0.10375237765865468, + "grad_norm": 0.5463519096374512, + "learning_rate": 0.0009446467589220686, + "loss": 1.8991, + "step": 150 + }, + { + "epoch": 0.10444406017637904, + "grad_norm": 0.4798230826854706, + "learning_rate": 0.000943918426802622, + "loss": 2.1618, + "step": 151 + }, + { + "epoch": 0.1051357426941034, + "grad_norm": 0.4733302891254425, + "learning_rate": 0.0009431900946831755, + "loss": 1.8547, + "step": 152 + }, + { + "epoch": 0.10582742521182777, + "grad_norm": 0.558428168296814, + "learning_rate": 0.0009424617625637291, + "loss": 2.314, + "step": 153 + }, + { + "epoch": 0.10651910772955213, + "grad_norm": 0.5310361385345459, + "learning_rate": 0.0009417334304442826, + "loss": 1.9073, + "step": 154 + }, + { + "epoch": 0.1072107902472765, + "grad_norm": 0.4204038679599762, + "learning_rate": 0.0009410050983248362, + "loss": 1.9635, + "step": 155 + }, + { + "epoch": 0.10790247276500087, + "grad_norm": 0.5052216649055481, + "learning_rate": 0.0009402767662053897, + "loss": 1.0511, + "step": 156 + }, + { + "epoch": 0.10859415528272523, + "grad_norm": 0.5589479804039001, + "learning_rate": 0.0009395484340859433, + "loss": 1.4608, + "step": 157 + }, + { + "epoch": 0.10928583780044959, + "grad_norm": 0.7388360500335693, + "learning_rate": 0.0009388201019664967, + "loss": 1.9785, + "step": 158 + }, + { + "epoch": 0.10997752031817395, + "grad_norm": 0.5995668172836304, + "learning_rate": 0.0009380917698470502, + "loss": 1.4633, + "step": 159 + }, + { + "epoch": 0.11066920283589832, + "grad_norm": 0.8107509613037109, + "learning_rate": 0.0009373634377276038, + "loss": 1.6685, + "step": 160 + }, + { + "epoch": 0.1113608853536227, + "grad_norm": 0.6110396981239319, + "learning_rate": 0.0009366351056081573, + "loss": 1.8113, + "step": 161 + }, + { + "epoch": 0.11205256787134706, + "grad_norm": 0.5032293796539307, + "learning_rate": 0.0009359067734887109, + "loss": 1.7344, + "step": 162 + }, + { + "epoch": 0.11274425038907142, + "grad_norm": 1.456254243850708, + "learning_rate": 0.0009351784413692644, + "loss": 1.2546, + "step": 163 + }, + { + "epoch": 0.11343593290679578, + "grad_norm": 0.8283969163894653, + "learning_rate": 0.0009344501092498179, + "loss": 1.8466, + "step": 164 + }, + { + "epoch": 0.11412761542452014, + "grad_norm": 0.8178532123565674, + "learning_rate": 0.0009337217771303714, + "loss": 1.7566, + "step": 165 + }, + { + "epoch": 0.1148192979422445, + "grad_norm": 0.5897772908210754, + "learning_rate": 0.0009329934450109249, + "loss": 1.9611, + "step": 166 + }, + { + "epoch": 0.11551098045996887, + "grad_norm": 0.4763628840446472, + "learning_rate": 0.0009322651128914785, + "loss": 1.6854, + "step": 167 + }, + { + "epoch": 0.11620266297769324, + "grad_norm": 0.5219669938087463, + "learning_rate": 0.000931536780772032, + "loss": 1.5156, + "step": 168 + }, + { + "epoch": 0.1168943454954176, + "grad_norm": 0.7750780582427979, + "learning_rate": 0.0009308084486525857, + "loss": 1.2845, + "step": 169 + }, + { + "epoch": 0.11758602801314197, + "grad_norm": 0.5357050895690918, + "learning_rate": 0.0009300801165331392, + "loss": 1.1729, + "step": 170 + }, + { + "epoch": 0.11827771053086633, + "grad_norm": 0.5962219834327698, + "learning_rate": 0.0009293517844136927, + "loss": 1.4915, + "step": 171 + }, + { + "epoch": 0.1189693930485907, + "grad_norm": 0.4935504198074341, + "learning_rate": 0.0009286234522942462, + "loss": 1.7863, + "step": 172 + }, + { + "epoch": 0.11966107556631506, + "grad_norm": 0.5719547867774963, + "learning_rate": 0.0009278951201747997, + "loss": 1.7086, + "step": 173 + }, + { + "epoch": 0.12035275808403943, + "grad_norm": 0.614291787147522, + "learning_rate": 0.0009271667880553533, + "loss": 2.0141, + "step": 174 + }, + { + "epoch": 0.1210444406017638, + "grad_norm": 0.4415907859802246, + "learning_rate": 0.0009264384559359068, + "loss": 1.4772, + "step": 175 + }, + { + "epoch": 0.12173612311948816, + "grad_norm": 0.518036961555481, + "learning_rate": 0.0009257101238164604, + "loss": 1.1856, + "step": 176 + }, + { + "epoch": 0.12242780563721252, + "grad_norm": 0.39714357256889343, + "learning_rate": 0.0009249817916970139, + "loss": 1.1254, + "step": 177 + }, + { + "epoch": 0.12311948815493688, + "grad_norm": 0.5234679579734802, + "learning_rate": 0.0009242534595775674, + "loss": 2.0139, + "step": 178 + }, + { + "epoch": 0.12381117067266124, + "grad_norm": 0.548357367515564, + "learning_rate": 0.0009235251274581209, + "loss": 1.5552, + "step": 179 + }, + { + "epoch": 0.1245028531903856, + "grad_norm": 0.6111085414886475, + "learning_rate": 0.0009227967953386744, + "loss": 2.1376, + "step": 180 + }, + { + "epoch": 0.12519453570810998, + "grad_norm": 11.656793594360352, + "learning_rate": 0.000922068463219228, + "loss": 2.0174, + "step": 181 + }, + { + "epoch": 0.12588621822583435, + "grad_norm": 0.7396730184555054, + "learning_rate": 0.0009213401310997815, + "loss": 2.2195, + "step": 182 + }, + { + "epoch": 0.1265779007435587, + "grad_norm": 0.7623037099838257, + "learning_rate": 0.0009206117989803351, + "loss": 1.8883, + "step": 183 + }, + { + "epoch": 0.12726958326128307, + "grad_norm": 4.827798366546631, + "learning_rate": 0.0009198834668608885, + "loss": 1.4435, + "step": 184 + }, + { + "epoch": 0.12796126577900743, + "grad_norm": 6.8200836181640625, + "learning_rate": 0.0009191551347414421, + "loss": 2.4153, + "step": 185 + }, + { + "epoch": 0.1286529482967318, + "grad_norm": 10.740931510925293, + "learning_rate": 0.0009184268026219956, + "loss": 2.1309, + "step": 186 + }, + { + "epoch": 0.12934463081445616, + "grad_norm": 31.872066497802734, + "learning_rate": 0.0009176984705025491, + "loss": 2.2398, + "step": 187 + }, + { + "epoch": 0.13003631333218052, + "grad_norm": 31.492610931396484, + "learning_rate": 0.0009169701383831027, + "loss": 1.7102, + "step": 188 + }, + { + "epoch": 0.13072799584990488, + "grad_norm": 14.984453201293945, + "learning_rate": 0.0009162418062636562, + "loss": 1.4899, + "step": 189 + }, + { + "epoch": 0.13141967836762924, + "grad_norm": 60.037567138671875, + "learning_rate": 0.0009155134741442099, + "loss": 1.4512, + "step": 190 + }, + { + "epoch": 0.13211136088535363, + "grad_norm": 13.009904861450195, + "learning_rate": 0.0009147851420247633, + "loss": 1.7607, + "step": 191 + }, + { + "epoch": 0.132803043403078, + "grad_norm": 37.90861511230469, + "learning_rate": 0.0009140568099053169, + "loss": 1.9484, + "step": 192 + }, + { + "epoch": 0.13349472592080236, + "grad_norm": 25.40981674194336, + "learning_rate": 0.0009133284777858704, + "loss": 2.4415, + "step": 193 + }, + { + "epoch": 0.13418640843852672, + "grad_norm": 6.186267375946045, + "learning_rate": 0.0009126001456664239, + "loss": 1.1091, + "step": 194 + }, + { + "epoch": 0.13487809095625108, + "grad_norm": 0.7662860155105591, + "learning_rate": 0.0009118718135469775, + "loss": 1.8422, + "step": 195 + }, + { + "epoch": 0.13556977347397545, + "grad_norm": 0.6533941626548767, + "learning_rate": 0.000911143481427531, + "loss": 1.9951, + "step": 196 + }, + { + "epoch": 0.1362614559916998, + "grad_norm": 0.6851759552955627, + "learning_rate": 0.0009104151493080846, + "loss": 2.2665, + "step": 197 + }, + { + "epoch": 0.13695313850942417, + "grad_norm": 0.49062949419021606, + "learning_rate": 0.000909686817188638, + "loss": 1.1709, + "step": 198 + }, + { + "epoch": 0.13764482102714853, + "grad_norm": 0.5005449056625366, + "learning_rate": 0.0009089584850691916, + "loss": 1.6217, + "step": 199 + }, + { + "epoch": 0.1383365035448729, + "grad_norm": 0.5429890751838684, + "learning_rate": 0.0009082301529497451, + "loss": 2.0511, + "step": 200 + }, + { + "epoch": 0.13902818606259726, + "grad_norm": 0.652536153793335, + "learning_rate": 0.0009075018208302986, + "loss": 1.2122, + "step": 201 + }, + { + "epoch": 0.13971986858032162, + "grad_norm": 0.4541880488395691, + "learning_rate": 0.0009067734887108522, + "loss": 1.1111, + "step": 202 + }, + { + "epoch": 0.14041155109804598, + "grad_norm": 0.5066574811935425, + "learning_rate": 0.0009060451565914057, + "loss": 1.0966, + "step": 203 + }, + { + "epoch": 0.14110323361577037, + "grad_norm": 0.5900403261184692, + "learning_rate": 0.0009053168244719592, + "loss": 1.9502, + "step": 204 + }, + { + "epoch": 0.14179491613349474, + "grad_norm": 0.5873029828071594, + "learning_rate": 0.0009045884923525127, + "loss": 2.1726, + "step": 205 + }, + { + "epoch": 0.1424865986512191, + "grad_norm": 0.46297940611839294, + "learning_rate": 0.0009038601602330663, + "loss": 1.5529, + "step": 206 + }, + { + "epoch": 0.14317828116894346, + "grad_norm": 0.6434882283210754, + "learning_rate": 0.0009031318281136198, + "loss": 1.1717, + "step": 207 + }, + { + "epoch": 0.14386996368666782, + "grad_norm": 0.5225998163223267, + "learning_rate": 0.0009024034959941733, + "loss": 1.5854, + "step": 208 + }, + { + "epoch": 0.14456164620439219, + "grad_norm": 0.5846410989761353, + "learning_rate": 0.0009016751638747269, + "loss": 1.5399, + "step": 209 + }, + { + "epoch": 0.14525332872211655, + "grad_norm": 0.6395654082298279, + "learning_rate": 0.0009009468317552804, + "loss": 1.7706, + "step": 210 + }, + { + "epoch": 0.1459450112398409, + "grad_norm": 4.408266067504883, + "learning_rate": 0.000900218499635834, + "loss": 2.841, + "step": 211 + }, + { + "epoch": 0.14663669375756527, + "grad_norm": 0.5043503642082214, + "learning_rate": 0.0008994901675163874, + "loss": 1.7936, + "step": 212 + }, + { + "epoch": 0.14732837627528964, + "grad_norm": 0.4562769830226898, + "learning_rate": 0.0008987618353969411, + "loss": 1.8362, + "step": 213 + }, + { + "epoch": 0.148020058793014, + "grad_norm": 0.7404221296310425, + "learning_rate": 0.0008980335032774946, + "loss": 1.6168, + "step": 214 + }, + { + "epoch": 0.14871174131073836, + "grad_norm": 0.7720257043838501, + "learning_rate": 0.0008973051711580481, + "loss": 1.2365, + "step": 215 + }, + { + "epoch": 0.14940342382846272, + "grad_norm": 0.9425879716873169, + "learning_rate": 0.0008965768390386017, + "loss": 1.5786, + "step": 216 + }, + { + "epoch": 0.1500951063461871, + "grad_norm": 0.5764768719673157, + "learning_rate": 0.0008958485069191552, + "loss": 2.0833, + "step": 217 + }, + { + "epoch": 0.15078678886391148, + "grad_norm": 0.49806153774261475, + "learning_rate": 0.0008951201747997087, + "loss": 1.1311, + "step": 218 + }, + { + "epoch": 0.15147847138163584, + "grad_norm": 0.5747334361076355, + "learning_rate": 0.0008943918426802622, + "loss": 2.2558, + "step": 219 + }, + { + "epoch": 0.1521701538993602, + "grad_norm": 1.0881627798080444, + "learning_rate": 0.0008936635105608158, + "loss": 1.7196, + "step": 220 + }, + { + "epoch": 0.15286183641708456, + "grad_norm": 0.6077120900154114, + "learning_rate": 0.0008929351784413693, + "loss": 1.8436, + "step": 221 + }, + { + "epoch": 0.15355351893480892, + "grad_norm": 1.4011138677597046, + "learning_rate": 0.0008922068463219228, + "loss": 1.4092, + "step": 222 + }, + { + "epoch": 0.1542452014525333, + "grad_norm": 0.6316831707954407, + "learning_rate": 0.0008914785142024764, + "loss": 1.6801, + "step": 223 + }, + { + "epoch": 0.15493688397025765, + "grad_norm": 0.6225351691246033, + "learning_rate": 0.0008907501820830298, + "loss": 1.575, + "step": 224 + }, + { + "epoch": 0.155628566487982, + "grad_norm": 0.45079120993614197, + "learning_rate": 0.0008900218499635834, + "loss": 1.7441, + "step": 225 + }, + { + "epoch": 0.15632024900570637, + "grad_norm": 0.5602415204048157, + "learning_rate": 0.0008892935178441369, + "loss": 1.8509, + "step": 226 + }, + { + "epoch": 0.15701193152343074, + "grad_norm": 0.43019142746925354, + "learning_rate": 0.0008885651857246905, + "loss": 2.0136, + "step": 227 + }, + { + "epoch": 0.1577036140411551, + "grad_norm": 0.48303139209747314, + "learning_rate": 0.000887836853605244, + "loss": 1.7679, + "step": 228 + }, + { + "epoch": 0.15839529655887946, + "grad_norm": 0.5987271666526794, + "learning_rate": 0.0008871085214857975, + "loss": 1.3894, + "step": 229 + }, + { + "epoch": 0.15908697907660385, + "grad_norm": 0.6672357320785522, + "learning_rate": 0.0008863801893663511, + "loss": 2.0173, + "step": 230 + }, + { + "epoch": 0.15977866159432821, + "grad_norm": 0.5140132904052734, + "learning_rate": 0.0008856518572469045, + "loss": 1.9006, + "step": 231 + }, + { + "epoch": 0.16047034411205258, + "grad_norm": 0.7984848022460938, + "learning_rate": 0.0008849235251274581, + "loss": 1.7669, + "step": 232 + }, + { + "epoch": 0.16116202662977694, + "grad_norm": 1.279133677482605, + "learning_rate": 0.0008841951930080116, + "loss": 1.3613, + "step": 233 + }, + { + "epoch": 0.1618537091475013, + "grad_norm": 0.37104475498199463, + "learning_rate": 0.0008834668608885653, + "loss": 1.118, + "step": 234 + }, + { + "epoch": 0.16254539166522566, + "grad_norm": 0.5247305631637573, + "learning_rate": 0.0008827385287691188, + "loss": 2.0458, + "step": 235 + }, + { + "epoch": 0.16323707418295003, + "grad_norm": 0.837685227394104, + "learning_rate": 0.0008820101966496723, + "loss": 2.1954, + "step": 236 + }, + { + "epoch": 0.1639287567006744, + "grad_norm": 0.5766549706459045, + "learning_rate": 0.0008812818645302259, + "loss": 1.4839, + "step": 237 + }, + { + "epoch": 0.16462043921839875, + "grad_norm": 0.9044421315193176, + "learning_rate": 0.0008805535324107793, + "loss": 1.6802, + "step": 238 + }, + { + "epoch": 0.1653121217361231, + "grad_norm": 0.6272666454315186, + "learning_rate": 0.0008798252002913329, + "loss": 1.9889, + "step": 239 + }, + { + "epoch": 0.16600380425384748, + "grad_norm": 0.5650503039360046, + "learning_rate": 0.0008790968681718864, + "loss": 1.9305, + "step": 240 + }, + { + "epoch": 0.16669548677157184, + "grad_norm": 0.605739176273346, + "learning_rate": 0.00087836853605244, + "loss": 1.4619, + "step": 241 + }, + { + "epoch": 0.1673871692892962, + "grad_norm": 0.654289186000824, + "learning_rate": 0.0008776402039329935, + "loss": 2.1283, + "step": 242 + }, + { + "epoch": 0.1680788518070206, + "grad_norm": 0.5998426079750061, + "learning_rate": 0.000876911871813547, + "loss": 2.0628, + "step": 243 + }, + { + "epoch": 0.16877053432474495, + "grad_norm": 0.5341598391532898, + "learning_rate": 0.0008761835396941005, + "loss": 1.8415, + "step": 244 + }, + { + "epoch": 0.16946221684246932, + "grad_norm": 0.9030768275260925, + "learning_rate": 0.000875455207574654, + "loss": 1.3423, + "step": 245 + }, + { + "epoch": 0.17015389936019368, + "grad_norm": 0.7384636998176575, + "learning_rate": 0.0008747268754552076, + "loss": 1.6916, + "step": 246 + }, + { + "epoch": 0.17084558187791804, + "grad_norm": 0.9748024940490723, + "learning_rate": 0.0008739985433357611, + "loss": 1.1592, + "step": 247 + }, + { + "epoch": 0.1715372643956424, + "grad_norm": 0.49209123849868774, + "learning_rate": 0.0008732702112163147, + "loss": 1.5281, + "step": 248 + }, + { + "epoch": 0.17222894691336676, + "grad_norm": 0.6235657930374146, + "learning_rate": 0.0008725418790968682, + "loss": 1.6423, + "step": 249 + }, + { + "epoch": 0.17292062943109113, + "grad_norm": 0.8116986751556396, + "learning_rate": 0.0008718135469774217, + "loss": 1.6386, + "step": 250 + }, + { + "epoch": 0.1736123119488155, + "grad_norm": 0.643518328666687, + "learning_rate": 0.0008710852148579752, + "loss": 1.1341, + "step": 251 + }, + { + "epoch": 0.17430399446653985, + "grad_norm": 0.826726496219635, + "learning_rate": 0.0008703568827385287, + "loss": 1.4637, + "step": 252 + }, + { + "epoch": 0.17499567698426421, + "grad_norm": 0.6371028423309326, + "learning_rate": 0.0008696285506190823, + "loss": 1.8159, + "step": 253 + }, + { + "epoch": 0.17568735950198858, + "grad_norm": 0.7354971766471863, + "learning_rate": 0.0008689002184996358, + "loss": 1.5482, + "step": 254 + }, + { + "epoch": 0.17637904201971294, + "grad_norm": 0.5614224672317505, + "learning_rate": 0.0008681718863801895, + "loss": 1.6824, + "step": 255 + }, + { + "epoch": 0.17707072453743733, + "grad_norm": 0.7730950117111206, + "learning_rate": 0.000867443554260743, + "loss": 1.6996, + "step": 256 + }, + { + "epoch": 0.1777624070551617, + "grad_norm": 0.5419211983680725, + "learning_rate": 0.0008667152221412965, + "loss": 1.8659, + "step": 257 + }, + { + "epoch": 0.17845408957288605, + "grad_norm": 0.5566856861114502, + "learning_rate": 0.00086598689002185, + "loss": 1.1103, + "step": 258 + }, + { + "epoch": 0.17914577209061042, + "grad_norm": 0.773952841758728, + "learning_rate": 0.0008652585579024035, + "loss": 1.2759, + "step": 259 + }, + { + "epoch": 0.17983745460833478, + "grad_norm": 0.49450692534446716, + "learning_rate": 0.0008645302257829571, + "loss": 1.913, + "step": 260 + }, + { + "epoch": 0.18052913712605914, + "grad_norm": 0.565629243850708, + "learning_rate": 0.0008638018936635106, + "loss": 1.0691, + "step": 261 + }, + { + "epoch": 0.1812208196437835, + "grad_norm": 0.5907365679740906, + "learning_rate": 0.0008630735615440642, + "loss": 2.1622, + "step": 262 + }, + { + "epoch": 0.18191250216150787, + "grad_norm": 0.6517736911773682, + "learning_rate": 0.0008623452294246177, + "loss": 1.9742, + "step": 263 + }, + { + "epoch": 0.18260418467923223, + "grad_norm": 0.7100114822387695, + "learning_rate": 0.0008616168973051711, + "loss": 1.6081, + "step": 264 + }, + { + "epoch": 0.1832958671969566, + "grad_norm": 0.5431230068206787, + "learning_rate": 0.0008608885651857247, + "loss": 2.0853, + "step": 265 + }, + { + "epoch": 0.18398754971468095, + "grad_norm": 0.4722400903701782, + "learning_rate": 0.0008601602330662782, + "loss": 1.1004, + "step": 266 + }, + { + "epoch": 0.18467923223240532, + "grad_norm": 0.6258965730667114, + "learning_rate": 0.0008594319009468318, + "loss": 1.9442, + "step": 267 + }, + { + "epoch": 0.18537091475012968, + "grad_norm": 0.6985493898391724, + "learning_rate": 0.0008587035688273853, + "loss": 1.5243, + "step": 268 + }, + { + "epoch": 0.18606259726785407, + "grad_norm": 0.6129814386367798, + "learning_rate": 0.0008579752367079389, + "loss": 1.5075, + "step": 269 + }, + { + "epoch": 0.18675427978557843, + "grad_norm": 0.49683645367622375, + "learning_rate": 0.0008572469045884924, + "loss": 2.0752, + "step": 270 + }, + { + "epoch": 0.1874459623033028, + "grad_norm": 0.48728471994400024, + "learning_rate": 0.0008565185724690458, + "loss": 2.2337, + "step": 271 + }, + { + "epoch": 0.18813764482102716, + "grad_norm": 0.8094476461410522, + "learning_rate": 0.0008557902403495994, + "loss": 2.007, + "step": 272 + }, + { + "epoch": 0.18882932733875152, + "grad_norm": 0.558074951171875, + "learning_rate": 0.0008550619082301529, + "loss": 1.4369, + "step": 273 + }, + { + "epoch": 0.18952100985647588, + "grad_norm": 0.6702684760093689, + "learning_rate": 0.0008543335761107065, + "loss": 1.8943, + "step": 274 + }, + { + "epoch": 0.19021269237420024, + "grad_norm": 0.7045763731002808, + "learning_rate": 0.00085360524399126, + "loss": 2.0621, + "step": 275 + }, + { + "epoch": 0.1909043748919246, + "grad_norm": 0.5553760528564453, + "learning_rate": 0.0008528769118718137, + "loss": 1.9012, + "step": 276 + }, + { + "epoch": 0.19159605740964897, + "grad_norm": 0.651685893535614, + "learning_rate": 0.0008521485797523672, + "loss": 1.826, + "step": 277 + }, + { + "epoch": 0.19228773992737333, + "grad_norm": 0.46926578879356384, + "learning_rate": 0.0008514202476329205, + "loss": 1.8451, + "step": 278 + }, + { + "epoch": 0.1929794224450977, + "grad_norm": 0.5306689739227295, + "learning_rate": 0.0008506919155134742, + "loss": 1.9559, + "step": 279 + }, + { + "epoch": 0.19367110496282205, + "grad_norm": 0.437308669090271, + "learning_rate": 0.0008499635833940277, + "loss": 1.1798, + "step": 280 + }, + { + "epoch": 0.19436278748054642, + "grad_norm": 0.5720314383506775, + "learning_rate": 0.0008492352512745813, + "loss": 1.8443, + "step": 281 + }, + { + "epoch": 0.1950544699982708, + "grad_norm": 0.6609981060028076, + "learning_rate": 0.0008485069191551348, + "loss": 2.1044, + "step": 282 + }, + { + "epoch": 0.19574615251599517, + "grad_norm": 0.7185072302818298, + "learning_rate": 0.0008477785870356884, + "loss": 1.309, + "step": 283 + }, + { + "epoch": 0.19643783503371953, + "grad_norm": 0.9821947813034058, + "learning_rate": 0.0008470502549162418, + "loss": 1.5823, + "step": 284 + }, + { + "epoch": 0.1971295175514439, + "grad_norm": 0.6811301112174988, + "learning_rate": 0.0008463219227967953, + "loss": 1.4655, + "step": 285 + }, + { + "epoch": 0.19782120006916826, + "grad_norm": 0.5955311059951782, + "learning_rate": 0.0008455935906773489, + "loss": 1.5187, + "step": 286 + }, + { + "epoch": 0.19851288258689262, + "grad_norm": 0.568804919719696, + "learning_rate": 0.0008448652585579024, + "loss": 1.3988, + "step": 287 + }, + { + "epoch": 0.19920456510461698, + "grad_norm": 0.7858214974403381, + "learning_rate": 0.000844136926438456, + "loss": 1.5478, + "step": 288 + }, + { + "epoch": 0.19989624762234134, + "grad_norm": 0.5844207406044006, + "learning_rate": 0.0008434085943190095, + "loss": 1.904, + "step": 289 + }, + { + "epoch": 0.2005879301400657, + "grad_norm": 0.7172948122024536, + "learning_rate": 0.0008426802621995631, + "loss": 1.4772, + "step": 290 + }, + { + "epoch": 0.20127961265779007, + "grad_norm": 0.6408190727233887, + "learning_rate": 0.0008419519300801165, + "loss": 1.4348, + "step": 291 + }, + { + "epoch": 0.20197129517551443, + "grad_norm": 0.9460310339927673, + "learning_rate": 0.00084122359796067, + "loss": 1.7674, + "step": 292 + }, + { + "epoch": 0.2026629776932388, + "grad_norm": 0.6002872586250305, + "learning_rate": 0.0008404952658412236, + "loss": 1.4278, + "step": 293 + }, + { + "epoch": 0.20335466021096316, + "grad_norm": 1.0076587200164795, + "learning_rate": 0.0008397669337217771, + "loss": 1.5417, + "step": 294 + }, + { + "epoch": 0.20404634272868752, + "grad_norm": 1.3005017042160034, + "learning_rate": 0.0008390386016023307, + "loss": 1.6232, + "step": 295 + }, + { + "epoch": 0.2047380252464119, + "grad_norm": 0.751641035079956, + "learning_rate": 0.0008383102694828842, + "loss": 1.9296, + "step": 296 + }, + { + "epoch": 0.20542970776413627, + "grad_norm": 0.6361163258552551, + "learning_rate": 0.0008375819373634378, + "loss": 2.0047, + "step": 297 + }, + { + "epoch": 0.20612139028186063, + "grad_norm": 0.9554282426834106, + "learning_rate": 0.0008368536052439912, + "loss": 1.7218, + "step": 298 + }, + { + "epoch": 0.206813072799585, + "grad_norm": 0.7240822911262512, + "learning_rate": 0.0008361252731245447, + "loss": 1.6413, + "step": 299 + }, + { + "epoch": 0.20750475531730936, + "grad_norm": 0.46834996342658997, + "learning_rate": 0.0008353969410050984, + "loss": 1.2228, + "step": 300 + }, + { + "epoch": 0.20819643783503372, + "grad_norm": 0.7188776731491089, + "learning_rate": 0.0008346686088856519, + "loss": 1.4258, + "step": 301 + }, + { + "epoch": 0.20888812035275808, + "grad_norm": 0.588649332523346, + "learning_rate": 0.0008339402767662055, + "loss": 1.7551, + "step": 302 + }, + { + "epoch": 0.20957980287048245, + "grad_norm": 0.6962491273880005, + "learning_rate": 0.000833211944646759, + "loss": 2.1571, + "step": 303 + }, + { + "epoch": 0.2102714853882068, + "grad_norm": 0.6146702170372009, + "learning_rate": 0.0008324836125273124, + "loss": 2.0106, + "step": 304 + }, + { + "epoch": 0.21096316790593117, + "grad_norm": 0.6004481315612793, + "learning_rate": 0.000831755280407866, + "loss": 1.3429, + "step": 305 + }, + { + "epoch": 0.21165485042365553, + "grad_norm": 0.6162500381469727, + "learning_rate": 0.0008310269482884195, + "loss": 1.8476, + "step": 306 + }, + { + "epoch": 0.2123465329413799, + "grad_norm": 0.5027235746383667, + "learning_rate": 0.0008302986161689731, + "loss": 1.7516, + "step": 307 + }, + { + "epoch": 0.21303821545910426, + "grad_norm": 0.5416428446769714, + "learning_rate": 0.0008295702840495266, + "loss": 1.8363, + "step": 308 + }, + { + "epoch": 0.21372989797682865, + "grad_norm": 0.6236619353294373, + "learning_rate": 0.0008288419519300802, + "loss": 1.6738, + "step": 309 + }, + { + "epoch": 0.214421580494553, + "grad_norm": 0.5952901244163513, + "learning_rate": 0.0008281136198106337, + "loss": 1.4978, + "step": 310 + }, + { + "epoch": 0.21511326301227737, + "grad_norm": 0.7139809131622314, + "learning_rate": 0.0008273852876911871, + "loss": 1.593, + "step": 311 + }, + { + "epoch": 0.21580494553000173, + "grad_norm": 0.6548435091972351, + "learning_rate": 0.0008266569555717407, + "loss": 1.7464, + "step": 312 + }, + { + "epoch": 0.2164966280477261, + "grad_norm": 0.6812461018562317, + "learning_rate": 0.0008259286234522942, + "loss": 1.9677, + "step": 313 + }, + { + "epoch": 0.21718831056545046, + "grad_norm": 0.7574117183685303, + "learning_rate": 0.0008252002913328478, + "loss": 1.6102, + "step": 314 + }, + { + "epoch": 0.21787999308317482, + "grad_norm": 0.5767763257026672, + "learning_rate": 0.0008244719592134013, + "loss": 2.0047, + "step": 315 + }, + { + "epoch": 0.21857167560089918, + "grad_norm": 0.864742636680603, + "learning_rate": 0.0008237436270939549, + "loss": 1.6079, + "step": 316 + }, + { + "epoch": 0.21926335811862355, + "grad_norm": 1.1354854106903076, + "learning_rate": 0.0008230152949745084, + "loss": 1.3049, + "step": 317 + }, + { + "epoch": 0.2199550406363479, + "grad_norm": 0.8098461031913757, + "learning_rate": 0.0008222869628550618, + "loss": 1.1503, + "step": 318 + }, + { + "epoch": 0.22064672315407227, + "grad_norm": 0.7209709286689758, + "learning_rate": 0.0008215586307356154, + "loss": 1.9659, + "step": 319 + }, + { + "epoch": 0.22133840567179663, + "grad_norm": 0.6464136838912964, + "learning_rate": 0.0008208302986161689, + "loss": 1.8093, + "step": 320 + }, + { + "epoch": 0.222030088189521, + "grad_norm": 2.2832796573638916, + "learning_rate": 0.0008201019664967226, + "loss": 1.2252, + "step": 321 + }, + { + "epoch": 0.2227217707072454, + "grad_norm": 0.6651481986045837, + "learning_rate": 0.0008193736343772761, + "loss": 1.8967, + "step": 322 + }, + { + "epoch": 0.22341345322496975, + "grad_norm": 0.5639248490333557, + "learning_rate": 0.0008186453022578297, + "loss": 1.1423, + "step": 323 + }, + { + "epoch": 0.2241051357426941, + "grad_norm": 0.6734063029289246, + "learning_rate": 0.0008179169701383831, + "loss": 1.7355, + "step": 324 + }, + { + "epoch": 0.22479681826041847, + "grad_norm": 0.8061289191246033, + "learning_rate": 0.0008171886380189366, + "loss": 0.8081, + "step": 325 + }, + { + "epoch": 0.22548850077814284, + "grad_norm": 0.584674060344696, + "learning_rate": 0.0008164603058994902, + "loss": 1.1589, + "step": 326 + }, + { + "epoch": 0.2261801832958672, + "grad_norm": 0.6683285236358643, + "learning_rate": 0.0008157319737800437, + "loss": 1.7779, + "step": 327 + }, + { + "epoch": 0.22687186581359156, + "grad_norm": 0.7037453055381775, + "learning_rate": 0.0008150036416605973, + "loss": 2.0579, + "step": 328 + }, + { + "epoch": 0.22756354833131592, + "grad_norm": 0.6727277636528015, + "learning_rate": 0.0008142753095411508, + "loss": 1.7619, + "step": 329 + }, + { + "epoch": 0.22825523084904029, + "grad_norm": 0.7048072218894958, + "learning_rate": 0.0008135469774217044, + "loss": 1.7445, + "step": 330 + }, + { + "epoch": 0.22894691336676465, + "grad_norm": 0.5675456523895264, + "learning_rate": 0.0008128186453022578, + "loss": 1.8732, + "step": 331 + }, + { + "epoch": 0.229638595884489, + "grad_norm": 0.5742422342300415, + "learning_rate": 0.0008120903131828113, + "loss": 1.9362, + "step": 332 + }, + { + "epoch": 0.23033027840221337, + "grad_norm": 0.612397313117981, + "learning_rate": 0.0008113619810633649, + "loss": 1.3922, + "step": 333 + }, + { + "epoch": 0.23102196091993774, + "grad_norm": 0.5459281802177429, + "learning_rate": 0.0008106336489439184, + "loss": 1.823, + "step": 334 + }, + { + "epoch": 0.23171364343766213, + "grad_norm": 0.6739487051963806, + "learning_rate": 0.000809905316824472, + "loss": 1.9681, + "step": 335 + }, + { + "epoch": 0.2324053259553865, + "grad_norm": 0.550207257270813, + "learning_rate": 0.0008091769847050255, + "loss": 1.8516, + "step": 336 + }, + { + "epoch": 0.23309700847311085, + "grad_norm": 0.45742911100387573, + "learning_rate": 0.0008084486525855791, + "loss": 1.6626, + "step": 337 + }, + { + "epoch": 0.2337886909908352, + "grad_norm": 0.764325737953186, + "learning_rate": 0.0008077203204661325, + "loss": 1.7487, + "step": 338 + }, + { + "epoch": 0.23448037350855958, + "grad_norm": 0.5911192297935486, + "learning_rate": 0.000806991988346686, + "loss": 1.2527, + "step": 339 + }, + { + "epoch": 0.23517205602628394, + "grad_norm": 0.5554788708686829, + "learning_rate": 0.0008062636562272396, + "loss": 1.2681, + "step": 340 + }, + { + "epoch": 0.2358637385440083, + "grad_norm": 0.5522503852844238, + "learning_rate": 0.0008055353241077931, + "loss": 1.4677, + "step": 341 + }, + { + "epoch": 0.23655542106173266, + "grad_norm": 0.5872853994369507, + "learning_rate": 0.0008048069919883468, + "loss": 1.7502, + "step": 342 + }, + { + "epoch": 0.23724710357945702, + "grad_norm": 0.6910445690155029, + "learning_rate": 0.0008040786598689003, + "loss": 2.1711, + "step": 343 + }, + { + "epoch": 0.2379387860971814, + "grad_norm": 0.7674363255500793, + "learning_rate": 0.0008033503277494538, + "loss": 1.0802, + "step": 344 + }, + { + "epoch": 0.23863046861490575, + "grad_norm": 0.8880471587181091, + "learning_rate": 0.0008026219956300073, + "loss": 1.3592, + "step": 345 + }, + { + "epoch": 0.2393221511326301, + "grad_norm": 0.6581029295921326, + "learning_rate": 0.0008018936635105608, + "loss": 1.6392, + "step": 346 + }, + { + "epoch": 0.24001383365035447, + "grad_norm": 1.1078494787216187, + "learning_rate": 0.0008011653313911144, + "loss": 1.5557, + "step": 347 + }, + { + "epoch": 0.24070551616807886, + "grad_norm": 0.5563998818397522, + "learning_rate": 0.0008004369992716679, + "loss": 1.3359, + "step": 348 + }, + { + "epoch": 0.24139719868580323, + "grad_norm": 0.6263977885246277, + "learning_rate": 0.0007997086671522215, + "loss": 2.1702, + "step": 349 + }, + { + "epoch": 0.2420888812035276, + "grad_norm": 0.5776513814926147, + "learning_rate": 0.000798980335032775, + "loss": 1.9447, + "step": 350 + }, + { + "epoch": 0.24278056372125195, + "grad_norm": 0.748920202255249, + "learning_rate": 0.0007982520029133285, + "loss": 1.4751, + "step": 351 + }, + { + "epoch": 0.24347224623897631, + "grad_norm": 1.3247708082199097, + "learning_rate": 0.000797523670793882, + "loss": 1.7557, + "step": 352 + }, + { + "epoch": 0.24416392875670068, + "grad_norm": 0.7095309495925903, + "learning_rate": 0.0007967953386744355, + "loss": 2.2349, + "step": 353 + }, + { + "epoch": 0.24485561127442504, + "grad_norm": 0.532289981842041, + "learning_rate": 0.0007960670065549891, + "loss": 1.1916, + "step": 354 + }, + { + "epoch": 0.2455472937921494, + "grad_norm": 0.6105953454971313, + "learning_rate": 0.0007953386744355426, + "loss": 1.6084, + "step": 355 + }, + { + "epoch": 0.24623897630987376, + "grad_norm": 0.6233397126197815, + "learning_rate": 0.0007946103423160962, + "loss": 1.7177, + "step": 356 + }, + { + "epoch": 0.24693065882759813, + "grad_norm": 10.080041885375977, + "learning_rate": 0.0007938820101966497, + "loss": 2.1116, + "step": 357 + }, + { + "epoch": 0.2476223413453225, + "grad_norm": 0.5390161275863647, + "learning_rate": 0.0007931536780772032, + "loss": 1.7296, + "step": 358 + }, + { + "epoch": 0.24831402386304685, + "grad_norm": 1.2583034038543701, + "learning_rate": 0.0007924253459577567, + "loss": 1.4504, + "step": 359 + }, + { + "epoch": 0.2490057063807712, + "grad_norm": 0.6620193719863892, + "learning_rate": 0.0007916970138383102, + "loss": 1.9403, + "step": 360 + }, + { + "epoch": 0.2496973888984956, + "grad_norm": 0.8169893622398376, + "learning_rate": 0.0007909686817188638, + "loss": 1.2534, + "step": 361 + }, + { + "epoch": 0.25038907141621997, + "grad_norm": 0.693074643611908, + "learning_rate": 0.0007902403495994173, + "loss": 2.112, + "step": 362 + }, + { + "epoch": 0.2510807539339443, + "grad_norm": 0.628724217414856, + "learning_rate": 0.000789512017479971, + "loss": 2.0005, + "step": 363 + }, + { + "epoch": 0.2517724364516687, + "grad_norm": 0.6025403141975403, + "learning_rate": 0.0007887836853605243, + "loss": 1.0489, + "step": 364 + }, + { + "epoch": 0.252464118969393, + "grad_norm": 0.6881316900253296, + "learning_rate": 0.000788055353241078, + "loss": 1.1542, + "step": 365 + }, + { + "epoch": 0.2531558014871174, + "grad_norm": 1.035561442375183, + "learning_rate": 0.0007873270211216315, + "loss": 1.5017, + "step": 366 + }, + { + "epoch": 0.25384748400484175, + "grad_norm": 0.5408887267112732, + "learning_rate": 0.000786598689002185, + "loss": 2.0277, + "step": 367 + }, + { + "epoch": 0.25453916652256614, + "grad_norm": 0.5508919358253479, + "learning_rate": 0.0007858703568827386, + "loss": 1.5144, + "step": 368 + }, + { + "epoch": 0.25523084904029053, + "grad_norm": 0.9890360236167908, + "learning_rate": 0.0007851420247632921, + "loss": 1.0548, + "step": 369 + }, + { + "epoch": 0.25592253155801487, + "grad_norm": 0.6218384504318237, + "learning_rate": 0.0007844136926438457, + "loss": 1.4775, + "step": 370 + }, + { + "epoch": 0.25661421407573926, + "grad_norm": 0.5427407622337341, + "learning_rate": 0.0007836853605243991, + "loss": 1.6648, + "step": 371 + }, + { + "epoch": 0.2573058965934636, + "grad_norm": 0.6376339793205261, + "learning_rate": 0.0007829570284049527, + "loss": 1.7319, + "step": 372 + }, + { + "epoch": 0.257997579111188, + "grad_norm": 0.5155366063117981, + "learning_rate": 0.0007822286962855062, + "loss": 1.7143, + "step": 373 + }, + { + "epoch": 0.2586892616289123, + "grad_norm": 1.0346859693527222, + "learning_rate": 0.0007815003641660597, + "loss": 1.96, + "step": 374 + }, + { + "epoch": 0.2593809441466367, + "grad_norm": 0.5473276376724243, + "learning_rate": 0.0007807720320466133, + "loss": 1.8269, + "step": 375 + }, + { + "epoch": 0.26007262666436104, + "grad_norm": 0.9501216411590576, + "learning_rate": 0.0007800436999271668, + "loss": 1.5286, + "step": 376 + }, + { + "epoch": 0.26076430918208543, + "grad_norm": 0.4338766634464264, + "learning_rate": 0.0007793153678077204, + "loss": 0.8967, + "step": 377 + }, + { + "epoch": 0.26145599169980976, + "grad_norm": 12.023887634277344, + "learning_rate": 0.0007785870356882738, + "loss": 1.8385, + "step": 378 + }, + { + "epoch": 0.26214767421753415, + "grad_norm": 0.5424131155014038, + "learning_rate": 0.0007778587035688274, + "loss": 1.7216, + "step": 379 + }, + { + "epoch": 0.2628393567352585, + "grad_norm": 0.6199079751968384, + "learning_rate": 0.0007771303714493809, + "loss": 1.9606, + "step": 380 + }, + { + "epoch": 0.2635310392529829, + "grad_norm": 0.6037024259567261, + "learning_rate": 0.0007764020393299344, + "loss": 1.4306, + "step": 381 + }, + { + "epoch": 0.26422272177070727, + "grad_norm": 0.6312823295593262, + "learning_rate": 0.000775673707210488, + "loss": 1.0085, + "step": 382 + }, + { + "epoch": 0.2649144042884316, + "grad_norm": 0.5497464537620544, + "learning_rate": 0.0007749453750910415, + "loss": 1.4443, + "step": 383 + }, + { + "epoch": 0.265606086806156, + "grad_norm": 0.9736106395721436, + "learning_rate": 0.000774217042971595, + "loss": 1.7324, + "step": 384 + }, + { + "epoch": 0.26629776932388033, + "grad_norm": 0.6415931582450867, + "learning_rate": 0.0007734887108521485, + "loss": 0.9575, + "step": 385 + }, + { + "epoch": 0.2669894518416047, + "grad_norm": 0.570580244064331, + "learning_rate": 0.0007727603787327021, + "loss": 1.7195, + "step": 386 + }, + { + "epoch": 0.26768113435932905, + "grad_norm": 0.7033479809761047, + "learning_rate": 0.0007720320466132557, + "loss": 2.0542, + "step": 387 + }, + { + "epoch": 0.26837281687705344, + "grad_norm": 0.7575972676277161, + "learning_rate": 0.0007713037144938092, + "loss": 1.7199, + "step": 388 + }, + { + "epoch": 0.2690644993947778, + "grad_norm": 0.5389835238456726, + "learning_rate": 0.0007705753823743628, + "loss": 1.5259, + "step": 389 + }, + { + "epoch": 0.26975618191250217, + "grad_norm": 0.574540913105011, + "learning_rate": 0.0007698470502549163, + "loss": 1.5391, + "step": 390 + }, + { + "epoch": 0.2704478644302265, + "grad_norm": 0.5298869013786316, + "learning_rate": 0.0007691187181354698, + "loss": 1.9361, + "step": 391 + }, + { + "epoch": 0.2711395469479509, + "grad_norm": 0.5654643177986145, + "learning_rate": 0.0007683903860160233, + "loss": 1.8952, + "step": 392 + }, + { + "epoch": 0.27183122946567523, + "grad_norm": 0.7499473094940186, + "learning_rate": 0.0007676620538965769, + "loss": 1.0416, + "step": 393 + }, + { + "epoch": 0.2725229119833996, + "grad_norm": 0.6296089887619019, + "learning_rate": 0.0007669337217771304, + "loss": 1.4852, + "step": 394 + }, + { + "epoch": 0.273214594501124, + "grad_norm": 0.5401056408882141, + "learning_rate": 0.0007662053896576839, + "loss": 1.8153, + "step": 395 + }, + { + "epoch": 0.27390627701884834, + "grad_norm": 0.5954565405845642, + "learning_rate": 0.0007654770575382375, + "loss": 1.7855, + "step": 396 + }, + { + "epoch": 0.27459795953657273, + "grad_norm": 0.9156423211097717, + "learning_rate": 0.000764748725418791, + "loss": 1.9173, + "step": 397 + }, + { + "epoch": 0.27528964205429707, + "grad_norm": 0.6210983395576477, + "learning_rate": 0.0007640203932993445, + "loss": 1.9079, + "step": 398 + }, + { + "epoch": 0.27598132457202146, + "grad_norm": 0.529227077960968, + "learning_rate": 0.000763292061179898, + "loss": 1.8594, + "step": 399 + }, + { + "epoch": 0.2766730070897458, + "grad_norm": 0.80283522605896, + "learning_rate": 0.0007625637290604516, + "loss": 1.5913, + "step": 400 + }, + { + "epoch": 0.2773646896074702, + "grad_norm": 0.5629950761795044, + "learning_rate": 0.0007618353969410051, + "loss": 1.5373, + "step": 401 + }, + { + "epoch": 0.2780563721251945, + "grad_norm": 0.6493797898292542, + "learning_rate": 0.0007611070648215586, + "loss": 2.2277, + "step": 402 + }, + { + "epoch": 0.2787480546429189, + "grad_norm": 0.5912362933158875, + "learning_rate": 0.0007603787327021122, + "loss": 1.4388, + "step": 403 + }, + { + "epoch": 0.27943973716064324, + "grad_norm": 0.7361041307449341, + "learning_rate": 0.0007596504005826656, + "loss": 0.9273, + "step": 404 + }, + { + "epoch": 0.28013141967836763, + "grad_norm": 0.8257749676704407, + "learning_rate": 0.0007589220684632192, + "loss": 0.7073, + "step": 405 + }, + { + "epoch": 0.28082310219609197, + "grad_norm": 0.8185616731643677, + "learning_rate": 0.0007581937363437727, + "loss": 1.9928, + "step": 406 + }, + { + "epoch": 0.28151478471381636, + "grad_norm": 0.5865523219108582, + "learning_rate": 0.0007574654042243263, + "loss": 1.1638, + "step": 407 + }, + { + "epoch": 0.28220646723154075, + "grad_norm": 0.5210615396499634, + "learning_rate": 0.0007567370721048798, + "loss": 1.1934, + "step": 408 + }, + { + "epoch": 0.2828981497492651, + "grad_norm": 0.6531309485435486, + "learning_rate": 0.0007560087399854334, + "loss": 1.8601, + "step": 409 + }, + { + "epoch": 0.28358983226698947, + "grad_norm": 0.7874279022216797, + "learning_rate": 0.000755280407865987, + "loss": 1.4043, + "step": 410 + }, + { + "epoch": 0.2842815147847138, + "grad_norm": 1.121983289718628, + "learning_rate": 0.0007545520757465404, + "loss": 1.7112, + "step": 411 + }, + { + "epoch": 0.2849731973024382, + "grad_norm": 0.5046870708465576, + "learning_rate": 0.000753823743627094, + "loss": 1.6255, + "step": 412 + }, + { + "epoch": 0.28566487982016253, + "grad_norm": 0.4254264831542969, + "learning_rate": 0.0007530954115076475, + "loss": 1.3181, + "step": 413 + }, + { + "epoch": 0.2863565623378869, + "grad_norm": 0.8146479725837708, + "learning_rate": 0.0007523670793882011, + "loss": 1.7557, + "step": 414 + }, + { + "epoch": 0.28704824485561126, + "grad_norm": 0.47856444120407104, + "learning_rate": 0.0007516387472687546, + "loss": 1.6217, + "step": 415 + }, + { + "epoch": 0.28773992737333565, + "grad_norm": 0.5287722945213318, + "learning_rate": 0.0007509104151493081, + "loss": 1.8146, + "step": 416 + }, + { + "epoch": 0.28843160989106, + "grad_norm": 0.8364676833152771, + "learning_rate": 0.0007501820830298617, + "loss": 1.1081, + "step": 417 + }, + { + "epoch": 0.28912329240878437, + "grad_norm": 0.6923417448997498, + "learning_rate": 0.0007494537509104151, + "loss": 1.0822, + "step": 418 + }, + { + "epoch": 0.2898149749265087, + "grad_norm": 0.7535339593887329, + "learning_rate": 0.0007487254187909687, + "loss": 1.8013, + "step": 419 + }, + { + "epoch": 0.2905066574442331, + "grad_norm": 1.226645588874817, + "learning_rate": 0.0007479970866715222, + "loss": 1.1891, + "step": 420 + }, + { + "epoch": 0.2911983399619575, + "grad_norm": 0.7388406991958618, + "learning_rate": 0.0007472687545520758, + "loss": 1.0448, + "step": 421 + }, + { + "epoch": 0.2918900224796818, + "grad_norm": 0.6585919260978699, + "learning_rate": 0.0007465404224326293, + "loss": 1.0951, + "step": 422 + }, + { + "epoch": 0.2925817049974062, + "grad_norm": 0.7637200355529785, + "learning_rate": 0.0007458120903131828, + "loss": 1.3508, + "step": 423 + }, + { + "epoch": 0.29327338751513055, + "grad_norm": 0.5754939913749695, + "learning_rate": 0.0007450837581937363, + "loss": 1.4937, + "step": 424 + }, + { + "epoch": 0.29396507003285494, + "grad_norm": 0.6434321999549866, + "learning_rate": 0.0007443554260742898, + "loss": 2.0318, + "step": 425 + }, + { + "epoch": 0.29465675255057927, + "grad_norm": 0.7063912749290466, + "learning_rate": 0.0007436270939548434, + "loss": 2.0036, + "step": 426 + }, + { + "epoch": 0.29534843506830366, + "grad_norm": 0.5120965242385864, + "learning_rate": 0.0007428987618353969, + "loss": 1.7687, + "step": 427 + }, + { + "epoch": 0.296040117586028, + "grad_norm": 0.7403333187103271, + "learning_rate": 0.0007421704297159505, + "loss": 1.5747, + "step": 428 + }, + { + "epoch": 0.2967318001037524, + "grad_norm": 0.5760396122932434, + "learning_rate": 0.000741442097596504, + "loss": 1.7351, + "step": 429 + }, + { + "epoch": 0.2974234826214767, + "grad_norm": 0.6725696325302124, + "learning_rate": 0.0007407137654770575, + "loss": 1.6643, + "step": 430 + }, + { + "epoch": 0.2981151651392011, + "grad_norm": 0.5612234473228455, + "learning_rate": 0.000739985433357611, + "loss": 1.47, + "step": 431 + }, + { + "epoch": 0.29880684765692545, + "grad_norm": 0.48072177171707153, + "learning_rate": 0.0007392571012381646, + "loss": 1.2806, + "step": 432 + }, + { + "epoch": 0.29949853017464984, + "grad_norm": 0.6465651988983154, + "learning_rate": 0.0007385287691187182, + "loss": 1.7875, + "step": 433 + }, + { + "epoch": 0.3001902126923742, + "grad_norm": 0.7127341628074646, + "learning_rate": 0.0007378004369992717, + "loss": 1.9259, + "step": 434 + }, + { + "epoch": 0.30088189521009856, + "grad_norm": 0.5775954127311707, + "learning_rate": 0.0007370721048798253, + "loss": 1.6652, + "step": 435 + }, + { + "epoch": 0.30157357772782295, + "grad_norm": 0.6212239861488342, + "learning_rate": 0.0007363437727603788, + "loss": 1.7045, + "step": 436 + }, + { + "epoch": 0.3022652602455473, + "grad_norm": 0.6443663835525513, + "learning_rate": 0.0007356154406409323, + "loss": 1.9416, + "step": 437 + }, + { + "epoch": 0.3029569427632717, + "grad_norm": 0.6374452710151672, + "learning_rate": 0.0007348871085214858, + "loss": 1.7767, + "step": 438 + }, + { + "epoch": 0.303648625280996, + "grad_norm": 0.7170910835266113, + "learning_rate": 0.0007341587764020393, + "loss": 1.1563, + "step": 439 + }, + { + "epoch": 0.3043403077987204, + "grad_norm": 1.249282717704773, + "learning_rate": 0.0007334304442825929, + "loss": 1.7667, + "step": 440 + }, + { + "epoch": 0.30503199031644473, + "grad_norm": 0.6763089895248413, + "learning_rate": 0.0007327021121631464, + "loss": 1.9292, + "step": 441 + }, + { + "epoch": 0.3057236728341691, + "grad_norm": 0.7364800572395325, + "learning_rate": 0.0007319737800437, + "loss": 1.6102, + "step": 442 + }, + { + "epoch": 0.30641535535189346, + "grad_norm": 0.8224323987960815, + "learning_rate": 0.0007312454479242535, + "loss": 1.0648, + "step": 443 + }, + { + "epoch": 0.30710703786961785, + "grad_norm": 0.752155065536499, + "learning_rate": 0.0007305171158048069, + "loss": 1.788, + "step": 444 + }, + { + "epoch": 0.3077987203873422, + "grad_norm": 0.5755220651626587, + "learning_rate": 0.0007297887836853605, + "loss": 1.2096, + "step": 445 + }, + { + "epoch": 0.3084904029050666, + "grad_norm": 0.8400484323501587, + "learning_rate": 0.000729060451565914, + "loss": 1.3676, + "step": 446 + }, + { + "epoch": 0.30918208542279096, + "grad_norm": 0.5796182155609131, + "learning_rate": 0.0007283321194464676, + "loss": 1.5668, + "step": 447 + }, + { + "epoch": 0.3098737679405153, + "grad_norm": 0.985273003578186, + "learning_rate": 0.0007276037873270211, + "loss": 1.4033, + "step": 448 + }, + { + "epoch": 0.3105654504582397, + "grad_norm": 0.6488866209983826, + "learning_rate": 0.0007268754552075747, + "loss": 1.5978, + "step": 449 + }, + { + "epoch": 0.311257132975964, + "grad_norm": 0.5811178088188171, + "learning_rate": 0.0007261471230881282, + "loss": 2.1722, + "step": 450 + }, + { + "epoch": 0.3119488154936884, + "grad_norm": 0.5769645571708679, + "learning_rate": 0.0007254187909686816, + "loss": 1.9041, + "step": 451 + }, + { + "epoch": 0.31264049801141275, + "grad_norm": 0.769631028175354, + "learning_rate": 0.0007246904588492352, + "loss": 1.3963, + "step": 452 + }, + { + "epoch": 0.31333218052913714, + "grad_norm": 0.8301665186882019, + "learning_rate": 0.0007239621267297888, + "loss": 1.623, + "step": 453 + }, + { + "epoch": 0.3140238630468615, + "grad_norm": 0.6046326756477356, + "learning_rate": 0.0007232337946103424, + "loss": 1.8703, + "step": 454 + }, + { + "epoch": 0.31471554556458586, + "grad_norm": 0.5623071193695068, + "learning_rate": 0.0007225054624908959, + "loss": 1.6112, + "step": 455 + }, + { + "epoch": 0.3154072280823102, + "grad_norm": 0.7813363671302795, + "learning_rate": 0.0007217771303714495, + "loss": 1.681, + "step": 456 + }, + { + "epoch": 0.3160989106000346, + "grad_norm": 0.6935021877288818, + "learning_rate": 0.000721048798252003, + "loss": 1.5276, + "step": 457 + }, + { + "epoch": 0.3167905931177589, + "grad_norm": 1.0678547620773315, + "learning_rate": 0.0007203204661325564, + "loss": 1.2066, + "step": 458 + }, + { + "epoch": 0.3174822756354833, + "grad_norm": 0.985817551612854, + "learning_rate": 0.00071959213401311, + "loss": 1.5923, + "step": 459 + }, + { + "epoch": 0.3181739581532077, + "grad_norm": 0.6185691356658936, + "learning_rate": 0.0007188638018936635, + "loss": 1.1597, + "step": 460 + }, + { + "epoch": 0.31886564067093204, + "grad_norm": 0.6517722010612488, + "learning_rate": 0.0007181354697742171, + "loss": 1.0909, + "step": 461 + }, + { + "epoch": 0.31955732318865643, + "grad_norm": 0.6660693883895874, + "learning_rate": 0.0007174071376547706, + "loss": 1.8785, + "step": 462 + }, + { + "epoch": 0.32024900570638076, + "grad_norm": 0.8916088938713074, + "learning_rate": 0.0007166788055353242, + "loss": 1.0246, + "step": 463 + }, + { + "epoch": 0.32094068822410515, + "grad_norm": 0.6262109875679016, + "learning_rate": 0.0007159504734158776, + "loss": 1.8117, + "step": 464 + }, + { + "epoch": 0.3216323707418295, + "grad_norm": 0.5265359878540039, + "learning_rate": 0.0007152221412964311, + "loss": 1.8935, + "step": 465 + }, + { + "epoch": 0.3223240532595539, + "grad_norm": 0.584057629108429, + "learning_rate": 0.0007144938091769847, + "loss": 2.1743, + "step": 466 + }, + { + "epoch": 0.3230157357772782, + "grad_norm": 0.6198194026947021, + "learning_rate": 0.0007137654770575382, + "loss": 1.0213, + "step": 467 + }, + { + "epoch": 0.3237074182950026, + "grad_norm": 0.8011792898178101, + "learning_rate": 0.0007130371449380918, + "loss": 1.3635, + "step": 468 + }, + { + "epoch": 0.32439910081272694, + "grad_norm": 0.6226928234100342, + "learning_rate": 0.0007123088128186453, + "loss": 1.5389, + "step": 469 + }, + { + "epoch": 0.3250907833304513, + "grad_norm": 0.6563382744789124, + "learning_rate": 0.0007115804806991989, + "loss": 1.6626, + "step": 470 + }, + { + "epoch": 0.32578246584817566, + "grad_norm": 0.6689289808273315, + "learning_rate": 0.0007108521485797523, + "loss": 0.2715, + "step": 471 + }, + { + "epoch": 0.32647414836590005, + "grad_norm": 0.7439026832580566, + "learning_rate": 0.0007101238164603058, + "loss": 1.9645, + "step": 472 + }, + { + "epoch": 0.32716583088362444, + "grad_norm": 0.6306619048118591, + "learning_rate": 0.0007093954843408594, + "loss": 2.026, + "step": 473 + }, + { + "epoch": 0.3278575134013488, + "grad_norm": 0.5557575225830078, + "learning_rate": 0.000708667152221413, + "loss": 1.7433, + "step": 474 + }, + { + "epoch": 0.32854919591907317, + "grad_norm": 0.6485971212387085, + "learning_rate": 0.0007079388201019666, + "loss": 1.8697, + "step": 475 + }, + { + "epoch": 0.3292408784367975, + "grad_norm": 0.6541025042533875, + "learning_rate": 0.0007072104879825201, + "loss": 1.8979, + "step": 476 + }, + { + "epoch": 0.3299325609545219, + "grad_norm": 0.617359459400177, + "learning_rate": 0.0007064821558630737, + "loss": 2.1617, + "step": 477 + }, + { + "epoch": 0.3306242434722462, + "grad_norm": 0.5855705142021179, + "learning_rate": 0.0007057538237436271, + "loss": 1.8079, + "step": 478 + }, + { + "epoch": 0.3313159259899706, + "grad_norm": 0.5983846187591553, + "learning_rate": 0.0007050254916241806, + "loss": 1.8796, + "step": 479 + }, + { + "epoch": 0.33200760850769495, + "grad_norm": 1.1877782344818115, + "learning_rate": 0.0007042971595047342, + "loss": 1.5439, + "step": 480 + }, + { + "epoch": 0.33269929102541934, + "grad_norm": 0.48594361543655396, + "learning_rate": 0.0007035688273852877, + "loss": 1.3418, + "step": 481 + }, + { + "epoch": 0.3333909735431437, + "grad_norm": 1.506446123123169, + "learning_rate": 0.0007028404952658413, + "loss": 1.2715, + "step": 482 + }, + { + "epoch": 0.33408265606086807, + "grad_norm": 1.1047416925430298, + "learning_rate": 0.0007021121631463948, + "loss": 1.6349, + "step": 483 + }, + { + "epoch": 0.3347743385785924, + "grad_norm": 0.6083149909973145, + "learning_rate": 0.0007013838310269483, + "loss": 1.9314, + "step": 484 + }, + { + "epoch": 0.3354660210963168, + "grad_norm": 1.35393226146698, + "learning_rate": 0.0007006554989075018, + "loss": 1.8499, + "step": 485 + }, + { + "epoch": 0.3361577036140412, + "grad_norm": 0.9483690857887268, + "learning_rate": 0.0006999271667880553, + "loss": 1.51, + "step": 486 + }, + { + "epoch": 0.3368493861317655, + "grad_norm": 0.5228135585784912, + "learning_rate": 0.0006991988346686089, + "loss": 1.9898, + "step": 487 + }, + { + "epoch": 0.3375410686494899, + "grad_norm": 0.5586177706718445, + "learning_rate": 0.0006984705025491624, + "loss": 1.3041, + "step": 488 + }, + { + "epoch": 0.33823275116721424, + "grad_norm": 0.5928153991699219, + "learning_rate": 0.000697742170429716, + "loss": 2.0132, + "step": 489 + }, + { + "epoch": 0.33892443368493863, + "grad_norm": 1.3419654369354248, + "learning_rate": 0.0006970138383102695, + "loss": 1.0039, + "step": 490 + }, + { + "epoch": 0.33961611620266297, + "grad_norm": 0.6874161958694458, + "learning_rate": 0.000696285506190823, + "loss": 1.2347, + "step": 491 + }, + { + "epoch": 0.34030779872038736, + "grad_norm": 0.6180602312088013, + "learning_rate": 0.0006955571740713765, + "loss": 1.1032, + "step": 492 + }, + { + "epoch": 0.3409994812381117, + "grad_norm": 0.6274152994155884, + "learning_rate": 0.00069482884195193, + "loss": 1.8178, + "step": 493 + }, + { + "epoch": 0.3416911637558361, + "grad_norm": 0.6605129837989807, + "learning_rate": 0.0006941005098324836, + "loss": 1.4469, + "step": 494 + }, + { + "epoch": 0.3423828462735604, + "grad_norm": 0.5513582229614258, + "learning_rate": 0.0006933721777130371, + "loss": 1.6028, + "step": 495 + }, + { + "epoch": 0.3430745287912848, + "grad_norm": 0.4937622845172882, + "learning_rate": 0.0006926438455935908, + "loss": 1.676, + "step": 496 + }, + { + "epoch": 0.34376621130900914, + "grad_norm": 0.6980156302452087, + "learning_rate": 0.0006919155134741443, + "loss": 1.8214, + "step": 497 + }, + { + "epoch": 0.34445789382673353, + "grad_norm": 0.589282214641571, + "learning_rate": 0.0006911871813546978, + "loss": 1.5903, + "step": 498 + }, + { + "epoch": 0.3451495763444579, + "grad_norm": 0.5733899474143982, + "learning_rate": 0.0006904588492352513, + "loss": 2.0085, + "step": 499 + }, + { + "epoch": 0.34584125886218225, + "grad_norm": 13.85383129119873, + "learning_rate": 0.0006897305171158048, + "loss": 1.3935, + "step": 500 + }, + { + "epoch": 0.34584125886218225, + "eval_loss": 1.4428730010986328, + "eval_runtime": 586.3776, + "eval_samples_per_second": 2.191, + "eval_steps_per_second": 1.097, + "step": 500 + }, + { + "epoch": 0.34653294137990664, + "grad_norm": 0.7078066468238831, + "learning_rate": 0.0006890021849963584, + "loss": 2.1292, + "step": 501 + }, + { + "epoch": 0.347224623897631, + "grad_norm": 0.5394365787506104, + "learning_rate": 0.0006882738528769119, + "loss": 1.754, + "step": 502 + }, + { + "epoch": 0.34791630641535537, + "grad_norm": 0.5652272701263428, + "learning_rate": 0.0006875455207574655, + "loss": 1.7321, + "step": 503 + }, + { + "epoch": 0.3486079889330797, + "grad_norm": 0.7590258717536926, + "learning_rate": 0.0006868171886380189, + "loss": 1.1914, + "step": 504 + }, + { + "epoch": 0.3492996714508041, + "grad_norm": 0.6021602153778076, + "learning_rate": 0.0006860888565185725, + "loss": 1.4885, + "step": 505 + }, + { + "epoch": 0.34999135396852843, + "grad_norm": 0.5702530741691589, + "learning_rate": 0.000685360524399126, + "loss": 1.7321, + "step": 506 + }, + { + "epoch": 0.3506830364862528, + "grad_norm": 0.6997826099395752, + "learning_rate": 0.0006846321922796795, + "loss": 1.2399, + "step": 507 + }, + { + "epoch": 0.35137471900397715, + "grad_norm": 0.5168879628181458, + "learning_rate": 0.0006839038601602331, + "loss": 1.1561, + "step": 508 + }, + { + "epoch": 0.35206640152170154, + "grad_norm": 0.7245921492576599, + "learning_rate": 0.0006831755280407866, + "loss": 1.3214, + "step": 509 + }, + { + "epoch": 0.3527580840394259, + "grad_norm": 0.5683633685112, + "learning_rate": 0.0006824471959213402, + "loss": 0.8284, + "step": 510 + }, + { + "epoch": 0.35344976655715027, + "grad_norm": 0.6745566725730896, + "learning_rate": 0.0006817188638018936, + "loss": 1.3734, + "step": 511 + }, + { + "epoch": 0.35414144907487466, + "grad_norm": 1.4038891792297363, + "learning_rate": 0.0006809905316824472, + "loss": 1.5278, + "step": 512 + }, + { + "epoch": 0.354833131592599, + "grad_norm": 0.633068323135376, + "learning_rate": 0.0006802621995630007, + "loss": 1.9671, + "step": 513 + }, + { + "epoch": 0.3555248141103234, + "grad_norm": 0.6043351888656616, + "learning_rate": 0.0006795338674435542, + "loss": 2.0149, + "step": 514 + }, + { + "epoch": 0.3562164966280477, + "grad_norm": 0.68370121717453, + "learning_rate": 0.0006788055353241078, + "loss": 2.1092, + "step": 515 + }, + { + "epoch": 0.3569081791457721, + "grad_norm": 0.8545238375663757, + "learning_rate": 0.0006780772032046613, + "loss": 1.8421, + "step": 516 + }, + { + "epoch": 0.35759986166349644, + "grad_norm": 0.5773653388023376, + "learning_rate": 0.000677348871085215, + "loss": 1.3504, + "step": 517 + }, + { + "epoch": 0.35829154418122083, + "grad_norm": 0.5424834489822388, + "learning_rate": 0.0006766205389657683, + "loss": 1.5214, + "step": 518 + }, + { + "epoch": 0.35898322669894517, + "grad_norm": 0.6430636644363403, + "learning_rate": 0.000675892206846322, + "loss": 1.8938, + "step": 519 + }, + { + "epoch": 0.35967490921666956, + "grad_norm": 0.7093544602394104, + "learning_rate": 0.0006751638747268755, + "loss": 1.2654, + "step": 520 + }, + { + "epoch": 0.3603665917343939, + "grad_norm": 0.6688354015350342, + "learning_rate": 0.000674435542607429, + "loss": 1.457, + "step": 521 + }, + { + "epoch": 0.3610582742521183, + "grad_norm": 0.6786370873451233, + "learning_rate": 0.0006737072104879826, + "loss": 1.3456, + "step": 522 + }, + { + "epoch": 0.3617499567698426, + "grad_norm": 0.5341997742652893, + "learning_rate": 0.0006729788783685361, + "loss": 1.9376, + "step": 523 + }, + { + "epoch": 0.362441639287567, + "grad_norm": 0.7012555003166199, + "learning_rate": 0.0006722505462490896, + "loss": 1.2591, + "step": 524 + }, + { + "epoch": 0.3631333218052914, + "grad_norm": 0.7682555317878723, + "learning_rate": 0.0006715222141296431, + "loss": 1.4887, + "step": 525 + }, + { + "epoch": 0.36382500432301573, + "grad_norm": 0.8778069615364075, + "learning_rate": 0.0006707938820101967, + "loss": 1.8948, + "step": 526 + }, + { + "epoch": 0.3645166868407401, + "grad_norm": 0.7078661322593689, + "learning_rate": 0.0006700655498907502, + "loss": 1.8194, + "step": 527 + }, + { + "epoch": 0.36520836935846446, + "grad_norm": 0.7950670719146729, + "learning_rate": 0.0006693372177713037, + "loss": 1.4127, + "step": 528 + }, + { + "epoch": 0.36590005187618885, + "grad_norm": 0.5888793468475342, + "learning_rate": 0.0006686088856518573, + "loss": 1.7251, + "step": 529 + }, + { + "epoch": 0.3665917343939132, + "grad_norm": 0.9127269387245178, + "learning_rate": 0.0006678805535324108, + "loss": 1.3327, + "step": 530 + }, + { + "epoch": 0.3672834169116376, + "grad_norm": 0.5952627062797546, + "learning_rate": 0.0006671522214129643, + "loss": 1.4203, + "step": 531 + }, + { + "epoch": 0.3679750994293619, + "grad_norm": 0.4815312922000885, + "learning_rate": 0.0006664238892935178, + "loss": 0.9024, + "step": 532 + }, + { + "epoch": 0.3686667819470863, + "grad_norm": 0.652438759803772, + "learning_rate": 0.0006656955571740714, + "loss": 1.146, + "step": 533 + }, + { + "epoch": 0.36935846446481063, + "grad_norm": 0.7467165589332581, + "learning_rate": 0.0006649672250546249, + "loss": 1.2516, + "step": 534 + }, + { + "epoch": 0.370050146982535, + "grad_norm": 0.5682463049888611, + "learning_rate": 0.0006642388929351784, + "loss": 1.9983, + "step": 535 + }, + { + "epoch": 0.37074182950025936, + "grad_norm": 1.3564231395721436, + "learning_rate": 0.000663510560815732, + "loss": 1.6168, + "step": 536 + }, + { + "epoch": 0.37143351201798375, + "grad_norm": 0.5406346321105957, + "learning_rate": 0.0006627822286962855, + "loss": 1.8084, + "step": 537 + }, + { + "epoch": 0.37212519453570814, + "grad_norm": 0.7050182223320007, + "learning_rate": 0.000662053896576839, + "loss": 1.8881, + "step": 538 + }, + { + "epoch": 0.37281687705343247, + "grad_norm": 0.645821213722229, + "learning_rate": 0.0006613255644573925, + "loss": 1.9676, + "step": 539 + }, + { + "epoch": 0.37350855957115686, + "grad_norm": 0.5666909217834473, + "learning_rate": 0.0006605972323379462, + "loss": 1.4184, + "step": 540 + }, + { + "epoch": 0.3742002420888812, + "grad_norm": 0.8641100525856018, + "learning_rate": 0.0006598689002184997, + "loss": 2.3462, + "step": 541 + }, + { + "epoch": 0.3748919246066056, + "grad_norm": 1.3686286211013794, + "learning_rate": 0.0006591405680990532, + "loss": 2.1734, + "step": 542 + }, + { + "epoch": 0.3755836071243299, + "grad_norm": 0.946014404296875, + "learning_rate": 0.0006584122359796068, + "loss": 1.5707, + "step": 543 + }, + { + "epoch": 0.3762752896420543, + "grad_norm": 0.7506546378135681, + "learning_rate": 0.0006576839038601602, + "loss": 1.9043, + "step": 544 + }, + { + "epoch": 0.37696697215977865, + "grad_norm": 0.5678505301475525, + "learning_rate": 0.0006569555717407138, + "loss": 1.8435, + "step": 545 + }, + { + "epoch": 0.37765865467750304, + "grad_norm": 0.616298496723175, + "learning_rate": 0.0006562272396212673, + "loss": 1.6757, + "step": 546 + }, + { + "epoch": 0.37835033719522737, + "grad_norm": 0.6611154675483704, + "learning_rate": 0.0006554989075018209, + "loss": 1.997, + "step": 547 + }, + { + "epoch": 0.37904201971295176, + "grad_norm": 0.6710254549980164, + "learning_rate": 0.0006547705753823744, + "loss": 2.0473, + "step": 548 + }, + { + "epoch": 0.3797337022306761, + "grad_norm": 0.6638564467430115, + "learning_rate": 0.0006540422432629279, + "loss": 1.5527, + "step": 549 + }, + { + "epoch": 0.3804253847484005, + "grad_norm": 0.5560479164123535, + "learning_rate": 0.0006533139111434815, + "loss": 1.1808, + "step": 550 + }, + { + "epoch": 0.3811170672661249, + "grad_norm": 0.8038564920425415, + "learning_rate": 0.0006525855790240349, + "loss": 2.1025, + "step": 551 + }, + { + "epoch": 0.3818087497838492, + "grad_norm": 0.5988295078277588, + "learning_rate": 0.0006518572469045885, + "loss": 1.5031, + "step": 552 + }, + { + "epoch": 0.3825004323015736, + "grad_norm": 0.6957617402076721, + "learning_rate": 0.000651128914785142, + "loss": 1.8805, + "step": 553 + }, + { + "epoch": 0.38319211481929794, + "grad_norm": 0.5128351449966431, + "learning_rate": 0.0006504005826656956, + "loss": 1.3545, + "step": 554 + }, + { + "epoch": 0.3838837973370223, + "grad_norm": 0.6512929201126099, + "learning_rate": 0.0006496722505462491, + "loss": 1.4282, + "step": 555 + }, + { + "epoch": 0.38457547985474666, + "grad_norm": 0.7529391646385193, + "learning_rate": 0.0006489439184268026, + "loss": 1.6247, + "step": 556 + }, + { + "epoch": 0.38526716237247105, + "grad_norm": 0.5604774951934814, + "learning_rate": 0.0006482155863073562, + "loss": 2.0437, + "step": 557 + }, + { + "epoch": 0.3859588448901954, + "grad_norm": 0.9159753322601318, + "learning_rate": 0.0006474872541879096, + "loss": 1.2743, + "step": 558 + }, + { + "epoch": 0.3866505274079198, + "grad_norm": 0.6029691100120544, + "learning_rate": 0.0006467589220684632, + "loss": 1.709, + "step": 559 + }, + { + "epoch": 0.3873422099256441, + "grad_norm": 0.545449435710907, + "learning_rate": 0.0006460305899490167, + "loss": 1.7701, + "step": 560 + }, + { + "epoch": 0.3880338924433685, + "grad_norm": 0.8501741886138916, + "learning_rate": 0.0006453022578295704, + "loss": 0.98, + "step": 561 + }, + { + "epoch": 0.38872557496109283, + "grad_norm": 0.8401899337768555, + "learning_rate": 0.0006445739257101239, + "loss": 0.8133, + "step": 562 + }, + { + "epoch": 0.3894172574788172, + "grad_norm": 0.5995264053344727, + "learning_rate": 0.0006438455935906774, + "loss": 2.0898, + "step": 563 + }, + { + "epoch": 0.3901089399965416, + "grad_norm": 0.5908014178276062, + "learning_rate": 0.0006431172614712309, + "loss": 1.2815, + "step": 564 + }, + { + "epoch": 0.39080062251426595, + "grad_norm": 0.6674137711524963, + "learning_rate": 0.0006423889293517844, + "loss": 0.9943, + "step": 565 + }, + { + "epoch": 0.39149230503199034, + "grad_norm": 0.5658942461013794, + "learning_rate": 0.000641660597232338, + "loss": 1.8295, + "step": 566 + }, + { + "epoch": 0.3921839875497147, + "grad_norm": 0.994026780128479, + "learning_rate": 0.0006409322651128915, + "loss": 1.3895, + "step": 567 + }, + { + "epoch": 0.39287567006743906, + "grad_norm": 0.8961228132247925, + "learning_rate": 0.0006402039329934451, + "loss": 1.3684, + "step": 568 + }, + { + "epoch": 0.3935673525851634, + "grad_norm": 0.7889755368232727, + "learning_rate": 0.0006394756008739986, + "loss": 0.6388, + "step": 569 + }, + { + "epoch": 0.3942590351028878, + "grad_norm": 0.5751606822013855, + "learning_rate": 0.0006387472687545521, + "loss": 1.4578, + "step": 570 + }, + { + "epoch": 0.3949507176206121, + "grad_norm": 0.6397355794906616, + "learning_rate": 0.0006380189366351056, + "loss": 1.3388, + "step": 571 + }, + { + "epoch": 0.3956424001383365, + "grad_norm": 0.5430677533149719, + "learning_rate": 0.0006372906045156591, + "loss": 1.6792, + "step": 572 + }, + { + "epoch": 0.39633408265606085, + "grad_norm": 0.5750475525856018, + "learning_rate": 0.0006365622723962127, + "loss": 1.7402, + "step": 573 + }, + { + "epoch": 0.39702576517378524, + "grad_norm": 0.8769389986991882, + "learning_rate": 0.0006358339402767662, + "loss": 1.1827, + "step": 574 + }, + { + "epoch": 0.3977174476915096, + "grad_norm": 0.6424825191497803, + "learning_rate": 0.0006351056081573198, + "loss": 1.9263, + "step": 575 + }, + { + "epoch": 0.39840913020923396, + "grad_norm": 0.7621489763259888, + "learning_rate": 0.0006343772760378733, + "loss": 1.0903, + "step": 576 + }, + { + "epoch": 0.39910081272695835, + "grad_norm": 1.0157557725906372, + "learning_rate": 0.0006336489439184268, + "loss": 1.0674, + "step": 577 + }, + { + "epoch": 0.3997924952446827, + "grad_norm": 0.9108319878578186, + "learning_rate": 0.0006329206117989803, + "loss": 1.371, + "step": 578 + }, + { + "epoch": 0.4004841777624071, + "grad_norm": 0.9133428931236267, + "learning_rate": 0.0006321922796795338, + "loss": 1.6273, + "step": 579 + }, + { + "epoch": 0.4011758602801314, + "grad_norm": 1.2508081197738647, + "learning_rate": 0.0006314639475600874, + "loss": 1.3252, + "step": 580 + }, + { + "epoch": 0.4018675427978558, + "grad_norm": 0.5828914642333984, + "learning_rate": 0.0006307356154406409, + "loss": 1.8814, + "step": 581 + }, + { + "epoch": 0.40255922531558014, + "grad_norm": 0.6611084938049316, + "learning_rate": 0.0006300072833211945, + "loss": 1.7483, + "step": 582 + }, + { + "epoch": 0.40325090783330453, + "grad_norm": 0.5295059680938721, + "learning_rate": 0.000629278951201748, + "loss": 1.0284, + "step": 583 + }, + { + "epoch": 0.40394259035102886, + "grad_norm": 0.6011462211608887, + "learning_rate": 0.0006285506190823014, + "loss": 1.4974, + "step": 584 + }, + { + "epoch": 0.40463427286875325, + "grad_norm": 0.9036271572113037, + "learning_rate": 0.0006278222869628551, + "loss": 0.9939, + "step": 585 + }, + { + "epoch": 0.4053259553864776, + "grad_norm": 0.7162883877754211, + "learning_rate": 0.0006270939548434086, + "loss": 1.8186, + "step": 586 + }, + { + "epoch": 0.406017637904202, + "grad_norm": 0.8202586770057678, + "learning_rate": 0.0006263656227239622, + "loss": 1.3202, + "step": 587 + }, + { + "epoch": 0.4067093204219263, + "grad_norm": 0.6272046566009521, + "learning_rate": 0.0006256372906045157, + "loss": 1.3552, + "step": 588 + }, + { + "epoch": 0.4074010029396507, + "grad_norm": 0.5360514521598816, + "learning_rate": 0.0006249089584850693, + "loss": 1.3703, + "step": 589 + }, + { + "epoch": 0.40809268545737504, + "grad_norm": 0.5342544913291931, + "learning_rate": 0.0006241806263656228, + "loss": 1.4543, + "step": 590 + }, + { + "epoch": 0.4087843679750994, + "grad_norm": 0.678282618522644, + "learning_rate": 0.0006234522942461762, + "loss": 1.9462, + "step": 591 + }, + { + "epoch": 0.4094760504928238, + "grad_norm": 0.6571401357650757, + "learning_rate": 0.0006227239621267298, + "loss": 2.0453, + "step": 592 + }, + { + "epoch": 0.41016773301054815, + "grad_norm": 0.6967712044715881, + "learning_rate": 0.0006219956300072833, + "loss": 1.3844, + "step": 593 + }, + { + "epoch": 0.41085941552827254, + "grad_norm": 1.1035219430923462, + "learning_rate": 0.0006212672978878369, + "loss": 1.6901, + "step": 594 + }, + { + "epoch": 0.4115510980459969, + "grad_norm": 1.2840924263000488, + "learning_rate": 0.0006205389657683904, + "loss": 1.8151, + "step": 595 + }, + { + "epoch": 0.41224278056372127, + "grad_norm": 0.594129204750061, + "learning_rate": 0.000619810633648944, + "loss": 1.6399, + "step": 596 + }, + { + "epoch": 0.4129344630814456, + "grad_norm": 0.5824661254882812, + "learning_rate": 0.0006190823015294975, + "loss": 1.7004, + "step": 597 + }, + { + "epoch": 0.41362614559917, + "grad_norm": 0.7485136389732361, + "learning_rate": 0.0006183539694100509, + "loss": 1.3451, + "step": 598 + }, + { + "epoch": 0.4143178281168943, + "grad_norm": 0.5770803689956665, + "learning_rate": 0.0006176256372906045, + "loss": 1.571, + "step": 599 + }, + { + "epoch": 0.4150095106346187, + "grad_norm": 0.8474501371383667, + "learning_rate": 0.000616897305171158, + "loss": 1.6703, + "step": 600 + }, + { + "epoch": 0.41570119315234305, + "grad_norm": 0.5007557272911072, + "learning_rate": 0.0006161689730517116, + "loss": 1.3968, + "step": 601 + }, + { + "epoch": 0.41639287567006744, + "grad_norm": 0.598360538482666, + "learning_rate": 0.0006154406409322651, + "loss": 1.0098, + "step": 602 + }, + { + "epoch": 0.4170845581877918, + "grad_norm": 0.49432629346847534, + "learning_rate": 0.0006147123088128187, + "loss": 1.0843, + "step": 603 + }, + { + "epoch": 0.41777624070551617, + "grad_norm": 0.6179078817367554, + "learning_rate": 0.0006139839766933721, + "loss": 1.781, + "step": 604 + }, + { + "epoch": 0.41846792322324056, + "grad_norm": 0.719780445098877, + "learning_rate": 0.0006132556445739256, + "loss": 1.8742, + "step": 605 + }, + { + "epoch": 0.4191596057409649, + "grad_norm": 0.768247127532959, + "learning_rate": 0.0006125273124544793, + "loss": 1.8538, + "step": 606 + }, + { + "epoch": 0.4198512882586893, + "grad_norm": 0.5836595892906189, + "learning_rate": 0.0006117989803350328, + "loss": 1.2134, + "step": 607 + }, + { + "epoch": 0.4205429707764136, + "grad_norm": 0.8769506216049194, + "learning_rate": 0.0006110706482155864, + "loss": 1.1559, + "step": 608 + }, + { + "epoch": 0.421234653294138, + "grad_norm": 0.5141684412956238, + "learning_rate": 0.0006103423160961399, + "loss": 1.6293, + "step": 609 + }, + { + "epoch": 0.42192633581186234, + "grad_norm": 0.6994926333427429, + "learning_rate": 0.0006096139839766935, + "loss": 1.4453, + "step": 610 + }, + { + "epoch": 0.42261801832958673, + "grad_norm": 0.6264182329177856, + "learning_rate": 0.0006088856518572469, + "loss": 1.9851, + "step": 611 + }, + { + "epoch": 0.42330970084731107, + "grad_norm": 0.6712834239006042, + "learning_rate": 0.0006081573197378004, + "loss": 1.5792, + "step": 612 + }, + { + "epoch": 0.42400138336503546, + "grad_norm": 0.6829782724380493, + "learning_rate": 0.000607428987618354, + "loss": 1.2137, + "step": 613 + }, + { + "epoch": 0.4246930658827598, + "grad_norm": 0.6076679825782776, + "learning_rate": 0.0006067006554989075, + "loss": 1.7787, + "step": 614 + }, + { + "epoch": 0.4253847484004842, + "grad_norm": 0.6156198382377625, + "learning_rate": 0.0006059723233794611, + "loss": 1.7189, + "step": 615 + }, + { + "epoch": 0.4260764309182085, + "grad_norm": 0.7797242403030396, + "learning_rate": 0.0006052439912600146, + "loss": 1.58, + "step": 616 + }, + { + "epoch": 0.4267681134359329, + "grad_norm": 0.6205226182937622, + "learning_rate": 0.0006045156591405682, + "loss": 1.0808, + "step": 617 + }, + { + "epoch": 0.4274597959536573, + "grad_norm": 0.7405165433883667, + "learning_rate": 0.0006037873270211216, + "loss": 1.7764, + "step": 618 + }, + { + "epoch": 0.42815147847138163, + "grad_norm": 0.5714502334594727, + "learning_rate": 0.0006030589949016751, + "loss": 0.8059, + "step": 619 + }, + { + "epoch": 0.428843160989106, + "grad_norm": 0.6483557224273682, + "learning_rate": 0.0006023306627822287, + "loss": 1.7951, + "step": 620 + }, + { + "epoch": 0.42953484350683035, + "grad_norm": 0.8051972389221191, + "learning_rate": 0.0006016023306627822, + "loss": 1.4164, + "step": 621 + }, + { + "epoch": 0.43022652602455475, + "grad_norm": 0.6258521676063538, + "learning_rate": 0.0006008739985433358, + "loss": 1.9752, + "step": 622 + }, + { + "epoch": 0.4309182085422791, + "grad_norm": 0.7436981201171875, + "learning_rate": 0.0006001456664238893, + "loss": 1.6743, + "step": 623 + }, + { + "epoch": 0.43160989106000347, + "grad_norm": 0.43516382575035095, + "learning_rate": 0.0005994173343044428, + "loss": 1.6526, + "step": 624 + }, + { + "epoch": 0.4323015735777278, + "grad_norm": 0.8342098593711853, + "learning_rate": 0.0005986890021849963, + "loss": 1.612, + "step": 625 + }, + { + "epoch": 0.4329932560954522, + "grad_norm": 1.1247819662094116, + "learning_rate": 0.0005979606700655498, + "loss": 1.736, + "step": 626 + }, + { + "epoch": 0.43368493861317653, + "grad_norm": 0.6731348633766174, + "learning_rate": 0.0005972323379461035, + "loss": 1.4284, + "step": 627 + }, + { + "epoch": 0.4343766211309009, + "grad_norm": 0.8929141163825989, + "learning_rate": 0.000596504005826657, + "loss": 1.5504, + "step": 628 + }, + { + "epoch": 0.43506830364862525, + "grad_norm": 0.5387009382247925, + "learning_rate": 0.0005957756737072106, + "loss": 1.4214, + "step": 629 + }, + { + "epoch": 0.43575998616634964, + "grad_norm": 0.6756249666213989, + "learning_rate": 0.0005950473415877641, + "loss": 1.863, + "step": 630 + }, + { + "epoch": 0.43645166868407403, + "grad_norm": 0.7305259704589844, + "learning_rate": 0.0005943190094683176, + "loss": 2.0542, + "step": 631 + }, + { + "epoch": 0.43714335120179837, + "grad_norm": 0.6207787394523621, + "learning_rate": 0.0005935906773488711, + "loss": 1.4846, + "step": 632 + }, + { + "epoch": 0.43783503371952276, + "grad_norm": 0.5971491932868958, + "learning_rate": 0.0005928623452294246, + "loss": 1.5098, + "step": 633 + }, + { + "epoch": 0.4385267162372471, + "grad_norm": 0.6698988080024719, + "learning_rate": 0.0005921340131099782, + "loss": 1.9203, + "step": 634 + }, + { + "epoch": 0.4392183987549715, + "grad_norm": 0.6068461537361145, + "learning_rate": 0.0005914056809905317, + "loss": 1.0954, + "step": 635 + }, + { + "epoch": 0.4399100812726958, + "grad_norm": 0.5868760347366333, + "learning_rate": 0.0005906773488710853, + "loss": 1.3239, + "step": 636 + }, + { + "epoch": 0.4406017637904202, + "grad_norm": 0.6060192584991455, + "learning_rate": 0.0005899490167516388, + "loss": 1.4716, + "step": 637 + }, + { + "epoch": 0.44129344630814454, + "grad_norm": 0.7964019775390625, + "learning_rate": 0.0005892206846321923, + "loss": 1.6277, + "step": 638 + }, + { + "epoch": 0.44198512882586893, + "grad_norm": 0.7350580096244812, + "learning_rate": 0.0005884923525127458, + "loss": 1.7172, + "step": 639 + }, + { + "epoch": 0.44267681134359327, + "grad_norm": 0.6525869369506836, + "learning_rate": 0.0005877640203932993, + "loss": 1.5932, + "step": 640 + }, + { + "epoch": 0.44336849386131766, + "grad_norm": 0.5807157754898071, + "learning_rate": 0.0005870356882738529, + "loss": 1.8092, + "step": 641 + }, + { + "epoch": 0.444060176379042, + "grad_norm": 0.5111778378486633, + "learning_rate": 0.0005863073561544064, + "loss": 1.3868, + "step": 642 + }, + { + "epoch": 0.4447518588967664, + "grad_norm": 0.5414280295372009, + "learning_rate": 0.00058557902403496, + "loss": 1.2737, + "step": 643 + }, + { + "epoch": 0.4454435414144908, + "grad_norm": 0.5459370613098145, + "learning_rate": 0.0005848506919155134, + "loss": 1.1543, + "step": 644 + }, + { + "epoch": 0.4461352239322151, + "grad_norm": 0.8171971440315247, + "learning_rate": 0.000584122359796067, + "loss": 1.0578, + "step": 645 + }, + { + "epoch": 0.4468269064499395, + "grad_norm": 0.7383177280426025, + "learning_rate": 0.0005833940276766205, + "loss": 1.592, + "step": 646 + }, + { + "epoch": 0.44751858896766383, + "grad_norm": 1.1485675573349, + "learning_rate": 0.000582665695557174, + "loss": 0.9909, + "step": 647 + }, + { + "epoch": 0.4482102714853882, + "grad_norm": 0.9529873132705688, + "learning_rate": 0.0005819373634377276, + "loss": 1.3978, + "step": 648 + }, + { + "epoch": 0.44890195400311256, + "grad_norm": 0.6915671229362488, + "learning_rate": 0.0005812090313182812, + "loss": 1.4762, + "step": 649 + }, + { + "epoch": 0.44959363652083695, + "grad_norm": 0.5856941938400269, + "learning_rate": 0.0005804806991988348, + "loss": 1.5474, + "step": 650 + }, + { + "epoch": 0.4502853190385613, + "grad_norm": 1.4381829500198364, + "learning_rate": 0.0005797523670793882, + "loss": 1.3106, + "step": 651 + }, + { + "epoch": 0.4509770015562857, + "grad_norm": 0.9599109292030334, + "learning_rate": 0.0005790240349599418, + "loss": 1.6511, + "step": 652 + }, + { + "epoch": 0.45166868407401, + "grad_norm": 0.779615581035614, + "learning_rate": 0.0005782957028404953, + "loss": 1.3878, + "step": 653 + }, + { + "epoch": 0.4523603665917344, + "grad_norm": 0.9594855308532715, + "learning_rate": 0.0005775673707210488, + "loss": 1.4339, + "step": 654 + }, + { + "epoch": 0.45305204910945873, + "grad_norm": 0.7078779935836792, + "learning_rate": 0.0005768390386016024, + "loss": 1.1043, + "step": 655 + }, + { + "epoch": 0.4537437316271831, + "grad_norm": 1.176413893699646, + "learning_rate": 0.0005761107064821559, + "loss": 1.5211, + "step": 656 + }, + { + "epoch": 0.4544354141449075, + "grad_norm": 0.6396954655647278, + "learning_rate": 0.0005753823743627095, + "loss": 1.8841, + "step": 657 + }, + { + "epoch": 0.45512709666263185, + "grad_norm": 0.8579792380332947, + "learning_rate": 0.0005746540422432629, + "loss": 1.7458, + "step": 658 + }, + { + "epoch": 0.45581877918035624, + "grad_norm": 0.6195595264434814, + "learning_rate": 0.0005739257101238165, + "loss": 1.6583, + "step": 659 + }, + { + "epoch": 0.45651046169808057, + "grad_norm": 0.6468372941017151, + "learning_rate": 0.00057319737800437, + "loss": 1.9984, + "step": 660 + }, + { + "epoch": 0.45720214421580496, + "grad_norm": 1.8570871353149414, + "learning_rate": 0.0005724690458849235, + "loss": 1.3586, + "step": 661 + }, + { + "epoch": 0.4578938267335293, + "grad_norm": 7.004371166229248, + "learning_rate": 0.0005717407137654771, + "loss": 1.9842, + "step": 662 + }, + { + "epoch": 0.4585855092512537, + "grad_norm": 0.7576525807380676, + "learning_rate": 0.0005710123816460306, + "loss": 1.8268, + "step": 663 + }, + { + "epoch": 0.459277191768978, + "grad_norm": 1.155989646911621, + "learning_rate": 0.0005702840495265841, + "loss": 0.7441, + "step": 664 + }, + { + "epoch": 0.4599688742867024, + "grad_norm": 0.5623228549957275, + "learning_rate": 0.0005695557174071376, + "loss": 1.5069, + "step": 665 + }, + { + "epoch": 0.46066055680442675, + "grad_norm": 0.8218792676925659, + "learning_rate": 0.0005688273852876912, + "loss": 1.4099, + "step": 666 + }, + { + "epoch": 0.46135223932215114, + "grad_norm": 0.5278311371803284, + "learning_rate": 0.0005680990531682447, + "loss": 1.6798, + "step": 667 + }, + { + "epoch": 0.46204392183987547, + "grad_norm": 0.5412008762359619, + "learning_rate": 0.0005673707210487982, + "loss": 1.1857, + "step": 668 + }, + { + "epoch": 0.46273560435759986, + "grad_norm": 0.5376167893409729, + "learning_rate": 0.0005666423889293518, + "loss": 1.7932, + "step": 669 + }, + { + "epoch": 0.46342728687532425, + "grad_norm": 0.7957973480224609, + "learning_rate": 0.0005659140568099053, + "loss": 1.0163, + "step": 670 + }, + { + "epoch": 0.4641189693930486, + "grad_norm": 0.933850109577179, + "learning_rate": 0.0005651857246904589, + "loss": 1.2543, + "step": 671 + }, + { + "epoch": 0.464810651910773, + "grad_norm": 0.7921749949455261, + "learning_rate": 0.0005644573925710124, + "loss": 1.3725, + "step": 672 + }, + { + "epoch": 0.4655023344284973, + "grad_norm": 0.5913270115852356, + "learning_rate": 0.000563729060451566, + "loss": 1.7408, + "step": 673 + }, + { + "epoch": 0.4661940169462217, + "grad_norm": 0.5727415680885315, + "learning_rate": 0.0005630007283321195, + "loss": 1.0868, + "step": 674 + }, + { + "epoch": 0.46688569946394604, + "grad_norm": 0.6014571785926819, + "learning_rate": 0.000562272396212673, + "loss": 1.2138, + "step": 675 + }, + { + "epoch": 0.4675773819816704, + "grad_norm": 0.6142331957817078, + "learning_rate": 0.0005615440640932266, + "loss": 1.348, + "step": 676 + }, + { + "epoch": 0.46826906449939476, + "grad_norm": 0.6304256319999695, + "learning_rate": 0.0005608157319737801, + "loss": 2.1715, + "step": 677 + }, + { + "epoch": 0.46896074701711915, + "grad_norm": 0.6730361580848694, + "learning_rate": 0.0005600873998543336, + "loss": 1.7929, + "step": 678 + }, + { + "epoch": 0.4696524295348435, + "grad_norm": 1.0285260677337646, + "learning_rate": 0.0005593590677348871, + "loss": 1.0785, + "step": 679 + }, + { + "epoch": 0.4703441120525679, + "grad_norm": 0.6518314480781555, + "learning_rate": 0.0005586307356154407, + "loss": 1.9217, + "step": 680 + }, + { + "epoch": 0.4710357945702922, + "grad_norm": 0.813822329044342, + "learning_rate": 0.0005579024034959942, + "loss": 1.4779, + "step": 681 + }, + { + "epoch": 0.4717274770880166, + "grad_norm": 0.87235426902771, + "learning_rate": 0.0005571740713765477, + "loss": 1.4881, + "step": 682 + }, + { + "epoch": 0.472419159605741, + "grad_norm": 0.44555163383483887, + "learning_rate": 0.0005564457392571013, + "loss": 1.345, + "step": 683 + }, + { + "epoch": 0.4731108421234653, + "grad_norm": 0.6434228420257568, + "learning_rate": 0.0005557174071376547, + "loss": 1.7488, + "step": 684 + }, + { + "epoch": 0.4738025246411897, + "grad_norm": 1.8087595701217651, + "learning_rate": 0.0005549890750182083, + "loss": 1.4157, + "step": 685 + }, + { + "epoch": 0.47449420715891405, + "grad_norm": 0.6187067031860352, + "learning_rate": 0.0005542607428987618, + "loss": 1.9694, + "step": 686 + }, + { + "epoch": 0.47518588967663844, + "grad_norm": 0.5754404664039612, + "learning_rate": 0.0005535324107793154, + "loss": 1.6113, + "step": 687 + }, + { + "epoch": 0.4758775721943628, + "grad_norm": 1.1567997932434082, + "learning_rate": 0.0005528040786598689, + "loss": 1.492, + "step": 688 + }, + { + "epoch": 0.47656925471208716, + "grad_norm": 7.947727680206299, + "learning_rate": 0.0005520757465404224, + "loss": 1.1358, + "step": 689 + }, + { + "epoch": 0.4772609372298115, + "grad_norm": 0.7208907008171082, + "learning_rate": 0.000551347414420976, + "loss": 1.4342, + "step": 690 + }, + { + "epoch": 0.4779526197475359, + "grad_norm": 0.594211995601654, + "learning_rate": 0.0005506190823015294, + "loss": 2.0316, + "step": 691 + }, + { + "epoch": 0.4786443022652602, + "grad_norm": 0.65560382604599, + "learning_rate": 0.000549890750182083, + "loss": 1.1252, + "step": 692 + }, + { + "epoch": 0.4793359847829846, + "grad_norm": 0.6759006381034851, + "learning_rate": 0.0005491624180626365, + "loss": 1.8818, + "step": 693 + }, + { + "epoch": 0.48002766730070895, + "grad_norm": 0.5605379343032837, + "learning_rate": 0.0005484340859431902, + "loss": 1.8104, + "step": 694 + }, + { + "epoch": 0.48071934981843334, + "grad_norm": 1.8264721632003784, + "learning_rate": 0.0005477057538237437, + "loss": 1.032, + "step": 695 + }, + { + "epoch": 0.48141103233615773, + "grad_norm": 0.6420240998268127, + "learning_rate": 0.0005469774217042972, + "loss": 0.9941, + "step": 696 + }, + { + "epoch": 0.48210271485388206, + "grad_norm": 0.8541857600212097, + "learning_rate": 0.0005462490895848508, + "loss": 1.9238, + "step": 697 + }, + { + "epoch": 0.48279439737160645, + "grad_norm": 0.5706299543380737, + "learning_rate": 0.0005455207574654042, + "loss": 0.894, + "step": 698 + }, + { + "epoch": 0.4834860798893308, + "grad_norm": 0.7758136987686157, + "learning_rate": 0.0005447924253459578, + "loss": 1.5292, + "step": 699 + }, + { + "epoch": 0.4841777624070552, + "grad_norm": 0.5781087875366211, + "learning_rate": 0.0005440640932265113, + "loss": 0.8735, + "step": 700 + }, + { + "epoch": 0.4848694449247795, + "grad_norm": 0.5916205048561096, + "learning_rate": 0.0005433357611070649, + "loss": 1.329, + "step": 701 + }, + { + "epoch": 0.4855611274425039, + "grad_norm": 1.0263584852218628, + "learning_rate": 0.0005426074289876184, + "loss": 1.7103, + "step": 702 + }, + { + "epoch": 0.48625280996022824, + "grad_norm": 0.6922283172607422, + "learning_rate": 0.0005418790968681719, + "loss": 0.9118, + "step": 703 + }, + { + "epoch": 0.48694449247795263, + "grad_norm": 1.4904029369354248, + "learning_rate": 0.0005411507647487254, + "loss": 1.3195, + "step": 704 + }, + { + "epoch": 0.48763617499567696, + "grad_norm": 0.7213814854621887, + "learning_rate": 0.0005404224326292789, + "loss": 2.0028, + "step": 705 + }, + { + "epoch": 0.48832785751340135, + "grad_norm": 0.6512391567230225, + "learning_rate": 0.0005396941005098325, + "loss": 1.8618, + "step": 706 + }, + { + "epoch": 0.4890195400311257, + "grad_norm": 0.8064336180686951, + "learning_rate": 0.000538965768390386, + "loss": 1.3046, + "step": 707 + }, + { + "epoch": 0.4897112225488501, + "grad_norm": 0.7077018022537231, + "learning_rate": 0.0005382374362709396, + "loss": 1.9682, + "step": 708 + }, + { + "epoch": 0.49040290506657447, + "grad_norm": 0.8143154382705688, + "learning_rate": 0.0005375091041514931, + "loss": 1.5645, + "step": 709 + }, + { + "epoch": 0.4910945875842988, + "grad_norm": 0.7311553359031677, + "learning_rate": 0.0005367807720320466, + "loss": 1.6889, + "step": 710 + }, + { + "epoch": 0.4917862701020232, + "grad_norm": 0.5971880555152893, + "learning_rate": 0.0005360524399126001, + "loss": 1.4776, + "step": 711 + }, + { + "epoch": 0.4924779526197475, + "grad_norm": 0.6928828954696655, + "learning_rate": 0.0005353241077931536, + "loss": 2.232, + "step": 712 + }, + { + "epoch": 0.4931696351374719, + "grad_norm": 0.6291587352752686, + "learning_rate": 0.0005345957756737072, + "loss": 1.3976, + "step": 713 + }, + { + "epoch": 0.49386131765519625, + "grad_norm": 0.48338043689727783, + "learning_rate": 0.0005338674435542607, + "loss": 0.9061, + "step": 714 + }, + { + "epoch": 0.49455300017292064, + "grad_norm": 0.6431358456611633, + "learning_rate": 0.0005331391114348144, + "loss": 1.6624, + "step": 715 + }, + { + "epoch": 0.495244682690645, + "grad_norm": 0.6543510556221008, + "learning_rate": 0.0005324107793153679, + "loss": 1.905, + "step": 716 + }, + { + "epoch": 0.49593636520836937, + "grad_norm": 0.5716462731361389, + "learning_rate": 0.0005316824471959214, + "loss": 1.7145, + "step": 717 + }, + { + "epoch": 0.4966280477260937, + "grad_norm": 0.5821312665939331, + "learning_rate": 0.0005309541150764749, + "loss": 1.8807, + "step": 718 + }, + { + "epoch": 0.4973197302438181, + "grad_norm": 0.7142646908760071, + "learning_rate": 0.0005302257829570284, + "loss": 1.9264, + "step": 719 + }, + { + "epoch": 0.4980114127615424, + "grad_norm": 0.5939432382583618, + "learning_rate": 0.000529497450837582, + "loss": 1.934, + "step": 720 + }, + { + "epoch": 0.4987030952792668, + "grad_norm": 1.000845193862915, + "learning_rate": 0.0005287691187181355, + "loss": 1.4068, + "step": 721 + }, + { + "epoch": 0.4993947777969912, + "grad_norm": 0.7502846717834473, + "learning_rate": 0.0005280407865986891, + "loss": 1.6726, + "step": 722 + }, + { + "epoch": 0.5000864603147156, + "grad_norm": 0.7099502682685852, + "learning_rate": 0.0005273124544792426, + "loss": 0.9208, + "step": 723 + }, + { + "epoch": 0.5007781428324399, + "grad_norm": 0.5614446997642517, + "learning_rate": 0.000526584122359796, + "loss": 1.8832, + "step": 724 + }, + { + "epoch": 0.5014698253501643, + "grad_norm": 0.6386409401893616, + "learning_rate": 0.0005258557902403496, + "loss": 0.9929, + "step": 725 + }, + { + "epoch": 0.5021615078678886, + "grad_norm": 0.7122677564620972, + "learning_rate": 0.0005251274581209031, + "loss": 1.6074, + "step": 726 + }, + { + "epoch": 0.502853190385613, + "grad_norm": 0.7774210572242737, + "learning_rate": 0.0005243991260014567, + "loss": 1.7793, + "step": 727 + }, + { + "epoch": 0.5035448729033374, + "grad_norm": 0.6662734150886536, + "learning_rate": 0.0005236707938820102, + "loss": 1.8851, + "step": 728 + }, + { + "epoch": 0.5042365554210617, + "grad_norm": 0.6568670868873596, + "learning_rate": 0.0005229424617625638, + "loss": 1.369, + "step": 729 + }, + { + "epoch": 0.504928237938786, + "grad_norm": 3.8021135330200195, + "learning_rate": 0.0005222141296431173, + "loss": 1.4424, + "step": 730 + }, + { + "epoch": 0.5056199204565105, + "grad_norm": 1.1102453470230103, + "learning_rate": 0.0005214857975236707, + "loss": 1.5926, + "step": 731 + }, + { + "epoch": 0.5063116029742348, + "grad_norm": 0.6229285001754761, + "learning_rate": 0.0005207574654042243, + "loss": 1.7467, + "step": 732 + }, + { + "epoch": 0.5070032854919592, + "grad_norm": 0.6418752074241638, + "learning_rate": 0.0005200291332847778, + "loss": 1.7413, + "step": 733 + }, + { + "epoch": 0.5076949680096835, + "grad_norm": 1.1830925941467285, + "learning_rate": 0.0005193008011653314, + "loss": 1.6488, + "step": 734 + }, + { + "epoch": 0.508386650527408, + "grad_norm": 0.7698209285736084, + "learning_rate": 0.0005185724690458849, + "loss": 1.5476, + "step": 735 + }, + { + "epoch": 0.5090783330451323, + "grad_norm": 0.781249463558197, + "learning_rate": 0.0005178441369264386, + "loss": 1.8956, + "step": 736 + }, + { + "epoch": 0.5097700155628566, + "grad_norm": 0.5504831075668335, + "learning_rate": 0.0005171158048069921, + "loss": 1.7983, + "step": 737 + }, + { + "epoch": 0.5104616980805811, + "grad_norm": 1.5704818964004517, + "learning_rate": 0.0005163874726875455, + "loss": 2.4774, + "step": 738 + }, + { + "epoch": 0.5111533805983054, + "grad_norm": 0.581377387046814, + "learning_rate": 0.0005156591405680991, + "loss": 1.5272, + "step": 739 + }, + { + "epoch": 0.5118450631160297, + "grad_norm": 0.7596077919006348, + "learning_rate": 0.0005149308084486526, + "loss": 0.9089, + "step": 740 + }, + { + "epoch": 0.5125367456337541, + "grad_norm": 0.6792967915534973, + "learning_rate": 0.0005142024763292062, + "loss": 2.0292, + "step": 741 + }, + { + "epoch": 0.5132284281514785, + "grad_norm": 0.6570396423339844, + "learning_rate": 0.0005134741442097597, + "loss": 1.9329, + "step": 742 + }, + { + "epoch": 0.5139201106692028, + "grad_norm": 0.7024231553077698, + "learning_rate": 0.0005127458120903133, + "loss": 1.6629, + "step": 743 + }, + { + "epoch": 0.5146117931869272, + "grad_norm": 0.8019945621490479, + "learning_rate": 0.0005120174799708667, + "loss": 1.0594, + "step": 744 + }, + { + "epoch": 0.5153034757046515, + "grad_norm": 7.380868911743164, + "learning_rate": 0.0005112891478514202, + "loss": 1.2738, + "step": 745 + }, + { + "epoch": 0.515995158222376, + "grad_norm": 0.6265088319778442, + "learning_rate": 0.0005105608157319738, + "loss": 2.0701, + "step": 746 + }, + { + "epoch": 0.5166868407401003, + "grad_norm": 0.5740177631378174, + "learning_rate": 0.0005098324836125273, + "loss": 1.8416, + "step": 747 + }, + { + "epoch": 0.5173785232578246, + "grad_norm": 0.6777141690254211, + "learning_rate": 0.0005091041514930809, + "loss": 1.333, + "step": 748 + }, + { + "epoch": 0.5180702057755491, + "grad_norm": 0.4951770007610321, + "learning_rate": 0.0005083758193736344, + "loss": 1.7605, + "step": 749 + }, + { + "epoch": 0.5187618882932734, + "grad_norm": 0.6544963717460632, + "learning_rate": 0.000507647487254188, + "loss": 1.5054, + "step": 750 + }, + { + "epoch": 0.5194535708109977, + "grad_norm": 68.59622955322266, + "learning_rate": 0.0005069191551347414, + "loss": 1.911, + "step": 751 + }, + { + "epoch": 0.5201452533287221, + "grad_norm": 0.5751796960830688, + "learning_rate": 0.0005061908230152949, + "loss": 1.3003, + "step": 752 + }, + { + "epoch": 0.5208369358464465, + "grad_norm": 0.7600306868553162, + "learning_rate": 0.0005054624908958485, + "loss": 1.2258, + "step": 753 + }, + { + "epoch": 0.5215286183641709, + "grad_norm": 0.9036048650741577, + "learning_rate": 0.000504734158776402, + "loss": 1.0586, + "step": 754 + }, + { + "epoch": 0.5222203008818952, + "grad_norm": 0.6078492403030396, + "learning_rate": 0.0005040058266569556, + "loss": 1.3874, + "step": 755 + }, + { + "epoch": 0.5229119833996195, + "grad_norm": 0.714954137802124, + "learning_rate": 0.0005032774945375091, + "loss": 1.6183, + "step": 756 + }, + { + "epoch": 0.523603665917344, + "grad_norm": 0.5517666935920715, + "learning_rate": 0.0005025491624180628, + "loss": 1.5844, + "step": 757 + }, + { + "epoch": 0.5242953484350683, + "grad_norm": 0.7139641046524048, + "learning_rate": 0.0005018208302986161, + "loss": 1.724, + "step": 758 + }, + { + "epoch": 0.5249870309527926, + "grad_norm": 0.7252593040466309, + "learning_rate": 0.0005010924981791696, + "loss": 1.3367, + "step": 759 + }, + { + "epoch": 0.525678713470517, + "grad_norm": 0.7002785205841064, + "learning_rate": 0.0005003641660597233, + "loss": 2.0537, + "step": 760 + }, + { + "epoch": 0.5263703959882414, + "grad_norm": 0.6444349884986877, + "learning_rate": 0.0004996358339402768, + "loss": 1.9185, + "step": 761 + }, + { + "epoch": 0.5270620785059658, + "grad_norm": 0.6805012822151184, + "learning_rate": 0.0004989075018208304, + "loss": 1.2298, + "step": 762 + }, + { + "epoch": 0.5277537610236901, + "grad_norm": 0.5559502243995667, + "learning_rate": 0.0004981791697013839, + "loss": 1.1846, + "step": 763 + }, + { + "epoch": 0.5284454435414145, + "grad_norm": 0.6215760111808777, + "learning_rate": 0.0004974508375819374, + "loss": 1.9352, + "step": 764 + }, + { + "epoch": 0.5291371260591389, + "grad_norm": 0.8805095553398132, + "learning_rate": 0.0004967225054624909, + "loss": 1.4648, + "step": 765 + }, + { + "epoch": 0.5298288085768632, + "grad_norm": 0.8312969207763672, + "learning_rate": 0.0004959941733430444, + "loss": 1.5746, + "step": 766 + }, + { + "epoch": 0.5305204910945875, + "grad_norm": 0.5943465828895569, + "learning_rate": 0.000495265841223598, + "loss": 1.6898, + "step": 767 + }, + { + "epoch": 0.531212173612312, + "grad_norm": 0.6693414449691772, + "learning_rate": 0.0004945375091041515, + "loss": 1.3878, + "step": 768 + }, + { + "epoch": 0.5319038561300363, + "grad_norm": 1.313461422920227, + "learning_rate": 0.000493809176984705, + "loss": 1.551, + "step": 769 + }, + { + "epoch": 0.5325955386477607, + "grad_norm": 0.6303570866584778, + "learning_rate": 0.0004930808448652586, + "loss": 1.2923, + "step": 770 + }, + { + "epoch": 0.533287221165485, + "grad_norm": 1.8511700630187988, + "learning_rate": 0.0004923525127458121, + "loss": 1.2441, + "step": 771 + }, + { + "epoch": 0.5339789036832094, + "grad_norm": 0.4831252992153168, + "learning_rate": 0.0004916241806263656, + "loss": 1.5108, + "step": 772 + }, + { + "epoch": 0.5346705862009338, + "grad_norm": 0.6066388487815857, + "learning_rate": 0.0004908958485069191, + "loss": 1.41, + "step": 773 + }, + { + "epoch": 0.5353622687186581, + "grad_norm": 0.6585482358932495, + "learning_rate": 0.0004901675163874727, + "loss": 1.7785, + "step": 774 + }, + { + "epoch": 0.5360539512363826, + "grad_norm": 1.0999970436096191, + "learning_rate": 0.0004894391842680262, + "loss": 1.686, + "step": 775 + }, + { + "epoch": 0.5367456337541069, + "grad_norm": 0.5038065314292908, + "learning_rate": 0.0004887108521485797, + "loss": 1.2632, + "step": 776 + }, + { + "epoch": 0.5374373162718312, + "grad_norm": 0.9723607897758484, + "learning_rate": 0.0004879825200291333, + "loss": 1.5182, + "step": 777 + }, + { + "epoch": 0.5381289987895556, + "grad_norm": 1.2071069478988647, + "learning_rate": 0.00048725418790968684, + "loss": 1.352, + "step": 778 + }, + { + "epoch": 0.53882068130728, + "grad_norm": 0.5600361227989197, + "learning_rate": 0.00048652585579024034, + "loss": 1.0904, + "step": 779 + }, + { + "epoch": 0.5395123638250043, + "grad_norm": 0.5586333870887756, + "learning_rate": 0.0004857975236707939, + "loss": 1.7258, + "step": 780 + }, + { + "epoch": 0.5402040463427287, + "grad_norm": 0.47964948415756226, + "learning_rate": 0.00048506919155134746, + "loss": 1.2326, + "step": 781 + }, + { + "epoch": 0.540895728860453, + "grad_norm": 0.6363099813461304, + "learning_rate": 0.00048434085943190096, + "loss": 1.6935, + "step": 782 + }, + { + "epoch": 0.5415874113781775, + "grad_norm": 1.4082186222076416, + "learning_rate": 0.00048361252731245446, + "loss": 1.8553, + "step": 783 + }, + { + "epoch": 0.5422790938959018, + "grad_norm": 0.5664854049682617, + "learning_rate": 0.000482884195193008, + "loss": 1.7903, + "step": 784 + }, + { + "epoch": 0.5429707764136261, + "grad_norm": 0.7249478101730347, + "learning_rate": 0.0004821558630735616, + "loss": 1.9806, + "step": 785 + }, + { + "epoch": 0.5436624589313505, + "grad_norm": 0.7467180490493774, + "learning_rate": 0.0004814275309541151, + "loss": 1.9561, + "step": 786 + }, + { + "epoch": 0.5443541414490749, + "grad_norm": 0.6132490634918213, + "learning_rate": 0.00048069919883466863, + "loss": 1.4799, + "step": 787 + }, + { + "epoch": 0.5450458239667992, + "grad_norm": 0.8408911824226379, + "learning_rate": 0.0004799708667152222, + "loss": 1.4826, + "step": 788 + }, + { + "epoch": 0.5457375064845236, + "grad_norm": 0.867709219455719, + "learning_rate": 0.00047924253459577564, + "loss": 1.1398, + "step": 789 + }, + { + "epoch": 0.546429189002248, + "grad_norm": 1.0073575973510742, + "learning_rate": 0.0004785142024763292, + "loss": 1.5904, + "step": 790 + }, + { + "epoch": 0.5471208715199724, + "grad_norm": 0.7767248153686523, + "learning_rate": 0.00047778587035688275, + "loss": 1.8212, + "step": 791 + }, + { + "epoch": 0.5478125540376967, + "grad_norm": 0.45061439275741577, + "learning_rate": 0.0004770575382374363, + "loss": 1.3728, + "step": 792 + }, + { + "epoch": 0.548504236555421, + "grad_norm": 0.6767532229423523, + "learning_rate": 0.0004763292061179898, + "loss": 1.2439, + "step": 793 + }, + { + "epoch": 0.5491959190731455, + "grad_norm": 0.987127423286438, + "learning_rate": 0.00047560087399854337, + "loss": 1.1955, + "step": 794 + }, + { + "epoch": 0.5498876015908698, + "grad_norm": 0.5513572096824646, + "learning_rate": 0.0004748725418790969, + "loss": 0.935, + "step": 795 + }, + { + "epoch": 0.5505792841085941, + "grad_norm": 0.5820390582084656, + "learning_rate": 0.00047414420975965037, + "loss": 1.5176, + "step": 796 + }, + { + "epoch": 0.5512709666263185, + "grad_norm": 0.5624158382415771, + "learning_rate": 0.0004734158776402039, + "loss": 1.8107, + "step": 797 + }, + { + "epoch": 0.5519626491440429, + "grad_norm": 0.9582436680793762, + "learning_rate": 0.0004726875455207575, + "loss": 1.4249, + "step": 798 + }, + { + "epoch": 0.5526543316617673, + "grad_norm": 0.8588325381278992, + "learning_rate": 0.000471959213401311, + "loss": 1.6912, + "step": 799 + }, + { + "epoch": 0.5533460141794916, + "grad_norm": 0.5387138724327087, + "learning_rate": 0.00047123088128186454, + "loss": 1.9325, + "step": 800 + }, + { + "epoch": 0.554037696697216, + "grad_norm": 0.5876449346542358, + "learning_rate": 0.0004705025491624181, + "loss": 1.8334, + "step": 801 + }, + { + "epoch": 0.5547293792149404, + "grad_norm": 0.6836307048797607, + "learning_rate": 0.00046977421704297165, + "loss": 1.6645, + "step": 802 + }, + { + "epoch": 0.5554210617326647, + "grad_norm": 0.4884951412677765, + "learning_rate": 0.0004690458849235251, + "loss": 1.8236, + "step": 803 + }, + { + "epoch": 0.556112744250389, + "grad_norm": 0.6170971989631653, + "learning_rate": 0.00046831755280407866, + "loss": 1.5683, + "step": 804 + }, + { + "epoch": 0.5568044267681135, + "grad_norm": 0.5362435579299927, + "learning_rate": 0.0004675892206846322, + "loss": 1.3946, + "step": 805 + }, + { + "epoch": 0.5574961092858378, + "grad_norm": 0.590861976146698, + "learning_rate": 0.0004668608885651857, + "loss": 1.8858, + "step": 806 + }, + { + "epoch": 0.5581877918035621, + "grad_norm": 0.8209269046783447, + "learning_rate": 0.0004661325564457393, + "loss": 1.1731, + "step": 807 + }, + { + "epoch": 0.5588794743212865, + "grad_norm": 0.5863669514656067, + "learning_rate": 0.00046540422432629283, + "loss": 1.3344, + "step": 808 + }, + { + "epoch": 0.5595711568390109, + "grad_norm": 0.6586650013923645, + "learning_rate": 0.00046467589220684633, + "loss": 1.5557, + "step": 809 + }, + { + "epoch": 0.5602628393567353, + "grad_norm": 0.5491942763328552, + "learning_rate": 0.00046394756008739983, + "loss": 1.8428, + "step": 810 + }, + { + "epoch": 0.5609545218744596, + "grad_norm": 0.5121621489524841, + "learning_rate": 0.0004632192279679534, + "loss": 1.2454, + "step": 811 + }, + { + "epoch": 0.5616462043921839, + "grad_norm": 0.5642603039741516, + "learning_rate": 0.00046249089584850695, + "loss": 1.8521, + "step": 812 + }, + { + "epoch": 0.5623378869099084, + "grad_norm": 0.8125218749046326, + "learning_rate": 0.00046176256372906045, + "loss": 1.983, + "step": 813 + }, + { + "epoch": 0.5630295694276327, + "grad_norm": 0.5608864426612854, + "learning_rate": 0.000461034231609614, + "loss": 1.4064, + "step": 814 + }, + { + "epoch": 0.563721251945357, + "grad_norm": 0.555400550365448, + "learning_rate": 0.00046030589949016756, + "loss": 1.3027, + "step": 815 + }, + { + "epoch": 0.5644129344630815, + "grad_norm": 0.8211348056793213, + "learning_rate": 0.00045957756737072106, + "loss": 1.4144, + "step": 816 + }, + { + "epoch": 0.5651046169808058, + "grad_norm": 0.6786699891090393, + "learning_rate": 0.00045884923525127457, + "loss": 1.7804, + "step": 817 + }, + { + "epoch": 0.5657962994985302, + "grad_norm": 18.696903228759766, + "learning_rate": 0.0004581209031318281, + "loss": 1.9006, + "step": 818 + }, + { + "epoch": 0.5664879820162545, + "grad_norm": 0.5174362063407898, + "learning_rate": 0.0004573925710123816, + "loss": 1.4575, + "step": 819 + }, + { + "epoch": 0.5671796645339789, + "grad_norm": 0.946522057056427, + "learning_rate": 0.0004566642388929352, + "loss": 1.6379, + "step": 820 + }, + { + "epoch": 0.5678713470517033, + "grad_norm": 0.7363066077232361, + "learning_rate": 0.00045593590677348874, + "loss": 1.7782, + "step": 821 + }, + { + "epoch": 0.5685630295694276, + "grad_norm": 0.6305325031280518, + "learning_rate": 0.0004552075746540423, + "loss": 1.1396, + "step": 822 + }, + { + "epoch": 0.569254712087152, + "grad_norm": 0.7217493057250977, + "learning_rate": 0.0004544792425345958, + "loss": 1.8529, + "step": 823 + }, + { + "epoch": 0.5699463946048764, + "grad_norm": 0.7322853207588196, + "learning_rate": 0.0004537509104151493, + "loss": 1.9815, + "step": 824 + }, + { + "epoch": 0.5706380771226007, + "grad_norm": 0.48964637517929077, + "learning_rate": 0.00045302257829570286, + "loss": 1.0413, + "step": 825 + }, + { + "epoch": 0.5713297596403251, + "grad_norm": 0.7855163216590881, + "learning_rate": 0.00045229424617625636, + "loss": 1.7931, + "step": 826 + }, + { + "epoch": 0.5720214421580495, + "grad_norm": 0.8431006073951721, + "learning_rate": 0.0004515659140568099, + "loss": 1.5186, + "step": 827 + }, + { + "epoch": 0.5727131246757738, + "grad_norm": 0.7614803314208984, + "learning_rate": 0.00045083758193736347, + "loss": 1.6783, + "step": 828 + }, + { + "epoch": 0.5734048071934982, + "grad_norm": 0.670314371585846, + "learning_rate": 0.000450109249817917, + "loss": 1.5495, + "step": 829 + }, + { + "epoch": 0.5740964897112225, + "grad_norm": 0.7063092589378357, + "learning_rate": 0.00044938091769847053, + "loss": 1.5258, + "step": 830 + }, + { + "epoch": 0.574788172228947, + "grad_norm": 0.5529667139053345, + "learning_rate": 0.00044865258557902403, + "loss": 1.3823, + "step": 831 + }, + { + "epoch": 0.5754798547466713, + "grad_norm": 0.7476693391799927, + "learning_rate": 0.0004479242534595776, + "loss": 1.3917, + "step": 832 + }, + { + "epoch": 0.5761715372643956, + "grad_norm": 3.093163013458252, + "learning_rate": 0.0004471959213401311, + "loss": 1.6397, + "step": 833 + }, + { + "epoch": 0.57686321978212, + "grad_norm": 3.8266420364379883, + "learning_rate": 0.00044646758922068465, + "loss": 1.5871, + "step": 834 + }, + { + "epoch": 0.5775549022998444, + "grad_norm": 0.958208441734314, + "learning_rate": 0.0004457392571012382, + "loss": 1.5749, + "step": 835 + }, + { + "epoch": 0.5782465848175687, + "grad_norm": 0.5491811633110046, + "learning_rate": 0.0004450109249817917, + "loss": 1.6318, + "step": 836 + }, + { + "epoch": 0.5789382673352931, + "grad_norm": 1.3938939571380615, + "learning_rate": 0.00044428259286234526, + "loss": 1.6772, + "step": 837 + }, + { + "epoch": 0.5796299498530174, + "grad_norm": 0.9199579954147339, + "learning_rate": 0.00044355426074289876, + "loss": 1.5163, + "step": 838 + }, + { + "epoch": 0.5803216323707419, + "grad_norm": 0.5929269194602966, + "learning_rate": 0.00044282592862345227, + "loss": 1.6575, + "step": 839 + }, + { + "epoch": 0.5810133148884662, + "grad_norm": 0.6414217948913574, + "learning_rate": 0.0004420975965040058, + "loss": 1.5223, + "step": 840 + }, + { + "epoch": 0.5817049974061905, + "grad_norm": 0.48738619685173035, + "learning_rate": 0.0004413692643845594, + "loss": 1.5968, + "step": 841 + }, + { + "epoch": 0.582396679923915, + "grad_norm": 0.56129390001297, + "learning_rate": 0.00044064093226511294, + "loss": 1.707, + "step": 842 + }, + { + "epoch": 0.5830883624416393, + "grad_norm": 0.596315860748291, + "learning_rate": 0.00043991260014566644, + "loss": 1.8077, + "step": 843 + }, + { + "epoch": 0.5837800449593636, + "grad_norm": 0.7291851043701172, + "learning_rate": 0.00043918426802622, + "loss": 1.3719, + "step": 844 + }, + { + "epoch": 0.584471727477088, + "grad_norm": 1.4549719095230103, + "learning_rate": 0.0004384559359067735, + "loss": 1.402, + "step": 845 + }, + { + "epoch": 0.5851634099948124, + "grad_norm": 0.5116413831710815, + "learning_rate": 0.000437727603787327, + "loss": 1.1901, + "step": 846 + }, + { + "epoch": 0.5858550925125368, + "grad_norm": 1.1522141695022583, + "learning_rate": 0.00043699927166788056, + "loss": 1.6226, + "step": 847 + }, + { + "epoch": 0.5865467750302611, + "grad_norm": 0.5651256442070007, + "learning_rate": 0.0004362709395484341, + "loss": 0.9863, + "step": 848 + }, + { + "epoch": 0.5872384575479854, + "grad_norm": 0.9690898656845093, + "learning_rate": 0.0004355426074289876, + "loss": 1.0906, + "step": 849 + }, + { + "epoch": 0.5879301400657099, + "grad_norm": 0.603584349155426, + "learning_rate": 0.00043481427530954117, + "loss": 1.4331, + "step": 850 + }, + { + "epoch": 0.5886218225834342, + "grad_norm": 10.612972259521484, + "learning_rate": 0.0004340859431900947, + "loss": 1.1645, + "step": 851 + }, + { + "epoch": 0.5893135051011585, + "grad_norm": 0.6220032572746277, + "learning_rate": 0.00043335761107064823, + "loss": 1.7473, + "step": 852 + }, + { + "epoch": 0.590005187618883, + "grad_norm": 0.546869158744812, + "learning_rate": 0.00043262927895120173, + "loss": 1.8727, + "step": 853 + }, + { + "epoch": 0.5906968701366073, + "grad_norm": 0.5782111883163452, + "learning_rate": 0.0004319009468317553, + "loss": 1.5827, + "step": 854 + }, + { + "epoch": 0.5913885526543317, + "grad_norm": 0.7139537930488586, + "learning_rate": 0.00043117261471230884, + "loss": 1.8199, + "step": 855 + }, + { + "epoch": 0.592080235172056, + "grad_norm": 0.8115746378898621, + "learning_rate": 0.00043044428259286235, + "loss": 2.009, + "step": 856 + }, + { + "epoch": 0.5927719176897804, + "grad_norm": 0.5989879369735718, + "learning_rate": 0.0004297159504734159, + "loss": 1.6683, + "step": 857 + }, + { + "epoch": 0.5934636002075048, + "grad_norm": 0.6566680669784546, + "learning_rate": 0.00042898761835396946, + "loss": 1.7271, + "step": 858 + }, + { + "epoch": 0.5941552827252291, + "grad_norm": 0.9507800936698914, + "learning_rate": 0.0004282592862345229, + "loss": 1.0203, + "step": 859 + }, + { + "epoch": 0.5948469652429534, + "grad_norm": 0.6159283518791199, + "learning_rate": 0.00042753095411507646, + "loss": 1.6809, + "step": 860 + }, + { + "epoch": 0.5955386477606779, + "grad_norm": 0.6028535962104797, + "learning_rate": 0.00042680262199563, + "loss": 1.4339, + "step": 861 + }, + { + "epoch": 0.5962303302784022, + "grad_norm": 0.6777454614639282, + "learning_rate": 0.0004260742898761836, + "loss": 1.5344, + "step": 862 + }, + { + "epoch": 0.5969220127961266, + "grad_norm": 1.078660488128662, + "learning_rate": 0.0004253459577567371, + "loss": 1.1091, + "step": 863 + }, + { + "epoch": 0.5976136953138509, + "grad_norm": 0.7826130390167236, + "learning_rate": 0.00042461762563729064, + "loss": 1.8942, + "step": 864 + }, + { + "epoch": 0.5983053778315753, + "grad_norm": 0.623664140701294, + "learning_rate": 0.0004238892935178442, + "loss": 1.862, + "step": 865 + }, + { + "epoch": 0.5989970603492997, + "grad_norm": 1.0740686655044556, + "learning_rate": 0.00042316096139839764, + "loss": 1.5176, + "step": 866 + }, + { + "epoch": 0.599688742867024, + "grad_norm": 0.5756235122680664, + "learning_rate": 0.0004224326292789512, + "loss": 1.8423, + "step": 867 + }, + { + "epoch": 0.6003804253847485, + "grad_norm": 0.7312625646591187, + "learning_rate": 0.00042170429715950475, + "loss": 1.404, + "step": 868 + }, + { + "epoch": 0.6010721079024728, + "grad_norm": 0.5717254281044006, + "learning_rate": 0.00042097596504005826, + "loss": 1.1473, + "step": 869 + }, + { + "epoch": 0.6017637904201971, + "grad_norm": 0.6751309633255005, + "learning_rate": 0.0004202476329206118, + "loss": 1.982, + "step": 870 + }, + { + "epoch": 0.6024554729379215, + "grad_norm": 0.6197341680526733, + "learning_rate": 0.00041951930080116537, + "loss": 1.7876, + "step": 871 + }, + { + "epoch": 0.6031471554556459, + "grad_norm": 0.6944209337234497, + "learning_rate": 0.0004187909686817189, + "loss": 1.0763, + "step": 872 + }, + { + "epoch": 0.6038388379733702, + "grad_norm": 0.7229098677635193, + "learning_rate": 0.00041806263656227237, + "loss": 1.6516, + "step": 873 + }, + { + "epoch": 0.6045305204910946, + "grad_norm": 1.5600318908691406, + "learning_rate": 0.00041733430444282593, + "loss": 1.4807, + "step": 874 + }, + { + "epoch": 0.6052222030088189, + "grad_norm": 0.9896885752677917, + "learning_rate": 0.0004166059723233795, + "loss": 1.398, + "step": 875 + }, + { + "epoch": 0.6059138855265433, + "grad_norm": 0.5078806281089783, + "learning_rate": 0.000415877640203933, + "loss": 1.3289, + "step": 876 + }, + { + "epoch": 0.6066055680442677, + "grad_norm": 0.7672819495201111, + "learning_rate": 0.00041514930808448654, + "loss": 1.2339, + "step": 877 + }, + { + "epoch": 0.607297250561992, + "grad_norm": 0.5641161799430847, + "learning_rate": 0.0004144209759650401, + "loss": 1.4188, + "step": 878 + }, + { + "epoch": 0.6079889330797165, + "grad_norm": 0.5341874361038208, + "learning_rate": 0.00041369264384559355, + "loss": 1.5544, + "step": 879 + }, + { + "epoch": 0.6086806155974408, + "grad_norm": 0.6755079627037048, + "learning_rate": 0.0004129643117261471, + "loss": 1.8125, + "step": 880 + }, + { + "epoch": 0.6093722981151651, + "grad_norm": 0.6716341972351074, + "learning_rate": 0.00041223597960670066, + "loss": 1.469, + "step": 881 + }, + { + "epoch": 0.6100639806328895, + "grad_norm": 0.8121787905693054, + "learning_rate": 0.0004115076474872542, + "loss": 1.7901, + "step": 882 + }, + { + "epoch": 0.6107556631506139, + "grad_norm": 0.608357846736908, + "learning_rate": 0.0004107793153678077, + "loss": 1.3145, + "step": 883 + }, + { + "epoch": 0.6114473456683382, + "grad_norm": 0.7797583341598511, + "learning_rate": 0.0004100509832483613, + "loss": 1.0858, + "step": 884 + }, + { + "epoch": 0.6121390281860626, + "grad_norm": 0.6277884840965271, + "learning_rate": 0.00040932265112891483, + "loss": 1.4323, + "step": 885 + }, + { + "epoch": 0.6128307107037869, + "grad_norm": 0.7140945196151733, + "learning_rate": 0.0004085943190094683, + "loss": 1.3862, + "step": 886 + }, + { + "epoch": 0.6135223932215114, + "grad_norm": 0.7441515326499939, + "learning_rate": 0.00040786598689002184, + "loss": 1.5699, + "step": 887 + }, + { + "epoch": 0.6142140757392357, + "grad_norm": 0.514007031917572, + "learning_rate": 0.0004071376547705754, + "loss": 1.4761, + "step": 888 + }, + { + "epoch": 0.61490575825696, + "grad_norm": 0.6850712895393372, + "learning_rate": 0.0004064093226511289, + "loss": 1.1819, + "step": 889 + }, + { + "epoch": 0.6155974407746844, + "grad_norm": 0.6241645216941833, + "learning_rate": 0.00040568099053168245, + "loss": 1.3215, + "step": 890 + }, + { + "epoch": 0.6162891232924088, + "grad_norm": 1.1027741432189941, + "learning_rate": 0.000404952658412236, + "loss": 1.6761, + "step": 891 + }, + { + "epoch": 0.6169808058101331, + "grad_norm": 0.6459061503410339, + "learning_rate": 0.00040422432629278957, + "loss": 1.6112, + "step": 892 + }, + { + "epoch": 0.6176724883278575, + "grad_norm": 0.5237783789634705, + "learning_rate": 0.000403495994173343, + "loss": 0.878, + "step": 893 + }, + { + "epoch": 0.6183641708455819, + "grad_norm": 0.6251216530799866, + "learning_rate": 0.00040276766205389657, + "loss": 1.7803, + "step": 894 + }, + { + "epoch": 0.6190558533633063, + "grad_norm": 1.613736629486084, + "learning_rate": 0.0004020393299344501, + "loss": 1.3126, + "step": 895 + }, + { + "epoch": 0.6197475358810306, + "grad_norm": 0.7189272046089172, + "learning_rate": 0.00040131099781500363, + "loss": 1.3213, + "step": 896 + }, + { + "epoch": 0.6204392183987549, + "grad_norm": 0.6003819704055786, + "learning_rate": 0.0004005826656955572, + "loss": 2.0014, + "step": 897 + }, + { + "epoch": 0.6211309009164794, + "grad_norm": 0.6268942356109619, + "learning_rate": 0.00039985433357611074, + "loss": 0.9653, + "step": 898 + }, + { + "epoch": 0.6218225834342037, + "grad_norm": 0.5853712558746338, + "learning_rate": 0.00039912600145666424, + "loss": 1.9615, + "step": 899 + }, + { + "epoch": 0.622514265951928, + "grad_norm": 0.6711516380310059, + "learning_rate": 0.00039839766933721775, + "loss": 1.4365, + "step": 900 + }, + { + "epoch": 0.6232059484696524, + "grad_norm": 0.7002463936805725, + "learning_rate": 0.0003976693372177713, + "loss": 1.3427, + "step": 901 + }, + { + "epoch": 0.6238976309873768, + "grad_norm": 0.6239266991615295, + "learning_rate": 0.00039694100509832486, + "loss": 1.874, + "step": 902 + }, + { + "epoch": 0.6245893135051012, + "grad_norm": 0.7198671698570251, + "learning_rate": 0.00039621267297887836, + "loss": 1.6782, + "step": 903 + }, + { + "epoch": 0.6252809960228255, + "grad_norm": 1.8313370943069458, + "learning_rate": 0.0003954843408594319, + "loss": 1.7617, + "step": 904 + }, + { + "epoch": 0.6259726785405499, + "grad_norm": 0.6570086479187012, + "learning_rate": 0.0003947560087399855, + "loss": 1.6372, + "step": 905 + }, + { + "epoch": 0.6266643610582743, + "grad_norm": 0.5317332148551941, + "learning_rate": 0.000394027676620539, + "loss": 1.1686, + "step": 906 + }, + { + "epoch": 0.6273560435759986, + "grad_norm": 1.1987481117248535, + "learning_rate": 0.0003932993445010925, + "loss": 1.3278, + "step": 907 + }, + { + "epoch": 0.628047726093723, + "grad_norm": 0.5247228741645813, + "learning_rate": 0.00039257101238164603, + "loss": 1.3715, + "step": 908 + }, + { + "epoch": 0.6287394086114474, + "grad_norm": 0.6109928488731384, + "learning_rate": 0.00039184268026219954, + "loss": 1.7888, + "step": 909 + }, + { + "epoch": 0.6294310911291717, + "grad_norm": 0.6539821028709412, + "learning_rate": 0.0003911143481427531, + "loss": 1.3144, + "step": 910 + }, + { + "epoch": 0.6301227736468961, + "grad_norm": 0.870820164680481, + "learning_rate": 0.00039038601602330665, + "loss": 1.3851, + "step": 911 + }, + { + "epoch": 0.6308144561646204, + "grad_norm": 0.8201245069503784, + "learning_rate": 0.0003896576839038602, + "loss": 0.8613, + "step": 912 + }, + { + "epoch": 0.6315061386823448, + "grad_norm": 0.707269549369812, + "learning_rate": 0.0003889293517844137, + "loss": 1.0491, + "step": 913 + }, + { + "epoch": 0.6321978212000692, + "grad_norm": 0.7596359848976135, + "learning_rate": 0.0003882010196649672, + "loss": 1.5361, + "step": 914 + }, + { + "epoch": 0.6328895037177935, + "grad_norm": 0.5276856422424316, + "learning_rate": 0.00038747268754552077, + "loss": 1.9265, + "step": 915 + }, + { + "epoch": 0.6335811862355178, + "grad_norm": 0.6771373152732849, + "learning_rate": 0.00038674435542607427, + "loss": 1.2081, + "step": 916 + }, + { + "epoch": 0.6342728687532423, + "grad_norm": 0.788817822933197, + "learning_rate": 0.0003860160233066278, + "loss": 0.9206, + "step": 917 + }, + { + "epoch": 0.6349645512709666, + "grad_norm": 0.5923412442207336, + "learning_rate": 0.0003852876911871814, + "loss": 1.4172, + "step": 918 + }, + { + "epoch": 0.635656233788691, + "grad_norm": 0.6840768456459045, + "learning_rate": 0.0003845593590677349, + "loss": 1.7511, + "step": 919 + }, + { + "epoch": 0.6363479163064154, + "grad_norm": 0.6866530776023865, + "learning_rate": 0.00038383102694828844, + "loss": 1.5415, + "step": 920 + }, + { + "epoch": 0.6370395988241397, + "grad_norm": 0.852659285068512, + "learning_rate": 0.00038310269482884194, + "loss": 1.329, + "step": 921 + }, + { + "epoch": 0.6377312813418641, + "grad_norm": 0.8291088938713074, + "learning_rate": 0.0003823743627093955, + "loss": 1.4088, + "step": 922 + }, + { + "epoch": 0.6384229638595884, + "grad_norm": 1.7244031429290771, + "learning_rate": 0.000381646030589949, + "loss": 1.6084, + "step": 923 + }, + { + "epoch": 0.6391146463773129, + "grad_norm": 0.54539555311203, + "learning_rate": 0.00038091769847050256, + "loss": 0.745, + "step": 924 + }, + { + "epoch": 0.6398063288950372, + "grad_norm": 0.7444538474082947, + "learning_rate": 0.0003801893663510561, + "loss": 1.7405, + "step": 925 + }, + { + "epoch": 0.6404980114127615, + "grad_norm": 0.8226865530014038, + "learning_rate": 0.0003794610342316096, + "loss": 1.5868, + "step": 926 + }, + { + "epoch": 0.6411896939304859, + "grad_norm": 0.7856529355049133, + "learning_rate": 0.0003787327021121632, + "loss": 0.9057, + "step": 927 + }, + { + "epoch": 0.6418813764482103, + "grad_norm": 0.6824025511741638, + "learning_rate": 0.0003780043699927167, + "loss": 1.9523, + "step": 928 + }, + { + "epoch": 0.6425730589659346, + "grad_norm": 3.9347681999206543, + "learning_rate": 0.0003772760378732702, + "loss": 1.7031, + "step": 929 + }, + { + "epoch": 0.643264741483659, + "grad_norm": 0.7192727327346802, + "learning_rate": 0.00037654770575382373, + "loss": 1.7645, + "step": 930 + }, + { + "epoch": 0.6439564240013833, + "grad_norm": 0.8705196380615234, + "learning_rate": 0.0003758193736343773, + "loss": 1.0606, + "step": 931 + }, + { + "epoch": 0.6446481065191078, + "grad_norm": 0.5852888226509094, + "learning_rate": 0.00037509104151493085, + "loss": 1.6836, + "step": 932 + }, + { + "epoch": 0.6453397890368321, + "grad_norm": 0.5580787658691406, + "learning_rate": 0.00037436270939548435, + "loss": 1.6459, + "step": 933 + }, + { + "epoch": 0.6460314715545564, + "grad_norm": 0.8001941442489624, + "learning_rate": 0.0003736343772760379, + "loss": 1.0433, + "step": 934 + }, + { + "epoch": 0.6467231540722809, + "grad_norm": 0.5320255160331726, + "learning_rate": 0.0003729060451565914, + "loss": 1.2846, + "step": 935 + }, + { + "epoch": 0.6474148365900052, + "grad_norm": 0.6623178124427795, + "learning_rate": 0.0003721777130371449, + "loss": 1.689, + "step": 936 + }, + { + "epoch": 0.6481065191077295, + "grad_norm": 0.5291332602500916, + "learning_rate": 0.00037144938091769847, + "loss": 1.8375, + "step": 937 + }, + { + "epoch": 0.6487982016254539, + "grad_norm": 0.6939443945884705, + "learning_rate": 0.000370721048798252, + "loss": 1.9226, + "step": 938 + }, + { + "epoch": 0.6494898841431783, + "grad_norm": 0.5086541771888733, + "learning_rate": 0.0003699927166788055, + "loss": 1.4844, + "step": 939 + }, + { + "epoch": 0.6501815666609027, + "grad_norm": 0.6537139415740967, + "learning_rate": 0.0003692643845593591, + "loss": 1.512, + "step": 940 + }, + { + "epoch": 0.650873249178627, + "grad_norm": 1.0320565700531006, + "learning_rate": 0.00036853605243991264, + "loss": 1.4956, + "step": 941 + }, + { + "epoch": 0.6515649316963513, + "grad_norm": 0.7342500686645508, + "learning_rate": 0.00036780772032046614, + "loss": 1.5453, + "step": 942 + }, + { + "epoch": 0.6522566142140758, + "grad_norm": 0.8326630592346191, + "learning_rate": 0.00036707938820101964, + "loss": 1.392, + "step": 943 + }, + { + "epoch": 0.6529482967318001, + "grad_norm": 0.6338616013526917, + "learning_rate": 0.0003663510560815732, + "loss": 1.6057, + "step": 944 + }, + { + "epoch": 0.6536399792495244, + "grad_norm": 0.6175053119659424, + "learning_rate": 0.00036562272396212676, + "loss": 1.7356, + "step": 945 + }, + { + "epoch": 0.6543316617672489, + "grad_norm": 0.5108622908592224, + "learning_rate": 0.00036489439184268026, + "loss": 1.182, + "step": 946 + }, + { + "epoch": 0.6550233442849732, + "grad_norm": 0.7233152389526367, + "learning_rate": 0.0003641660597232338, + "loss": 1.3504, + "step": 947 + }, + { + "epoch": 0.6557150268026976, + "grad_norm": 0.6574891209602356, + "learning_rate": 0.00036343772760378737, + "loss": 1.9128, + "step": 948 + }, + { + "epoch": 0.6564067093204219, + "grad_norm": 0.676141083240509, + "learning_rate": 0.0003627093954843408, + "loss": 1.724, + "step": 949 + }, + { + "epoch": 0.6570983918381463, + "grad_norm": 0.8102545738220215, + "learning_rate": 0.0003619810633648944, + "loss": 1.1221, + "step": 950 + }, + { + "epoch": 0.6577900743558707, + "grad_norm": 0.7310335636138916, + "learning_rate": 0.00036125273124544793, + "loss": 1.8765, + "step": 951 + }, + { + "epoch": 0.658481756873595, + "grad_norm": 0.622969388961792, + "learning_rate": 0.0003605243991260015, + "loss": 0.7725, + "step": 952 + }, + { + "epoch": 0.6591734393913193, + "grad_norm": 0.7207367420196533, + "learning_rate": 0.000359796067006555, + "loss": 1.8594, + "step": 953 + }, + { + "epoch": 0.6598651219090438, + "grad_norm": 0.612112820148468, + "learning_rate": 0.00035906773488710855, + "loss": 1.3199, + "step": 954 + }, + { + "epoch": 0.6605568044267681, + "grad_norm": 0.6712756752967834, + "learning_rate": 0.0003583394027676621, + "loss": 1.8219, + "step": 955 + }, + { + "epoch": 0.6612484869444925, + "grad_norm": 0.5637266039848328, + "learning_rate": 0.00035761107064821555, + "loss": 1.5599, + "step": 956 + }, + { + "epoch": 0.6619401694622168, + "grad_norm": 0.714928150177002, + "learning_rate": 0.0003568827385287691, + "loss": 1.9542, + "step": 957 + }, + { + "epoch": 0.6626318519799412, + "grad_norm": 1.0304123163223267, + "learning_rate": 0.00035615440640932266, + "loss": 1.5718, + "step": 958 + }, + { + "epoch": 0.6633235344976656, + "grad_norm": 0.5427642464637756, + "learning_rate": 0.00035542607428987617, + "loss": 1.361, + "step": 959 + }, + { + "epoch": 0.6640152170153899, + "grad_norm": 0.640608012676239, + "learning_rate": 0.0003546977421704297, + "loss": 1.3447, + "step": 960 + }, + { + "epoch": 0.6647068995331143, + "grad_norm": 2.1725761890411377, + "learning_rate": 0.0003539694100509833, + "loss": 1.9901, + "step": 961 + }, + { + "epoch": 0.6653985820508387, + "grad_norm": 1.3823773860931396, + "learning_rate": 0.00035324107793153684, + "loss": 1.794, + "step": 962 + }, + { + "epoch": 0.666090264568563, + "grad_norm": 0.6191059947013855, + "learning_rate": 0.0003525127458120903, + "loss": 1.1826, + "step": 963 + }, + { + "epoch": 0.6667819470862874, + "grad_norm": 0.9410331845283508, + "learning_rate": 0.00035178441369264384, + "loss": 1.0219, + "step": 964 + }, + { + "epoch": 0.6674736296040118, + "grad_norm": 0.7751091718673706, + "learning_rate": 0.0003510560815731974, + "loss": 1.3428, + "step": 965 + }, + { + "epoch": 0.6681653121217361, + "grad_norm": 0.6246415376663208, + "learning_rate": 0.0003503277494537509, + "loss": 1.7892, + "step": 966 + }, + { + "epoch": 0.6688569946394605, + "grad_norm": 0.47676700353622437, + "learning_rate": 0.00034959941733430446, + "loss": 1.0902, + "step": 967 + }, + { + "epoch": 0.6695486771571848, + "grad_norm": 0.6154366731643677, + "learning_rate": 0.000348871085214858, + "loss": 1.1601, + "step": 968 + }, + { + "epoch": 0.6702403596749092, + "grad_norm": 0.6108272671699524, + "learning_rate": 0.0003481427530954115, + "loss": 1.9166, + "step": 969 + }, + { + "epoch": 0.6709320421926336, + "grad_norm": 0.6346696019172668, + "learning_rate": 0.000347414420975965, + "loss": 1.8435, + "step": 970 + }, + { + "epoch": 0.6716237247103579, + "grad_norm": 0.675031304359436, + "learning_rate": 0.00034668608885651857, + "loss": 1.5768, + "step": 971 + }, + { + "epoch": 0.6723154072280824, + "grad_norm": 0.7120993137359619, + "learning_rate": 0.00034595775673707213, + "loss": 1.5565, + "step": 972 + }, + { + "epoch": 0.6730070897458067, + "grad_norm": 0.643236517906189, + "learning_rate": 0.00034522942461762563, + "loss": 1.7646, + "step": 973 + }, + { + "epoch": 0.673698772263531, + "grad_norm": 0.6599898934364319, + "learning_rate": 0.0003445010924981792, + "loss": 1.704, + "step": 974 + }, + { + "epoch": 0.6743904547812554, + "grad_norm": 0.7331796884536743, + "learning_rate": 0.00034377276037873274, + "loss": 1.3173, + "step": 975 + }, + { + "epoch": 0.6750821372989798, + "grad_norm": 0.8388747572898865, + "learning_rate": 0.00034304442825928625, + "loss": 1.4021, + "step": 976 + }, + { + "epoch": 0.6757738198167041, + "grad_norm": 0.820971667766571, + "learning_rate": 0.00034231609613983975, + "loss": 2.0084, + "step": 977 + }, + { + "epoch": 0.6764655023344285, + "grad_norm": 0.64729243516922, + "learning_rate": 0.0003415877640203933, + "loss": 1.848, + "step": 978 + }, + { + "epoch": 0.6771571848521528, + "grad_norm": 18.543529510498047, + "learning_rate": 0.0003408594319009468, + "loss": 1.104, + "step": 979 + }, + { + "epoch": 0.6778488673698773, + "grad_norm": 0.7021201848983765, + "learning_rate": 0.00034013109978150036, + "loss": 1.2265, + "step": 980 + }, + { + "epoch": 0.6785405498876016, + "grad_norm": 0.9745551943778992, + "learning_rate": 0.0003394027676620539, + "loss": 1.4934, + "step": 981 + }, + { + "epoch": 0.6792322324053259, + "grad_norm": 0.6969733834266663, + "learning_rate": 0.0003386744355426075, + "loss": 1.1012, + "step": 982 + }, + { + "epoch": 0.6799239149230503, + "grad_norm": 0.9625135064125061, + "learning_rate": 0.000337946103423161, + "loss": 1.3152, + "step": 983 + }, + { + "epoch": 0.6806155974407747, + "grad_norm": 0.5473238825798035, + "learning_rate": 0.0003372177713037145, + "loss": 0.8124, + "step": 984 + }, + { + "epoch": 0.681307279958499, + "grad_norm": 0.5325528383255005, + "learning_rate": 0.00033648943918426804, + "loss": 0.9812, + "step": 985 + }, + { + "epoch": 0.6819989624762234, + "grad_norm": 0.745832622051239, + "learning_rate": 0.00033576110706482154, + "loss": 1.9812, + "step": 986 + }, + { + "epoch": 0.6826906449939478, + "grad_norm": 0.5990468263626099, + "learning_rate": 0.0003350327749453751, + "loss": 1.753, + "step": 987 + }, + { + "epoch": 0.6833823275116722, + "grad_norm": 1.1015442609786987, + "learning_rate": 0.00033430444282592865, + "loss": 1.3853, + "step": 988 + }, + { + "epoch": 0.6840740100293965, + "grad_norm": 0.5846887230873108, + "learning_rate": 0.00033357611070648215, + "loss": 1.1549, + "step": 989 + }, + { + "epoch": 0.6847656925471208, + "grad_norm": 0.6844741702079773, + "learning_rate": 0.0003328477785870357, + "loss": 1.6326, + "step": 990 + }, + { + "epoch": 0.6854573750648453, + "grad_norm": 0.6528374552726746, + "learning_rate": 0.0003321194464675892, + "loss": 1.5923, + "step": 991 + }, + { + "epoch": 0.6861490575825696, + "grad_norm": 7.470677375793457, + "learning_rate": 0.00033139111434814277, + "loss": 1.3873, + "step": 992 + }, + { + "epoch": 0.686840740100294, + "grad_norm": 0.6172521710395813, + "learning_rate": 0.00033066278222869627, + "loss": 1.6455, + "step": 993 + }, + { + "epoch": 0.6875324226180183, + "grad_norm": 0.6366342306137085, + "learning_rate": 0.00032993445010924983, + "loss": 1.2664, + "step": 994 + }, + { + "epoch": 0.6882241051357427, + "grad_norm": 0.6257708668708801, + "learning_rate": 0.0003292061179898034, + "loss": 1.8094, + "step": 995 + }, + { + "epoch": 0.6889157876534671, + "grad_norm": 0.5113406181335449, + "learning_rate": 0.0003284777858703569, + "loss": 0.8943, + "step": 996 + }, + { + "epoch": 0.6896074701711914, + "grad_norm": 0.8148225545883179, + "learning_rate": 0.00032774945375091044, + "loss": 1.3974, + "step": 997 + }, + { + "epoch": 0.6902991526889158, + "grad_norm": 0.7986158728599548, + "learning_rate": 0.00032702112163146395, + "loss": 0.6925, + "step": 998 + }, + { + "epoch": 0.6909908352066402, + "grad_norm": 0.983278751373291, + "learning_rate": 0.00032629278951201745, + "loss": 1.0282, + "step": 999 + }, + { + "epoch": 0.6916825177243645, + "grad_norm": 0.5543628931045532, + "learning_rate": 0.000325564457392571, + "loss": 1.8791, + "step": 1000 + }, + { + "epoch": 0.6916825177243645, + "eval_loss": 1.3676680326461792, + "eval_runtime": 586.3584, + "eval_samples_per_second": 2.191, + "eval_steps_per_second": 1.097, + "step": 1000 + } + ], + "logging_steps": 1, + "max_steps": 1446, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.178081149272064e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}