diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6784 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 30.0, + "eval_steps": 500, + "global_step": 23130, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0311284046692607, + "grad_norm": 11.856830596923828, + "learning_rate": 3.314121037463977e-06, + "loss": 2.0132, + "step": 24 + }, + { + "epoch": 0.0622568093385214, + "grad_norm": 1.1055241823196411, + "learning_rate": 6.7723342939481265e-06, + "loss": 1.0751, + "step": 48 + }, + { + "epoch": 0.0933852140077821, + "grad_norm": 1.4284316301345825, + "learning_rate": 1.0230547550432277e-05, + "loss": 1.038, + "step": 72 + }, + { + "epoch": 0.1245136186770428, + "grad_norm": 0.6491210460662842, + "learning_rate": 1.3688760806916426e-05, + "loss": 0.9988, + "step": 96 + }, + { + "epoch": 0.1556420233463035, + "grad_norm": 0.39960888028144836, + "learning_rate": 1.7146974063400578e-05, + "loss": 0.9849, + "step": 120 + }, + { + "epoch": 0.1867704280155642, + "grad_norm": 0.678686797618866, + "learning_rate": 2.060518731988473e-05, + "loss": 0.9767, + "step": 144 + }, + { + "epoch": 0.2178988326848249, + "grad_norm": 0.5720846056938171, + "learning_rate": 2.406340057636888e-05, + "loss": 0.9684, + "step": 168 + }, + { + "epoch": 0.2490272373540856, + "grad_norm": 0.6077919602394104, + "learning_rate": 2.7521613832853026e-05, + "loss": 0.967, + "step": 192 + }, + { + "epoch": 0.2801556420233463, + "grad_norm": 0.7459629774093628, + "learning_rate": 3.097982708933718e-05, + "loss": 0.9553, + "step": 216 + }, + { + "epoch": 0.311284046692607, + "grad_norm": 0.8133582472801208, + "learning_rate": 3.443804034582133e-05, + "loss": 0.9474, + "step": 240 + }, + { + "epoch": 0.3424124513618677, + "grad_norm": 0.7748175263404846, + "learning_rate": 3.7896253602305474e-05, + "loss": 0.9404, + "step": 264 + }, + { + "epoch": 0.3735408560311284, + "grad_norm": 0.9244363903999329, + "learning_rate": 4.135446685878963e-05, + "loss": 0.9326, + "step": 288 + }, + { + "epoch": 0.4046692607003891, + "grad_norm": 1.0879474878311157, + "learning_rate": 4.4812680115273775e-05, + "loss": 0.9112, + "step": 312 + }, + { + "epoch": 0.4357976653696498, + "grad_norm": 1.3521575927734375, + "learning_rate": 4.827089337175792e-05, + "loss": 0.9033, + "step": 336 + }, + { + "epoch": 0.4669260700389105, + "grad_norm": 1.4220598936080933, + "learning_rate": 5.1729106628242076e-05, + "loss": 0.8877, + "step": 360 + }, + { + "epoch": 0.4980544747081712, + "grad_norm": 1.345544457435608, + "learning_rate": 5.518731988472623e-05, + "loss": 0.8726, + "step": 384 + }, + { + "epoch": 0.5291828793774319, + "grad_norm": 1.7283053398132324, + "learning_rate": 5.864553314121038e-05, + "loss": 0.8553, + "step": 408 + }, + { + "epoch": 0.5603112840466926, + "grad_norm": 1.2822779417037964, + "learning_rate": 6.210374639769453e-05, + "loss": 0.841, + "step": 432 + }, + { + "epoch": 0.5914396887159533, + "grad_norm": 2.8578877449035645, + "learning_rate": 6.556195965417868e-05, + "loss": 0.8262, + "step": 456 + }, + { + "epoch": 0.622568093385214, + "grad_norm": 1.603874683380127, + "learning_rate": 6.902017291066282e-05, + "loss": 0.7989, + "step": 480 + }, + { + "epoch": 0.6536964980544747, + "grad_norm": 2.2428903579711914, + "learning_rate": 7.247838616714697e-05, + "loss": 0.7958, + "step": 504 + }, + { + "epoch": 0.6848249027237354, + "grad_norm": 1.6760625839233398, + "learning_rate": 7.593659942363113e-05, + "loss": 0.7799, + "step": 528 + }, + { + "epoch": 0.7159533073929961, + "grad_norm": 2.1145055294036865, + "learning_rate": 7.939481268011528e-05, + "loss": 0.7671, + "step": 552 + }, + { + "epoch": 0.7470817120622568, + "grad_norm": 1.3805097341537476, + "learning_rate": 8.285302593659943e-05, + "loss": 0.7563, + "step": 576 + }, + { + "epoch": 0.7782101167315175, + "grad_norm": 2.1005349159240723, + "learning_rate": 8.631123919308359e-05, + "loss": 0.7396, + "step": 600 + }, + { + "epoch": 0.8093385214007782, + "grad_norm": 1.6995466947555542, + "learning_rate": 8.976945244956772e-05, + "loss": 0.738, + "step": 624 + }, + { + "epoch": 0.8404669260700389, + "grad_norm": 1.5165631771087646, + "learning_rate": 9.322766570605188e-05, + "loss": 0.7208, + "step": 648 + }, + { + "epoch": 0.8715953307392996, + "grad_norm": 1.4923312664031982, + "learning_rate": 9.668587896253603e-05, + "loss": 0.7126, + "step": 672 + }, + { + "epoch": 0.9027237354085603, + "grad_norm": 1.7333067655563354, + "learning_rate": 9.999999950982757e-05, + "loss": 0.6998, + "step": 696 + }, + { + "epoch": 0.933852140077821, + "grad_norm": 1.5790561437606812, + "learning_rate": 9.999969364253642e-05, + "loss": 0.6943, + "step": 720 + }, + { + "epoch": 0.9649805447470817, + "grad_norm": 2.4895715713500977, + "learning_rate": 9.999882310058304e-05, + "loss": 0.6887, + "step": 744 + }, + { + "epoch": 0.9961089494163424, + "grad_norm": 1.2938724756240845, + "learning_rate": 9.999738789379896e-05, + "loss": 0.6728, + "step": 768 + }, + { + "epoch": 1.027237354085603, + "grad_norm": 2.313992738723755, + "learning_rate": 9.999538803839277e-05, + "loss": 0.6704, + "step": 792 + }, + { + "epoch": 1.0583657587548638, + "grad_norm": 2.3841969966888428, + "learning_rate": 9.999282355694997e-05, + "loss": 0.6683, + "step": 816 + }, + { + "epoch": 1.0894941634241244, + "grad_norm": 1.7238163948059082, + "learning_rate": 9.998969447843267e-05, + "loss": 0.6598, + "step": 840 + }, + { + "epoch": 1.1206225680933852, + "grad_norm": 1.889561653137207, + "learning_rate": 9.998600083817934e-05, + "loss": 0.6469, + "step": 864 + }, + { + "epoch": 1.1517509727626458, + "grad_norm": 2.078350305557251, + "learning_rate": 9.998174267790433e-05, + "loss": 0.6394, + "step": 888 + }, + { + "epoch": 1.1828793774319066, + "grad_norm": 3.088223934173584, + "learning_rate": 9.99769200456974e-05, + "loss": 0.642, + "step": 912 + }, + { + "epoch": 1.2140077821011672, + "grad_norm": 1.8867013454437256, + "learning_rate": 9.997153299602332e-05, + "loss": 0.6365, + "step": 936 + }, + { + "epoch": 1.245136186770428, + "grad_norm": 2.187405586242676, + "learning_rate": 9.9965581589721e-05, + "loss": 0.6216, + "step": 960 + }, + { + "epoch": 1.2762645914396886, + "grad_norm": 1.5248736143112183, + "learning_rate": 9.995906589400307e-05, + "loss": 0.6208, + "step": 984 + }, + { + "epoch": 1.3073929961089494, + "grad_norm": 1.3533403873443604, + "learning_rate": 9.995198598245492e-05, + "loss": 0.6143, + "step": 1008 + }, + { + "epoch": 1.3385214007782102, + "grad_norm": 1.9436872005462646, + "learning_rate": 9.994434193503399e-05, + "loss": 0.6101, + "step": 1032 + }, + { + "epoch": 1.3696498054474708, + "grad_norm": 1.5890527963638306, + "learning_rate": 9.993613383806879e-05, + "loss": 0.6011, + "step": 1056 + }, + { + "epoch": 1.4007782101167314, + "grad_norm": 1.6845647096633911, + "learning_rate": 9.9927361784258e-05, + "loss": 0.6022, + "step": 1080 + }, + { + "epoch": 1.4319066147859922, + "grad_norm": 1.5048511028289795, + "learning_rate": 9.991802587266932e-05, + "loss": 0.6078, + "step": 1104 + }, + { + "epoch": 1.463035019455253, + "grad_norm": 1.8788032531738281, + "learning_rate": 9.990812620873848e-05, + "loss": 0.6014, + "step": 1128 + }, + { + "epoch": 1.4941634241245136, + "grad_norm": 2.0226938724517822, + "learning_rate": 9.989766290426795e-05, + "loss": 0.5912, + "step": 1152 + }, + { + "epoch": 1.5252918287937742, + "grad_norm": 1.9385308027267456, + "learning_rate": 9.98866360774257e-05, + "loss": 0.5812, + "step": 1176 + }, + { + "epoch": 1.556420233463035, + "grad_norm": 1.2753961086273193, + "learning_rate": 9.98750458527439e-05, + "loss": 0.5825, + "step": 1200 + }, + { + "epoch": 1.5875486381322959, + "grad_norm": 1.6889104843139648, + "learning_rate": 9.986289236111747e-05, + "loss": 0.58, + "step": 1224 + }, + { + "epoch": 1.6186770428015564, + "grad_norm": 2.130415916442871, + "learning_rate": 9.985017573980262e-05, + "loss": 0.5853, + "step": 1248 + }, + { + "epoch": 1.649805447470817, + "grad_norm": 1.8879446983337402, + "learning_rate": 9.983689613241531e-05, + "loss": 0.5806, + "step": 1272 + }, + { + "epoch": 1.6809338521400778, + "grad_norm": 1.2330986261367798, + "learning_rate": 9.982305368892964e-05, + "loss": 0.574, + "step": 1296 + }, + { + "epoch": 1.7120622568093387, + "grad_norm": 1.389142632484436, + "learning_rate": 9.980864856567606e-05, + "loss": 0.5743, + "step": 1320 + }, + { + "epoch": 1.7431906614785992, + "grad_norm": 1.184691309928894, + "learning_rate": 9.979368092533978e-05, + "loss": 0.5691, + "step": 1344 + }, + { + "epoch": 1.7743190661478598, + "grad_norm": 1.3246943950653076, + "learning_rate": 9.977815093695875e-05, + "loss": 0.5669, + "step": 1368 + }, + { + "epoch": 1.8054474708171206, + "grad_norm": 1.5033084154129028, + "learning_rate": 9.976205877592189e-05, + "loss": 0.5636, + "step": 1392 + }, + { + "epoch": 1.8365758754863815, + "grad_norm": 2.675381660461426, + "learning_rate": 9.974540462396697e-05, + "loss": 0.5554, + "step": 1416 + }, + { + "epoch": 1.867704280155642, + "grad_norm": 1.4676384925842285, + "learning_rate": 9.972818866917877e-05, + "loss": 0.5526, + "step": 1440 + }, + { + "epoch": 1.8988326848249026, + "grad_norm": 2.269249200820923, + "learning_rate": 9.971041110598669e-05, + "loss": 0.556, + "step": 1464 + }, + { + "epoch": 1.9299610894941635, + "grad_norm": 1.7739601135253906, + "learning_rate": 9.969207213516279e-05, + "loss": 0.5546, + "step": 1488 + }, + { + "epoch": 1.9610894941634243, + "grad_norm": 1.2574249505996704, + "learning_rate": 9.967317196381936e-05, + "loss": 0.549, + "step": 1512 + }, + { + "epoch": 1.9922178988326849, + "grad_norm": 1.65413236618042, + "learning_rate": 9.965371080540666e-05, + "loss": 0.5537, + "step": 1536 + }, + { + "epoch": 2.0233463035019454, + "grad_norm": 1.6155718564987183, + "learning_rate": 9.96336888797105e-05, + "loss": 0.5424, + "step": 1560 + }, + { + "epoch": 2.054474708171206, + "grad_norm": 1.556755542755127, + "learning_rate": 9.961310641284977e-05, + "loss": 0.5396, + "step": 1584 + }, + { + "epoch": 2.085603112840467, + "grad_norm": 1.5641894340515137, + "learning_rate": 9.959196363727383e-05, + "loss": 0.5465, + "step": 1608 + }, + { + "epoch": 2.1167315175097277, + "grad_norm": 1.4483375549316406, + "learning_rate": 9.957026079175996e-05, + "loss": 0.5401, + "step": 1632 + }, + { + "epoch": 2.1478599221789882, + "grad_norm": 1.8051731586456299, + "learning_rate": 9.954799812141054e-05, + "loss": 0.541, + "step": 1656 + }, + { + "epoch": 2.178988326848249, + "grad_norm": 2.337942361831665, + "learning_rate": 9.952517587765049e-05, + "loss": 0.5359, + "step": 1680 + }, + { + "epoch": 2.21011673151751, + "grad_norm": 1.5796310901641846, + "learning_rate": 9.950179431822421e-05, + "loss": 0.5361, + "step": 1704 + }, + { + "epoch": 2.2412451361867705, + "grad_norm": 1.3433961868286133, + "learning_rate": 9.947785370719281e-05, + "loss": 0.5254, + "step": 1728 + }, + { + "epoch": 2.272373540856031, + "grad_norm": 1.8424466848373413, + "learning_rate": 9.945335431493108e-05, + "loss": 0.5278, + "step": 1752 + }, + { + "epoch": 2.3035019455252916, + "grad_norm": 1.280912160873413, + "learning_rate": 9.942829641812445e-05, + "loss": 0.5314, + "step": 1776 + }, + { + "epoch": 2.3346303501945527, + "grad_norm": 2.389176368713379, + "learning_rate": 9.94026802997658e-05, + "loss": 0.5272, + "step": 1800 + }, + { + "epoch": 2.3657587548638133, + "grad_norm": 1.804115653038025, + "learning_rate": 9.93765062491524e-05, + "loss": 0.5214, + "step": 1824 + }, + { + "epoch": 2.396887159533074, + "grad_norm": 2.4799587726593018, + "learning_rate": 9.934977456188253e-05, + "loss": 0.5228, + "step": 1848 + }, + { + "epoch": 2.4280155642023344, + "grad_norm": 1.3502540588378906, + "learning_rate": 9.932248553985213e-05, + "loss": 0.5269, + "step": 1872 + }, + { + "epoch": 2.4591439688715955, + "grad_norm": 1.9639521837234497, + "learning_rate": 9.929463949125151e-05, + "loss": 0.5244, + "step": 1896 + }, + { + "epoch": 2.490272373540856, + "grad_norm": 1.5300196409225464, + "learning_rate": 9.926623673056173e-05, + "loss": 0.5163, + "step": 1920 + }, + { + "epoch": 2.5214007782101167, + "grad_norm": 1.3195667266845703, + "learning_rate": 9.923727757855117e-05, + "loss": 0.5155, + "step": 1944 + }, + { + "epoch": 2.5525291828793772, + "grad_norm": 1.3704023361206055, + "learning_rate": 9.920776236227181e-05, + "loss": 0.5164, + "step": 1968 + }, + { + "epoch": 2.5836575875486383, + "grad_norm": 1.2443211078643799, + "learning_rate": 9.917769141505557e-05, + "loss": 0.5119, + "step": 1992 + }, + { + "epoch": 2.614785992217899, + "grad_norm": 1.7005102634429932, + "learning_rate": 9.91470650765106e-05, + "loss": 0.5191, + "step": 2016 + }, + { + "epoch": 2.6459143968871595, + "grad_norm": 1.742263674736023, + "learning_rate": 9.911588369251736e-05, + "loss": 0.5207, + "step": 2040 + }, + { + "epoch": 2.6770428015564205, + "grad_norm": 2.342224597930908, + "learning_rate": 9.908414761522473e-05, + "loss": 0.5116, + "step": 2064 + }, + { + "epoch": 2.708171206225681, + "grad_norm": 1.481919765472412, + "learning_rate": 9.905185720304612e-05, + "loss": 0.5169, + "step": 2088 + }, + { + "epoch": 2.7392996108949417, + "grad_norm": 2.477743148803711, + "learning_rate": 9.901901282065529e-05, + "loss": 0.5125, + "step": 2112 + }, + { + "epoch": 2.7704280155642023, + "grad_norm": 1.231108546257019, + "learning_rate": 9.898561483898233e-05, + "loss": 0.5119, + "step": 2136 + }, + { + "epoch": 2.801556420233463, + "grad_norm": 1.6876734495162964, + "learning_rate": 9.895166363520943e-05, + "loss": 0.5098, + "step": 2160 + }, + { + "epoch": 2.832684824902724, + "grad_norm": 1.886053442955017, + "learning_rate": 9.891715959276664e-05, + "loss": 0.509, + "step": 2184 + }, + { + "epoch": 2.8638132295719845, + "grad_norm": 2.044147253036499, + "learning_rate": 9.88821031013275e-05, + "loss": 0.5081, + "step": 2208 + }, + { + "epoch": 2.894941634241245, + "grad_norm": 1.8339983224868774, + "learning_rate": 9.88464945568047e-05, + "loss": 0.5031, + "step": 2232 + }, + { + "epoch": 2.926070038910506, + "grad_norm": 2.0237860679626465, + "learning_rate": 9.881033436134555e-05, + "loss": 0.5026, + "step": 2256 + }, + { + "epoch": 2.9571984435797667, + "grad_norm": 1.222092866897583, + "learning_rate": 9.877362292332749e-05, + "loss": 0.4922, + "step": 2280 + }, + { + "epoch": 2.9883268482490273, + "grad_norm": 2.8668859004974365, + "learning_rate": 9.873636065735343e-05, + "loss": 0.4978, + "step": 2304 + }, + { + "epoch": 3.019455252918288, + "grad_norm": 1.8704198598861694, + "learning_rate": 9.869854798424709e-05, + "loss": 0.4999, + "step": 2328 + }, + { + "epoch": 3.0505836575875485, + "grad_norm": 1.3280694484710693, + "learning_rate": 9.866018533104826e-05, + "loss": 0.4979, + "step": 2352 + }, + { + "epoch": 3.0817120622568095, + "grad_norm": 1.6099941730499268, + "learning_rate": 9.862127313100795e-05, + "loss": 0.4966, + "step": 2376 + }, + { + "epoch": 3.11284046692607, + "grad_norm": 1.797253131866455, + "learning_rate": 9.858181182358355e-05, + "loss": 0.4913, + "step": 2400 + }, + { + "epoch": 3.1439688715953307, + "grad_norm": 1.4523372650146484, + "learning_rate": 9.854180185443378e-05, + "loss": 0.494, + "step": 2424 + }, + { + "epoch": 3.1750972762645913, + "grad_norm": 1.665285587310791, + "learning_rate": 9.850124367541371e-05, + "loss": 0.495, + "step": 2448 + }, + { + "epoch": 3.2062256809338523, + "grad_norm": 1.2931227684020996, + "learning_rate": 9.84601377445697e-05, + "loss": 0.4949, + "step": 2472 + }, + { + "epoch": 3.237354085603113, + "grad_norm": 2.0045413970947266, + "learning_rate": 9.841848452613412e-05, + "loss": 0.4901, + "step": 2496 + }, + { + "epoch": 3.2684824902723735, + "grad_norm": 1.2784613370895386, + "learning_rate": 9.83762844905202e-05, + "loss": 0.4967, + "step": 2520 + }, + { + "epoch": 3.299610894941634, + "grad_norm": 1.485795497894287, + "learning_rate": 9.833353811431669e-05, + "loss": 0.4921, + "step": 2544 + }, + { + "epoch": 3.330739299610895, + "grad_norm": 2.1288626194000244, + "learning_rate": 9.829024588028244e-05, + "loss": 0.4912, + "step": 2568 + }, + { + "epoch": 3.3618677042801557, + "grad_norm": 1.5102566480636597, + "learning_rate": 9.824640827734102e-05, + "loss": 0.4938, + "step": 2592 + }, + { + "epoch": 3.3929961089494163, + "grad_norm": 2.126574993133545, + "learning_rate": 9.820202580057512e-05, + "loss": 0.4881, + "step": 2616 + }, + { + "epoch": 3.424124513618677, + "grad_norm": 1.1427215337753296, + "learning_rate": 9.8157098951221e-05, + "loss": 0.4956, + "step": 2640 + }, + { + "epoch": 3.455252918287938, + "grad_norm": 1.847524881362915, + "learning_rate": 9.811162823666287e-05, + "loss": 0.4883, + "step": 2664 + }, + { + "epoch": 3.4863813229571985, + "grad_norm": 1.3941086530685425, + "learning_rate": 9.806561417042706e-05, + "loss": 0.488, + "step": 2688 + }, + { + "epoch": 3.517509727626459, + "grad_norm": 1.7835474014282227, + "learning_rate": 9.801905727217631e-05, + "loss": 0.4796, + "step": 2712 + }, + { + "epoch": 3.5486381322957197, + "grad_norm": 2.4145917892456055, + "learning_rate": 9.797195806770387e-05, + "loss": 0.4856, + "step": 2736 + }, + { + "epoch": 3.5797665369649807, + "grad_norm": 1.6567249298095703, + "learning_rate": 9.792431708892752e-05, + "loss": 0.4799, + "step": 2760 + }, + { + "epoch": 3.6108949416342413, + "grad_norm": 1.7985295057296753, + "learning_rate": 9.787613487388365e-05, + "loss": 0.4886, + "step": 2784 + }, + { + "epoch": 3.642023346303502, + "grad_norm": 1.7581013441085815, + "learning_rate": 9.78274119667211e-05, + "loss": 0.4835, + "step": 2808 + }, + { + "epoch": 3.673151750972763, + "grad_norm": 1.6254545450210571, + "learning_rate": 9.777814891769507e-05, + "loss": 0.4841, + "step": 2832 + }, + { + "epoch": 3.7042801556420235, + "grad_norm": 1.745969295501709, + "learning_rate": 9.772834628316087e-05, + "loss": 0.4848, + "step": 2856 + }, + { + "epoch": 3.735408560311284, + "grad_norm": 1.762830138206482, + "learning_rate": 9.767800462556769e-05, + "loss": 0.476, + "step": 2880 + }, + { + "epoch": 3.7665369649805447, + "grad_norm": 1.6283063888549805, + "learning_rate": 9.762712451345217e-05, + "loss": 0.48, + "step": 2904 + }, + { + "epoch": 3.7976653696498053, + "grad_norm": 1.7204512357711792, + "learning_rate": 9.757570652143202e-05, + "loss": 0.4746, + "step": 2928 + }, + { + "epoch": 3.8287937743190663, + "grad_norm": 2.6043598651885986, + "learning_rate": 9.752375123019956e-05, + "loss": 0.4805, + "step": 2952 + }, + { + "epoch": 3.859922178988327, + "grad_norm": 2.134938955307007, + "learning_rate": 9.74712592265151e-05, + "loss": 0.4776, + "step": 2976 + }, + { + "epoch": 3.8910505836575875, + "grad_norm": 1.4748331308364868, + "learning_rate": 9.741823110320037e-05, + "loss": 0.4725, + "step": 3000 + }, + { + "epoch": 3.9221789883268485, + "grad_norm": 2.2188987731933594, + "learning_rate": 9.73646674591318e-05, + "loss": 0.4781, + "step": 3024 + }, + { + "epoch": 3.953307392996109, + "grad_norm": 1.2936460971832275, + "learning_rate": 9.731056889923374e-05, + "loss": 0.4808, + "step": 3048 + }, + { + "epoch": 3.9844357976653697, + "grad_norm": 2.5133862495422363, + "learning_rate": 9.725593603447166e-05, + "loss": 0.4839, + "step": 3072 + }, + { + "epoch": 4.01556420233463, + "grad_norm": 2.2660224437713623, + "learning_rate": 9.720076948184522e-05, + "loss": 0.4709, + "step": 3096 + }, + { + "epoch": 4.046692607003891, + "grad_norm": 1.573203444480896, + "learning_rate": 9.714506986438134e-05, + "loss": 0.4762, + "step": 3120 + }, + { + "epoch": 4.0778210116731515, + "grad_norm": 1.9054023027420044, + "learning_rate": 9.70888378111271e-05, + "loss": 0.4796, + "step": 3144 + }, + { + "epoch": 4.108949416342412, + "grad_norm": 2.2776753902435303, + "learning_rate": 9.703207395714274e-05, + "loss": 0.4705, + "step": 3168 + }, + { + "epoch": 4.1400778210116735, + "grad_norm": 1.614623785018921, + "learning_rate": 9.697477894349438e-05, + "loss": 0.4713, + "step": 3192 + }, + { + "epoch": 4.171206225680934, + "grad_norm": 2.478569269180298, + "learning_rate": 9.691695341724681e-05, + "loss": 0.4719, + "step": 3216 + }, + { + "epoch": 4.202334630350195, + "grad_norm": 1.3797364234924316, + "learning_rate": 9.685859803145625e-05, + "loss": 0.4663, + "step": 3240 + }, + { + "epoch": 4.233463035019455, + "grad_norm": 2.49601674079895, + "learning_rate": 9.679971344516288e-05, + "loss": 0.4827, + "step": 3264 + }, + { + "epoch": 4.264591439688716, + "grad_norm": 1.5913656949996948, + "learning_rate": 9.674030032338346e-05, + "loss": 0.4869, + "step": 3288 + }, + { + "epoch": 4.2957198443579765, + "grad_norm": 1.5114320516586304, + "learning_rate": 9.668035933710378e-05, + "loss": 0.4794, + "step": 3312 + }, + { + "epoch": 4.326848249027237, + "grad_norm": 1.905714750289917, + "learning_rate": 9.661989116327112e-05, + "loss": 0.4702, + "step": 3336 + }, + { + "epoch": 4.357976653696498, + "grad_norm": 1.6932348012924194, + "learning_rate": 9.655889648478657e-05, + "loss": 0.4693, + "step": 3360 + }, + { + "epoch": 4.389105058365759, + "grad_norm": 1.9976513385772705, + "learning_rate": 9.649737599049736e-05, + "loss": 0.4705, + "step": 3384 + }, + { + "epoch": 4.42023346303502, + "grad_norm": 1.4826905727386475, + "learning_rate": 9.643533037518899e-05, + "loss": 0.4697, + "step": 3408 + }, + { + "epoch": 4.45136186770428, + "grad_norm": 1.617922306060791, + "learning_rate": 9.637276033957755e-05, + "loss": 0.4684, + "step": 3432 + }, + { + "epoch": 4.482490272373541, + "grad_norm": 2.4124162197113037, + "learning_rate": 9.630966659030158e-05, + "loss": 0.462, + "step": 3456 + }, + { + "epoch": 4.5136186770428015, + "grad_norm": 1.8999947309494019, + "learning_rate": 9.624604983991434e-05, + "loss": 0.4614, + "step": 3480 + }, + { + "epoch": 4.544747081712062, + "grad_norm": 2.2038631439208984, + "learning_rate": 9.618191080687552e-05, + "loss": 0.473, + "step": 3504 + }, + { + "epoch": 4.575875486381323, + "grad_norm": 1.5659903287887573, + "learning_rate": 9.611725021554333e-05, + "loss": 0.4632, + "step": 3528 + }, + { + "epoch": 4.607003891050583, + "grad_norm": 2.38783597946167, + "learning_rate": 9.605206879616617e-05, + "loss": 0.4547, + "step": 3552 + }, + { + "epoch": 4.638132295719844, + "grad_norm": 1.5512051582336426, + "learning_rate": 9.59863672848745e-05, + "loss": 0.4623, + "step": 3576 + }, + { + "epoch": 4.669260700389105, + "grad_norm": 3.2371737957000732, + "learning_rate": 9.592014642367243e-05, + "loss": 0.4635, + "step": 3600 + }, + { + "epoch": 4.700389105058366, + "grad_norm": 1.7594435214996338, + "learning_rate": 9.585340696042935e-05, + "loss": 0.4674, + "step": 3624 + }, + { + "epoch": 4.7315175097276265, + "grad_norm": 1.3836287260055542, + "learning_rate": 9.57861496488716e-05, + "loss": 0.4611, + "step": 3648 + }, + { + "epoch": 4.762645914396887, + "grad_norm": 1.7907147407531738, + "learning_rate": 9.571837524857384e-05, + "loss": 0.4609, + "step": 3672 + }, + { + "epoch": 4.793774319066148, + "grad_norm": 1.7246521711349487, + "learning_rate": 9.565008452495046e-05, + "loss": 0.4588, + "step": 3696 + }, + { + "epoch": 4.824902723735408, + "grad_norm": 2.1095101833343506, + "learning_rate": 9.558127824924701e-05, + "loss": 0.4623, + "step": 3720 + }, + { + "epoch": 4.856031128404669, + "grad_norm": 1.1277464628219604, + "learning_rate": 9.551195719853147e-05, + "loss": 0.4568, + "step": 3744 + }, + { + "epoch": 4.88715953307393, + "grad_norm": 1.2232158184051514, + "learning_rate": 9.544212215568547e-05, + "loss": 0.459, + "step": 3768 + }, + { + "epoch": 4.918287937743191, + "grad_norm": 1.9220589399337769, + "learning_rate": 9.53717739093954e-05, + "loss": 0.4539, + "step": 3792 + }, + { + "epoch": 4.9494163424124515, + "grad_norm": 1.6076886653900146, + "learning_rate": 9.530091325414359e-05, + "loss": 0.4583, + "step": 3816 + }, + { + "epoch": 4.980544747081712, + "grad_norm": 1.2246028184890747, + "learning_rate": 9.522954099019927e-05, + "loss": 0.4567, + "step": 3840 + }, + { + "epoch": 5.011673151750973, + "grad_norm": 1.4004205465316772, + "learning_rate": 9.515765792360955e-05, + "loss": 0.4535, + "step": 3864 + }, + { + "epoch": 5.042801556420233, + "grad_norm": 1.30203378200531, + "learning_rate": 9.508526486619036e-05, + "loss": 0.452, + "step": 3888 + }, + { + "epoch": 5.073929961089494, + "grad_norm": 1.538682222366333, + "learning_rate": 9.501236263551719e-05, + "loss": 0.4511, + "step": 3912 + }, + { + "epoch": 5.1050583657587545, + "grad_norm": 1.3054372072219849, + "learning_rate": 9.493895205491595e-05, + "loss": 0.4489, + "step": 3936 + }, + { + "epoch": 5.136186770428016, + "grad_norm": 1.4001922607421875, + "learning_rate": 9.486503395345358e-05, + "loss": 0.4577, + "step": 3960 + }, + { + "epoch": 5.167315175097277, + "grad_norm": 1.3580487966537476, + "learning_rate": 9.47906091659288e-05, + "loss": 0.4519, + "step": 3984 + }, + { + "epoch": 5.198443579766537, + "grad_norm": 1.929853081703186, + "learning_rate": 9.47156785328626e-05, + "loss": 0.4562, + "step": 4008 + }, + { + "epoch": 5.229571984435798, + "grad_norm": 1.9883568286895752, + "learning_rate": 9.464024290048879e-05, + "loss": 0.4573, + "step": 4032 + }, + { + "epoch": 5.260700389105058, + "grad_norm": 1.7795013189315796, + "learning_rate": 9.456430312074432e-05, + "loss": 0.4513, + "step": 4056 + }, + { + "epoch": 5.291828793774319, + "grad_norm": 1.1019718647003174, + "learning_rate": 9.44878600512599e-05, + "loss": 0.4475, + "step": 4080 + }, + { + "epoch": 5.3229571984435795, + "grad_norm": 1.604556918144226, + "learning_rate": 9.441091455535007e-05, + "loss": 0.4466, + "step": 4104 + }, + { + "epoch": 5.35408560311284, + "grad_norm": 1.8707294464111328, + "learning_rate": 9.433346750200363e-05, + "loss": 0.4501, + "step": 4128 + }, + { + "epoch": 5.385214007782102, + "grad_norm": 1.6021867990493774, + "learning_rate": 9.425551976587366e-05, + "loss": 0.4443, + "step": 4152 + }, + { + "epoch": 5.416342412451362, + "grad_norm": 1.7186486721038818, + "learning_rate": 9.417707222726784e-05, + "loss": 0.4374, + "step": 4176 + }, + { + "epoch": 5.447470817120623, + "grad_norm": 2.0640745162963867, + "learning_rate": 9.409812577213833e-05, + "loss": 0.4468, + "step": 4200 + }, + { + "epoch": 5.478599221789883, + "grad_norm": 2.1669087409973145, + "learning_rate": 9.401868129207181e-05, + "loss": 0.4501, + "step": 4224 + }, + { + "epoch": 5.509727626459144, + "grad_norm": 2.237527847290039, + "learning_rate": 9.393873968427953e-05, + "loss": 0.4469, + "step": 4248 + }, + { + "epoch": 5.5408560311284045, + "grad_norm": 1.5120989084243774, + "learning_rate": 9.385830185158701e-05, + "loss": 0.4425, + "step": 4272 + }, + { + "epoch": 5.571984435797665, + "grad_norm": 2.029425621032715, + "learning_rate": 9.377736870242393e-05, + "loss": 0.4509, + "step": 4296 + }, + { + "epoch": 5.603112840466926, + "grad_norm": 1.609480857849121, + "learning_rate": 9.369594115081386e-05, + "loss": 0.4528, + "step": 4320 + }, + { + "epoch": 5.634241245136186, + "grad_norm": 1.126060128211975, + "learning_rate": 9.361402011636395e-05, + "loss": 0.4435, + "step": 4344 + }, + { + "epoch": 5.665369649805448, + "grad_norm": 3.637361526489258, + "learning_rate": 9.353160652425452e-05, + "loss": 0.4466, + "step": 4368 + }, + { + "epoch": 5.696498054474708, + "grad_norm": 3.3293521404266357, + "learning_rate": 9.344870130522863e-05, + "loss": 0.4495, + "step": 4392 + }, + { + "epoch": 5.727626459143969, + "grad_norm": 1.1623419523239136, + "learning_rate": 9.33653053955815e-05, + "loss": 0.4362, + "step": 4416 + }, + { + "epoch": 5.7587548638132295, + "grad_norm": 1.3908815383911133, + "learning_rate": 9.328141973715008e-05, + "loss": 0.445, + "step": 4440 + }, + { + "epoch": 5.78988326848249, + "grad_norm": 1.1905103921890259, + "learning_rate": 9.31970452773023e-05, + "loss": 0.4399, + "step": 4464 + }, + { + "epoch": 5.821011673151751, + "grad_norm": 1.7141236066818237, + "learning_rate": 9.311218296892636e-05, + "loss": 0.4396, + "step": 4488 + }, + { + "epoch": 5.852140077821011, + "grad_norm": 1.5528429746627808, + "learning_rate": 9.302683377042007e-05, + "loss": 0.4369, + "step": 4512 + }, + { + "epoch": 5.883268482490273, + "grad_norm": 1.206060528755188, + "learning_rate": 9.29409986456799e-05, + "loss": 0.443, + "step": 4536 + }, + { + "epoch": 5.914396887159533, + "grad_norm": 1.2948627471923828, + "learning_rate": 9.285467856409023e-05, + "loss": 0.4421, + "step": 4560 + }, + { + "epoch": 5.945525291828794, + "grad_norm": 1.6690411567687988, + "learning_rate": 9.276787450051225e-05, + "loss": 0.4393, + "step": 4584 + }, + { + "epoch": 5.976653696498055, + "grad_norm": 1.4727965593338013, + "learning_rate": 9.26805874352731e-05, + "loss": 0.443, + "step": 4608 + }, + { + "epoch": 6.007782101167315, + "grad_norm": 2.1878299713134766, + "learning_rate": 9.25928183541547e-05, + "loss": 0.4359, + "step": 4632 + }, + { + "epoch": 6.038910505836576, + "grad_norm": 1.5079774856567383, + "learning_rate": 9.250456824838263e-05, + "loss": 0.438, + "step": 4656 + }, + { + "epoch": 6.070038910505836, + "grad_norm": 1.5700092315673828, + "learning_rate": 9.241583811461498e-05, + "loss": 0.4355, + "step": 4680 + }, + { + "epoch": 6.101167315175097, + "grad_norm": 1.0717116594314575, + "learning_rate": 9.232662895493107e-05, + "loss": 0.4337, + "step": 4704 + }, + { + "epoch": 6.132295719844358, + "grad_norm": 1.775414228439331, + "learning_rate": 9.223694177682009e-05, + "loss": 0.4398, + "step": 4728 + }, + { + "epoch": 6.163424124513619, + "grad_norm": 3.057781457901001, + "learning_rate": 9.214677759316982e-05, + "loss": 0.4367, + "step": 4752 + }, + { + "epoch": 6.19455252918288, + "grad_norm": 1.2848880290985107, + "learning_rate": 9.205613742225507e-05, + "loss": 0.433, + "step": 4776 + }, + { + "epoch": 6.22568093385214, + "grad_norm": 1.5465294122695923, + "learning_rate": 9.196502228772626e-05, + "loss": 0.442, + "step": 4800 + }, + { + "epoch": 6.256809338521401, + "grad_norm": 1.1864486932754517, + "learning_rate": 9.18734332185979e-05, + "loss": 0.4356, + "step": 4824 + }, + { + "epoch": 6.287937743190661, + "grad_norm": 1.6817840337753296, + "learning_rate": 9.17813712492368e-05, + "loss": 0.4386, + "step": 4848 + }, + { + "epoch": 6.319066147859922, + "grad_norm": 1.285474181175232, + "learning_rate": 9.16888374193506e-05, + "loss": 0.4306, + "step": 4872 + }, + { + "epoch": 6.3501945525291825, + "grad_norm": 1.5364230871200562, + "learning_rate": 9.159583277397587e-05, + "loss": 0.4333, + "step": 4896 + }, + { + "epoch": 6.381322957198444, + "grad_norm": 1.8164541721343994, + "learning_rate": 9.150235836346639e-05, + "loss": 0.4285, + "step": 4920 + }, + { + "epoch": 6.412451361867705, + "grad_norm": 1.5146026611328125, + "learning_rate": 9.140841524348125e-05, + "loss": 0.4354, + "step": 4944 + }, + { + "epoch": 6.443579766536965, + "grad_norm": 1.238393783569336, + "learning_rate": 9.131400447497294e-05, + "loss": 0.4257, + "step": 4968 + }, + { + "epoch": 6.474708171206226, + "grad_norm": 1.4109466075897217, + "learning_rate": 9.121912712417536e-05, + "loss": 0.43, + "step": 4992 + }, + { + "epoch": 6.505836575875486, + "grad_norm": 1.8265984058380127, + "learning_rate": 9.11237842625918e-05, + "loss": 0.4373, + "step": 5016 + }, + { + "epoch": 6.536964980544747, + "grad_norm": 1.5519527196884155, + "learning_rate": 9.102797696698284e-05, + "loss": 0.4347, + "step": 5040 + }, + { + "epoch": 6.5680933852140075, + "grad_norm": 1.314172387123108, + "learning_rate": 9.093170631935412e-05, + "loss": 0.4348, + "step": 5064 + }, + { + "epoch": 6.599221789883268, + "grad_norm": 1.7968671321868896, + "learning_rate": 9.083497340694425e-05, + "loss": 0.4379, + "step": 5088 + }, + { + "epoch": 6.630350194552529, + "grad_norm": 1.166242003440857, + "learning_rate": 9.073777932221239e-05, + "loss": 0.4313, + "step": 5112 + }, + { + "epoch": 6.66147859922179, + "grad_norm": 1.9698489904403687, + "learning_rate": 9.064012516282601e-05, + "loss": 0.441, + "step": 5136 + }, + { + "epoch": 6.692607003891051, + "grad_norm": 1.2938389778137207, + "learning_rate": 9.054201203164845e-05, + "loss": 0.4301, + "step": 5160 + }, + { + "epoch": 6.723735408560311, + "grad_norm": 5.220723628997803, + "learning_rate": 9.044344103672651e-05, + "loss": 0.4232, + "step": 5184 + }, + { + "epoch": 6.754863813229572, + "grad_norm": 1.7442070245742798, + "learning_rate": 9.034441329127783e-05, + "loss": 0.4343, + "step": 5208 + }, + { + "epoch": 6.785992217898833, + "grad_norm": 4.927098274230957, + "learning_rate": 9.024492991367848e-05, + "loss": 0.4279, + "step": 5232 + }, + { + "epoch": 6.817120622568093, + "grad_norm": 1.1979647874832153, + "learning_rate": 9.014499202745019e-05, + "loss": 0.4312, + "step": 5256 + }, + { + "epoch": 6.848249027237354, + "grad_norm": 1.6905076503753662, + "learning_rate": 9.004460076124768e-05, + "loss": 0.432, + "step": 5280 + }, + { + "epoch": 6.879377431906615, + "grad_norm": 1.388134241104126, + "learning_rate": 8.994375724884604e-05, + "loss": 0.4314, + "step": 5304 + }, + { + "epoch": 6.910505836575876, + "grad_norm": 2.4431025981903076, + "learning_rate": 8.984246262912774e-05, + "loss": 0.4341, + "step": 5328 + }, + { + "epoch": 6.941634241245136, + "grad_norm": 2.5521421432495117, + "learning_rate": 8.974071804606989e-05, + "loss": 0.4251, + "step": 5352 + }, + { + "epoch": 6.972762645914397, + "grad_norm": 1.6180981397628784, + "learning_rate": 8.96385246487313e-05, + "loss": 0.4332, + "step": 5376 + }, + { + "epoch": 7.003891050583658, + "grad_norm": 1.673168659210205, + "learning_rate": 8.95358835912395e-05, + "loss": 0.4258, + "step": 5400 + }, + { + "epoch": 7.035019455252918, + "grad_norm": 2.032773733139038, + "learning_rate": 8.943279603277767e-05, + "loss": 0.4337, + "step": 5424 + }, + { + "epoch": 7.066147859922179, + "grad_norm": 1.7290483713150024, + "learning_rate": 8.932926313757157e-05, + "loss": 0.4312, + "step": 5448 + }, + { + "epoch": 7.097276264591439, + "grad_norm": 4.685028076171875, + "learning_rate": 8.922528607487645e-05, + "loss": 0.4416, + "step": 5472 + }, + { + "epoch": 7.1284046692607, + "grad_norm": 1.5580335855484009, + "learning_rate": 8.912086601896372e-05, + "loss": 0.4358, + "step": 5496 + }, + { + "epoch": 7.159533073929961, + "grad_norm": 1.332607388496399, + "learning_rate": 8.901600414910785e-05, + "loss": 0.4288, + "step": 5520 + }, + { + "epoch": 7.190661478599222, + "grad_norm": 1.2149999141693115, + "learning_rate": 8.891070164957288e-05, + "loss": 0.4238, + "step": 5544 + }, + { + "epoch": 7.221789883268483, + "grad_norm": 1.4633874893188477, + "learning_rate": 8.880495970959917e-05, + "loss": 0.4278, + "step": 5568 + }, + { + "epoch": 7.252918287937743, + "grad_norm": 1.4801607131958008, + "learning_rate": 8.869877952338991e-05, + "loss": 0.4227, + "step": 5592 + }, + { + "epoch": 7.284046692607004, + "grad_norm": 1.8194708824157715, + "learning_rate": 8.85921622900977e-05, + "loss": 0.4192, + "step": 5616 + }, + { + "epoch": 7.315175097276264, + "grad_norm": 1.111076831817627, + "learning_rate": 8.848510921381089e-05, + "loss": 0.4231, + "step": 5640 + }, + { + "epoch": 7.346303501945525, + "grad_norm": 1.4320513010025024, + "learning_rate": 8.83776215035401e-05, + "loss": 0.4224, + "step": 5664 + }, + { + "epoch": 7.377431906614786, + "grad_norm": 1.80966317653656, + "learning_rate": 8.826970037320448e-05, + "loss": 0.4183, + "step": 5688 + }, + { + "epoch": 7.408560311284047, + "grad_norm": 1.843509554862976, + "learning_rate": 8.816134704161807e-05, + "loss": 0.417, + "step": 5712 + }, + { + "epoch": 7.439688715953308, + "grad_norm": 1.2015341520309448, + "learning_rate": 8.805256273247598e-05, + "loss": 0.4177, + "step": 5736 + }, + { + "epoch": 7.470817120622568, + "grad_norm": 1.6432462930679321, + "learning_rate": 8.794334867434059e-05, + "loss": 0.4236, + "step": 5760 + }, + { + "epoch": 7.501945525291829, + "grad_norm": 1.354224443435669, + "learning_rate": 8.783370610062769e-05, + "loss": 0.4142, + "step": 5784 + }, + { + "epoch": 7.533073929961089, + "grad_norm": 1.6838608980178833, + "learning_rate": 8.772363624959255e-05, + "loss": 0.4173, + "step": 5808 + }, + { + "epoch": 7.56420233463035, + "grad_norm": 1.8743314743041992, + "learning_rate": 8.761314036431588e-05, + "loss": 0.4248, + "step": 5832 + }, + { + "epoch": 7.595330739299611, + "grad_norm": 1.4311802387237549, + "learning_rate": 8.750221969268985e-05, + "loss": 0.4204, + "step": 5856 + }, + { + "epoch": 7.626459143968871, + "grad_norm": 1.4219359159469604, + "learning_rate": 8.739087548740404e-05, + "loss": 0.4201, + "step": 5880 + }, + { + "epoch": 7.657587548638133, + "grad_norm": 2.070533275604248, + "learning_rate": 8.727910900593114e-05, + "loss": 0.4229, + "step": 5904 + }, + { + "epoch": 7.688715953307393, + "grad_norm": 1.4531338214874268, + "learning_rate": 8.716692151051293e-05, + "loss": 0.42, + "step": 5928 + }, + { + "epoch": 7.719844357976654, + "grad_norm": 2.2621729373931885, + "learning_rate": 8.705431426814585e-05, + "loss": 0.4171, + "step": 5952 + }, + { + "epoch": 7.750972762645914, + "grad_norm": 1.242394208908081, + "learning_rate": 8.694128855056683e-05, + "loss": 0.4133, + "step": 5976 + }, + { + "epoch": 7.782101167315175, + "grad_norm": 1.2939616441726685, + "learning_rate": 8.68278456342389e-05, + "loss": 0.4185, + "step": 6000 + }, + { + "epoch": 7.813229571984436, + "grad_norm": 2.0788450241088867, + "learning_rate": 8.671398680033668e-05, + "loss": 0.4183, + "step": 6024 + }, + { + "epoch": 7.844357976653696, + "grad_norm": 2.538680076599121, + "learning_rate": 8.659971333473206e-05, + "loss": 0.4246, + "step": 6048 + }, + { + "epoch": 7.875486381322958, + "grad_norm": 2.1128950119018555, + "learning_rate": 8.648502652797954e-05, + "loss": 0.4156, + "step": 6072 + }, + { + "epoch": 7.906614785992218, + "grad_norm": 2.2612478733062744, + "learning_rate": 8.636992767530171e-05, + "loss": 0.409, + "step": 6096 + }, + { + "epoch": 7.937743190661479, + "grad_norm": 2.0751936435699463, + "learning_rate": 8.625441807657471e-05, + "loss": 0.4264, + "step": 6120 + }, + { + "epoch": 7.968871595330739, + "grad_norm": 2.009459972381592, + "learning_rate": 8.613849903631334e-05, + "loss": 0.4255, + "step": 6144 + }, + { + "epoch": 8.0, + "grad_norm": 1.8576405048370361, + "learning_rate": 8.602217186365655e-05, + "loss": 0.4211, + "step": 6168 + }, + { + "epoch": 8.03112840466926, + "grad_norm": 2.817073345184326, + "learning_rate": 8.590543787235252e-05, + "loss": 0.4156, + "step": 6192 + }, + { + "epoch": 8.062256809338521, + "grad_norm": 1.5011825561523438, + "learning_rate": 8.578829838074389e-05, + "loss": 0.41, + "step": 6216 + }, + { + "epoch": 8.093385214007782, + "grad_norm": 1.2293556928634644, + "learning_rate": 8.567075471175281e-05, + "loss": 0.417, + "step": 6240 + }, + { + "epoch": 8.124513618677042, + "grad_norm": 1.415345549583435, + "learning_rate": 8.555280819286603e-05, + "loss": 0.4148, + "step": 6264 + }, + { + "epoch": 8.155642023346303, + "grad_norm": 2.2379307746887207, + "learning_rate": 8.543446015611995e-05, + "loss": 0.4104, + "step": 6288 + }, + { + "epoch": 8.186770428015564, + "grad_norm": 1.0670602321624756, + "learning_rate": 8.531571193808549e-05, + "loss": 0.4131, + "step": 6312 + }, + { + "epoch": 8.217898832684824, + "grad_norm": 1.0915449857711792, + "learning_rate": 8.519656487985309e-05, + "loss": 0.4073, + "step": 6336 + }, + { + "epoch": 8.249027237354085, + "grad_norm": 1.4844944477081299, + "learning_rate": 8.507702032701748e-05, + "loss": 0.4109, + "step": 6360 + }, + { + "epoch": 8.280155642023347, + "grad_norm": 1.1173604726791382, + "learning_rate": 8.495707962966253e-05, + "loss": 0.4145, + "step": 6384 + }, + { + "epoch": 8.311284046692608, + "grad_norm": 1.5978012084960938, + "learning_rate": 8.4836744142346e-05, + "loss": 0.4108, + "step": 6408 + }, + { + "epoch": 8.342412451361868, + "grad_norm": 1.7912710905075073, + "learning_rate": 8.471601522408422e-05, + "loss": 0.4155, + "step": 6432 + }, + { + "epoch": 8.373540856031129, + "grad_norm": 2.182061195373535, + "learning_rate": 8.459489423833678e-05, + "loss": 0.4117, + "step": 6456 + }, + { + "epoch": 8.40466926070039, + "grad_norm": 1.8379067182540894, + "learning_rate": 8.447338255299106e-05, + "loss": 0.4104, + "step": 6480 + }, + { + "epoch": 8.43579766536965, + "grad_norm": 1.4474197626113892, + "learning_rate": 8.435148154034694e-05, + "loss": 0.4142, + "step": 6504 + }, + { + "epoch": 8.46692607003891, + "grad_norm": 2.309518575668335, + "learning_rate": 8.422919257710104e-05, + "loss": 0.4079, + "step": 6528 + }, + { + "epoch": 8.498054474708171, + "grad_norm": 1.2606794834136963, + "learning_rate": 8.410651704433146e-05, + "loss": 0.4125, + "step": 6552 + }, + { + "epoch": 8.529182879377432, + "grad_norm": 1.683693766593933, + "learning_rate": 8.398345632748194e-05, + "loss": 0.4132, + "step": 6576 + }, + { + "epoch": 8.560311284046692, + "grad_norm": 2.342796802520752, + "learning_rate": 8.386001181634642e-05, + "loss": 0.4125, + "step": 6600 + }, + { + "epoch": 8.591439688715953, + "grad_norm": 0.9687896370887756, + "learning_rate": 8.373618490505315e-05, + "loss": 0.4082, + "step": 6624 + }, + { + "epoch": 8.622568093385214, + "grad_norm": 1.2769346237182617, + "learning_rate": 8.361197699204911e-05, + "loss": 0.413, + "step": 6648 + }, + { + "epoch": 8.653696498054474, + "grad_norm": 1.4064596891403198, + "learning_rate": 8.348738948008413e-05, + "loss": 0.4172, + "step": 6672 + }, + { + "epoch": 8.684824902723735, + "grad_norm": 1.0059700012207031, + "learning_rate": 8.336242377619501e-05, + "loss": 0.4132, + "step": 6696 + }, + { + "epoch": 8.715953307392995, + "grad_norm": 1.5852705240249634, + "learning_rate": 8.323708129168979e-05, + "loss": 0.4129, + "step": 6720 + }, + { + "epoch": 8.747081712062258, + "grad_norm": 1.879469394683838, + "learning_rate": 8.31113634421316e-05, + "loss": 0.4104, + "step": 6744 + }, + { + "epoch": 8.778210116731518, + "grad_norm": 1.1461695432662964, + "learning_rate": 8.298527164732283e-05, + "loss": 0.4068, + "step": 6768 + }, + { + "epoch": 8.809338521400779, + "grad_norm": 1.1254854202270508, + "learning_rate": 8.285880733128907e-05, + "loss": 0.4118, + "step": 6792 + }, + { + "epoch": 8.84046692607004, + "grad_norm": 1.7840899229049683, + "learning_rate": 8.273197192226294e-05, + "loss": 0.4113, + "step": 6816 + }, + { + "epoch": 8.8715953307393, + "grad_norm": 1.618880271911621, + "learning_rate": 8.260476685266807e-05, + "loss": 0.4065, + "step": 6840 + }, + { + "epoch": 8.90272373540856, + "grad_norm": 1.2630411386489868, + "learning_rate": 8.247719355910284e-05, + "loss": 0.4029, + "step": 6864 + }, + { + "epoch": 8.933852140077821, + "grad_norm": 1.138664960861206, + "learning_rate": 8.234925348232421e-05, + "loss": 0.4012, + "step": 6888 + }, + { + "epoch": 8.964980544747082, + "grad_norm": 1.4435471296310425, + "learning_rate": 8.222094806723143e-05, + "loss": 0.4068, + "step": 6912 + }, + { + "epoch": 8.996108949416342, + "grad_norm": 1.9499974250793457, + "learning_rate": 8.209227876284972e-05, + "loss": 0.4092, + "step": 6936 + }, + { + "epoch": 9.027237354085603, + "grad_norm": 2.3621513843536377, + "learning_rate": 8.196324702231389e-05, + "loss": 0.4048, + "step": 6960 + }, + { + "epoch": 9.058365758754864, + "grad_norm": 1.2890691757202148, + "learning_rate": 8.183385430285197e-05, + "loss": 0.3996, + "step": 6984 + }, + { + "epoch": 9.089494163424124, + "grad_norm": 1.3257933855056763, + "learning_rate": 8.170410206576872e-05, + "loss": 0.3985, + "step": 7008 + }, + { + "epoch": 9.120622568093385, + "grad_norm": 1.485418677330017, + "learning_rate": 8.157399177642914e-05, + "loss": 0.3994, + "step": 7032 + }, + { + "epoch": 9.151750972762645, + "grad_norm": 1.115235686302185, + "learning_rate": 8.144352490424187e-05, + "loss": 0.3997, + "step": 7056 + }, + { + "epoch": 9.182879377431906, + "grad_norm": 1.565184473991394, + "learning_rate": 8.131270292264272e-05, + "loss": 0.4059, + "step": 7080 + }, + { + "epoch": 9.214007782101167, + "grad_norm": 1.3453902006149292, + "learning_rate": 8.118152730907788e-05, + "loss": 0.406, + "step": 7104 + }, + { + "epoch": 9.245136186770427, + "grad_norm": 1.4093341827392578, + "learning_rate": 8.104999954498734e-05, + "loss": 0.4029, + "step": 7128 + }, + { + "epoch": 9.27626459143969, + "grad_norm": 1.1250804662704468, + "learning_rate": 8.091812111578812e-05, + "loss": 0.4097, + "step": 7152 + }, + { + "epoch": 9.30739299610895, + "grad_norm": 1.6016291379928589, + "learning_rate": 8.07858935108575e-05, + "loss": 0.4078, + "step": 7176 + }, + { + "epoch": 9.33852140077821, + "grad_norm": 1.8599820137023926, + "learning_rate": 8.065331822351618e-05, + "loss": 0.4029, + "step": 7200 + }, + { + "epoch": 9.369649805447471, + "grad_norm": 1.2994579076766968, + "learning_rate": 8.052039675101143e-05, + "loss": 0.4079, + "step": 7224 + }, + { + "epoch": 9.400778210116732, + "grad_norm": 1.200239896774292, + "learning_rate": 8.038713059450026e-05, + "loss": 0.4017, + "step": 7248 + }, + { + "epoch": 9.431906614785992, + "grad_norm": 3.8246068954467773, + "learning_rate": 8.025352125903227e-05, + "loss": 0.4006, + "step": 7272 + }, + { + "epoch": 9.463035019455253, + "grad_norm": 1.4172035455703735, + "learning_rate": 8.011957025353287e-05, + "loss": 0.4028, + "step": 7296 + }, + { + "epoch": 9.494163424124514, + "grad_norm": 2.0654618740081787, + "learning_rate": 7.998527909078607e-05, + "loss": 0.4014, + "step": 7320 + }, + { + "epoch": 9.525291828793774, + "grad_norm": 1.3547816276550293, + "learning_rate": 7.985064928741754e-05, + "loss": 0.3981, + "step": 7344 + }, + { + "epoch": 9.556420233463035, + "grad_norm": 1.3812025785446167, + "learning_rate": 7.971568236387734e-05, + "loss": 0.406, + "step": 7368 + }, + { + "epoch": 9.587548638132295, + "grad_norm": 1.438240885734558, + "learning_rate": 7.958037984442285e-05, + "loss": 0.4011, + "step": 7392 + }, + { + "epoch": 9.618677042801556, + "grad_norm": 1.7840272188186646, + "learning_rate": 7.944474325710154e-05, + "loss": 0.401, + "step": 7416 + }, + { + "epoch": 9.649805447470817, + "grad_norm": 1.251658320426941, + "learning_rate": 7.930877413373367e-05, + "loss": 0.3969, + "step": 7440 + }, + { + "epoch": 9.680933852140077, + "grad_norm": 2.252761125564575, + "learning_rate": 7.917247400989505e-05, + "loss": 0.4049, + "step": 7464 + }, + { + "epoch": 9.712062256809338, + "grad_norm": 1.476012110710144, + "learning_rate": 7.903584442489958e-05, + "loss": 0.401, + "step": 7488 + }, + { + "epoch": 9.7431906614786, + "grad_norm": 2.692723035812378, + "learning_rate": 7.889888692178207e-05, + "loss": 0.4017, + "step": 7512 + }, + { + "epoch": 9.77431906614786, + "grad_norm": 3.0412638187408447, + "learning_rate": 7.87616030472806e-05, + "loss": 0.4093, + "step": 7536 + }, + { + "epoch": 9.805447470817121, + "grad_norm": 1.527076244354248, + "learning_rate": 7.862399435181917e-05, + "loss": 0.3988, + "step": 7560 + }, + { + "epoch": 9.836575875486382, + "grad_norm": 1.2038588523864746, + "learning_rate": 7.848606238949021e-05, + "loss": 0.4058, + "step": 7584 + }, + { + "epoch": 9.867704280155642, + "grad_norm": 1.9050565958023071, + "learning_rate": 7.834780871803693e-05, + "loss": 0.3943, + "step": 7608 + }, + { + "epoch": 9.898832684824903, + "grad_norm": 1.483185887336731, + "learning_rate": 7.82092348988358e-05, + "loss": 0.3992, + "step": 7632 + }, + { + "epoch": 9.929961089494164, + "grad_norm": 1.5043606758117676, + "learning_rate": 7.80703424968789e-05, + "loss": 0.3989, + "step": 7656 + }, + { + "epoch": 9.961089494163424, + "grad_norm": 1.194094181060791, + "learning_rate": 7.793113308075626e-05, + "loss": 0.4007, + "step": 7680 + }, + { + "epoch": 9.992217898832685, + "grad_norm": 1.5360095500946045, + "learning_rate": 7.77916082226381e-05, + "loss": 0.395, + "step": 7704 + }, + { + "epoch": 10.023346303501945, + "grad_norm": 1.1073459386825562, + "learning_rate": 7.76517694982571e-05, + "loss": 0.3989, + "step": 7728 + }, + { + "epoch": 10.054474708171206, + "grad_norm": 1.4059771299362183, + "learning_rate": 7.751161848689063e-05, + "loss": 0.3964, + "step": 7752 + }, + { + "epoch": 10.085603112840467, + "grad_norm": 1.8619714975357056, + "learning_rate": 7.737115677134294e-05, + "loss": 0.3964, + "step": 7776 + }, + { + "epoch": 10.116731517509727, + "grad_norm": 0.8621863722801208, + "learning_rate": 7.723038593792712e-05, + "loss": 0.4019, + "step": 7800 + }, + { + "epoch": 10.147859922178988, + "grad_norm": 1.542912483215332, + "learning_rate": 7.708930757644739e-05, + "loss": 0.3957, + "step": 7824 + }, + { + "epoch": 10.178988326848248, + "grad_norm": 1.8078597784042358, + "learning_rate": 7.694792328018106e-05, + "loss": 0.3991, + "step": 7848 + }, + { + "epoch": 10.210116731517509, + "grad_norm": 1.4210093021392822, + "learning_rate": 7.680623464586048e-05, + "loss": 0.3925, + "step": 7872 + }, + { + "epoch": 10.24124513618677, + "grad_norm": 1.6985816955566406, + "learning_rate": 7.66642432736551e-05, + "loss": 0.3984, + "step": 7896 + }, + { + "epoch": 10.272373540856032, + "grad_norm": 1.4291504621505737, + "learning_rate": 7.652195076715332e-05, + "loss": 0.4016, + "step": 7920 + }, + { + "epoch": 10.303501945525293, + "grad_norm": 1.3934870958328247, + "learning_rate": 7.637935873334448e-05, + "loss": 0.3992, + "step": 7944 + }, + { + "epoch": 10.334630350194553, + "grad_norm": 1.5841765403747559, + "learning_rate": 7.623646878260062e-05, + "loss": 0.3989, + "step": 7968 + }, + { + "epoch": 10.365758754863814, + "grad_norm": 1.1344020366668701, + "learning_rate": 7.60932825286583e-05, + "loss": 0.3934, + "step": 7992 + }, + { + "epoch": 10.396887159533074, + "grad_norm": 1.1252238750457764, + "learning_rate": 7.594980158860043e-05, + "loss": 0.3947, + "step": 8016 + }, + { + "epoch": 10.428015564202335, + "grad_norm": 1.5455870628356934, + "learning_rate": 7.580602758283796e-05, + "loss": 0.3897, + "step": 8040 + }, + { + "epoch": 10.459143968871595, + "grad_norm": 2.1351683139801025, + "learning_rate": 7.566196213509163e-05, + "loss": 0.3911, + "step": 8064 + }, + { + "epoch": 10.490272373540856, + "grad_norm": 1.9759098291397095, + "learning_rate": 7.551760687237351e-05, + "loss": 0.3973, + "step": 8088 + }, + { + "epoch": 10.521400778210117, + "grad_norm": 1.0132018327713013, + "learning_rate": 7.537296342496884e-05, + "loss": 0.3957, + "step": 8112 + }, + { + "epoch": 10.552529182879377, + "grad_norm": 2.219759464263916, + "learning_rate": 7.522803342641737e-05, + "loss": 0.3887, + "step": 8136 + }, + { + "epoch": 10.583657587548638, + "grad_norm": 2.361774206161499, + "learning_rate": 7.508281851349512e-05, + "loss": 0.3975, + "step": 8160 + }, + { + "epoch": 10.614785992217898, + "grad_norm": 1.4584128856658936, + "learning_rate": 7.493732032619578e-05, + "loss": 0.4, + "step": 8184 + }, + { + "epoch": 10.645914396887159, + "grad_norm": 1.375190019607544, + "learning_rate": 7.47915405077122e-05, + "loss": 0.4021, + "step": 8208 + }, + { + "epoch": 10.67704280155642, + "grad_norm": 1.5501540899276733, + "learning_rate": 7.464548070441785e-05, + "loss": 0.3943, + "step": 8232 + }, + { + "epoch": 10.70817120622568, + "grad_norm": 1.5805977582931519, + "learning_rate": 7.449914256584828e-05, + "loss": 0.3915, + "step": 8256 + }, + { + "epoch": 10.739299610894943, + "grad_norm": 1.0127402544021606, + "learning_rate": 7.435252774468237e-05, + "loss": 0.3899, + "step": 8280 + }, + { + "epoch": 10.770428015564203, + "grad_norm": 1.5114730596542358, + "learning_rate": 7.420563789672375e-05, + "loss": 0.3922, + "step": 8304 + }, + { + "epoch": 10.801556420233464, + "grad_norm": 1.1805211305618286, + "learning_rate": 7.405847468088209e-05, + "loss": 0.3951, + "step": 8328 + }, + { + "epoch": 10.832684824902724, + "grad_norm": 1.1337734460830688, + "learning_rate": 7.391103975915436e-05, + "loss": 0.3954, + "step": 8352 + }, + { + "epoch": 10.863813229571985, + "grad_norm": 1.024134874343872, + "learning_rate": 7.376333479660607e-05, + "loss": 0.3829, + "step": 8376 + }, + { + "epoch": 10.894941634241246, + "grad_norm": 1.2885181903839111, + "learning_rate": 7.361536146135243e-05, + "loss": 0.3904, + "step": 8400 + }, + { + "epoch": 10.926070038910506, + "grad_norm": 1.2240935564041138, + "learning_rate": 7.346712142453954e-05, + "loss": 0.3904, + "step": 8424 + }, + { + "epoch": 10.957198443579767, + "grad_norm": 1.2982319593429565, + "learning_rate": 7.33186163603255e-05, + "loss": 0.3944, + "step": 8448 + }, + { + "epoch": 10.988326848249027, + "grad_norm": 1.0359567403793335, + "learning_rate": 7.316984794586155e-05, + "loss": 0.3989, + "step": 8472 + }, + { + "epoch": 11.019455252918288, + "grad_norm": 2.0623931884765625, + "learning_rate": 7.302081786127304e-05, + "loss": 0.3853, + "step": 8496 + }, + { + "epoch": 11.050583657587548, + "grad_norm": 1.2377070188522339, + "learning_rate": 7.287152778964055e-05, + "loss": 0.3913, + "step": 8520 + }, + { + "epoch": 11.081712062256809, + "grad_norm": 1.016614556312561, + "learning_rate": 7.272197941698084e-05, + "loss": 0.3882, + "step": 8544 + }, + { + "epoch": 11.11284046692607, + "grad_norm": 1.5649337768554688, + "learning_rate": 7.257217443222777e-05, + "loss": 0.378, + "step": 8568 + }, + { + "epoch": 11.14396887159533, + "grad_norm": 1.4619653224945068, + "learning_rate": 7.242211452721331e-05, + "loss": 0.3874, + "step": 8592 + }, + { + "epoch": 11.17509727626459, + "grad_norm": 1.6870439052581787, + "learning_rate": 7.227180139664836e-05, + "loss": 0.3867, + "step": 8616 + }, + { + "epoch": 11.206225680933851, + "grad_norm": 1.0460180044174194, + "learning_rate": 7.212123673810363e-05, + "loss": 0.394, + "step": 8640 + }, + { + "epoch": 11.237354085603112, + "grad_norm": 1.0444591045379639, + "learning_rate": 7.19704222519905e-05, + "loss": 0.3877, + "step": 8664 + }, + { + "epoch": 11.268482490272374, + "grad_norm": 1.3924522399902344, + "learning_rate": 7.181935964154182e-05, + "loss": 0.3836, + "step": 8688 + }, + { + "epoch": 11.299610894941635, + "grad_norm": 2.0957131385803223, + "learning_rate": 7.166805061279257e-05, + "loss": 0.3879, + "step": 8712 + }, + { + "epoch": 11.330739299610896, + "grad_norm": 1.5147196054458618, + "learning_rate": 7.151649687456074e-05, + "loss": 0.3888, + "step": 8736 + }, + { + "epoch": 11.361867704280156, + "grad_norm": 1.5958192348480225, + "learning_rate": 7.136470013842791e-05, + "loss": 0.3883, + "step": 8760 + }, + { + "epoch": 11.392996108949417, + "grad_norm": 1.494354248046875, + "learning_rate": 7.121266211872004e-05, + "loss": 0.3847, + "step": 8784 + }, + { + "epoch": 11.424124513618677, + "grad_norm": 1.3116648197174072, + "learning_rate": 7.106038453248794e-05, + "loss": 0.3913, + "step": 8808 + }, + { + "epoch": 11.455252918287938, + "grad_norm": 2.947636842727661, + "learning_rate": 7.090786909948809e-05, + "loss": 0.3837, + "step": 8832 + }, + { + "epoch": 11.486381322957198, + "grad_norm": 1.8480781316757202, + "learning_rate": 7.075511754216304e-05, + "loss": 0.3816, + "step": 8856 + }, + { + "epoch": 11.517509727626459, + "grad_norm": 1.5083237886428833, + "learning_rate": 7.060213158562205e-05, + "loss": 0.3856, + "step": 8880 + }, + { + "epoch": 11.54863813229572, + "grad_norm": 1.2127504348754883, + "learning_rate": 7.044891295762154e-05, + "loss": 0.3861, + "step": 8904 + }, + { + "epoch": 11.57976653696498, + "grad_norm": 1.0090476274490356, + "learning_rate": 7.029546338854569e-05, + "loss": 0.3894, + "step": 8928 + }, + { + "epoch": 11.61089494163424, + "grad_norm": 0.9990460872650146, + "learning_rate": 7.014178461138676e-05, + "loss": 0.388, + "step": 8952 + }, + { + "epoch": 11.642023346303501, + "grad_norm": 1.7229726314544678, + "learning_rate": 6.998787836172564e-05, + "loss": 0.3883, + "step": 8976 + }, + { + "epoch": 11.673151750972762, + "grad_norm": 1.0046260356903076, + "learning_rate": 6.983374637771217e-05, + "loss": 0.3853, + "step": 9000 + }, + { + "epoch": 11.704280155642023, + "grad_norm": 1.4152393341064453, + "learning_rate": 6.967939040004551e-05, + "loss": 0.3829, + "step": 9024 + }, + { + "epoch": 11.735408560311285, + "grad_norm": 1.2723467350006104, + "learning_rate": 6.952481217195456e-05, + "loss": 0.3879, + "step": 9048 + }, + { + "epoch": 11.766536964980546, + "grad_norm": 1.7674216032028198, + "learning_rate": 6.937001343917818e-05, + "loss": 0.3909, + "step": 9072 + }, + { + "epoch": 11.797665369649806, + "grad_norm": 1.4604827165603638, + "learning_rate": 6.92149959499455e-05, + "loss": 0.3878, + "step": 9096 + }, + { + "epoch": 11.828793774319067, + "grad_norm": 1.5532753467559814, + "learning_rate": 6.905976145495628e-05, + "loss": 0.3884, + "step": 9120 + }, + { + "epoch": 11.859922178988327, + "grad_norm": 1.1423866748809814, + "learning_rate": 6.890431170736091e-05, + "loss": 0.3861, + "step": 9144 + }, + { + "epoch": 11.891050583657588, + "grad_norm": 1.350380778312683, + "learning_rate": 6.874864846274087e-05, + "loss": 0.3813, + "step": 9168 + }, + { + "epoch": 11.922178988326849, + "grad_norm": 1.2758312225341797, + "learning_rate": 6.85927734790887e-05, + "loss": 0.3877, + "step": 9192 + }, + { + "epoch": 11.95330739299611, + "grad_norm": 1.970986247062683, + "learning_rate": 6.843668851678831e-05, + "loss": 0.3828, + "step": 9216 + }, + { + "epoch": 11.98443579766537, + "grad_norm": 1.340889811515808, + "learning_rate": 6.828039533859489e-05, + "loss": 0.3875, + "step": 9240 + }, + { + "epoch": 12.01556420233463, + "grad_norm": 1.2335118055343628, + "learning_rate": 6.812389570961525e-05, + "loss": 0.3809, + "step": 9264 + }, + { + "epoch": 12.04669260700389, + "grad_norm": 1.2043426036834717, + "learning_rate": 6.796719139728777e-05, + "loss": 0.3835, + "step": 9288 + }, + { + "epoch": 12.077821011673151, + "grad_norm": 1.197809100151062, + "learning_rate": 6.781028417136231e-05, + "loss": 0.3792, + "step": 9312 + }, + { + "epoch": 12.108949416342412, + "grad_norm": 1.2524584531784058, + "learning_rate": 6.765317580388046e-05, + "loss": 0.3842, + "step": 9336 + }, + { + "epoch": 12.140077821011673, + "grad_norm": 1.082410454750061, + "learning_rate": 6.749586806915535e-05, + "loss": 0.3827, + "step": 9360 + }, + { + "epoch": 12.171206225680933, + "grad_norm": 1.2853772640228271, + "learning_rate": 6.733836274375176e-05, + "loss": 0.3755, + "step": 9384 + }, + { + "epoch": 12.202334630350194, + "grad_norm": 1.6849515438079834, + "learning_rate": 6.718066160646585e-05, + "loss": 0.38, + "step": 9408 + }, + { + "epoch": 12.233463035019454, + "grad_norm": 2.0715172290802, + "learning_rate": 6.702276643830531e-05, + "loss": 0.3799, + "step": 9432 + }, + { + "epoch": 12.264591439688717, + "grad_norm": 1.7511128187179565, + "learning_rate": 6.686467902246909e-05, + "loss": 0.3752, + "step": 9456 + }, + { + "epoch": 12.295719844357977, + "grad_norm": 1.1407638788223267, + "learning_rate": 6.670640114432724e-05, + "loss": 0.3834, + "step": 9480 + }, + { + "epoch": 12.326848249027238, + "grad_norm": 1.0695194005966187, + "learning_rate": 6.654793459140089e-05, + "loss": 0.3835, + "step": 9504 + }, + { + "epoch": 12.357976653696499, + "grad_norm": 1.285834789276123, + "learning_rate": 6.638928115334196e-05, + "loss": 0.3904, + "step": 9528 + }, + { + "epoch": 12.38910505836576, + "grad_norm": 1.508699893951416, + "learning_rate": 6.623044262191293e-05, + "loss": 0.3964, + "step": 9552 + }, + { + "epoch": 12.42023346303502, + "grad_norm": 1.287642002105713, + "learning_rate": 6.607142079096668e-05, + "loss": 0.3819, + "step": 9576 + }, + { + "epoch": 12.45136186770428, + "grad_norm": 2.893951892852783, + "learning_rate": 6.591221745642621e-05, + "loss": 0.3805, + "step": 9600 + }, + { + "epoch": 12.482490272373541, + "grad_norm": 1.4402974843978882, + "learning_rate": 6.575283441626433e-05, + "loss": 0.376, + "step": 9624 + }, + { + "epoch": 12.513618677042802, + "grad_norm": 1.156258225440979, + "learning_rate": 6.559327347048331e-05, + "loss": 0.3778, + "step": 9648 + }, + { + "epoch": 12.544747081712062, + "grad_norm": 1.5183446407318115, + "learning_rate": 6.543353642109469e-05, + "loss": 0.382, + "step": 9672 + }, + { + "epoch": 12.575875486381323, + "grad_norm": 1.611879825592041, + "learning_rate": 6.527362507209879e-05, + "loss": 0.3791, + "step": 9696 + }, + { + "epoch": 12.607003891050583, + "grad_norm": 1.3625446557998657, + "learning_rate": 6.511354122946443e-05, + "loss": 0.379, + "step": 9720 + }, + { + "epoch": 12.638132295719844, + "grad_norm": 1.2298206090927124, + "learning_rate": 6.495328670110848e-05, + "loss": 0.3773, + "step": 9744 + }, + { + "epoch": 12.669260700389104, + "grad_norm": 1.0427093505859375, + "learning_rate": 6.479286329687543e-05, + "loss": 0.3752, + "step": 9768 + }, + { + "epoch": 12.700389105058365, + "grad_norm": 1.6555167436599731, + "learning_rate": 6.463227282851708e-05, + "loss": 0.3771, + "step": 9792 + }, + { + "epoch": 12.731517509727626, + "grad_norm": 1.3086024522781372, + "learning_rate": 6.447151710967187e-05, + "loss": 0.377, + "step": 9816 + }, + { + "epoch": 12.762645914396888, + "grad_norm": 1.3003504276275635, + "learning_rate": 6.431059795584453e-05, + "loss": 0.3812, + "step": 9840 + }, + { + "epoch": 12.793774319066149, + "grad_norm": 1.4847590923309326, + "learning_rate": 6.414951718438561e-05, + "loss": 0.3778, + "step": 9864 + }, + { + "epoch": 12.82490272373541, + "grad_norm": 1.3426965475082397, + "learning_rate": 6.398827661447084e-05, + "loss": 0.3794, + "step": 9888 + }, + { + "epoch": 12.85603112840467, + "grad_norm": 1.2530086040496826, + "learning_rate": 6.382687806708067e-05, + "loss": 0.3728, + "step": 9912 + }, + { + "epoch": 12.88715953307393, + "grad_norm": 1.8029588460922241, + "learning_rate": 6.366532336497968e-05, + "loss": 0.3795, + "step": 9936 + }, + { + "epoch": 12.918287937743191, + "grad_norm": 1.9585580825805664, + "learning_rate": 6.350361433269599e-05, + "loss": 0.3769, + "step": 9960 + }, + { + "epoch": 12.949416342412452, + "grad_norm": 1.7418956756591797, + "learning_rate": 6.334175279650062e-05, + "loss": 0.3778, + "step": 9984 + }, + { + "epoch": 12.980544747081712, + "grad_norm": 1.6264042854309082, + "learning_rate": 6.317974058438697e-05, + "loss": 0.3821, + "step": 10008 + }, + { + "epoch": 13.011673151750973, + "grad_norm": 0.9489176869392395, + "learning_rate": 6.301757952605007e-05, + "loss": 0.374, + "step": 10032 + }, + { + "epoch": 13.042801556420233, + "grad_norm": 2.183706045150757, + "learning_rate": 6.285527145286594e-05, + "loss": 0.3736, + "step": 10056 + }, + { + "epoch": 13.073929961089494, + "grad_norm": 1.3998112678527832, + "learning_rate": 6.269281819787095e-05, + "loss": 0.3726, + "step": 10080 + }, + { + "epoch": 13.105058365758754, + "grad_norm": 1.5030006170272827, + "learning_rate": 6.253022159574108e-05, + "loss": 0.3741, + "step": 10104 + }, + { + "epoch": 13.136186770428015, + "grad_norm": 2.579502820968628, + "learning_rate": 6.23674834827712e-05, + "loss": 0.373, + "step": 10128 + }, + { + "epoch": 13.167315175097276, + "grad_norm": 1.5349212884902954, + "learning_rate": 6.220460569685437e-05, + "loss": 0.3739, + "step": 10152 + }, + { + "epoch": 13.198443579766536, + "grad_norm": 1.6323474645614624, + "learning_rate": 6.204159007746103e-05, + "loss": 0.3729, + "step": 10176 + }, + { + "epoch": 13.229571984435797, + "grad_norm": 1.1729427576065063, + "learning_rate": 6.187843846561824e-05, + "loss": 0.3759, + "step": 10200 + }, + { + "epoch": 13.26070038910506, + "grad_norm": 2.276395320892334, + "learning_rate": 6.171515270388892e-05, + "loss": 0.3657, + "step": 10224 + }, + { + "epoch": 13.29182879377432, + "grad_norm": 0.9925207495689392, + "learning_rate": 6.155173463635103e-05, + "loss": 0.3724, + "step": 10248 + }, + { + "epoch": 13.32295719844358, + "grad_norm": 0.9079545140266418, + "learning_rate": 6.13881861085767e-05, + "loss": 0.3675, + "step": 10272 + }, + { + "epoch": 13.354085603112841, + "grad_norm": 2.5486135482788086, + "learning_rate": 6.122450896761147e-05, + "loss": 0.3684, + "step": 10296 + }, + { + "epoch": 13.385214007782102, + "grad_norm": 1.5650309324264526, + "learning_rate": 6.106070506195332e-05, + "loss": 0.3765, + "step": 10320 + }, + { + "epoch": 13.416342412451362, + "grad_norm": 0.9130122065544128, + "learning_rate": 6.0896776241531916e-05, + "loss": 0.3788, + "step": 10344 + }, + { + "epoch": 13.447470817120623, + "grad_norm": 1.1227184534072876, + "learning_rate": 6.073272435768761e-05, + "loss": 0.3717, + "step": 10368 + }, + { + "epoch": 13.478599221789883, + "grad_norm": 2.312488079071045, + "learning_rate": 6.0568551263150606e-05, + "loss": 0.3775, + "step": 10392 + }, + { + "epoch": 13.509727626459144, + "grad_norm": 1.1797654628753662, + "learning_rate": 6.040425881201998e-05, + "loss": 0.3721, + "step": 10416 + }, + { + "epoch": 13.540856031128405, + "grad_norm": 3.0446395874023438, + "learning_rate": 6.0239848859742795e-05, + "loss": 0.3698, + "step": 10440 + }, + { + "epoch": 13.571984435797665, + "grad_norm": 1.0386089086532593, + "learning_rate": 6.007532326309313e-05, + "loss": 0.3724, + "step": 10464 + }, + { + "epoch": 13.603112840466926, + "grad_norm": 1.4335585832595825, + "learning_rate": 5.9910683880151064e-05, + "loss": 0.3749, + "step": 10488 + }, + { + "epoch": 13.634241245136186, + "grad_norm": 1.4243568181991577, + "learning_rate": 5.974593257028176e-05, + "loss": 0.3714, + "step": 10512 + }, + { + "epoch": 13.665369649805447, + "grad_norm": 1.3887135982513428, + "learning_rate": 5.958107119411441e-05, + "loss": 0.3763, + "step": 10536 + }, + { + "epoch": 13.696498054474707, + "grad_norm": 1.4939093589782715, + "learning_rate": 5.941610161352128e-05, + "loss": 0.3689, + "step": 10560 + }, + { + "epoch": 13.727626459143968, + "grad_norm": 1.3950523138046265, + "learning_rate": 5.925102569159661e-05, + "loss": 0.3721, + "step": 10584 + }, + { + "epoch": 13.75875486381323, + "grad_norm": 1.5457286834716797, + "learning_rate": 5.9085845292635645e-05, + "loss": 0.3736, + "step": 10608 + }, + { + "epoch": 13.789883268482491, + "grad_norm": 1.7134722471237183, + "learning_rate": 5.8920562282113534e-05, + "loss": 0.3705, + "step": 10632 + }, + { + "epoch": 13.821011673151752, + "grad_norm": 1.9264869689941406, + "learning_rate": 5.875517852666428e-05, + "loss": 0.3731, + "step": 10656 + }, + { + "epoch": 13.852140077821012, + "grad_norm": 1.9957599639892578, + "learning_rate": 5.8589695894059626e-05, + "loss": 0.3727, + "step": 10680 + }, + { + "epoch": 13.883268482490273, + "grad_norm": 1.0721269845962524, + "learning_rate": 5.842411625318805e-05, + "loss": 0.3717, + "step": 10704 + }, + { + "epoch": 13.914396887159533, + "grad_norm": 1.339650273323059, + "learning_rate": 5.825844147403353e-05, + "loss": 0.3781, + "step": 10728 + }, + { + "epoch": 13.945525291828794, + "grad_norm": 1.0256425142288208, + "learning_rate": 5.809267342765456e-05, + "loss": 0.3743, + "step": 10752 + }, + { + "epoch": 13.976653696498055, + "grad_norm": 1.1623256206512451, + "learning_rate": 5.792681398616293e-05, + "loss": 0.372, + "step": 10776 + }, + { + "epoch": 14.007782101167315, + "grad_norm": 2.1772332191467285, + "learning_rate": 5.776086502270258e-05, + "loss": 0.3768, + "step": 10800 + }, + { + "epoch": 14.038910505836576, + "grad_norm": 1.4126263856887817, + "learning_rate": 5.759482841142848e-05, + "loss": 0.3689, + "step": 10824 + }, + { + "epoch": 14.070038910505836, + "grad_norm": 1.1903387308120728, + "learning_rate": 5.742870602748547e-05, + "loss": 0.3667, + "step": 10848 + }, + { + "epoch": 14.101167315175097, + "grad_norm": 1.1915792226791382, + "learning_rate": 5.7262499746987094e-05, + "loss": 0.372, + "step": 10872 + }, + { + "epoch": 14.132295719844358, + "grad_norm": 1.3118023872375488, + "learning_rate": 5.7096211446994344e-05, + "loss": 0.3673, + "step": 10896 + }, + { + "epoch": 14.163424124513618, + "grad_norm": 1.0034823417663574, + "learning_rate": 5.692984300549451e-05, + "loss": 0.3743, + "step": 10920 + }, + { + "epoch": 14.194552529182879, + "grad_norm": 1.1173166036605835, + "learning_rate": 5.6763396301379976e-05, + "loss": 0.3722, + "step": 10944 + }, + { + "epoch": 14.22568093385214, + "grad_norm": 1.1479343175888062, + "learning_rate": 5.659687321442701e-05, + "loss": 0.3691, + "step": 10968 + }, + { + "epoch": 14.2568093385214, + "grad_norm": 1.3507132530212402, + "learning_rate": 5.6430275625274456e-05, + "loss": 0.3655, + "step": 10992 + }, + { + "epoch": 14.287937743190662, + "grad_norm": 1.1012446880340576, + "learning_rate": 5.626360541540261e-05, + "loss": 0.366, + "step": 11016 + }, + { + "epoch": 14.319066147859923, + "grad_norm": 1.2122224569320679, + "learning_rate": 5.609686446711191e-05, + "loss": 0.3608, + "step": 11040 + }, + { + "epoch": 14.350194552529183, + "grad_norm": 0.9675916433334351, + "learning_rate": 5.593005466350164e-05, + "loss": 0.3677, + "step": 11064 + }, + { + "epoch": 14.381322957198444, + "grad_norm": 1.0538902282714844, + "learning_rate": 5.576317788844875e-05, + "loss": 0.369, + "step": 11088 + }, + { + "epoch": 14.412451361867705, + "grad_norm": 2.077829122543335, + "learning_rate": 5.55962360265865e-05, + "loss": 0.3642, + "step": 11112 + }, + { + "epoch": 14.443579766536965, + "grad_norm": 1.2885998487472534, + "learning_rate": 5.542923096328325e-05, + "loss": 0.3685, + "step": 11136 + }, + { + "epoch": 14.474708171206226, + "grad_norm": 2.953463077545166, + "learning_rate": 5.526216458462111e-05, + "loss": 0.3683, + "step": 11160 + }, + { + "epoch": 14.505836575875486, + "grad_norm": 1.336449384689331, + "learning_rate": 5.509503877737465e-05, + "loss": 0.3627, + "step": 11184 + }, + { + "epoch": 14.536964980544747, + "grad_norm": 4.623841762542725, + "learning_rate": 5.4927855428989624e-05, + "loss": 0.3738, + "step": 11208 + }, + { + "epoch": 14.568093385214008, + "grad_norm": 1.4652122259140015, + "learning_rate": 5.476061642756161e-05, + "loss": 0.3722, + "step": 11232 + }, + { + "epoch": 14.599221789883268, + "grad_norm": 1.3524249792099, + "learning_rate": 5.4593323661814686e-05, + "loss": 0.3586, + "step": 11256 + }, + { + "epoch": 14.630350194552529, + "grad_norm": 1.833708643913269, + "learning_rate": 5.442597902108019e-05, + "loss": 0.3568, + "step": 11280 + }, + { + "epoch": 14.66147859922179, + "grad_norm": 1.4893455505371094, + "learning_rate": 5.425858439527525e-05, + "loss": 0.3698, + "step": 11304 + }, + { + "epoch": 14.69260700389105, + "grad_norm": 1.7463867664337158, + "learning_rate": 5.409114167488152e-05, + "loss": 0.3726, + "step": 11328 + }, + { + "epoch": 14.72373540856031, + "grad_norm": 1.5364842414855957, + "learning_rate": 5.392365275092383e-05, + "loss": 0.3656, + "step": 11352 + }, + { + "epoch": 14.754863813229573, + "grad_norm": 1.4161092042922974, + "learning_rate": 5.37561195149488e-05, + "loss": 0.3636, + "step": 11376 + }, + { + "epoch": 14.785992217898833, + "grad_norm": 1.125667691230774, + "learning_rate": 5.358854385900348e-05, + "loss": 0.3636, + "step": 11400 + }, + { + "epoch": 14.817120622568094, + "grad_norm": 1.9482998847961426, + "learning_rate": 5.342092767561402e-05, + "loss": 0.3646, + "step": 11424 + }, + { + "epoch": 14.848249027237355, + "grad_norm": 1.8707369565963745, + "learning_rate": 5.325327285776425e-05, + "loss": 0.3657, + "step": 11448 + }, + { + "epoch": 14.879377431906615, + "grad_norm": 1.7567267417907715, + "learning_rate": 5.308558129887431e-05, + "loss": 0.3628, + "step": 11472 + }, + { + "epoch": 14.910505836575876, + "grad_norm": 1.5714308023452759, + "learning_rate": 5.2917854892779304e-05, + "loss": 0.3667, + "step": 11496 + }, + { + "epoch": 14.941634241245136, + "grad_norm": 2.1905322074890137, + "learning_rate": 5.275009553370788e-05, + "loss": 0.371, + "step": 11520 + }, + { + "epoch": 14.972762645914397, + "grad_norm": 2.8119211196899414, + "learning_rate": 5.2582305116260835e-05, + "loss": 0.3704, + "step": 11544 + }, + { + "epoch": 15.003891050583658, + "grad_norm": 1.1872552633285522, + "learning_rate": 5.241448553538968e-05, + "loss": 0.3755, + "step": 11568 + }, + { + "epoch": 15.035019455252918, + "grad_norm": 1.4244314432144165, + "learning_rate": 5.224663868637538e-05, + "loss": 0.3599, + "step": 11592 + }, + { + "epoch": 15.066147859922179, + "grad_norm": 1.2808740139007568, + "learning_rate": 5.2078766464806796e-05, + "loss": 0.3683, + "step": 11616 + }, + { + "epoch": 15.09727626459144, + "grad_norm": 1.0528135299682617, + "learning_rate": 5.191087076655935e-05, + "loss": 0.3598, + "step": 11640 + }, + { + "epoch": 15.1284046692607, + "grad_norm": 1.8377207517623901, + "learning_rate": 5.174295348777357e-05, + "loss": 0.3553, + "step": 11664 + }, + { + "epoch": 15.15953307392996, + "grad_norm": 1.7853907346725464, + "learning_rate": 5.1575016524833754e-05, + "loss": 0.3614, + "step": 11688 + }, + { + "epoch": 15.190661478599221, + "grad_norm": 1.7978260517120361, + "learning_rate": 5.140706177434645e-05, + "loss": 0.3608, + "step": 11712 + }, + { + "epoch": 15.221789883268482, + "grad_norm": 1.1315481662750244, + "learning_rate": 5.123909113311915e-05, + "loss": 0.3635, + "step": 11736 + }, + { + "epoch": 15.252918287937742, + "grad_norm": 1.6177383661270142, + "learning_rate": 5.1071106498138764e-05, + "loss": 0.3624, + "step": 11760 + }, + { + "epoch": 15.284046692607005, + "grad_norm": 1.2278454303741455, + "learning_rate": 5.0903109766550264e-05, + "loss": 0.3658, + "step": 11784 + }, + { + "epoch": 15.315175097276265, + "grad_norm": 1.3733409643173218, + "learning_rate": 5.073510283563523e-05, + "loss": 0.3612, + "step": 11808 + }, + { + "epoch": 15.346303501945526, + "grad_norm": 1.3404691219329834, + "learning_rate": 5.05670876027904e-05, + "loss": 0.3629, + "step": 11832 + }, + { + "epoch": 15.377431906614786, + "grad_norm": 1.2201738357543945, + "learning_rate": 5.039906596550633e-05, + "loss": 0.3666, + "step": 11856 + }, + { + "epoch": 15.408560311284047, + "grad_norm": 2.0148181915283203, + "learning_rate": 5.023103982134586e-05, + "loss": 0.3665, + "step": 11880 + }, + { + "epoch": 15.439688715953308, + "grad_norm": 1.249961256980896, + "learning_rate": 5.006301106792274e-05, + "loss": 0.3647, + "step": 11904 + }, + { + "epoch": 15.470817120622568, + "grad_norm": 1.5822800397872925, + "learning_rate": 4.989498160288019e-05, + "loss": 0.3659, + "step": 11928 + }, + { + "epoch": 15.501945525291829, + "grad_norm": 1.1686407327651978, + "learning_rate": 4.9726953323869456e-05, + "loss": 0.363, + "step": 11952 + }, + { + "epoch": 15.53307392996109, + "grad_norm": 1.8801552057266235, + "learning_rate": 4.9558928128528414e-05, + "loss": 0.3623, + "step": 11976 + }, + { + "epoch": 15.56420233463035, + "grad_norm": 1.2335692644119263, + "learning_rate": 4.9390907914460105e-05, + "loss": 0.3664, + "step": 12000 + }, + { + "epoch": 15.59533073929961, + "grad_norm": 1.496955156326294, + "learning_rate": 4.9222894579211276e-05, + "loss": 0.3644, + "step": 12024 + }, + { + "epoch": 15.626459143968871, + "grad_norm": 1.6293377876281738, + "learning_rate": 4.905489002025106e-05, + "loss": 0.3605, + "step": 12048 + }, + { + "epoch": 15.657587548638132, + "grad_norm": 1.2555320262908936, + "learning_rate": 4.8886896134949415e-05, + "loss": 0.3594, + "step": 12072 + }, + { + "epoch": 15.688715953307392, + "grad_norm": 1.2741057872772217, + "learning_rate": 4.871891482055575e-05, + "loss": 0.3622, + "step": 12096 + }, + { + "epoch": 15.719844357976653, + "grad_norm": 2.100410223007202, + "learning_rate": 4.855094797417758e-05, + "loss": 0.3612, + "step": 12120 + }, + { + "epoch": 15.750972762645915, + "grad_norm": 0.88619464635849, + "learning_rate": 4.8382997492758936e-05, + "loss": 0.3589, + "step": 12144 + }, + { + "epoch": 15.782101167315176, + "grad_norm": 1.5951071977615356, + "learning_rate": 4.8215065273059085e-05, + "loss": 0.3613, + "step": 12168 + }, + { + "epoch": 15.813229571984436, + "grad_norm": 1.1034135818481445, + "learning_rate": 4.8047153211631e-05, + "loss": 0.3609, + "step": 12192 + }, + { + "epoch": 15.844357976653697, + "grad_norm": 1.9069421291351318, + "learning_rate": 4.787926320480009e-05, + "loss": 0.3617, + "step": 12216 + }, + { + "epoch": 15.875486381322958, + "grad_norm": 2.139292001724243, + "learning_rate": 4.7711397148642583e-05, + "loss": 0.3582, + "step": 12240 + }, + { + "epoch": 15.906614785992218, + "grad_norm": 1.134293556213379, + "learning_rate": 4.7543556938964275e-05, + "loss": 0.361, + "step": 12264 + }, + { + "epoch": 15.937743190661479, + "grad_norm": 1.2520484924316406, + "learning_rate": 4.7375744471279084e-05, + "loss": 0.3613, + "step": 12288 + }, + { + "epoch": 15.96887159533074, + "grad_norm": 1.2001314163208008, + "learning_rate": 4.720796164078755e-05, + "loss": 0.363, + "step": 12312 + }, + { + "epoch": 16.0, + "grad_norm": 1.0038580894470215, + "learning_rate": 4.7040210342355584e-05, + "loss": 0.3566, + "step": 12336 + }, + { + "epoch": 16.03112840466926, + "grad_norm": 1.0586698055267334, + "learning_rate": 4.6872492470492914e-05, + "loss": 0.3554, + "step": 12360 + }, + { + "epoch": 16.06225680933852, + "grad_norm": 1.4238923788070679, + "learning_rate": 4.670480991933182e-05, + "loss": 0.3598, + "step": 12384 + }, + { + "epoch": 16.09338521400778, + "grad_norm": 1.7448209524154663, + "learning_rate": 4.6537164582605674e-05, + "loss": 0.3523, + "step": 12408 + }, + { + "epoch": 16.124513618677042, + "grad_norm": 0.9236373901367188, + "learning_rate": 4.6369558353627517e-05, + "loss": 0.3556, + "step": 12432 + }, + { + "epoch": 16.155642023346303, + "grad_norm": 1.2013592720031738, + "learning_rate": 4.6201993125268804e-05, + "loss": 0.352, + "step": 12456 + }, + { + "epoch": 16.186770428015564, + "grad_norm": 1.267756700515747, + "learning_rate": 4.603447078993788e-05, + "loss": 0.3578, + "step": 12480 + }, + { + "epoch": 16.217898832684824, + "grad_norm": 1.0369305610656738, + "learning_rate": 4.586699323955871e-05, + "loss": 0.3476, + "step": 12504 + }, + { + "epoch": 16.249027237354085, + "grad_norm": 1.4075908660888672, + "learning_rate": 4.569956236554945e-05, + "loss": 0.3544, + "step": 12528 + }, + { + "epoch": 16.280155642023345, + "grad_norm": 1.3998584747314453, + "learning_rate": 4.5532180058801145e-05, + "loss": 0.3596, + "step": 12552 + }, + { + "epoch": 16.311284046692606, + "grad_norm": 1.5231702327728271, + "learning_rate": 4.5364848209656336e-05, + "loss": 0.3542, + "step": 12576 + }, + { + "epoch": 16.342412451361866, + "grad_norm": 1.283345103263855, + "learning_rate": 4.5197568707887675e-05, + "loss": 0.3526, + "step": 12600 + }, + { + "epoch": 16.373540856031127, + "grad_norm": 1.3944894075393677, + "learning_rate": 4.503034344267671e-05, + "loss": 0.357, + "step": 12624 + }, + { + "epoch": 16.404669260700388, + "grad_norm": 1.9900680780410767, + "learning_rate": 4.486317430259238e-05, + "loss": 0.3603, + "step": 12648 + }, + { + "epoch": 16.43579766536965, + "grad_norm": 0.9823328852653503, + "learning_rate": 4.4696063175569804e-05, + "loss": 0.3545, + "step": 12672 + }, + { + "epoch": 16.46692607003891, + "grad_norm": 1.634529709815979, + "learning_rate": 4.452901194888897e-05, + "loss": 0.3543, + "step": 12696 + }, + { + "epoch": 16.49805447470817, + "grad_norm": 1.4010380506515503, + "learning_rate": 4.436202250915329e-05, + "loss": 0.3524, + "step": 12720 + }, + { + "epoch": 16.529182879377434, + "grad_norm": 1.239943504333496, + "learning_rate": 4.419509674226846e-05, + "loss": 0.3648, + "step": 12744 + }, + { + "epoch": 16.560311284046694, + "grad_norm": 3.315246820449829, + "learning_rate": 4.4028236533421016e-05, + "loss": 0.3624, + "step": 12768 + }, + { + "epoch": 16.591439688715955, + "grad_norm": 1.0445722341537476, + "learning_rate": 4.3861443767057205e-05, + "loss": 0.3536, + "step": 12792 + }, + { + "epoch": 16.622568093385215, + "grad_norm": 1.154893398284912, + "learning_rate": 4.369472032686149e-05, + "loss": 0.3608, + "step": 12816 + }, + { + "epoch": 16.653696498054476, + "grad_norm": 2.0033769607543945, + "learning_rate": 4.352806809573547e-05, + "loss": 0.3511, + "step": 12840 + }, + { + "epoch": 16.684824902723737, + "grad_norm": 1.4693876504898071, + "learning_rate": 4.336148895577656e-05, + "loss": 0.3531, + "step": 12864 + }, + { + "epoch": 16.715953307392997, + "grad_norm": 1.8765549659729004, + "learning_rate": 4.319498478825663e-05, + "loss": 0.3563, + "step": 12888 + }, + { + "epoch": 16.747081712062258, + "grad_norm": 1.6893914937973022, + "learning_rate": 4.302855747360092e-05, + "loss": 0.3579, + "step": 12912 + }, + { + "epoch": 16.77821011673152, + "grad_norm": 1.183452844619751, + "learning_rate": 4.286220889136668e-05, + "loss": 0.3637, + "step": 12936 + }, + { + "epoch": 16.80933852140078, + "grad_norm": 1.102815866470337, + "learning_rate": 4.269594092022203e-05, + "loss": 0.3561, + "step": 12960 + }, + { + "epoch": 16.84046692607004, + "grad_norm": 0.9764434695243835, + "learning_rate": 4.252975543792468e-05, + "loss": 0.3581, + "step": 12984 + }, + { + "epoch": 16.8715953307393, + "grad_norm": 2.3779425621032715, + "learning_rate": 4.2363654321300735e-05, + "loss": 0.3531, + "step": 13008 + }, + { + "epoch": 16.90272373540856, + "grad_norm": 1.463118076324463, + "learning_rate": 4.219763944622356e-05, + "loss": 0.3562, + "step": 13032 + }, + { + "epoch": 16.93385214007782, + "grad_norm": 1.756101369857788, + "learning_rate": 4.203171268759248e-05, + "loss": 0.3566, + "step": 13056 + }, + { + "epoch": 16.964980544747082, + "grad_norm": 1.5917153358459473, + "learning_rate": 4.1865875919311726e-05, + "loss": 0.3504, + "step": 13080 + }, + { + "epoch": 16.996108949416342, + "grad_norm": 2.404031753540039, + "learning_rate": 4.170013101426917e-05, + "loss": 0.3581, + "step": 13104 + }, + { + "epoch": 17.027237354085603, + "grad_norm": 1.3285900354385376, + "learning_rate": 4.153447984431527e-05, + "loss": 0.3499, + "step": 13128 + }, + { + "epoch": 17.058365758754864, + "grad_norm": 1.0520793199539185, + "learning_rate": 4.136892428024187e-05, + "loss": 0.3547, + "step": 13152 + }, + { + "epoch": 17.089494163424124, + "grad_norm": 1.0784560441970825, + "learning_rate": 4.120346619176102e-05, + "loss": 0.3525, + "step": 13176 + }, + { + "epoch": 17.120622568093385, + "grad_norm": 1.9099761247634888, + "learning_rate": 4.103810744748403e-05, + "loss": 0.3531, + "step": 13200 + }, + { + "epoch": 17.151750972762645, + "grad_norm": 1.4144366979599, + "learning_rate": 4.0872849914900175e-05, + "loss": 0.3431, + "step": 13224 + }, + { + "epoch": 17.182879377431906, + "grad_norm": 1.078682541847229, + "learning_rate": 4.070769546035571e-05, + "loss": 0.3563, + "step": 13248 + }, + { + "epoch": 17.214007782101167, + "grad_norm": 2.5183982849121094, + "learning_rate": 4.054264594903281e-05, + "loss": 0.3534, + "step": 13272 + }, + { + "epoch": 17.245136186770427, + "grad_norm": 1.3110893964767456, + "learning_rate": 4.037770324492841e-05, + "loss": 0.351, + "step": 13296 + }, + { + "epoch": 17.276264591439688, + "grad_norm": 1.4684545993804932, + "learning_rate": 4.021286921083326e-05, + "loss": 0.3525, + "step": 13320 + }, + { + "epoch": 17.30739299610895, + "grad_norm": 1.3898323774337769, + "learning_rate": 4.004814570831078e-05, + "loss": 0.353, + "step": 13344 + }, + { + "epoch": 17.33852140077821, + "grad_norm": 1.7565838098526, + "learning_rate": 3.9883534597676177e-05, + "loss": 0.3566, + "step": 13368 + }, + { + "epoch": 17.36964980544747, + "grad_norm": 1.3672667741775513, + "learning_rate": 3.971903773797528e-05, + "loss": 0.3502, + "step": 13392 + }, + { + "epoch": 17.40077821011673, + "grad_norm": 1.2242878675460815, + "learning_rate": 3.955465698696363e-05, + "loss": 0.3518, + "step": 13416 + }, + { + "epoch": 17.43190661478599, + "grad_norm": 2.410991907119751, + "learning_rate": 3.939039420108556e-05, + "loss": 0.3503, + "step": 13440 + }, + { + "epoch": 17.46303501945525, + "grad_norm": 1.4282727241516113, + "learning_rate": 3.922625123545305e-05, + "loss": 0.3488, + "step": 13464 + }, + { + "epoch": 17.494163424124515, + "grad_norm": 1.5992825031280518, + "learning_rate": 3.906222994382495e-05, + "loss": 0.3567, + "step": 13488 + }, + { + "epoch": 17.525291828793776, + "grad_norm": 2.398169994354248, + "learning_rate": 3.889833217858594e-05, + "loss": 0.3542, + "step": 13512 + }, + { + "epoch": 17.556420233463037, + "grad_norm": 1.140195608139038, + "learning_rate": 3.873455979072569e-05, + "loss": 0.3493, + "step": 13536 + }, + { + "epoch": 17.587548638132297, + "grad_norm": 1.305156946182251, + "learning_rate": 3.8570914629817886e-05, + "loss": 0.3504, + "step": 13560 + }, + { + "epoch": 17.618677042801558, + "grad_norm": 9.382534980773926, + "learning_rate": 3.840739854399934e-05, + "loss": 0.3534, + "step": 13584 + }, + { + "epoch": 17.64980544747082, + "grad_norm": 1.1403177976608276, + "learning_rate": 3.824401337994923e-05, + "loss": 0.3461, + "step": 13608 + }, + { + "epoch": 17.68093385214008, + "grad_norm": 2.1274640560150146, + "learning_rate": 3.808076098286806e-05, + "loss": 0.3521, + "step": 13632 + }, + { + "epoch": 17.71206225680934, + "grad_norm": 1.9969298839569092, + "learning_rate": 3.7917643196457e-05, + "loss": 0.3521, + "step": 13656 + }, + { + "epoch": 17.7431906614786, + "grad_norm": 1.2433438301086426, + "learning_rate": 3.775466186289693e-05, + "loss": 0.3565, + "step": 13680 + }, + { + "epoch": 17.77431906614786, + "grad_norm": 1.7864729166030884, + "learning_rate": 3.7591818822827745e-05, + "loss": 0.3508, + "step": 13704 + }, + { + "epoch": 17.80544747081712, + "grad_norm": 1.7596447467803955, + "learning_rate": 3.7429115915327484e-05, + "loss": 0.3533, + "step": 13728 + }, + { + "epoch": 17.836575875486382, + "grad_norm": 1.7605047225952148, + "learning_rate": 3.726655497789156e-05, + "loss": 0.3553, + "step": 13752 + }, + { + "epoch": 17.867704280155642, + "grad_norm": 1.5380836725234985, + "learning_rate": 3.710413784641212e-05, + "loss": 0.3526, + "step": 13776 + }, + { + "epoch": 17.898832684824903, + "grad_norm": 1.448866844177246, + "learning_rate": 3.694186635515714e-05, + "loss": 0.3516, + "step": 13800 + }, + { + "epoch": 17.929961089494164, + "grad_norm": 1.527550458908081, + "learning_rate": 3.677974233674983e-05, + "loss": 0.3438, + "step": 13824 + }, + { + "epoch": 17.961089494163424, + "grad_norm": 1.3250521421432495, + "learning_rate": 3.661776762214797e-05, + "loss": 0.3551, + "step": 13848 + }, + { + "epoch": 17.992217898832685, + "grad_norm": 1.4741333723068237, + "learning_rate": 3.6455944040623075e-05, + "loss": 0.3529, + "step": 13872 + }, + { + "epoch": 18.023346303501945, + "grad_norm": 2.2234058380126953, + "learning_rate": 3.6294273419739874e-05, + "loss": 0.3486, + "step": 13896 + }, + { + "epoch": 18.054474708171206, + "grad_norm": 1.4099419116973877, + "learning_rate": 3.613275758533561e-05, + "loss": 0.3473, + "step": 13920 + }, + { + "epoch": 18.085603112840467, + "grad_norm": 1.9094316959381104, + "learning_rate": 3.5971398361499466e-05, + "loss": 0.3548, + "step": 13944 + }, + { + "epoch": 18.116731517509727, + "grad_norm": 1.2845815420150757, + "learning_rate": 3.581019757055188e-05, + "loss": 0.345, + "step": 13968 + }, + { + "epoch": 18.147859922178988, + "grad_norm": 2.0491998195648193, + "learning_rate": 3.564915703302407e-05, + "loss": 0.3474, + "step": 13992 + }, + { + "epoch": 18.17898832684825, + "grad_norm": 1.3620078563690186, + "learning_rate": 3.5488278567637426e-05, + "loss": 0.3452, + "step": 14016 + }, + { + "epoch": 18.21011673151751, + "grad_norm": 4.295355796813965, + "learning_rate": 3.53275639912829e-05, + "loss": 0.3474, + "step": 14040 + }, + { + "epoch": 18.24124513618677, + "grad_norm": 2.150200366973877, + "learning_rate": 3.516701511900062e-05, + "loss": 0.3465, + "step": 14064 + }, + { + "epoch": 18.27237354085603, + "grad_norm": 1.407614827156067, + "learning_rate": 3.500663376395927e-05, + "loss": 0.3453, + "step": 14088 + }, + { + "epoch": 18.30350194552529, + "grad_norm": 1.2066164016723633, + "learning_rate": 3.484642173743575e-05, + "loss": 0.3477, + "step": 14112 + }, + { + "epoch": 18.33463035019455, + "grad_norm": 1.1473839282989502, + "learning_rate": 3.4686380848794544e-05, + "loss": 0.3448, + "step": 14136 + }, + { + "epoch": 18.365758754863812, + "grad_norm": 2.0838565826416016, + "learning_rate": 3.452651290546742e-05, + "loss": 0.3451, + "step": 14160 + }, + { + "epoch": 18.396887159533073, + "grad_norm": 1.3917421102523804, + "learning_rate": 3.436681971293301e-05, + "loss": 0.3442, + "step": 14184 + }, + { + "epoch": 18.428015564202333, + "grad_norm": 1.2915924787521362, + "learning_rate": 3.420730307469632e-05, + "loss": 0.3409, + "step": 14208 + }, + { + "epoch": 18.459143968871594, + "grad_norm": 2.337096691131592, + "learning_rate": 3.404796479226852e-05, + "loss": 0.3471, + "step": 14232 + }, + { + "epoch": 18.490272373540854, + "grad_norm": 1.732359528541565, + "learning_rate": 3.3888806665146374e-05, + "loss": 0.3478, + "step": 14256 + }, + { + "epoch": 18.52140077821012, + "grad_norm": 1.1314399242401123, + "learning_rate": 3.3729830490792166e-05, + "loss": 0.345, + "step": 14280 + }, + { + "epoch": 18.55252918287938, + "grad_norm": 1.5127285718917847, + "learning_rate": 3.357103806461328e-05, + "loss": 0.3405, + "step": 14304 + }, + { + "epoch": 18.58365758754864, + "grad_norm": 1.306648850440979, + "learning_rate": 3.3412431179941847e-05, + "loss": 0.3443, + "step": 14328 + }, + { + "epoch": 18.6147859922179, + "grad_norm": 1.189726710319519, + "learning_rate": 3.3254011628014656e-05, + "loss": 0.3447, + "step": 14352 + }, + { + "epoch": 18.64591439688716, + "grad_norm": 1.2058913707733154, + "learning_rate": 3.309578119795278e-05, + "loss": 0.347, + "step": 14376 + }, + { + "epoch": 18.67704280155642, + "grad_norm": 1.702572226524353, + "learning_rate": 3.293774167674149e-05, + "loss": 0.3496, + "step": 14400 + }, + { + "epoch": 18.708171206225682, + "grad_norm": 1.8515872955322266, + "learning_rate": 3.277989484920996e-05, + "loss": 0.344, + "step": 14424 + }, + { + "epoch": 18.739299610894943, + "grad_norm": 1.8190243244171143, + "learning_rate": 3.26222424980112e-05, + "loss": 0.3499, + "step": 14448 + }, + { + "epoch": 18.770428015564203, + "grad_norm": 1.261648416519165, + "learning_rate": 3.246478640360191e-05, + "loss": 0.345, + "step": 14472 + }, + { + "epoch": 18.801556420233464, + "grad_norm": 1.3052914142608643, + "learning_rate": 3.2307528344222296e-05, + "loss": 0.3505, + "step": 14496 + }, + { + "epoch": 18.832684824902724, + "grad_norm": 1.5217386484146118, + "learning_rate": 3.215047009587609e-05, + "loss": 0.3507, + "step": 14520 + }, + { + "epoch": 18.863813229571985, + "grad_norm": 1.2934740781784058, + "learning_rate": 3.1993613432310384e-05, + "loss": 0.3459, + "step": 14544 + }, + { + "epoch": 18.894941634241246, + "grad_norm": 1.5978559255599976, + "learning_rate": 3.183696012499574e-05, + "loss": 0.3464, + "step": 14568 + }, + { + "epoch": 18.926070038910506, + "grad_norm": 1.2306820154190063, + "learning_rate": 3.168051194310609e-05, + "loss": 0.3446, + "step": 14592 + }, + { + "epoch": 18.957198443579767, + "grad_norm": 1.1488240957260132, + "learning_rate": 3.152427065349867e-05, + "loss": 0.3475, + "step": 14616 + }, + { + "epoch": 18.988326848249027, + "grad_norm": 3.1832704544067383, + "learning_rate": 3.1368238020694316e-05, + "loss": 0.3437, + "step": 14640 + }, + { + "epoch": 19.019455252918288, + "grad_norm": 2.3371617794036865, + "learning_rate": 3.121241580685727e-05, + "loss": 0.3465, + "step": 14664 + }, + { + "epoch": 19.05058365758755, + "grad_norm": 2.816099166870117, + "learning_rate": 3.1056805771775436e-05, + "loss": 0.3435, + "step": 14688 + }, + { + "epoch": 19.08171206225681, + "grad_norm": 1.3421522378921509, + "learning_rate": 3.090140967284046e-05, + "loss": 0.3418, + "step": 14712 + }, + { + "epoch": 19.11284046692607, + "grad_norm": 1.8488672971725464, + "learning_rate": 3.07462292650279e-05, + "loss": 0.348, + "step": 14736 + }, + { + "epoch": 19.14396887159533, + "grad_norm": 1.2293037176132202, + "learning_rate": 3.05912663008774e-05, + "loss": 0.342, + "step": 14760 + }, + { + "epoch": 19.17509727626459, + "grad_norm": 1.7620015144348145, + "learning_rate": 3.043652253047281e-05, + "loss": 0.3454, + "step": 14784 + }, + { + "epoch": 19.20622568093385, + "grad_norm": 1.6479402780532837, + "learning_rate": 3.0281999701422637e-05, + "loss": 0.3427, + "step": 14808 + }, + { + "epoch": 19.237354085603112, + "grad_norm": 1.5058902502059937, + "learning_rate": 3.012769955884005e-05, + "loss": 0.3328, + "step": 14832 + }, + { + "epoch": 19.268482490272373, + "grad_norm": 1.6616445779800415, + "learning_rate": 2.9973623845323347e-05, + "loss": 0.3441, + "step": 14856 + }, + { + "epoch": 19.299610894941633, + "grad_norm": 1.5390020608901978, + "learning_rate": 2.9819774300936255e-05, + "loss": 0.3434, + "step": 14880 + }, + { + "epoch": 19.330739299610894, + "grad_norm": 1.7172026634216309, + "learning_rate": 2.9666152663188172e-05, + "loss": 0.3439, + "step": 14904 + }, + { + "epoch": 19.361867704280154, + "grad_norm": 1.134320855140686, + "learning_rate": 2.9512760667014682e-05, + "loss": 0.3431, + "step": 14928 + }, + { + "epoch": 19.392996108949415, + "grad_norm": 4.418805122375488, + "learning_rate": 2.935960004475784e-05, + "loss": 0.344, + "step": 14952 + }, + { + "epoch": 19.424124513618676, + "grad_norm": 1.3951141834259033, + "learning_rate": 2.920667252614674e-05, + "loss": 0.3334, + "step": 14976 + }, + { + "epoch": 19.455252918287936, + "grad_norm": 2.0081377029418945, + "learning_rate": 2.9053979838277834e-05, + "loss": 0.3413, + "step": 15000 + }, + { + "epoch": 19.486381322957197, + "grad_norm": 1.0862860679626465, + "learning_rate": 2.890152370559552e-05, + "loss": 0.3406, + "step": 15024 + }, + { + "epoch": 19.51750972762646, + "grad_norm": 1.3487762212753296, + "learning_rate": 2.8749305849872686e-05, + "loss": 0.3335, + "step": 15048 + }, + { + "epoch": 19.54863813229572, + "grad_norm": 1.122753381729126, + "learning_rate": 2.8597327990191146e-05, + "loss": 0.3491, + "step": 15072 + }, + { + "epoch": 19.579766536964982, + "grad_norm": 1.518355131149292, + "learning_rate": 2.844559184292239e-05, + "loss": 0.3405, + "step": 15096 + }, + { + "epoch": 19.610894941634243, + "grad_norm": 1.0469350814819336, + "learning_rate": 2.829409912170806e-05, + "loss": 0.3395, + "step": 15120 + }, + { + "epoch": 19.642023346303503, + "grad_norm": 1.915490984916687, + "learning_rate": 2.814285153744064e-05, + "loss": 0.3426, + "step": 15144 + }, + { + "epoch": 19.673151750972764, + "grad_norm": 1.477184772491455, + "learning_rate": 2.7991850798244197e-05, + "loss": 0.3463, + "step": 15168 + }, + { + "epoch": 19.704280155642024, + "grad_norm": 1.3598774671554565, + "learning_rate": 2.7841098609454976e-05, + "loss": 0.3454, + "step": 15192 + }, + { + "epoch": 19.735408560311285, + "grad_norm": 2.6406991481781006, + "learning_rate": 2.769059667360227e-05, + "loss": 0.3422, + "step": 15216 + }, + { + "epoch": 19.766536964980546, + "grad_norm": 1.2698395252227783, + "learning_rate": 2.754034669038905e-05, + "loss": 0.3473, + "step": 15240 + }, + { + "epoch": 19.797665369649806, + "grad_norm": 1.3700004816055298, + "learning_rate": 2.7390350356672934e-05, + "loss": 0.3434, + "step": 15264 + }, + { + "epoch": 19.828793774319067, + "grad_norm": 1.1726247072219849, + "learning_rate": 2.7240609366446845e-05, + "loss": 0.3421, + "step": 15288 + }, + { + "epoch": 19.859922178988327, + "grad_norm": 1.5183639526367188, + "learning_rate": 2.709112541082e-05, + "loss": 0.3418, + "step": 15312 + }, + { + "epoch": 19.891050583657588, + "grad_norm": 1.1311919689178467, + "learning_rate": 2.6941900177998824e-05, + "loss": 0.3411, + "step": 15336 + }, + { + "epoch": 19.92217898832685, + "grad_norm": 1.6014869213104248, + "learning_rate": 2.6792935353267757e-05, + "loss": 0.339, + "step": 15360 + }, + { + "epoch": 19.95330739299611, + "grad_norm": 1.8378218412399292, + "learning_rate": 2.6644232618970382e-05, + "loss": 0.3464, + "step": 15384 + }, + { + "epoch": 19.98443579766537, + "grad_norm": 2.1291933059692383, + "learning_rate": 2.6495793654490292e-05, + "loss": 0.3409, + "step": 15408 + }, + { + "epoch": 20.01556420233463, + "grad_norm": 1.1774524450302124, + "learning_rate": 2.6347620136232232e-05, + "loss": 0.339, + "step": 15432 + }, + { + "epoch": 20.04669260700389, + "grad_norm": 1.3319616317749023, + "learning_rate": 2.6199713737603055e-05, + "loss": 0.3376, + "step": 15456 + }, + { + "epoch": 20.07782101167315, + "grad_norm": 1.488239049911499, + "learning_rate": 2.60520761289929e-05, + "loss": 0.3379, + "step": 15480 + }, + { + "epoch": 20.108949416342412, + "grad_norm": 1.2733827829360962, + "learning_rate": 2.590470897775636e-05, + "loss": 0.3352, + "step": 15504 + }, + { + "epoch": 20.140077821011673, + "grad_norm": 2.291374921798706, + "learning_rate": 2.575761394819351e-05, + "loss": 0.3395, + "step": 15528 + }, + { + "epoch": 20.171206225680933, + "grad_norm": 1.3169567584991455, + "learning_rate": 2.5610792701531298e-05, + "loss": 0.3365, + "step": 15552 + }, + { + "epoch": 20.202334630350194, + "grad_norm": 1.0463300943374634, + "learning_rate": 2.54642468959046e-05, + "loss": 0.337, + "step": 15576 + }, + { + "epoch": 20.233463035019454, + "grad_norm": 1.5346705913543701, + "learning_rate": 2.5317978186337664e-05, + "loss": 0.3394, + "step": 15600 + }, + { + "epoch": 20.264591439688715, + "grad_norm": 1.6092703342437744, + "learning_rate": 2.5171988224725267e-05, + "loss": 0.3308, + "step": 15624 + }, + { + "epoch": 20.295719844357976, + "grad_norm": 1.3011606931686401, + "learning_rate": 2.5026278659814144e-05, + "loss": 0.339, + "step": 15648 + }, + { + "epoch": 20.326848249027236, + "grad_norm": 1.2459102869033813, + "learning_rate": 2.4880851137184403e-05, + "loss": 0.3308, + "step": 15672 + }, + { + "epoch": 20.357976653696497, + "grad_norm": 1.4810408353805542, + "learning_rate": 2.4735707299230808e-05, + "loss": 0.3376, + "step": 15696 + }, + { + "epoch": 20.389105058365757, + "grad_norm": 1.2645267248153687, + "learning_rate": 2.4590848785144386e-05, + "loss": 0.3402, + "step": 15720 + }, + { + "epoch": 20.420233463035018, + "grad_norm": 2.001779556274414, + "learning_rate": 2.4446277230893823e-05, + "loss": 0.3358, + "step": 15744 + }, + { + "epoch": 20.45136186770428, + "grad_norm": 3.0970067977905273, + "learning_rate": 2.4301994269206968e-05, + "loss": 0.334, + "step": 15768 + }, + { + "epoch": 20.48249027237354, + "grad_norm": 1.4983640909194946, + "learning_rate": 2.415800152955247e-05, + "loss": 0.3424, + "step": 15792 + }, + { + "epoch": 20.5136186770428, + "grad_norm": 1.3392024040222168, + "learning_rate": 2.40143006381213e-05, + "loss": 0.3463, + "step": 15816 + }, + { + "epoch": 20.544747081712064, + "grad_norm": 1.4383450746536255, + "learning_rate": 2.3870893217808495e-05, + "loss": 0.3354, + "step": 15840 + }, + { + "epoch": 20.575875486381324, + "grad_norm": 1.4223530292510986, + "learning_rate": 2.3727780888194658e-05, + "loss": 0.333, + "step": 15864 + }, + { + "epoch": 20.607003891050585, + "grad_norm": 1.5441044569015503, + "learning_rate": 2.3584965265527847e-05, + "loss": 0.3335, + "step": 15888 + }, + { + "epoch": 20.638132295719846, + "grad_norm": 0.8291170597076416, + "learning_rate": 2.344244796270524e-05, + "loss": 0.3389, + "step": 15912 + }, + { + "epoch": 20.669260700389106, + "grad_norm": 2.7805609703063965, + "learning_rate": 2.330023058925486e-05, + "loss": 0.3353, + "step": 15936 + }, + { + "epoch": 20.700389105058367, + "grad_norm": 1.6097582578659058, + "learning_rate": 2.3158314751317513e-05, + "loss": 0.339, + "step": 15960 + }, + { + "epoch": 20.731517509727627, + "grad_norm": 1.4149878025054932, + "learning_rate": 2.3016702051628547e-05, + "loss": 0.3375, + "step": 15984 + }, + { + "epoch": 20.762645914396888, + "grad_norm": 1.2236443758010864, + "learning_rate": 2.2875394089499847e-05, + "loss": 0.3358, + "step": 16008 + }, + { + "epoch": 20.79377431906615, + "grad_norm": 1.0645393133163452, + "learning_rate": 2.2734392460801727e-05, + "loss": 0.3377, + "step": 16032 + }, + { + "epoch": 20.82490272373541, + "grad_norm": 1.2843340635299683, + "learning_rate": 2.259369875794485e-05, + "loss": 0.3332, + "step": 16056 + }, + { + "epoch": 20.85603112840467, + "grad_norm": 1.735514760017395, + "learning_rate": 2.2453314569862366e-05, + "loss": 0.3364, + "step": 16080 + }, + { + "epoch": 20.88715953307393, + "grad_norm": 1.3856208324432373, + "learning_rate": 2.2313241481991855e-05, + "loss": 0.3389, + "step": 16104 + }, + { + "epoch": 20.91828793774319, + "grad_norm": 1.7546725273132324, + "learning_rate": 2.217348107625748e-05, + "loss": 0.3373, + "step": 16128 + }, + { + "epoch": 20.94941634241245, + "grad_norm": 1.3664530515670776, + "learning_rate": 2.2034034931052096e-05, + "loss": 0.3398, + "step": 16152 + }, + { + "epoch": 20.980544747081712, + "grad_norm": 5.165532112121582, + "learning_rate": 2.1894904621219463e-05, + "loss": 0.3372, + "step": 16176 + }, + { + "epoch": 21.011673151750973, + "grad_norm": 1.3261635303497314, + "learning_rate": 2.175609171803644e-05, + "loss": 0.3381, + "step": 16200 + }, + { + "epoch": 21.042801556420233, + "grad_norm": 1.8854881525039673, + "learning_rate": 2.1617597789195193e-05, + "loss": 0.3347, + "step": 16224 + }, + { + "epoch": 21.073929961089494, + "grad_norm": 1.3904035091400146, + "learning_rate": 2.1479424398785573e-05, + "loss": 0.3346, + "step": 16248 + }, + { + "epoch": 21.105058365758754, + "grad_norm": 1.318601369857788, + "learning_rate": 2.1341573107277392e-05, + "loss": 0.3347, + "step": 16272 + }, + { + "epoch": 21.136186770428015, + "grad_norm": 1.0564274787902832, + "learning_rate": 2.1204045471502803e-05, + "loss": 0.3295, + "step": 16296 + }, + { + "epoch": 21.167315175097276, + "grad_norm": 0.9953235387802124, + "learning_rate": 2.106684304463874e-05, + "loss": 0.3339, + "step": 16320 + }, + { + "epoch": 21.198443579766536, + "grad_norm": 1.0253063440322876, + "learning_rate": 2.092996737618939e-05, + "loss": 0.3271, + "step": 16344 + }, + { + "epoch": 21.229571984435797, + "grad_norm": 1.5001134872436523, + "learning_rate": 2.079342001196869e-05, + "loss": 0.3359, + "step": 16368 + }, + { + "epoch": 21.260700389105057, + "grad_norm": 1.1106650829315186, + "learning_rate": 2.0657202494082773e-05, + "loss": 0.327, + "step": 16392 + }, + { + "epoch": 21.291828793774318, + "grad_norm": 1.0053423643112183, + "learning_rate": 2.052131636091273e-05, + "loss": 0.3398, + "step": 16416 + }, + { + "epoch": 21.32295719844358, + "grad_norm": 1.3083621263504028, + "learning_rate": 2.038576314709707e-05, + "loss": 0.3306, + "step": 16440 + }, + { + "epoch": 21.35408560311284, + "grad_norm": 1.4561755657196045, + "learning_rate": 2.0250544383514457e-05, + "loss": 0.3364, + "step": 16464 + }, + { + "epoch": 21.3852140077821, + "grad_norm": 1.0885835886001587, + "learning_rate": 2.0115661597266476e-05, + "loss": 0.3355, + "step": 16488 + }, + { + "epoch": 21.41634241245136, + "grad_norm": 1.3506430387496948, + "learning_rate": 1.998111631166027e-05, + "loss": 0.3334, + "step": 16512 + }, + { + "epoch": 21.44747081712062, + "grad_norm": 1.0331530570983887, + "learning_rate": 1.9846910046191446e-05, + "loss": 0.3303, + "step": 16536 + }, + { + "epoch": 21.47859922178988, + "grad_norm": 1.0616254806518555, + "learning_rate": 1.9713044316526813e-05, + "loss": 0.3348, + "step": 16560 + }, + { + "epoch": 21.509727626459146, + "grad_norm": 2.5577657222747803, + "learning_rate": 1.9579520634487386e-05, + "loss": 0.335, + "step": 16584 + }, + { + "epoch": 21.540856031128406, + "grad_norm": 1.5290476083755493, + "learning_rate": 1.9446340508031185e-05, + "loss": 0.3382, + "step": 16608 + }, + { + "epoch": 21.571984435797667, + "grad_norm": 0.8804724216461182, + "learning_rate": 1.931350544123627e-05, + "loss": 0.3257, + "step": 16632 + }, + { + "epoch": 21.603112840466927, + "grad_norm": 1.1799284219741821, + "learning_rate": 1.918101693428379e-05, + "loss": 0.3298, + "step": 16656 + }, + { + "epoch": 21.634241245136188, + "grad_norm": 1.3328742980957031, + "learning_rate": 1.9048876483440942e-05, + "loss": 0.3373, + "step": 16680 + }, + { + "epoch": 21.66536964980545, + "grad_norm": 0.9985073208808899, + "learning_rate": 1.8917085581044193e-05, + "loss": 0.3313, + "step": 16704 + }, + { + "epoch": 21.69649805447471, + "grad_norm": 1.498244047164917, + "learning_rate": 1.8785645715482285e-05, + "loss": 0.3303, + "step": 16728 + }, + { + "epoch": 21.72762645914397, + "grad_norm": 1.6468580961227417, + "learning_rate": 1.8654558371179583e-05, + "loss": 0.3252, + "step": 16752 + }, + { + "epoch": 21.75875486381323, + "grad_norm": 1.6541725397109985, + "learning_rate": 1.8523825028579212e-05, + "loss": 0.3299, + "step": 16776 + }, + { + "epoch": 21.78988326848249, + "grad_norm": 0.9805202484130859, + "learning_rate": 1.8393447164126282e-05, + "loss": 0.3342, + "step": 16800 + }, + { + "epoch": 21.82101167315175, + "grad_norm": 0.9097315073013306, + "learning_rate": 1.8263426250251388e-05, + "loss": 0.3309, + "step": 16824 + }, + { + "epoch": 21.852140077821012, + "grad_norm": 1.2603996992111206, + "learning_rate": 1.8133763755353816e-05, + "loss": 0.3387, + "step": 16848 + }, + { + "epoch": 21.883268482490273, + "grad_norm": 1.0283710956573486, + "learning_rate": 1.800446114378508e-05, + "loss": 0.3325, + "step": 16872 + }, + { + "epoch": 21.914396887159533, + "grad_norm": 2.601137399673462, + "learning_rate": 1.7875519875832254e-05, + "loss": 0.3356, + "step": 16896 + }, + { + "epoch": 21.945525291828794, + "grad_norm": 1.0405902862548828, + "learning_rate": 1.774694140770163e-05, + "loss": 0.3339, + "step": 16920 + }, + { + "epoch": 21.976653696498055, + "grad_norm": 1.504928708076477, + "learning_rate": 1.7618727191502188e-05, + "loss": 0.3329, + "step": 16944 + }, + { + "epoch": 22.007782101167315, + "grad_norm": 1.1356394290924072, + "learning_rate": 1.749087867522912e-05, + "loss": 0.331, + "step": 16968 + }, + { + "epoch": 22.038910505836576, + "grad_norm": 1.3053059577941895, + "learning_rate": 1.7363397302747687e-05, + "loss": 0.3316, + "step": 16992 + }, + { + "epoch": 22.070038910505836, + "grad_norm": 1.8512986898422241, + "learning_rate": 1.723628451377669e-05, + "loss": 0.3286, + "step": 17016 + }, + { + "epoch": 22.101167315175097, + "grad_norm": 1.1379419565200806, + "learning_rate": 1.7109541743872366e-05, + "loss": 0.3311, + "step": 17040 + }, + { + "epoch": 22.132295719844358, + "grad_norm": 1.0137568712234497, + "learning_rate": 1.698317042441211e-05, + "loss": 0.3294, + "step": 17064 + }, + { + "epoch": 22.163424124513618, + "grad_norm": 1.1163158416748047, + "learning_rate": 1.6857171982578286e-05, + "loss": 0.3247, + "step": 17088 + }, + { + "epoch": 22.19455252918288, + "grad_norm": 0.992064893245697, + "learning_rate": 1.6731547841342193e-05, + "loss": 0.3331, + "step": 17112 + }, + { + "epoch": 22.22568093385214, + "grad_norm": 1.2021843194961548, + "learning_rate": 1.6606299419447894e-05, + "loss": 0.3284, + "step": 17136 + }, + { + "epoch": 22.2568093385214, + "grad_norm": 2.352348566055298, + "learning_rate": 1.6481428131396275e-05, + "loss": 0.3315, + "step": 17160 + }, + { + "epoch": 22.28793774319066, + "grad_norm": 1.283078908920288, + "learning_rate": 1.6356935387428996e-05, + "loss": 0.3262, + "step": 17184 + }, + { + "epoch": 22.31906614785992, + "grad_norm": 1.2125391960144043, + "learning_rate": 1.6232822593512654e-05, + "loss": 0.3312, + "step": 17208 + }, + { + "epoch": 22.35019455252918, + "grad_norm": 1.2397364377975464, + "learning_rate": 1.610909115132286e-05, + "loss": 0.3268, + "step": 17232 + }, + { + "epoch": 22.381322957198442, + "grad_norm": 1.4817135334014893, + "learning_rate": 1.5985742458228338e-05, + "loss": 0.3283, + "step": 17256 + }, + { + "epoch": 22.412451361867703, + "grad_norm": 2.0548017024993896, + "learning_rate": 1.58627779072753e-05, + "loss": 0.3249, + "step": 17280 + }, + { + "epoch": 22.443579766536963, + "grad_norm": 1.4913387298583984, + "learning_rate": 1.574019888717155e-05, + "loss": 0.3277, + "step": 17304 + }, + { + "epoch": 22.474708171206224, + "grad_norm": 1.2476876974105835, + "learning_rate": 1.5618006782270904e-05, + "loss": 0.3298, + "step": 17328 + }, + { + "epoch": 22.505836575875485, + "grad_norm": 1.2181342840194702, + "learning_rate": 1.5496202972557556e-05, + "loss": 0.329, + "step": 17352 + }, + { + "epoch": 22.53696498054475, + "grad_norm": 1.3082391023635864, + "learning_rate": 1.5374788833630404e-05, + "loss": 0.328, + "step": 17376 + }, + { + "epoch": 22.56809338521401, + "grad_norm": 1.217458963394165, + "learning_rate": 1.5253765736687636e-05, + "loss": 0.3273, + "step": 17400 + }, + { + "epoch": 22.59922178988327, + "grad_norm": 1.1426113843917847, + "learning_rate": 1.5133135048511127e-05, + "loss": 0.3314, + "step": 17424 + }, + { + "epoch": 22.63035019455253, + "grad_norm": 1.8684285879135132, + "learning_rate": 1.5012898131451114e-05, + "loss": 0.3301, + "step": 17448 + }, + { + "epoch": 22.66147859922179, + "grad_norm": 1.1370235681533813, + "learning_rate": 1.489305634341071e-05, + "loss": 0.3315, + "step": 17472 + }, + { + "epoch": 22.69260700389105, + "grad_norm": 1.1359672546386719, + "learning_rate": 1.4773611037830626e-05, + "loss": 0.3283, + "step": 17496 + }, + { + "epoch": 22.723735408560312, + "grad_norm": 1.3090800046920776, + "learning_rate": 1.4654563563673901e-05, + "loss": 0.3282, + "step": 17520 + }, + { + "epoch": 22.754863813229573, + "grad_norm": 1.2736905813217163, + "learning_rate": 1.4535915265410593e-05, + "loss": 0.33, + "step": 17544 + }, + { + "epoch": 22.785992217898833, + "grad_norm": 1.189782977104187, + "learning_rate": 1.4417667483002688e-05, + "loss": 0.3267, + "step": 17568 + }, + { + "epoch": 22.817120622568094, + "grad_norm": 2.092562437057495, + "learning_rate": 1.4299821551888881e-05, + "loss": 0.3276, + "step": 17592 + }, + { + "epoch": 22.848249027237355, + "grad_norm": 1.8085280656814575, + "learning_rate": 1.4182378802969582e-05, + "loss": 0.3267, + "step": 17616 + }, + { + "epoch": 22.879377431906615, + "grad_norm": 1.2389247417449951, + "learning_rate": 1.4065340562591784e-05, + "loss": 0.3322, + "step": 17640 + }, + { + "epoch": 22.910505836575876, + "grad_norm": 2.3639073371887207, + "learning_rate": 1.3948708152534162e-05, + "loss": 0.3286, + "step": 17664 + }, + { + "epoch": 22.941634241245136, + "grad_norm": 1.4584684371948242, + "learning_rate": 1.3832482889992138e-05, + "loss": 0.3275, + "step": 17688 + }, + { + "epoch": 22.972762645914397, + "grad_norm": 1.2135454416275024, + "learning_rate": 1.3716666087562951e-05, + "loss": 0.3331, + "step": 17712 + }, + { + "epoch": 23.003891050583658, + "grad_norm": 1.1459728479385376, + "learning_rate": 1.3601259053230924e-05, + "loss": 0.3259, + "step": 17736 + }, + { + "epoch": 23.035019455252918, + "grad_norm": 1.1459057331085205, + "learning_rate": 1.3486263090352563e-05, + "loss": 0.3229, + "step": 17760 + }, + { + "epoch": 23.06614785992218, + "grad_norm": 1.3186362981796265, + "learning_rate": 1.3371679497641997e-05, + "loss": 0.3242, + "step": 17784 + }, + { + "epoch": 23.09727626459144, + "grad_norm": 0.9882354736328125, + "learning_rate": 1.3257509569156162e-05, + "loss": 0.3263, + "step": 17808 + }, + { + "epoch": 23.1284046692607, + "grad_norm": 1.146543264389038, + "learning_rate": 1.3143754594280266e-05, + "loss": 0.3239, + "step": 17832 + }, + { + "epoch": 23.15953307392996, + "grad_norm": 1.5829049348831177, + "learning_rate": 1.3030415857713246e-05, + "loss": 0.3274, + "step": 17856 + }, + { + "epoch": 23.19066147859922, + "grad_norm": 1.1690993309020996, + "learning_rate": 1.2917494639453171e-05, + "loss": 0.3266, + "step": 17880 + }, + { + "epoch": 23.22178988326848, + "grad_norm": 2.0189902782440186, + "learning_rate": 1.280499221478289e-05, + "loss": 0.3277, + "step": 17904 + }, + { + "epoch": 23.252918287937742, + "grad_norm": 2.8502254486083984, + "learning_rate": 1.269290985425557e-05, + "loss": 0.3309, + "step": 17928 + }, + { + "epoch": 23.284046692607003, + "grad_norm": 1.144399881362915, + "learning_rate": 1.2581248823680336e-05, + "loss": 0.3302, + "step": 17952 + }, + { + "epoch": 23.315175097276263, + "grad_norm": 1.0023480653762817, + "learning_rate": 1.2470010384108012e-05, + "loss": 0.3259, + "step": 17976 + }, + { + "epoch": 23.346303501945524, + "grad_norm": 1.0780220031738281, + "learning_rate": 1.2359195791816841e-05, + "loss": 0.3274, + "step": 18000 + }, + { + "epoch": 23.377431906614785, + "grad_norm": 1.4481017589569092, + "learning_rate": 1.2248806298298372e-05, + "loss": 0.3191, + "step": 18024 + }, + { + "epoch": 23.408560311284045, + "grad_norm": 0.9282727837562561, + "learning_rate": 1.2138843150243212e-05, + "loss": 0.326, + "step": 18048 + }, + { + "epoch": 23.439688715953306, + "grad_norm": 1.2329308986663818, + "learning_rate": 1.2029307589527062e-05, + "loss": 0.3245, + "step": 18072 + }, + { + "epoch": 23.470817120622566, + "grad_norm": 1.535043478012085, + "learning_rate": 1.1920200853196623e-05, + "loss": 0.3273, + "step": 18096 + }, + { + "epoch": 23.50194552529183, + "grad_norm": 1.5993396043777466, + "learning_rate": 1.1811524173455618e-05, + "loss": 0.3242, + "step": 18120 + }, + { + "epoch": 23.53307392996109, + "grad_norm": 2.646594762802124, + "learning_rate": 1.1703278777650929e-05, + "loss": 0.3323, + "step": 18144 + }, + { + "epoch": 23.56420233463035, + "grad_norm": 1.254061222076416, + "learning_rate": 1.1595465888258661e-05, + "loss": 0.3238, + "step": 18168 + }, + { + "epoch": 23.595330739299612, + "grad_norm": 1.3275645971298218, + "learning_rate": 1.1488086722870439e-05, + "loss": 0.328, + "step": 18192 + }, + { + "epoch": 23.626459143968873, + "grad_norm": 1.366665244102478, + "learning_rate": 1.1381142494179586e-05, + "loss": 0.3275, + "step": 18216 + }, + { + "epoch": 23.657587548638134, + "grad_norm": 1.2128342390060425, + "learning_rate": 1.1274634409967389e-05, + "loss": 0.3247, + "step": 18240 + }, + { + "epoch": 23.688715953307394, + "grad_norm": 1.168764591217041, + "learning_rate": 1.1168563673089589e-05, + "loss": 0.3239, + "step": 18264 + }, + { + "epoch": 23.719844357976655, + "grad_norm": 1.2446372509002686, + "learning_rate": 1.1062931481462647e-05, + "loss": 0.32, + "step": 18288 + }, + { + "epoch": 23.750972762645915, + "grad_norm": 1.4571527242660522, + "learning_rate": 1.095773902805033e-05, + "loss": 0.3272, + "step": 18312 + }, + { + "epoch": 23.782101167315176, + "grad_norm": 1.1576392650604248, + "learning_rate": 1.0852987500850148e-05, + "loss": 0.3251, + "step": 18336 + }, + { + "epoch": 23.813229571984436, + "grad_norm": 1.3691147565841675, + "learning_rate": 1.0748678082880049e-05, + "loss": 0.3253, + "step": 18360 + }, + { + "epoch": 23.844357976653697, + "grad_norm": 1.859039068222046, + "learning_rate": 1.0644811952164957e-05, + "loss": 0.3293, + "step": 18384 + }, + { + "epoch": 23.875486381322958, + "grad_norm": 1.2036535739898682, + "learning_rate": 1.0541390281723478e-05, + "loss": 0.3269, + "step": 18408 + }, + { + "epoch": 23.90661478599222, + "grad_norm": 1.459100365638733, + "learning_rate": 1.043841423955474e-05, + "loss": 0.3276, + "step": 18432 + }, + { + "epoch": 23.93774319066148, + "grad_norm": 1.2927861213684082, + "learning_rate": 1.0335884988625084e-05, + "loss": 0.3263, + "step": 18456 + }, + { + "epoch": 23.96887159533074, + "grad_norm": 1.4151058197021484, + "learning_rate": 1.0233803686855014e-05, + "loss": 0.321, + "step": 18480 + }, + { + "epoch": 24.0, + "grad_norm": 1.434226393699646, + "learning_rate": 1.0132171487106068e-05, + "loss": 0.3202, + "step": 18504 + }, + { + "epoch": 24.03112840466926, + "grad_norm": 1.2331753969192505, + "learning_rate": 1.0030989537167857e-05, + "loss": 0.3242, + "step": 18528 + }, + { + "epoch": 24.06225680933852, + "grad_norm": 1.6305173635482788, + "learning_rate": 9.930258979745055e-06, + "loss": 0.3221, + "step": 18552 + }, + { + "epoch": 24.09338521400778, + "grad_norm": 1.1515713930130005, + "learning_rate": 9.82998095244449e-06, + "loss": 0.3217, + "step": 18576 + }, + { + "epoch": 24.124513618677042, + "grad_norm": 1.1086283922195435, + "learning_rate": 9.730156587762335e-06, + "loss": 0.3225, + "step": 18600 + }, + { + "epoch": 24.155642023346303, + "grad_norm": 1.256364107131958, + "learning_rate": 9.630787013071286e-06, + "loss": 0.3218, + "step": 18624 + }, + { + "epoch": 24.186770428015564, + "grad_norm": 1.2893520593643188, + "learning_rate": 9.531873350607823e-06, + "loss": 0.3285, + "step": 18648 + }, + { + "epoch": 24.217898832684824, + "grad_norm": 1.1564453840255737, + "learning_rate": 9.433416717459592e-06, + "loss": 0.3234, + "step": 18672 + }, + { + "epoch": 24.249027237354085, + "grad_norm": 1.6299091577529907, + "learning_rate": 9.3354182255527e-06, + "loss": 0.3237, + "step": 18696 + }, + { + "epoch": 24.280155642023345, + "grad_norm": 0.9497871994972229, + "learning_rate": 9.237878981639264e-06, + "loss": 0.3226, + "step": 18720 + }, + { + "epoch": 24.311284046692606, + "grad_norm": 1.3882777690887451, + "learning_rate": 9.140800087284801e-06, + "loss": 0.322, + "step": 18744 + }, + { + "epoch": 24.342412451361866, + "grad_norm": 1.1506375074386597, + "learning_rate": 9.044182638855891e-06, + "loss": 0.3274, + "step": 18768 + }, + { + "epoch": 24.373540856031127, + "grad_norm": 0.8968532681465149, + "learning_rate": 8.948027727507708e-06, + "loss": 0.319, + "step": 18792 + }, + { + "epoch": 24.404669260700388, + "grad_norm": 1.5157815217971802, + "learning_rate": 8.852336439171733e-06, + "loss": 0.3254, + "step": 18816 + }, + { + "epoch": 24.43579766536965, + "grad_norm": 0.9984537959098816, + "learning_rate": 8.757109854543533e-06, + "loss": 0.3244, + "step": 18840 + }, + { + "epoch": 24.46692607003891, + "grad_norm": 1.8151588439941406, + "learning_rate": 8.662349049070463e-06, + "loss": 0.3198, + "step": 18864 + }, + { + "epoch": 24.49805447470817, + "grad_norm": 1.1167311668395996, + "learning_rate": 8.568055092939615e-06, + "loss": 0.3179, + "step": 18888 + }, + { + "epoch": 24.529182879377434, + "grad_norm": 1.3895347118377686, + "learning_rate": 8.474229051065657e-06, + "loss": 0.3211, + "step": 18912 + }, + { + "epoch": 24.560311284046694, + "grad_norm": 1.2524361610412598, + "learning_rate": 8.38087198307887e-06, + "loss": 0.32, + "step": 18936 + }, + { + "epoch": 24.591439688715955, + "grad_norm": 1.389087200164795, + "learning_rate": 8.287984943313114e-06, + "loss": 0.3251, + "step": 18960 + }, + { + "epoch": 24.622568093385215, + "grad_norm": 1.6150294542312622, + "learning_rate": 8.195568980793967e-06, + "loss": 0.3275, + "step": 18984 + }, + { + "epoch": 24.653696498054476, + "grad_norm": 1.6251153945922852, + "learning_rate": 8.103625139226895e-06, + "loss": 0.3225, + "step": 19008 + }, + { + "epoch": 24.684824902723737, + "grad_norm": 1.5373034477233887, + "learning_rate": 8.012154456985388e-06, + "loss": 0.3253, + "step": 19032 + }, + { + "epoch": 24.715953307392997, + "grad_norm": 0.9456262588500977, + "learning_rate": 7.921157967099336e-06, + "loss": 0.3151, + "step": 19056 + }, + { + "epoch": 24.747081712062258, + "grad_norm": 0.9828768372535706, + "learning_rate": 7.830636697243254e-06, + "loss": 0.3252, + "step": 19080 + }, + { + "epoch": 24.77821011673152, + "grad_norm": 1.8610461950302124, + "learning_rate": 7.740591669724772e-06, + "loss": 0.325, + "step": 19104 + }, + { + "epoch": 24.80933852140078, + "grad_norm": 1.8049260377883911, + "learning_rate": 7.651023901473032e-06, + "loss": 0.3204, + "step": 19128 + }, + { + "epoch": 24.84046692607004, + "grad_norm": 1.1601166725158691, + "learning_rate": 7.561934404027193e-06, + "loss": 0.3231, + "step": 19152 + }, + { + "epoch": 24.8715953307393, + "grad_norm": 1.2389658689498901, + "learning_rate": 7.473324183525088e-06, + "loss": 0.329, + "step": 19176 + }, + { + "epoch": 24.90272373540856, + "grad_norm": 1.0001511573791504, + "learning_rate": 7.385194240691751e-06, + "loss": 0.319, + "step": 19200 + }, + { + "epoch": 24.93385214007782, + "grad_norm": 1.7757816314697266, + "learning_rate": 7.297545570828207e-06, + "loss": 0.3267, + "step": 19224 + }, + { + "epoch": 24.964980544747082, + "grad_norm": 1.1014970541000366, + "learning_rate": 7.210379163800185e-06, + "loss": 0.3223, + "step": 19248 + }, + { + "epoch": 24.996108949416342, + "grad_norm": 1.6188836097717285, + "learning_rate": 7.123696004026947e-06, + "loss": 0.3227, + "step": 19272 + }, + { + "epoch": 25.027237354085603, + "grad_norm": 1.2841421365737915, + "learning_rate": 7.037497070470167e-06, + "loss": 0.32, + "step": 19296 + }, + { + "epoch": 25.058365758754864, + "grad_norm": 1.2222139835357666, + "learning_rate": 6.951783336622864e-06, + "loss": 0.3217, + "step": 19320 + }, + { + "epoch": 25.089494163424124, + "grad_norm": 1.0179907083511353, + "learning_rate": 6.866555770498473e-06, + "loss": 0.3182, + "step": 19344 + }, + { + "epoch": 25.120622568093385, + "grad_norm": 0.9595916271209717, + "learning_rate": 6.781815334619812e-06, + "loss": 0.3195, + "step": 19368 + }, + { + "epoch": 25.151750972762645, + "grad_norm": 1.2857320308685303, + "learning_rate": 6.6975629860082935e-06, + "loss": 0.3177, + "step": 19392 + }, + { + "epoch": 25.182879377431906, + "grad_norm": 1.7358510494232178, + "learning_rate": 6.613799676173088e-06, + "loss": 0.3208, + "step": 19416 + }, + { + "epoch": 25.214007782101167, + "grad_norm": 1.8369121551513672, + "learning_rate": 6.530526351100347e-06, + "loss": 0.3196, + "step": 19440 + }, + { + "epoch": 25.245136186770427, + "grad_norm": 2.4744224548339844, + "learning_rate": 6.447743951242591e-06, + "loss": 0.3239, + "step": 19464 + }, + { + "epoch": 25.276264591439688, + "grad_norm": 1.2925540208816528, + "learning_rate": 6.3654534115079936e-06, + "loss": 0.3157, + "step": 19488 + }, + { + "epoch": 25.30739299610895, + "grad_norm": 1.1039607524871826, + "learning_rate": 6.28365566124991e-06, + "loss": 0.3229, + "step": 19512 + }, + { + "epoch": 25.33852140077821, + "grad_norm": 0.8712733387947083, + "learning_rate": 6.202351624256359e-06, + "loss": 0.3181, + "step": 19536 + }, + { + "epoch": 25.36964980544747, + "grad_norm": 1.236718773841858, + "learning_rate": 6.1215422187395345e-06, + "loss": 0.3172, + "step": 19560 + }, + { + "epoch": 25.40077821011673, + "grad_norm": 1.4729557037353516, + "learning_rate": 6.041228357325529e-06, + "loss": 0.3244, + "step": 19584 + }, + { + "epoch": 25.43190661478599, + "grad_norm": 1.1015067100524902, + "learning_rate": 5.961410947043927e-06, + "loss": 0.3227, + "step": 19608 + }, + { + "epoch": 25.46303501945525, + "grad_norm": 1.4798215627670288, + "learning_rate": 5.882090889317671e-06, + "loss": 0.3208, + "step": 19632 + }, + { + "epoch": 25.494163424124515, + "grad_norm": 1.9315009117126465, + "learning_rate": 5.803269079952739e-06, + "loss": 0.3158, + "step": 19656 + }, + { + "epoch": 25.525291828793776, + "grad_norm": 1.1661323308944702, + "learning_rate": 5.724946409128179e-06, + "loss": 0.3194, + "step": 19680 + }, + { + "epoch": 25.556420233463037, + "grad_norm": 1.796525239944458, + "learning_rate": 5.647123761385975e-06, + "loss": 0.3236, + "step": 19704 + }, + { + "epoch": 25.587548638132297, + "grad_norm": 1.251969814300537, + "learning_rate": 5.569802015621039e-06, + "loss": 0.3228, + "step": 19728 + }, + { + "epoch": 25.618677042801558, + "grad_norm": 1.9998018741607666, + "learning_rate": 5.492982045071355e-06, + "loss": 0.3248, + "step": 19752 + }, + { + "epoch": 25.64980544747082, + "grad_norm": 1.0044583082199097, + "learning_rate": 5.4166647173080345e-06, + "loss": 0.3246, + "step": 19776 + }, + { + "epoch": 25.68093385214008, + "grad_norm": 1.0275497436523438, + "learning_rate": 5.340850894225607e-06, + "loss": 0.3253, + "step": 19800 + }, + { + "epoch": 25.71206225680934, + "grad_norm": 1.0156971216201782, + "learning_rate": 5.265541432032212e-06, + "loss": 0.3171, + "step": 19824 + }, + { + "epoch": 25.7431906614786, + "grad_norm": 1.4596341848373413, + "learning_rate": 5.190737181239941e-06, + "loss": 0.3212, + "step": 19848 + }, + { + "epoch": 25.77431906614786, + "grad_norm": 1.2357956171035767, + "learning_rate": 5.116438986655303e-06, + "loss": 0.3268, + "step": 19872 + }, + { + "epoch": 25.80544747081712, + "grad_norm": 1.335877537727356, + "learning_rate": 5.042647687369573e-06, + "loss": 0.3218, + "step": 19896 + }, + { + "epoch": 25.836575875486382, + "grad_norm": 1.5729907751083374, + "learning_rate": 4.969364116749414e-06, + "loss": 0.3205, + "step": 19920 + }, + { + "epoch": 25.867704280155642, + "grad_norm": 1.5255457162857056, + "learning_rate": 4.89658910242739e-06, + "loss": 0.3165, + "step": 19944 + }, + { + "epoch": 25.898832684824903, + "grad_norm": 1.195453405380249, + "learning_rate": 4.8243234662926905e-06, + "loss": 0.323, + "step": 19968 + }, + { + "epoch": 25.929961089494164, + "grad_norm": 1.1830676794052124, + "learning_rate": 4.75256802448178e-06, + "loss": 0.3173, + "step": 19992 + }, + { + "epoch": 25.961089494163424, + "grad_norm": 0.9383173584938049, + "learning_rate": 4.681323587369213e-06, + "loss": 0.3159, + "step": 20016 + }, + { + "epoch": 25.992217898832685, + "grad_norm": 1.3204113245010376, + "learning_rate": 4.610590959558497e-06, + "loss": 0.3217, + "step": 20040 + }, + { + "epoch": 26.023346303501945, + "grad_norm": 1.1940529346466064, + "learning_rate": 4.540370939872974e-06, + "loss": 0.3188, + "step": 20064 + }, + { + "epoch": 26.054474708171206, + "grad_norm": 1.7250840663909912, + "learning_rate": 4.470664321346829e-06, + "loss": 0.3192, + "step": 20088 + }, + { + "epoch": 26.085603112840467, + "grad_norm": 0.9612188339233398, + "learning_rate": 4.401471891216114e-06, + "loss": 0.3183, + "step": 20112 + }, + { + "epoch": 26.116731517509727, + "grad_norm": 1.175308108329773, + "learning_rate": 4.332794430909854e-06, + "loss": 0.3162, + "step": 20136 + }, + { + "epoch": 26.147859922178988, + "grad_norm": 1.3628140687942505, + "learning_rate": 4.264632716041234e-06, + "loss": 0.3173, + "step": 20160 + }, + { + "epoch": 26.17898832684825, + "grad_norm": 0.9504318237304688, + "learning_rate": 4.196987516398831e-06, + "loss": 0.3259, + "step": 20184 + }, + { + "epoch": 26.21011673151751, + "grad_norm": 1.6836086511611938, + "learning_rate": 4.129859595937946e-06, + "loss": 0.3188, + "step": 20208 + }, + { + "epoch": 26.24124513618677, + "grad_norm": 1.2717008590698242, + "learning_rate": 4.063249712771922e-06, + "loss": 0.321, + "step": 20232 + }, + { + "epoch": 26.27237354085603, + "grad_norm": 1.989966869354248, + "learning_rate": 3.997158619163644e-06, + "loss": 0.3215, + "step": 20256 + }, + { + "epoch": 26.30350194552529, + "grad_norm": 1.1739614009857178, + "learning_rate": 3.931587061517011e-06, + "loss": 0.3193, + "step": 20280 + }, + { + "epoch": 26.33463035019455, + "grad_norm": 1.1167713403701782, + "learning_rate": 3.8665357803685025e-06, + "loss": 0.3174, + "step": 20304 + }, + { + "epoch": 26.365758754863812, + "grad_norm": 1.379565715789795, + "learning_rate": 3.8020055103788144e-06, + "loss": 0.3218, + "step": 20328 + }, + { + "epoch": 26.396887159533073, + "grad_norm": 1.4840023517608643, + "learning_rate": 3.7379969803245763e-06, + "loss": 0.3213, + "step": 20352 + }, + { + "epoch": 26.428015564202333, + "grad_norm": 1.1443723440170288, + "learning_rate": 3.6745109130901288e-06, + "loss": 0.3141, + "step": 20376 + }, + { + "epoch": 26.459143968871594, + "grad_norm": 1.090888500213623, + "learning_rate": 3.6115480256593394e-06, + "loss": 0.3212, + "step": 20400 + }, + { + "epoch": 26.490272373540854, + "grad_norm": 1.472679615020752, + "learning_rate": 3.5491090291075004e-06, + "loss": 0.3151, + "step": 20424 + }, + { + "epoch": 26.52140077821012, + "grad_norm": 0.9774566292762756, + "learning_rate": 3.487194628593332e-06, + "loss": 0.3214, + "step": 20448 + }, + { + "epoch": 26.55252918287938, + "grad_norm": 2.1687231063842773, + "learning_rate": 3.4258055233509665e-06, + "loss": 0.324, + "step": 20472 + }, + { + "epoch": 26.58365758754864, + "grad_norm": 1.2352170944213867, + "learning_rate": 3.364942406682109e-06, + "loss": 0.3101, + "step": 20496 + }, + { + "epoch": 26.6147859922179, + "grad_norm": 2.996083974838257, + "learning_rate": 3.304605965948149e-06, + "loss": 0.3141, + "step": 20520 + }, + { + "epoch": 26.64591439688716, + "grad_norm": 1.5926743745803833, + "learning_rate": 3.244796882562462e-06, + "loss": 0.3229, + "step": 20544 + }, + { + "epoch": 26.67704280155642, + "grad_norm": 1.1748905181884766, + "learning_rate": 3.1855158319826774e-06, + "loss": 0.3213, + "step": 20568 + }, + { + "epoch": 26.708171206225682, + "grad_norm": 1.1093063354492188, + "learning_rate": 3.126763483703016e-06, + "loss": 0.3178, + "step": 20592 + }, + { + "epoch": 26.739299610894943, + "grad_norm": 1.1090799570083618, + "learning_rate": 3.0685405012468137e-06, + "loss": 0.3198, + "step": 20616 + }, + { + "epoch": 26.770428015564203, + "grad_norm": 1.0905050039291382, + "learning_rate": 3.010847542158951e-06, + "loss": 0.3192, + "step": 20640 + }, + { + "epoch": 26.801556420233464, + "grad_norm": 1.8493279218673706, + "learning_rate": 2.953685257998451e-06, + "loss": 0.3204, + "step": 20664 + }, + { + "epoch": 26.832684824902724, + "grad_norm": 1.2924058437347412, + "learning_rate": 2.8970542943311583e-06, + "loss": 0.3261, + "step": 20688 + }, + { + "epoch": 26.863813229571985, + "grad_norm": 0.9771651029586792, + "learning_rate": 2.8409552907223804e-06, + "loss": 0.3132, + "step": 20712 + }, + { + "epoch": 26.894941634241246, + "grad_norm": 1.0269138813018799, + "learning_rate": 2.785388880729739e-06, + "loss": 0.3199, + "step": 20736 + }, + { + "epoch": 26.926070038910506, + "grad_norm": 1.309114933013916, + "learning_rate": 2.7303556918959305e-06, + "loss": 0.3145, + "step": 20760 + }, + { + "epoch": 26.957198443579767, + "grad_norm": 1.0709702968597412, + "learning_rate": 2.6758563457417286e-06, + "loss": 0.3192, + "step": 20784 + }, + { + "epoch": 26.988326848249027, + "grad_norm": 1.4049859046936035, + "learning_rate": 2.621891457758896e-06, + "loss": 0.3206, + "step": 20808 + }, + { + "epoch": 27.019455252918288, + "grad_norm": 1.3224713802337646, + "learning_rate": 2.568461637403252e-06, + "loss": 0.312, + "step": 20832 + }, + { + "epoch": 27.05058365758755, + "grad_norm": 1.3082164525985718, + "learning_rate": 2.5155674880878334e-06, + "loss": 0.3108, + "step": 20856 + }, + { + "epoch": 27.08171206225681, + "grad_norm": 0.991944432258606, + "learning_rate": 2.4632096071759925e-06, + "loss": 0.3188, + "step": 20880 + }, + { + "epoch": 27.11284046692607, + "grad_norm": 1.2203731536865234, + "learning_rate": 2.4113885859747497e-06, + "loss": 0.3108, + "step": 20904 + }, + { + "epoch": 27.14396887159533, + "grad_norm": 1.203995704650879, + "learning_rate": 2.360105009728025e-06, + "loss": 0.3102, + "step": 20928 + }, + { + "epoch": 27.17509727626459, + "grad_norm": 1.6264797449111938, + "learning_rate": 2.3093594576101107e-06, + "loss": 0.3174, + "step": 20952 + }, + { + "epoch": 27.20622568093385, + "grad_norm": 1.3530755043029785, + "learning_rate": 2.2591525027190473e-06, + "loss": 0.3252, + "step": 20976 + }, + { + "epoch": 27.237354085603112, + "grad_norm": 2.048307418823242, + "learning_rate": 2.20948471207022e-06, + "loss": 0.3184, + "step": 21000 + }, + { + "epoch": 27.268482490272373, + "grad_norm": 1.320873737335205, + "learning_rate": 2.160356646589934e-06, + "loss": 0.3191, + "step": 21024 + }, + { + "epoch": 27.299610894941633, + "grad_norm": 1.1831213235855103, + "learning_rate": 2.111768861109048e-06, + "loss": 0.3183, + "step": 21048 + }, + { + "epoch": 27.330739299610894, + "grad_norm": 1.0811506509780884, + "learning_rate": 2.0637219043567636e-06, + "loss": 0.3177, + "step": 21072 + }, + { + "epoch": 27.361867704280154, + "grad_norm": 1.1472513675689697, + "learning_rate": 2.0162163189543838e-06, + "loss": 0.3171, + "step": 21096 + }, + { + "epoch": 27.392996108949415, + "grad_norm": 1.6906425952911377, + "learning_rate": 1.9692526414092084e-06, + "loss": 0.3223, + "step": 21120 + }, + { + "epoch": 27.424124513618676, + "grad_norm": 1.600865364074707, + "learning_rate": 1.9228314021084548e-06, + "loss": 0.3151, + "step": 21144 + }, + { + "epoch": 27.455252918287936, + "grad_norm": 1.7052664756774902, + "learning_rate": 1.8769531253132854e-06, + "loss": 0.3172, + "step": 21168 + }, + { + "epoch": 27.486381322957197, + "grad_norm": 1.2754665613174438, + "learning_rate": 1.83161832915289e-06, + "loss": 0.3181, + "step": 21192 + }, + { + "epoch": 27.51750972762646, + "grad_norm": 0.9670736193656921, + "learning_rate": 1.7868275256186174e-06, + "loss": 0.3209, + "step": 21216 + }, + { + "epoch": 27.54863813229572, + "grad_norm": 1.7570668458938599, + "learning_rate": 1.7425812205582147e-06, + "loss": 0.3151, + "step": 21240 + }, + { + "epoch": 27.579766536964982, + "grad_norm": 1.1468702554702759, + "learning_rate": 1.6988799136700706e-06, + "loss": 0.32, + "step": 21264 + }, + { + "epoch": 27.610894941634243, + "grad_norm": 1.837241768836975, + "learning_rate": 1.6557240984976408e-06, + "loss": 0.3176, + "step": 21288 + }, + { + "epoch": 27.642023346303503, + "grad_norm": 1.050024151802063, + "learning_rate": 1.613114262423815e-06, + "loss": 0.3169, + "step": 21312 + }, + { + "epoch": 27.673151750972764, + "grad_norm": 1.0731110572814941, + "learning_rate": 1.5710508866654261e-06, + "loss": 0.3204, + "step": 21336 + }, + { + "epoch": 27.704280155642024, + "grad_norm": 1.2539221048355103, + "learning_rate": 1.5295344462678495e-06, + "loss": 0.3168, + "step": 21360 + }, + { + "epoch": 27.735408560311285, + "grad_norm": 1.4090372323989868, + "learning_rate": 1.488565410099585e-06, + "loss": 0.3164, + "step": 21384 + }, + { + "epoch": 27.766536964980546, + "grad_norm": 1.5965330600738525, + "learning_rate": 1.4481442408470047e-06, + "loss": 0.3216, + "step": 21408 + }, + { + "epoch": 27.797665369649806, + "grad_norm": 1.1138761043548584, + "learning_rate": 1.4082713950091198e-06, + "loss": 0.3206, + "step": 21432 + }, + { + "epoch": 27.828793774319067, + "grad_norm": 1.1677641868591309, + "learning_rate": 1.3689473228923944e-06, + "loss": 0.3241, + "step": 21456 + }, + { + "epoch": 27.859922178988327, + "grad_norm": 2.1310067176818848, + "learning_rate": 1.3301724686056894e-06, + "loss": 0.3187, + "step": 21480 + }, + { + "epoch": 27.891050583657588, + "grad_norm": 1.3181018829345703, + "learning_rate": 1.2919472700552382e-06, + "loss": 0.3164, + "step": 21504 + }, + { + "epoch": 27.92217898832685, + "grad_norm": 1.476120114326477, + "learning_rate": 1.2542721589397234e-06, + "loss": 0.3184, + "step": 21528 + }, + { + "epoch": 27.95330739299611, + "grad_norm": 1.1621023416519165, + "learning_rate": 1.217147560745352e-06, + "loss": 0.319, + "step": 21552 + }, + { + "epoch": 27.98443579766537, + "grad_norm": 1.1426842212677002, + "learning_rate": 1.1805738947410938e-06, + "loss": 0.3155, + "step": 21576 + }, + { + "epoch": 28.01556420233463, + "grad_norm": 2.4093399047851562, + "learning_rate": 1.1445515739739399e-06, + "loss": 0.3135, + "step": 21600 + }, + { + "epoch": 28.04669260700389, + "grad_norm": 1.5340672731399536, + "learning_rate": 1.1090810052642064e-06, + "loss": 0.3181, + "step": 21624 + }, + { + "epoch": 28.07782101167315, + "grad_norm": 1.0847253799438477, + "learning_rate": 1.0741625892009833e-06, + "loss": 0.3165, + "step": 21648 + }, + { + "epoch": 28.108949416342412, + "grad_norm": 1.3261409997940063, + "learning_rate": 1.0397967201375814e-06, + "loss": 0.3204, + "step": 21672 + }, + { + "epoch": 28.140077821011673, + "grad_norm": 1.0757031440734863, + "learning_rate": 1.0059837861870812e-06, + "loss": 0.3187, + "step": 21696 + }, + { + "epoch": 28.171206225680933, + "grad_norm": 1.2534974813461304, + "learning_rate": 9.727241692179756e-07, + "loss": 0.3096, + "step": 21720 + }, + { + "epoch": 28.202334630350194, + "grad_norm": 1.2287142276763916, + "learning_rate": 9.400182448498163e-07, + "loss": 0.3169, + "step": 21744 + }, + { + "epoch": 28.233463035019454, + "grad_norm": 0.9463332891464233, + "learning_rate": 9.078663824490131e-07, + "loss": 0.3185, + "step": 21768 + }, + { + "epoch": 28.264591439688715, + "grad_norm": 2.7430317401885986, + "learning_rate": 8.762689451246198e-07, + "loss": 0.3178, + "step": 21792 + }, + { + "epoch": 28.295719844357976, + "grad_norm": 1.1905908584594727, + "learning_rate": 8.452262897242768e-07, + "loss": 0.3197, + "step": 21816 + }, + { + "epoch": 28.326848249027236, + "grad_norm": 0.894260823726654, + "learning_rate": 8.147387668301421e-07, + "loss": 0.3201, + "step": 21840 + }, + { + "epoch": 28.357976653696497, + "grad_norm": 1.122759222984314, + "learning_rate": 7.848067207549603e-07, + "loss": 0.3102, + "step": 21864 + }, + { + "epoch": 28.389105058365757, + "grad_norm": 1.454839825630188, + "learning_rate": 7.554304895381781e-07, + "loss": 0.3156, + "step": 21888 + }, + { + "epoch": 28.420233463035018, + "grad_norm": 1.348819613456726, + "learning_rate": 7.266104049420797e-07, + "loss": 0.3173, + "step": 21912 + }, + { + "epoch": 28.45136186770428, + "grad_norm": 1.397900104522705, + "learning_rate": 6.983467924480957e-07, + "loss": 0.3206, + "step": 21936 + }, + { + "epoch": 28.48249027237354, + "grad_norm": 2.4935896396636963, + "learning_rate": 6.706399712531009e-07, + "loss": 0.3227, + "step": 21960 + }, + { + "epoch": 28.5136186770428, + "grad_norm": 1.3364354372024536, + "learning_rate": 6.434902542658106e-07, + "loss": 0.3143, + "step": 21984 + }, + { + "epoch": 28.544747081712064, + "grad_norm": 1.0415703058242798, + "learning_rate": 6.168979481032455e-07, + "loss": 0.3204, + "step": 22008 + }, + { + "epoch": 28.575875486381324, + "grad_norm": 1.0268234014511108, + "learning_rate": 5.908633530872732e-07, + "loss": 0.3163, + "step": 22032 + }, + { + "epoch": 28.607003891050585, + "grad_norm": 1.0088456869125366, + "learning_rate": 5.653867632412269e-07, + "loss": 0.3118, + "step": 22056 + }, + { + "epoch": 28.638132295719846, + "grad_norm": 1.52815842628479, + "learning_rate": 5.404684662865589e-07, + "loss": 0.3166, + "step": 22080 + }, + { + "epoch": 28.669260700389106, + "grad_norm": 1.0740587711334229, + "learning_rate": 5.161087436396095e-07, + "loss": 0.3157, + "step": 22104 + }, + { + "epoch": 28.700389105058367, + "grad_norm": 1.263934850692749, + "learning_rate": 4.923078704084372e-07, + "loss": 0.3169, + "step": 22128 + }, + { + "epoch": 28.731517509727627, + "grad_norm": 1.1837375164031982, + "learning_rate": 4.690661153896825e-07, + "loss": 0.3177, + "step": 22152 + }, + { + "epoch": 28.762645914396888, + "grad_norm": 1.1407973766326904, + "learning_rate": 4.463837410655536e-07, + "loss": 0.3161, + "step": 22176 + }, + { + "epoch": 28.79377431906615, + "grad_norm": 1.019492268562317, + "learning_rate": 4.242610036008676e-07, + "loss": 0.3135, + "step": 22200 + }, + { + "epoch": 28.82490272373541, + "grad_norm": 1.7875498533248901, + "learning_rate": 4.026981528401419e-07, + "loss": 0.3213, + "step": 22224 + }, + { + "epoch": 28.85603112840467, + "grad_norm": 0.9684593677520752, + "learning_rate": 3.8169543230477387e-07, + "loss": 0.3151, + "step": 22248 + }, + { + "epoch": 28.88715953307393, + "grad_norm": 1.086421012878418, + "learning_rate": 3.612530791903046e-07, + "loss": 0.3172, + "step": 22272 + }, + { + "epoch": 28.91828793774319, + "grad_norm": 1.9420697689056396, + "learning_rate": 3.4137132436372064e-07, + "loss": 0.3181, + "step": 22296 + }, + { + "epoch": 28.94941634241245, + "grad_norm": 1.217786192893982, + "learning_rate": 3.2205039236086197e-07, + "loss": 0.3151, + "step": 22320 + }, + { + "epoch": 28.980544747081712, + "grad_norm": 1.1275442838668823, + "learning_rate": 3.0329050138388494e-07, + "loss": 0.3193, + "step": 22344 + }, + { + "epoch": 29.011673151750973, + "grad_norm": 0.9701781272888184, + "learning_rate": 2.850918632987809e-07, + "loss": 0.316, + "step": 22368 + }, + { + "epoch": 29.042801556420233, + "grad_norm": 1.0859931707382202, + "learning_rate": 2.674546836330172e-07, + "loss": 0.3169, + "step": 22392 + }, + { + "epoch": 29.073929961089494, + "grad_norm": 0.9976264834403992, + "learning_rate": 2.503791615731721e-07, + "loss": 0.3172, + "step": 22416 + }, + { + "epoch": 29.105058365758754, + "grad_norm": 2.1112818717956543, + "learning_rate": 2.3386548996272572e-07, + "loss": 0.3202, + "step": 22440 + }, + { + "epoch": 29.136186770428015, + "grad_norm": 1.3070718050003052, + "learning_rate": 2.1791385529986163e-07, + "loss": 0.3163, + "step": 22464 + }, + { + "epoch": 29.167315175097276, + "grad_norm": 1.5637389421463013, + "learning_rate": 2.02524437735363e-07, + "loss": 0.3183, + "step": 22488 + }, + { + "epoch": 29.198443579766536, + "grad_norm": 1.19569730758667, + "learning_rate": 1.876974110705698e-07, + "loss": 0.3176, + "step": 22512 + }, + { + "epoch": 29.229571984435797, + "grad_norm": 2.7948904037475586, + "learning_rate": 1.7343294275543599e-07, + "loss": 0.3181, + "step": 22536 + }, + { + "epoch": 29.260700389105057, + "grad_norm": 2.1853528022766113, + "learning_rate": 1.597311938866308e-07, + "loss": 0.3144, + "step": 22560 + }, + { + "epoch": 29.291828793774318, + "grad_norm": 1.4694305658340454, + "learning_rate": 1.4659231920571282e-07, + "loss": 0.318, + "step": 22584 + }, + { + "epoch": 29.32295719844358, + "grad_norm": 1.037607192993164, + "learning_rate": 1.3401646709736983e-07, + "loss": 0.3142, + "step": 22608 + }, + { + "epoch": 29.35408560311284, + "grad_norm": 0.9353266358375549, + "learning_rate": 1.2200377958778708e-07, + "loss": 0.3133, + "step": 22632 + }, + { + "epoch": 29.3852140077821, + "grad_norm": 1.4458966255187988, + "learning_rate": 1.1055439234299858e-07, + "loss": 0.3164, + "step": 22656 + }, + { + "epoch": 29.41634241245136, + "grad_norm": 0.9110085368156433, + "learning_rate": 9.966843466736597e-08, + "loss": 0.3157, + "step": 22680 + }, + { + "epoch": 29.44747081712062, + "grad_norm": 1.0257847309112549, + "learning_rate": 8.934602950213533e-08, + "loss": 0.319, + "step": 22704 + }, + { + "epoch": 29.47859922178988, + "grad_norm": 1.2331140041351318, + "learning_rate": 7.958729342403826e-08, + "loss": 0.3177, + "step": 22728 + }, + { + "epoch": 29.509727626459146, + "grad_norm": 2.199601650238037, + "learning_rate": 7.039233664396516e-08, + "loss": 0.3164, + "step": 22752 + }, + { + "epoch": 29.540856031128406, + "grad_norm": 1.1412527561187744, + "learning_rate": 6.176126300573848e-08, + "loss": 0.3127, + "step": 22776 + }, + { + "epoch": 29.571984435797667, + "grad_norm": 1.556688904762268, + "learning_rate": 5.369416998492471e-08, + "loss": 0.3181, + "step": 22800 + }, + { + "epoch": 29.603112840466927, + "grad_norm": 1.2471084594726562, + "learning_rate": 4.619114868774643e-08, + "loss": 0.3152, + "step": 22824 + }, + { + "epoch": 29.634241245136188, + "grad_norm": 1.3103766441345215, + "learning_rate": 3.92522838500331e-08, + "loss": 0.3171, + "step": 22848 + }, + { + "epoch": 29.66536964980545, + "grad_norm": 1.0881154537200928, + "learning_rate": 3.2877653836299594e-08, + "loss": 0.3162, + "step": 22872 + }, + { + "epoch": 29.69649805447471, + "grad_norm": 0.981332004070282, + "learning_rate": 2.7067330638824718e-08, + "loss": 0.3152, + "step": 22896 + }, + { + "epoch": 29.72762645914397, + "grad_norm": 2.1748950481414795, + "learning_rate": 2.1821379876851845e-08, + "loss": 0.3138, + "step": 22920 + }, + { + "epoch": 29.75875486381323, + "grad_norm": 1.0983901023864746, + "learning_rate": 1.7139860795861717e-08, + "loss": 0.3194, + "step": 22944 + }, + { + "epoch": 29.78988326848249, + "grad_norm": 0.9180955290794373, + "learning_rate": 1.3022826266873012e-08, + "loss": 0.3155, + "step": 22968 + }, + { + "epoch": 29.82101167315175, + "grad_norm": 4.426241397857666, + "learning_rate": 9.470322785881668e-09, + "loss": 0.3176, + "step": 22992 + }, + { + "epoch": 29.852140077821012, + "grad_norm": 1.521730661392212, + "learning_rate": 6.482390473294686e-09, + "loss": 0.3179, + "step": 23016 + }, + { + "epoch": 29.883268482490273, + "grad_norm": 1.1130119562149048, + "learning_rate": 4.059063073524882e-09, + "loss": 0.3199, + "step": 23040 + }, + { + "epoch": 29.914396887159533, + "grad_norm": 1.0622695684432983, + "learning_rate": 2.2003679545690158e-09, + "loss": 0.3167, + "step": 23064 + }, + { + "epoch": 29.945525291828794, + "grad_norm": 1.495850920677185, + "learning_rate": 9.063261077080221e-10, + "loss": 0.3201, + "step": 23088 + }, + { + "epoch": 29.976653696498055, + "grad_norm": 1.2298061847686768, + "learning_rate": 1.7695214729607224e-10, + "loss": 0.3134, + "step": 23112 + }, + { + "epoch": 30.0, + "step": 23130, + "total_flos": 9.11148472281858e+17, + "train_loss": 0.3991138265909079, + "train_runtime": 54856.7027, + "train_samples_per_second": 107.912, + "train_steps_per_second": 0.422 + } + ], + "logging_steps": 24, + "max_steps": 23130, + "num_input_tokens_seen": 0, + "num_train_epochs": 30, + "save_steps": 1157, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 9.11148472281858e+17, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}