diff --git "a/checkpoint-16000/trainer_state.json" "b/checkpoint-16000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-16000/trainer_state.json" @@ -0,0 +1,112033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.8379185572339325, + "eval_steps": 500, + "global_step": 16000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00011486990982712079, + "grad_norm": 1.0950555801391602, + "learning_rate": 0.0001, + "loss": 2.0384, + "step": 1 + }, + { + "epoch": 0.00022973981965424158, + "grad_norm": 0.6104637384414673, + "learning_rate": 0.0001, + "loss": 2.0026, + "step": 2 + }, + { + "epoch": 0.00034460972948136237, + "grad_norm": 1.6990306377410889, + "learning_rate": 0.0001, + "loss": 2.2147, + "step": 3 + }, + { + "epoch": 0.00045947963930848316, + "grad_norm": 0.5962863564491272, + "learning_rate": 0.0001, + "loss": 1.6379, + "step": 4 + }, + { + "epoch": 0.0005743495491356039, + "grad_norm": 0.5962501764297485, + "learning_rate": 0.0001, + "loss": 1.9908, + "step": 5 + }, + { + "epoch": 0.0006892194589627247, + "grad_norm": 0.46677908301353455, + "learning_rate": 0.0001, + "loss": 1.8843, + "step": 6 + }, + { + "epoch": 0.0008040893687898455, + "grad_norm": 0.5024643540382385, + "learning_rate": 0.0001, + "loss": 2.0395, + "step": 7 + }, + { + "epoch": 0.0009189592786169663, + "grad_norm": 0.3814510703086853, + "learning_rate": 0.0001, + "loss": 1.7637, + "step": 8 + }, + { + "epoch": 0.0010338291884440872, + "grad_norm": 0.4189926087856293, + "learning_rate": 0.0001, + "loss": 1.9592, + "step": 9 + }, + { + "epoch": 0.0011486990982712078, + "grad_norm": 0.44599318504333496, + "learning_rate": 0.0001, + "loss": 1.7664, + "step": 10 + }, + { + "epoch": 0.0012635690080983286, + "grad_norm": 0.3737109899520874, + "learning_rate": 0.0001, + "loss": 1.7794, + "step": 11 + }, + { + "epoch": 0.0013784389179254495, + "grad_norm": 0.39896202087402344, + "learning_rate": 0.0001, + "loss": 1.8159, + "step": 12 + }, + { + "epoch": 0.0014933088277525703, + "grad_norm": 0.3495546281337738, + "learning_rate": 0.0001, + "loss": 1.739, + "step": 13 + }, + { + "epoch": 0.001608178737579691, + "grad_norm": 0.4069090187549591, + "learning_rate": 0.0001, + "loss": 1.9259, + "step": 14 + }, + { + "epoch": 0.0017230486474068118, + "grad_norm": 0.4087446331977844, + "learning_rate": 0.0001, + "loss": 1.8746, + "step": 15 + }, + { + "epoch": 0.0018379185572339326, + "grad_norm": 0.33482518792152405, + "learning_rate": 0.0001, + "loss": 1.6317, + "step": 16 + }, + { + "epoch": 0.0019527884670610533, + "grad_norm": 0.37447136640548706, + "learning_rate": 0.0001, + "loss": 2.0974, + "step": 17 + }, + { + "epoch": 0.0020676583768881743, + "grad_norm": 0.359221875667572, + "learning_rate": 0.0001, + "loss": 1.7629, + "step": 18 + }, + { + "epoch": 0.0021825282867152947, + "grad_norm": 0.38664090633392334, + "learning_rate": 0.0001, + "loss": 1.9719, + "step": 19 + }, + { + "epoch": 0.0022973981965424156, + "grad_norm": 0.326967716217041, + "learning_rate": 0.0001, + "loss": 1.6188, + "step": 20 + }, + { + "epoch": 0.0024122681063695364, + "grad_norm": 0.3577702045440674, + "learning_rate": 0.0001, + "loss": 1.9068, + "step": 21 + }, + { + "epoch": 0.0025271380161966573, + "grad_norm": 0.3471507728099823, + "learning_rate": 0.0001, + "loss": 1.8111, + "step": 22 + }, + { + "epoch": 0.002642007926023778, + "grad_norm": 0.3563218414783478, + "learning_rate": 0.0001, + "loss": 1.862, + "step": 23 + }, + { + "epoch": 0.002756877835850899, + "grad_norm": 0.38289228081703186, + "learning_rate": 0.0001, + "loss": 1.9097, + "step": 24 + }, + { + "epoch": 0.0028717477456780198, + "grad_norm": 0.37357401847839355, + "learning_rate": 0.0001, + "loss": 1.9897, + "step": 25 + }, + { + "epoch": 0.0029866176555051406, + "grad_norm": 0.3501838445663452, + "learning_rate": 0.0001, + "loss": 1.9073, + "step": 26 + }, + { + "epoch": 0.003101487565332261, + "grad_norm": 0.3973883390426636, + "learning_rate": 0.0001, + "loss": 1.961, + "step": 27 + }, + { + "epoch": 0.003216357475159382, + "grad_norm": 0.44121477007865906, + "learning_rate": 0.0001, + "loss": 1.9053, + "step": 28 + }, + { + "epoch": 0.0033312273849865027, + "grad_norm": 0.35977938771247864, + "learning_rate": 0.0001, + "loss": 1.8474, + "step": 29 + }, + { + "epoch": 0.0034460972948136236, + "grad_norm": 0.3839881420135498, + "learning_rate": 0.0001, + "loss": 1.9948, + "step": 30 + }, + { + "epoch": 0.0035609672046407444, + "grad_norm": 0.3463224768638611, + "learning_rate": 0.0001, + "loss": 1.7018, + "step": 31 + }, + { + "epoch": 0.0036758371144678652, + "grad_norm": 0.3363831639289856, + "learning_rate": 0.0001, + "loss": 1.7479, + "step": 32 + }, + { + "epoch": 0.003790707024294986, + "grad_norm": 0.3735474944114685, + "learning_rate": 0.0001, + "loss": 1.7217, + "step": 33 + }, + { + "epoch": 0.0039055769341221065, + "grad_norm": 0.3522309958934784, + "learning_rate": 0.0001, + "loss": 1.9002, + "step": 34 + }, + { + "epoch": 0.004020446843949228, + "grad_norm": 0.35659080743789673, + "learning_rate": 0.0001, + "loss": 1.9188, + "step": 35 + }, + { + "epoch": 0.004135316753776349, + "grad_norm": 0.3676978647708893, + "learning_rate": 0.0001, + "loss": 2.0147, + "step": 36 + }, + { + "epoch": 0.0042501866636034695, + "grad_norm": 0.3312574326992035, + "learning_rate": 0.0001, + "loss": 1.6601, + "step": 37 + }, + { + "epoch": 0.0043650565734305894, + "grad_norm": 0.31456542015075684, + "learning_rate": 0.0001, + "loss": 1.7814, + "step": 38 + }, + { + "epoch": 0.00447992648325771, + "grad_norm": 0.34201157093048096, + "learning_rate": 0.0001, + "loss": 1.8276, + "step": 39 + }, + { + "epoch": 0.004594796393084831, + "grad_norm": 0.3848654329776764, + "learning_rate": 0.0001, + "loss": 1.9418, + "step": 40 + }, + { + "epoch": 0.004709666302911952, + "grad_norm": 0.333026260137558, + "learning_rate": 0.0001, + "loss": 1.8563, + "step": 41 + }, + { + "epoch": 0.004824536212739073, + "grad_norm": 0.3494780659675598, + "learning_rate": 0.0001, + "loss": 1.8389, + "step": 42 + }, + { + "epoch": 0.004939406122566194, + "grad_norm": 0.3298174738883972, + "learning_rate": 0.0001, + "loss": 1.8113, + "step": 43 + }, + { + "epoch": 0.0050542760323933145, + "grad_norm": 0.33735814690589905, + "learning_rate": 0.0001, + "loss": 1.7348, + "step": 44 + }, + { + "epoch": 0.005169145942220435, + "grad_norm": 0.3232858180999756, + "learning_rate": 0.0001, + "loss": 1.7709, + "step": 45 + }, + { + "epoch": 0.005284015852047556, + "grad_norm": 0.34412050247192383, + "learning_rate": 0.0001, + "loss": 1.8392, + "step": 46 + }, + { + "epoch": 0.005398885761874677, + "grad_norm": 0.36024415493011475, + "learning_rate": 0.0001, + "loss": 1.8496, + "step": 47 + }, + { + "epoch": 0.005513755671701798, + "grad_norm": 0.36573317646980286, + "learning_rate": 0.0001, + "loss": 2.0788, + "step": 48 + }, + { + "epoch": 0.005628625581528919, + "grad_norm": 0.36239179968833923, + "learning_rate": 0.0001, + "loss": 1.8717, + "step": 49 + }, + { + "epoch": 0.0057434954913560396, + "grad_norm": 0.35284945368766785, + "learning_rate": 0.0001, + "loss": 2.0221, + "step": 50 + }, + { + "epoch": 0.00585836540118316, + "grad_norm": 0.33570870757102966, + "learning_rate": 0.0001, + "loss": 1.9219, + "step": 51 + }, + { + "epoch": 0.005973235311010281, + "grad_norm": 0.36989328265190125, + "learning_rate": 0.0001, + "loss": 1.848, + "step": 52 + }, + { + "epoch": 0.006088105220837401, + "grad_norm": 0.3059888482093811, + "learning_rate": 0.0001, + "loss": 1.7479, + "step": 53 + }, + { + "epoch": 0.006202975130664522, + "grad_norm": 0.3483046293258667, + "learning_rate": 0.0001, + "loss": 1.9025, + "step": 54 + }, + { + "epoch": 0.006317845040491643, + "grad_norm": 0.33112019300460815, + "learning_rate": 0.0001, + "loss": 1.7152, + "step": 55 + }, + { + "epoch": 0.006432714950318764, + "grad_norm": 0.3339669406414032, + "learning_rate": 0.0001, + "loss": 1.6784, + "step": 56 + }, + { + "epoch": 0.006547584860145885, + "grad_norm": 0.3434010446071625, + "learning_rate": 0.0001, + "loss": 1.9298, + "step": 57 + }, + { + "epoch": 0.0066624547699730054, + "grad_norm": 0.3272426724433899, + "learning_rate": 0.0001, + "loss": 1.7955, + "step": 58 + }, + { + "epoch": 0.006777324679800126, + "grad_norm": 0.3431703746318817, + "learning_rate": 0.0001, + "loss": 1.835, + "step": 59 + }, + { + "epoch": 0.006892194589627247, + "grad_norm": 0.36123043298721313, + "learning_rate": 0.0001, + "loss": 1.8674, + "step": 60 + }, + { + "epoch": 0.007007064499454368, + "grad_norm": 0.3534761071205139, + "learning_rate": 0.0001, + "loss": 1.8935, + "step": 61 + }, + { + "epoch": 0.007121934409281489, + "grad_norm": 0.3585789203643799, + "learning_rate": 0.0001, + "loss": 1.8238, + "step": 62 + }, + { + "epoch": 0.00723680431910861, + "grad_norm": 0.3434857726097107, + "learning_rate": 0.0001, + "loss": 1.7009, + "step": 63 + }, + { + "epoch": 0.0073516742289357305, + "grad_norm": 0.3498717248439789, + "learning_rate": 0.0001, + "loss": 1.854, + "step": 64 + }, + { + "epoch": 0.007466544138762851, + "grad_norm": 0.37057748436927795, + "learning_rate": 0.0001, + "loss": 1.9863, + "step": 65 + }, + { + "epoch": 0.007581414048589972, + "grad_norm": 0.36728590726852417, + "learning_rate": 0.0001, + "loss": 1.8498, + "step": 66 + }, + { + "epoch": 0.007696283958417093, + "grad_norm": 0.34580233693122864, + "learning_rate": 0.0001, + "loss": 2.0121, + "step": 67 + }, + { + "epoch": 0.007811153868244213, + "grad_norm": 0.3278733491897583, + "learning_rate": 0.0001, + "loss": 1.8673, + "step": 68 + }, + { + "epoch": 0.007926023778071334, + "grad_norm": 0.33521461486816406, + "learning_rate": 0.0001, + "loss": 1.7693, + "step": 69 + }, + { + "epoch": 0.008040893687898456, + "grad_norm": 0.34356561303138733, + "learning_rate": 0.0001, + "loss": 1.9444, + "step": 70 + }, + { + "epoch": 0.008155763597725576, + "grad_norm": 0.38897210359573364, + "learning_rate": 0.0001, + "loss": 1.9214, + "step": 71 + }, + { + "epoch": 0.008270633507552697, + "grad_norm": 0.34927767515182495, + "learning_rate": 0.0001, + "loss": 1.8995, + "step": 72 + }, + { + "epoch": 0.008385503417379817, + "grad_norm": 0.3214438259601593, + "learning_rate": 0.0001, + "loss": 1.6437, + "step": 73 + }, + { + "epoch": 0.008500373327206939, + "grad_norm": 0.32360997796058655, + "learning_rate": 0.0001, + "loss": 1.7942, + "step": 74 + }, + { + "epoch": 0.008615243237034059, + "grad_norm": 0.34416717290878296, + "learning_rate": 0.0001, + "loss": 1.8455, + "step": 75 + }, + { + "epoch": 0.008730113146861179, + "grad_norm": 0.3457936644554138, + "learning_rate": 0.0001, + "loss": 1.9531, + "step": 76 + }, + { + "epoch": 0.0088449830566883, + "grad_norm": 0.3502795398235321, + "learning_rate": 0.0001, + "loss": 1.7687, + "step": 77 + }, + { + "epoch": 0.00895985296651542, + "grad_norm": 0.348290354013443, + "learning_rate": 0.0001, + "loss": 1.7663, + "step": 78 + }, + { + "epoch": 0.009074722876342542, + "grad_norm": 0.32894769310951233, + "learning_rate": 0.0001, + "loss": 1.7323, + "step": 79 + }, + { + "epoch": 0.009189592786169662, + "grad_norm": 0.32729727029800415, + "learning_rate": 0.0001, + "loss": 1.7761, + "step": 80 + }, + { + "epoch": 0.009304462695996784, + "grad_norm": 0.3347381055355072, + "learning_rate": 0.0001, + "loss": 1.9055, + "step": 81 + }, + { + "epoch": 0.009419332605823904, + "grad_norm": 0.35440585017204285, + "learning_rate": 0.0001, + "loss": 1.9095, + "step": 82 + }, + { + "epoch": 0.009534202515651026, + "grad_norm": 0.33549365401268005, + "learning_rate": 0.0001, + "loss": 1.7806, + "step": 83 + }, + { + "epoch": 0.009649072425478146, + "grad_norm": 0.37921905517578125, + "learning_rate": 0.0001, + "loss": 2.0155, + "step": 84 + }, + { + "epoch": 0.009763942335305267, + "grad_norm": 0.3329186737537384, + "learning_rate": 0.0001, + "loss": 1.8357, + "step": 85 + }, + { + "epoch": 0.009878812245132387, + "grad_norm": 0.36343327164649963, + "learning_rate": 0.0001, + "loss": 1.9286, + "step": 86 + }, + { + "epoch": 0.009993682154959509, + "grad_norm": 0.3510076403617859, + "learning_rate": 0.0001, + "loss": 1.8747, + "step": 87 + }, + { + "epoch": 0.010108552064786629, + "grad_norm": 0.32831454277038574, + "learning_rate": 0.0001, + "loss": 1.5749, + "step": 88 + }, + { + "epoch": 0.01022342197461375, + "grad_norm": 0.3502967655658722, + "learning_rate": 0.0001, + "loss": 1.9506, + "step": 89 + }, + { + "epoch": 0.01033829188444087, + "grad_norm": 0.34648793935775757, + "learning_rate": 0.0001, + "loss": 1.8406, + "step": 90 + }, + { + "epoch": 0.01045316179426799, + "grad_norm": 0.35201138257980347, + "learning_rate": 0.0001, + "loss": 1.8329, + "step": 91 + }, + { + "epoch": 0.010568031704095112, + "grad_norm": 0.3722350299358368, + "learning_rate": 0.0001, + "loss": 1.9775, + "step": 92 + }, + { + "epoch": 0.010682901613922232, + "grad_norm": 0.3540225327014923, + "learning_rate": 0.0001, + "loss": 1.8165, + "step": 93 + }, + { + "epoch": 0.010797771523749354, + "grad_norm": 0.33188265562057495, + "learning_rate": 0.0001, + "loss": 1.9455, + "step": 94 + }, + { + "epoch": 0.010912641433576474, + "grad_norm": 0.32253599166870117, + "learning_rate": 0.0001, + "loss": 1.7694, + "step": 95 + }, + { + "epoch": 0.011027511343403596, + "grad_norm": 0.34879428148269653, + "learning_rate": 0.0001, + "loss": 1.9055, + "step": 96 + }, + { + "epoch": 0.011142381253230716, + "grad_norm": 0.3185734450817108, + "learning_rate": 0.0001, + "loss": 1.6201, + "step": 97 + }, + { + "epoch": 0.011257251163057837, + "grad_norm": 0.34270620346069336, + "learning_rate": 0.0001, + "loss": 1.8956, + "step": 98 + }, + { + "epoch": 0.011372121072884957, + "grad_norm": 0.3397463858127594, + "learning_rate": 0.0001, + "loss": 1.8395, + "step": 99 + }, + { + "epoch": 0.011486990982712079, + "grad_norm": 0.587956428527832, + "learning_rate": 0.0001, + "loss": 1.67, + "step": 100 + }, + { + "epoch": 0.011601860892539199, + "grad_norm": 0.34749558568000793, + "learning_rate": 0.0001, + "loss": 1.9738, + "step": 101 + }, + { + "epoch": 0.01171673080236632, + "grad_norm": 0.31300491094589233, + "learning_rate": 0.0001, + "loss": 1.8049, + "step": 102 + }, + { + "epoch": 0.01183160071219344, + "grad_norm": 0.3428112864494324, + "learning_rate": 0.0001, + "loss": 1.8066, + "step": 103 + }, + { + "epoch": 0.011946470622020562, + "grad_norm": 0.3374922573566437, + "learning_rate": 0.0001, + "loss": 1.8526, + "step": 104 + }, + { + "epoch": 0.012061340531847682, + "grad_norm": 0.32223325967788696, + "learning_rate": 0.0001, + "loss": 1.7013, + "step": 105 + }, + { + "epoch": 0.012176210441674802, + "grad_norm": 0.3365132808685303, + "learning_rate": 0.0001, + "loss": 1.7933, + "step": 106 + }, + { + "epoch": 0.012291080351501924, + "grad_norm": 0.31686636805534363, + "learning_rate": 0.0001, + "loss": 1.5097, + "step": 107 + }, + { + "epoch": 0.012405950261329044, + "grad_norm": 0.3461359441280365, + "learning_rate": 0.0001, + "loss": 1.563, + "step": 108 + }, + { + "epoch": 0.012520820171156166, + "grad_norm": 0.31762605905532837, + "learning_rate": 0.0001, + "loss": 1.665, + "step": 109 + }, + { + "epoch": 0.012635690080983286, + "grad_norm": 0.3150866627693176, + "learning_rate": 0.0001, + "loss": 1.6591, + "step": 110 + }, + { + "epoch": 0.012750559990810408, + "grad_norm": 0.3498469889163971, + "learning_rate": 0.0001, + "loss": 1.8973, + "step": 111 + }, + { + "epoch": 0.012865429900637528, + "grad_norm": 0.3512935936450958, + "learning_rate": 0.0001, + "loss": 1.9445, + "step": 112 + }, + { + "epoch": 0.01298029981046465, + "grad_norm": 0.3678135573863983, + "learning_rate": 0.0001, + "loss": 1.7957, + "step": 113 + }, + { + "epoch": 0.01309516972029177, + "grad_norm": 0.3330201208591461, + "learning_rate": 0.0001, + "loss": 1.7507, + "step": 114 + }, + { + "epoch": 0.013210039630118891, + "grad_norm": 0.33755484223365784, + "learning_rate": 0.0001, + "loss": 1.6139, + "step": 115 + }, + { + "epoch": 0.013324909539946011, + "grad_norm": 0.34242576360702515, + "learning_rate": 0.0001, + "loss": 1.8006, + "step": 116 + }, + { + "epoch": 0.013439779449773133, + "grad_norm": 0.35527321696281433, + "learning_rate": 0.0001, + "loss": 1.8855, + "step": 117 + }, + { + "epoch": 0.013554649359600253, + "grad_norm": 0.3338663578033447, + "learning_rate": 0.0001, + "loss": 1.7004, + "step": 118 + }, + { + "epoch": 0.013669519269427374, + "grad_norm": 0.3488336503505707, + "learning_rate": 0.0001, + "loss": 1.7168, + "step": 119 + }, + { + "epoch": 0.013784389179254494, + "grad_norm": 0.3300139605998993, + "learning_rate": 0.0001, + "loss": 1.7849, + "step": 120 + }, + { + "epoch": 0.013899259089081614, + "grad_norm": 0.29988691210746765, + "learning_rate": 0.0001, + "loss": 1.476, + "step": 121 + }, + { + "epoch": 0.014014128998908736, + "grad_norm": 0.3436170220375061, + "learning_rate": 0.0001, + "loss": 1.7939, + "step": 122 + }, + { + "epoch": 0.014128998908735856, + "grad_norm": 0.3263620436191559, + "learning_rate": 0.0001, + "loss": 1.7402, + "step": 123 + }, + { + "epoch": 0.014243868818562978, + "grad_norm": 0.3402661383152008, + "learning_rate": 0.0001, + "loss": 1.7146, + "step": 124 + }, + { + "epoch": 0.014358738728390098, + "grad_norm": 0.3439052700996399, + "learning_rate": 0.0001, + "loss": 1.7521, + "step": 125 + }, + { + "epoch": 0.01447360863821722, + "grad_norm": 0.3327753245830536, + "learning_rate": 0.0001, + "loss": 1.8139, + "step": 126 + }, + { + "epoch": 0.01458847854804434, + "grad_norm": 0.3490423560142517, + "learning_rate": 0.0001, + "loss": 1.9259, + "step": 127 + }, + { + "epoch": 0.014703348457871461, + "grad_norm": 0.33834782242774963, + "learning_rate": 0.0001, + "loss": 1.8205, + "step": 128 + }, + { + "epoch": 0.014818218367698581, + "grad_norm": 0.34202057123184204, + "learning_rate": 0.0001, + "loss": 1.7071, + "step": 129 + }, + { + "epoch": 0.014933088277525703, + "grad_norm": 0.31165608763694763, + "learning_rate": 0.0001, + "loss": 1.7411, + "step": 130 + }, + { + "epoch": 0.015047958187352823, + "grad_norm": 0.3511159420013428, + "learning_rate": 0.0001, + "loss": 1.8655, + "step": 131 + }, + { + "epoch": 0.015162828097179944, + "grad_norm": 0.3437829613685608, + "learning_rate": 0.0001, + "loss": 1.9517, + "step": 132 + }, + { + "epoch": 0.015277698007007064, + "grad_norm": 0.3470558822154999, + "learning_rate": 0.0001, + "loss": 1.6335, + "step": 133 + }, + { + "epoch": 0.015392567916834186, + "grad_norm": 0.3070822060108185, + "learning_rate": 0.0001, + "loss": 1.6165, + "step": 134 + }, + { + "epoch": 0.015507437826661306, + "grad_norm": 0.3400954306125641, + "learning_rate": 0.0001, + "loss": 1.8433, + "step": 135 + }, + { + "epoch": 0.015622307736488426, + "grad_norm": 0.37735289335250854, + "learning_rate": 0.0001, + "loss": 1.742, + "step": 136 + }, + { + "epoch": 0.015737177646315548, + "grad_norm": 0.33449429273605347, + "learning_rate": 0.0001, + "loss": 1.5548, + "step": 137 + }, + { + "epoch": 0.015852047556142668, + "grad_norm": 0.3531495928764343, + "learning_rate": 0.0001, + "loss": 1.9361, + "step": 138 + }, + { + "epoch": 0.015966917465969788, + "grad_norm": 0.3543091118335724, + "learning_rate": 0.0001, + "loss": 1.707, + "step": 139 + }, + { + "epoch": 0.01608178737579691, + "grad_norm": 0.3521219789981842, + "learning_rate": 0.0001, + "loss": 1.7885, + "step": 140 + }, + { + "epoch": 0.01619665728562403, + "grad_norm": 0.3496558368206024, + "learning_rate": 0.0001, + "loss": 1.6586, + "step": 141 + }, + { + "epoch": 0.01631152719545115, + "grad_norm": 0.35304608941078186, + "learning_rate": 0.0001, + "loss": 1.774, + "step": 142 + }, + { + "epoch": 0.01642639710527827, + "grad_norm": 0.3590501546859741, + "learning_rate": 0.0001, + "loss": 1.8743, + "step": 143 + }, + { + "epoch": 0.016541267015105394, + "grad_norm": 0.30004453659057617, + "learning_rate": 0.0001, + "loss": 1.6433, + "step": 144 + }, + { + "epoch": 0.016656136924932514, + "grad_norm": 0.3431392014026642, + "learning_rate": 0.0001, + "loss": 1.7894, + "step": 145 + }, + { + "epoch": 0.016771006834759634, + "grad_norm": 0.38148149847984314, + "learning_rate": 0.0001, + "loss": 1.9799, + "step": 146 + }, + { + "epoch": 0.016885876744586754, + "grad_norm": 0.35888367891311646, + "learning_rate": 0.0001, + "loss": 2.0782, + "step": 147 + }, + { + "epoch": 0.017000746654413878, + "grad_norm": 0.3419305980205536, + "learning_rate": 0.0001, + "loss": 1.8875, + "step": 148 + }, + { + "epoch": 0.017115616564240998, + "grad_norm": 0.33760684728622437, + "learning_rate": 0.0001, + "loss": 1.7646, + "step": 149 + }, + { + "epoch": 0.017230486474068118, + "grad_norm": 0.37631165981292725, + "learning_rate": 0.0001, + "loss": 1.7915, + "step": 150 + }, + { + "epoch": 0.017345356383895238, + "grad_norm": 0.3630012273788452, + "learning_rate": 0.0001, + "loss": 1.9777, + "step": 151 + }, + { + "epoch": 0.017460226293722358, + "grad_norm": 0.3230808973312378, + "learning_rate": 0.0001, + "loss": 1.73, + "step": 152 + }, + { + "epoch": 0.01757509620354948, + "grad_norm": 0.38511770963668823, + "learning_rate": 0.0001, + "loss": 1.9186, + "step": 153 + }, + { + "epoch": 0.0176899661133766, + "grad_norm": 0.32792919874191284, + "learning_rate": 0.0001, + "loss": 1.7209, + "step": 154 + }, + { + "epoch": 0.01780483602320372, + "grad_norm": 0.3158191442489624, + "learning_rate": 0.0001, + "loss": 1.6447, + "step": 155 + }, + { + "epoch": 0.01791970593303084, + "grad_norm": 0.36141133308410645, + "learning_rate": 0.0001, + "loss": 1.8623, + "step": 156 + }, + { + "epoch": 0.018034575842857965, + "grad_norm": 0.3470099866390228, + "learning_rate": 0.0001, + "loss": 1.7607, + "step": 157 + }, + { + "epoch": 0.018149445752685085, + "grad_norm": 0.31708627939224243, + "learning_rate": 0.0001, + "loss": 1.8103, + "step": 158 + }, + { + "epoch": 0.018264315662512205, + "grad_norm": 0.3248085677623749, + "learning_rate": 0.0001, + "loss": 1.8378, + "step": 159 + }, + { + "epoch": 0.018379185572339325, + "grad_norm": 0.32634925842285156, + "learning_rate": 0.0001, + "loss": 1.6819, + "step": 160 + }, + { + "epoch": 0.018494055482166448, + "grad_norm": 0.33397799730300903, + "learning_rate": 0.0001, + "loss": 1.5416, + "step": 161 + }, + { + "epoch": 0.018608925391993568, + "grad_norm": 0.3550772964954376, + "learning_rate": 0.0001, + "loss": 1.9381, + "step": 162 + }, + { + "epoch": 0.018723795301820688, + "grad_norm": 0.3159865438938141, + "learning_rate": 0.0001, + "loss": 1.6508, + "step": 163 + }, + { + "epoch": 0.018838665211647808, + "grad_norm": 0.3258025348186493, + "learning_rate": 0.0001, + "loss": 1.6776, + "step": 164 + }, + { + "epoch": 0.01895353512147493, + "grad_norm": 0.3488035798072815, + "learning_rate": 0.0001, + "loss": 2.0627, + "step": 165 + }, + { + "epoch": 0.01906840503130205, + "grad_norm": 0.3233996033668518, + "learning_rate": 0.0001, + "loss": 1.7189, + "step": 166 + }, + { + "epoch": 0.01918327494112917, + "grad_norm": 0.34753552079200745, + "learning_rate": 0.0001, + "loss": 1.9096, + "step": 167 + }, + { + "epoch": 0.01929814485095629, + "grad_norm": 0.3238770067691803, + "learning_rate": 0.0001, + "loss": 1.6883, + "step": 168 + }, + { + "epoch": 0.01941301476078341, + "grad_norm": 0.3520447611808777, + "learning_rate": 0.0001, + "loss": 1.8742, + "step": 169 + }, + { + "epoch": 0.019527884670610535, + "grad_norm": 0.35114407539367676, + "learning_rate": 0.0001, + "loss": 1.9098, + "step": 170 + }, + { + "epoch": 0.019642754580437655, + "grad_norm": 0.3472774922847748, + "learning_rate": 0.0001, + "loss": 1.6165, + "step": 171 + }, + { + "epoch": 0.019757624490264775, + "grad_norm": 0.33552825450897217, + "learning_rate": 0.0001, + "loss": 1.8656, + "step": 172 + }, + { + "epoch": 0.019872494400091895, + "grad_norm": 0.3363969027996063, + "learning_rate": 0.0001, + "loss": 1.8223, + "step": 173 + }, + { + "epoch": 0.019987364309919018, + "grad_norm": 0.3562895655632019, + "learning_rate": 0.0001, + "loss": 2.0164, + "step": 174 + }, + { + "epoch": 0.020102234219746138, + "grad_norm": 0.3177125155925751, + "learning_rate": 0.0001, + "loss": 1.6151, + "step": 175 + }, + { + "epoch": 0.020217104129573258, + "grad_norm": 0.33565396070480347, + "learning_rate": 0.0001, + "loss": 1.8483, + "step": 176 + }, + { + "epoch": 0.020331974039400378, + "grad_norm": 0.3205985128879547, + "learning_rate": 0.0001, + "loss": 1.9118, + "step": 177 + }, + { + "epoch": 0.0204468439492275, + "grad_norm": 0.32828885316848755, + "learning_rate": 0.0001, + "loss": 1.7084, + "step": 178 + }, + { + "epoch": 0.02056171385905462, + "grad_norm": 0.3119982182979584, + "learning_rate": 0.0001, + "loss": 1.6665, + "step": 179 + }, + { + "epoch": 0.02067658376888174, + "grad_norm": 0.34377679228782654, + "learning_rate": 0.0001, + "loss": 1.8481, + "step": 180 + }, + { + "epoch": 0.02079145367870886, + "grad_norm": 0.3235953152179718, + "learning_rate": 0.0001, + "loss": 1.8979, + "step": 181 + }, + { + "epoch": 0.02090632358853598, + "grad_norm": 0.35601556301116943, + "learning_rate": 0.0001, + "loss": 1.8469, + "step": 182 + }, + { + "epoch": 0.021021193498363105, + "grad_norm": 0.37358132004737854, + "learning_rate": 0.0001, + "loss": 2.0467, + "step": 183 + }, + { + "epoch": 0.021136063408190225, + "grad_norm": 0.31928551197052, + "learning_rate": 0.0001, + "loss": 1.6332, + "step": 184 + }, + { + "epoch": 0.021250933318017345, + "grad_norm": 0.33927685022354126, + "learning_rate": 0.0001, + "loss": 1.8849, + "step": 185 + }, + { + "epoch": 0.021365803227844465, + "grad_norm": 0.3599051237106323, + "learning_rate": 0.0001, + "loss": 1.9486, + "step": 186 + }, + { + "epoch": 0.021480673137671588, + "grad_norm": 0.32173585891723633, + "learning_rate": 0.0001, + "loss": 1.7674, + "step": 187 + }, + { + "epoch": 0.021595543047498708, + "grad_norm": 0.3629266023635864, + "learning_rate": 0.0001, + "loss": 1.7596, + "step": 188 + }, + { + "epoch": 0.021710412957325828, + "grad_norm": 0.31827229261398315, + "learning_rate": 0.0001, + "loss": 1.6838, + "step": 189 + }, + { + "epoch": 0.021825282867152948, + "grad_norm": 0.3418801426887512, + "learning_rate": 0.0001, + "loss": 1.6405, + "step": 190 + }, + { + "epoch": 0.02194015277698007, + "grad_norm": 0.3373776376247406, + "learning_rate": 0.0001, + "loss": 1.7593, + "step": 191 + }, + { + "epoch": 0.02205502268680719, + "grad_norm": 0.32532092928886414, + "learning_rate": 0.0001, + "loss": 1.8815, + "step": 192 + }, + { + "epoch": 0.02216989259663431, + "grad_norm": 0.3122539520263672, + "learning_rate": 0.0001, + "loss": 1.7756, + "step": 193 + }, + { + "epoch": 0.02228476250646143, + "grad_norm": 0.3185436725616455, + "learning_rate": 0.0001, + "loss": 1.6333, + "step": 194 + }, + { + "epoch": 0.022399632416288555, + "grad_norm": 0.3428870439529419, + "learning_rate": 0.0001, + "loss": 1.7126, + "step": 195 + }, + { + "epoch": 0.022514502326115675, + "grad_norm": 0.345682293176651, + "learning_rate": 0.0001, + "loss": 1.7324, + "step": 196 + }, + { + "epoch": 0.022629372235942795, + "grad_norm": 0.33158332109451294, + "learning_rate": 0.0001, + "loss": 1.78, + "step": 197 + }, + { + "epoch": 0.022744242145769915, + "grad_norm": 0.3336940109729767, + "learning_rate": 0.0001, + "loss": 1.7551, + "step": 198 + }, + { + "epoch": 0.022859112055597035, + "grad_norm": 0.3249858319759369, + "learning_rate": 0.0001, + "loss": 1.667, + "step": 199 + }, + { + "epoch": 0.022973981965424158, + "grad_norm": 0.3251815736293793, + "learning_rate": 0.0001, + "loss": 1.6737, + "step": 200 + }, + { + "epoch": 0.023088851875251278, + "grad_norm": 0.3444681763648987, + "learning_rate": 0.0001, + "loss": 1.6374, + "step": 201 + }, + { + "epoch": 0.023203721785078398, + "grad_norm": 0.33435139060020447, + "learning_rate": 0.0001, + "loss": 1.7646, + "step": 202 + }, + { + "epoch": 0.023318591694905518, + "grad_norm": 0.33999738097190857, + "learning_rate": 0.0001, + "loss": 1.8351, + "step": 203 + }, + { + "epoch": 0.02343346160473264, + "grad_norm": 0.33522528409957886, + "learning_rate": 0.0001, + "loss": 1.5754, + "step": 204 + }, + { + "epoch": 0.02354833151455976, + "grad_norm": 0.343474805355072, + "learning_rate": 0.0001, + "loss": 1.7712, + "step": 205 + }, + { + "epoch": 0.02366320142438688, + "grad_norm": 0.35150644183158875, + "learning_rate": 0.0001, + "loss": 1.9298, + "step": 206 + }, + { + "epoch": 0.023778071334214, + "grad_norm": 0.3332938253879547, + "learning_rate": 0.0001, + "loss": 1.6533, + "step": 207 + }, + { + "epoch": 0.023892941244041125, + "grad_norm": 0.34478330612182617, + "learning_rate": 0.0001, + "loss": 1.6317, + "step": 208 + }, + { + "epoch": 0.024007811153868245, + "grad_norm": 0.36427655816078186, + "learning_rate": 0.0001, + "loss": 1.7768, + "step": 209 + }, + { + "epoch": 0.024122681063695365, + "grad_norm": 0.33923599123954773, + "learning_rate": 0.0001, + "loss": 1.8095, + "step": 210 + }, + { + "epoch": 0.024237550973522485, + "grad_norm": 0.342052161693573, + "learning_rate": 0.0001, + "loss": 1.6883, + "step": 211 + }, + { + "epoch": 0.024352420883349605, + "grad_norm": 0.3708147704601288, + "learning_rate": 0.0001, + "loss": 1.9331, + "step": 212 + }, + { + "epoch": 0.02446729079317673, + "grad_norm": 0.3495383858680725, + "learning_rate": 0.0001, + "loss": 1.8306, + "step": 213 + }, + { + "epoch": 0.02458216070300385, + "grad_norm": 0.34303730726242065, + "learning_rate": 0.0001, + "loss": 1.848, + "step": 214 + }, + { + "epoch": 0.02469703061283097, + "grad_norm": 0.29449400305747986, + "learning_rate": 0.0001, + "loss": 1.6473, + "step": 215 + }, + { + "epoch": 0.024811900522658088, + "grad_norm": 0.31254515051841736, + "learning_rate": 0.0001, + "loss": 1.6091, + "step": 216 + }, + { + "epoch": 0.02492677043248521, + "grad_norm": 0.3257744014263153, + "learning_rate": 0.0001, + "loss": 1.5715, + "step": 217 + }, + { + "epoch": 0.02504164034231233, + "grad_norm": 0.37795010209083557, + "learning_rate": 0.0001, + "loss": 1.8833, + "step": 218 + }, + { + "epoch": 0.02515651025213945, + "grad_norm": 0.3558560311794281, + "learning_rate": 0.0001, + "loss": 1.7896, + "step": 219 + }, + { + "epoch": 0.02527138016196657, + "grad_norm": 0.3125206530094147, + "learning_rate": 0.0001, + "loss": 1.7799, + "step": 220 + }, + { + "epoch": 0.025386250071793695, + "grad_norm": 0.37031280994415283, + "learning_rate": 0.0001, + "loss": 1.7588, + "step": 221 + }, + { + "epoch": 0.025501119981620815, + "grad_norm": 0.32977885007858276, + "learning_rate": 0.0001, + "loss": 1.8772, + "step": 222 + }, + { + "epoch": 0.025615989891447935, + "grad_norm": 0.3385184407234192, + "learning_rate": 0.0001, + "loss": 1.7859, + "step": 223 + }, + { + "epoch": 0.025730859801275055, + "grad_norm": 0.3234337866306305, + "learning_rate": 0.0001, + "loss": 1.8324, + "step": 224 + }, + { + "epoch": 0.02584572971110218, + "grad_norm": 0.33667194843292236, + "learning_rate": 0.0001, + "loss": 1.8861, + "step": 225 + }, + { + "epoch": 0.0259605996209293, + "grad_norm": 0.3473112881183624, + "learning_rate": 0.0001, + "loss": 1.8953, + "step": 226 + }, + { + "epoch": 0.02607546953075642, + "grad_norm": 0.36767181754112244, + "learning_rate": 0.0001, + "loss": 1.8615, + "step": 227 + }, + { + "epoch": 0.02619033944058354, + "grad_norm": 0.2963101267814636, + "learning_rate": 0.0001, + "loss": 1.6676, + "step": 228 + }, + { + "epoch": 0.02630520935041066, + "grad_norm": 0.35011404752731323, + "learning_rate": 0.0001, + "loss": 1.9064, + "step": 229 + }, + { + "epoch": 0.026420079260237782, + "grad_norm": 0.3166825771331787, + "learning_rate": 0.0001, + "loss": 1.531, + "step": 230 + }, + { + "epoch": 0.026534949170064902, + "grad_norm": 0.33136868476867676, + "learning_rate": 0.0001, + "loss": 1.9791, + "step": 231 + }, + { + "epoch": 0.026649819079892022, + "grad_norm": 0.3448893129825592, + "learning_rate": 0.0001, + "loss": 1.9482, + "step": 232 + }, + { + "epoch": 0.02676468898971914, + "grad_norm": 0.33245259523391724, + "learning_rate": 0.0001, + "loss": 1.6812, + "step": 233 + }, + { + "epoch": 0.026879558899546265, + "grad_norm": 0.29518410563468933, + "learning_rate": 0.0001, + "loss": 1.5645, + "step": 234 + }, + { + "epoch": 0.026994428809373385, + "grad_norm": 0.33720827102661133, + "learning_rate": 0.0001, + "loss": 1.7339, + "step": 235 + }, + { + "epoch": 0.027109298719200505, + "grad_norm": 0.3211282789707184, + "learning_rate": 0.0001, + "loss": 1.7116, + "step": 236 + }, + { + "epoch": 0.027224168629027625, + "grad_norm": 0.3239465355873108, + "learning_rate": 0.0001, + "loss": 1.7349, + "step": 237 + }, + { + "epoch": 0.02733903853885475, + "grad_norm": 0.32968419790267944, + "learning_rate": 0.0001, + "loss": 1.7973, + "step": 238 + }, + { + "epoch": 0.02745390844868187, + "grad_norm": 0.3076919615268707, + "learning_rate": 0.0001, + "loss": 1.7016, + "step": 239 + }, + { + "epoch": 0.02756877835850899, + "grad_norm": 0.3537105619907379, + "learning_rate": 0.0001, + "loss": 1.7679, + "step": 240 + }, + { + "epoch": 0.02768364826833611, + "grad_norm": 0.3480488955974579, + "learning_rate": 0.0001, + "loss": 1.8703, + "step": 241 + }, + { + "epoch": 0.02779851817816323, + "grad_norm": 0.3412216901779175, + "learning_rate": 0.0001, + "loss": 1.6465, + "step": 242 + }, + { + "epoch": 0.027913388087990352, + "grad_norm": 0.3362070918083191, + "learning_rate": 0.0001, + "loss": 1.7042, + "step": 243 + }, + { + "epoch": 0.028028257997817472, + "grad_norm": 0.3288329839706421, + "learning_rate": 0.0001, + "loss": 1.7347, + "step": 244 + }, + { + "epoch": 0.028143127907644592, + "grad_norm": 0.33331453800201416, + "learning_rate": 0.0001, + "loss": 1.7895, + "step": 245 + }, + { + "epoch": 0.028257997817471712, + "grad_norm": 0.34005260467529297, + "learning_rate": 0.0001, + "loss": 1.8154, + "step": 246 + }, + { + "epoch": 0.028372867727298835, + "grad_norm": 0.33353686332702637, + "learning_rate": 0.0001, + "loss": 1.7256, + "step": 247 + }, + { + "epoch": 0.028487737637125955, + "grad_norm": 0.30739593505859375, + "learning_rate": 0.0001, + "loss": 1.779, + "step": 248 + }, + { + "epoch": 0.028602607546953075, + "grad_norm": 0.333033949136734, + "learning_rate": 0.0001, + "loss": 1.9319, + "step": 249 + }, + { + "epoch": 0.028717477456780195, + "grad_norm": 0.3508240878582001, + "learning_rate": 0.0001, + "loss": 1.993, + "step": 250 + }, + { + "epoch": 0.02883234736660732, + "grad_norm": 0.32476675510406494, + "learning_rate": 0.0001, + "loss": 1.8096, + "step": 251 + }, + { + "epoch": 0.02894721727643444, + "grad_norm": 0.31880107522010803, + "learning_rate": 0.0001, + "loss": 1.6591, + "step": 252 + }, + { + "epoch": 0.02906208718626156, + "grad_norm": 0.33040839433670044, + "learning_rate": 0.0001, + "loss": 1.7228, + "step": 253 + }, + { + "epoch": 0.02917695709608868, + "grad_norm": 0.3504570424556732, + "learning_rate": 0.0001, + "loss": 1.9012, + "step": 254 + }, + { + "epoch": 0.029291827005915802, + "grad_norm": 0.3304044008255005, + "learning_rate": 0.0001, + "loss": 1.8114, + "step": 255 + }, + { + "epoch": 0.029406696915742922, + "grad_norm": 0.3044085204601288, + "learning_rate": 0.0001, + "loss": 1.4707, + "step": 256 + }, + { + "epoch": 0.029521566825570042, + "grad_norm": 0.314557284116745, + "learning_rate": 0.0001, + "loss": 1.6985, + "step": 257 + }, + { + "epoch": 0.029636436735397162, + "grad_norm": 0.3400304317474365, + "learning_rate": 0.0001, + "loss": 1.7826, + "step": 258 + }, + { + "epoch": 0.029751306645224282, + "grad_norm": 0.3435501158237457, + "learning_rate": 0.0001, + "loss": 1.8779, + "step": 259 + }, + { + "epoch": 0.029866176555051405, + "grad_norm": 0.35256657004356384, + "learning_rate": 0.0001, + "loss": 1.9058, + "step": 260 + }, + { + "epoch": 0.029981046464878525, + "grad_norm": 0.32517293095588684, + "learning_rate": 0.0001, + "loss": 1.7505, + "step": 261 + }, + { + "epoch": 0.030095916374705645, + "grad_norm": 0.32231926918029785, + "learning_rate": 0.0001, + "loss": 1.5946, + "step": 262 + }, + { + "epoch": 0.030210786284532765, + "grad_norm": 0.312837153673172, + "learning_rate": 0.0001, + "loss": 1.7485, + "step": 263 + }, + { + "epoch": 0.03032565619435989, + "grad_norm": 0.33711639046669006, + "learning_rate": 0.0001, + "loss": 1.8586, + "step": 264 + }, + { + "epoch": 0.03044052610418701, + "grad_norm": 0.33530449867248535, + "learning_rate": 0.0001, + "loss": 1.5639, + "step": 265 + }, + { + "epoch": 0.03055539601401413, + "grad_norm": 0.3420933783054352, + "learning_rate": 0.0001, + "loss": 1.8716, + "step": 266 + }, + { + "epoch": 0.03067026592384125, + "grad_norm": 0.3286440372467041, + "learning_rate": 0.0001, + "loss": 1.6937, + "step": 267 + }, + { + "epoch": 0.030785135833668372, + "grad_norm": 0.331714391708374, + "learning_rate": 0.0001, + "loss": 1.8212, + "step": 268 + }, + { + "epoch": 0.030900005743495492, + "grad_norm": 0.32993122935295105, + "learning_rate": 0.0001, + "loss": 1.6213, + "step": 269 + }, + { + "epoch": 0.031014875653322612, + "grad_norm": 0.32213708758354187, + "learning_rate": 0.0001, + "loss": 1.8358, + "step": 270 + }, + { + "epoch": 0.031129745563149732, + "grad_norm": 0.33745911717414856, + "learning_rate": 0.0001, + "loss": 1.7545, + "step": 271 + }, + { + "epoch": 0.031244615472976852, + "grad_norm": 0.32847779989242554, + "learning_rate": 0.0001, + "loss": 1.7301, + "step": 272 + }, + { + "epoch": 0.031359485382803975, + "grad_norm": 0.3093877136707306, + "learning_rate": 0.0001, + "loss": 1.684, + "step": 273 + }, + { + "epoch": 0.031474355292631095, + "grad_norm": 0.32528170943260193, + "learning_rate": 0.0001, + "loss": 1.7503, + "step": 274 + }, + { + "epoch": 0.031589225202458215, + "grad_norm": 0.3486316204071045, + "learning_rate": 0.0001, + "loss": 1.8535, + "step": 275 + }, + { + "epoch": 0.031704095112285335, + "grad_norm": 0.3452807068824768, + "learning_rate": 0.0001, + "loss": 1.791, + "step": 276 + }, + { + "epoch": 0.031818965022112455, + "grad_norm": 0.3220309615135193, + "learning_rate": 0.0001, + "loss": 1.7711, + "step": 277 + }, + { + "epoch": 0.031933834931939575, + "grad_norm": 0.3608686327934265, + "learning_rate": 0.0001, + "loss": 1.853, + "step": 278 + }, + { + "epoch": 0.0320487048417667, + "grad_norm": 0.3356455862522125, + "learning_rate": 0.0001, + "loss": 1.8301, + "step": 279 + }, + { + "epoch": 0.03216357475159382, + "grad_norm": 0.36832770705223083, + "learning_rate": 0.0001, + "loss": 1.9413, + "step": 280 + }, + { + "epoch": 0.03227844466142094, + "grad_norm": 0.31815385818481445, + "learning_rate": 0.0001, + "loss": 1.7599, + "step": 281 + }, + { + "epoch": 0.03239331457124806, + "grad_norm": 0.3182365894317627, + "learning_rate": 0.0001, + "loss": 1.8303, + "step": 282 + }, + { + "epoch": 0.03250818448107518, + "grad_norm": 0.3294694721698761, + "learning_rate": 0.0001, + "loss": 1.7363, + "step": 283 + }, + { + "epoch": 0.0326230543909023, + "grad_norm": 0.3316207826137543, + "learning_rate": 0.0001, + "loss": 1.8191, + "step": 284 + }, + { + "epoch": 0.03273792430072942, + "grad_norm": 0.33212903141975403, + "learning_rate": 0.0001, + "loss": 1.8683, + "step": 285 + }, + { + "epoch": 0.03285279421055654, + "grad_norm": 0.3195394277572632, + "learning_rate": 0.0001, + "loss": 1.726, + "step": 286 + }, + { + "epoch": 0.03296766412038366, + "grad_norm": 0.30052995681762695, + "learning_rate": 0.0001, + "loss": 1.5014, + "step": 287 + }, + { + "epoch": 0.03308253403021079, + "grad_norm": 0.34062662720680237, + "learning_rate": 0.0001, + "loss": 1.9087, + "step": 288 + }, + { + "epoch": 0.03319740394003791, + "grad_norm": 0.34171825647354126, + "learning_rate": 0.0001, + "loss": 1.8431, + "step": 289 + }, + { + "epoch": 0.03331227384986503, + "grad_norm": 0.3276672065258026, + "learning_rate": 0.0001, + "loss": 1.6594, + "step": 290 + }, + { + "epoch": 0.03342714375969215, + "grad_norm": 0.36738014221191406, + "learning_rate": 0.0001, + "loss": 2.1369, + "step": 291 + }, + { + "epoch": 0.03354201366951927, + "grad_norm": 0.33889520168304443, + "learning_rate": 0.0001, + "loss": 1.6107, + "step": 292 + }, + { + "epoch": 0.03365688357934639, + "grad_norm": 0.33499544858932495, + "learning_rate": 0.0001, + "loss": 1.9176, + "step": 293 + }, + { + "epoch": 0.03377175348917351, + "grad_norm": 0.32459279894828796, + "learning_rate": 0.0001, + "loss": 1.7669, + "step": 294 + }, + { + "epoch": 0.03388662339900063, + "grad_norm": 0.3338513672351837, + "learning_rate": 0.0001, + "loss": 1.8755, + "step": 295 + }, + { + "epoch": 0.034001493308827756, + "grad_norm": 0.3502792418003082, + "learning_rate": 0.0001, + "loss": 1.7341, + "step": 296 + }, + { + "epoch": 0.034116363218654876, + "grad_norm": 0.34327709674835205, + "learning_rate": 0.0001, + "loss": 1.8455, + "step": 297 + }, + { + "epoch": 0.034231233128481996, + "grad_norm": 0.34909358620643616, + "learning_rate": 0.0001, + "loss": 1.6939, + "step": 298 + }, + { + "epoch": 0.034346103038309116, + "grad_norm": 0.3473874032497406, + "learning_rate": 0.0001, + "loss": 1.7983, + "step": 299 + }, + { + "epoch": 0.034460972948136236, + "grad_norm": 0.3394108712673187, + "learning_rate": 0.0001, + "loss": 1.6407, + "step": 300 + }, + { + "epoch": 0.034575842857963356, + "grad_norm": 0.34042516350746155, + "learning_rate": 0.0001, + "loss": 1.9465, + "step": 301 + }, + { + "epoch": 0.034690712767790476, + "grad_norm": 0.3370528221130371, + "learning_rate": 0.0001, + "loss": 1.7753, + "step": 302 + }, + { + "epoch": 0.034805582677617596, + "grad_norm": 0.32830432057380676, + "learning_rate": 0.0001, + "loss": 1.9652, + "step": 303 + }, + { + "epoch": 0.034920452587444716, + "grad_norm": 0.33252227306365967, + "learning_rate": 0.0001, + "loss": 1.8031, + "step": 304 + }, + { + "epoch": 0.03503532249727184, + "grad_norm": 0.32804980874061584, + "learning_rate": 0.0001, + "loss": 1.8358, + "step": 305 + }, + { + "epoch": 0.03515019240709896, + "grad_norm": 0.29810619354248047, + "learning_rate": 0.0001, + "loss": 1.6119, + "step": 306 + }, + { + "epoch": 0.03526506231692608, + "grad_norm": 0.3392501473426819, + "learning_rate": 0.0001, + "loss": 1.7285, + "step": 307 + }, + { + "epoch": 0.0353799322267532, + "grad_norm": 0.31986865401268005, + "learning_rate": 0.0001, + "loss": 1.7034, + "step": 308 + }, + { + "epoch": 0.03549480213658032, + "grad_norm": 0.33063703775405884, + "learning_rate": 0.0001, + "loss": 1.7634, + "step": 309 + }, + { + "epoch": 0.03560967204640744, + "grad_norm": 0.3045586049556732, + "learning_rate": 0.0001, + "loss": 1.6945, + "step": 310 + }, + { + "epoch": 0.03572454195623456, + "grad_norm": 0.3407602608203888, + "learning_rate": 0.0001, + "loss": 1.6488, + "step": 311 + }, + { + "epoch": 0.03583941186606168, + "grad_norm": 0.3090314567089081, + "learning_rate": 0.0001, + "loss": 1.6678, + "step": 312 + }, + { + "epoch": 0.03595428177588881, + "grad_norm": 0.3623706102371216, + "learning_rate": 0.0001, + "loss": 1.8877, + "step": 313 + }, + { + "epoch": 0.03606915168571593, + "grad_norm": 0.3450353145599365, + "learning_rate": 0.0001, + "loss": 1.6494, + "step": 314 + }, + { + "epoch": 0.03618402159554305, + "grad_norm": 0.3811194598674774, + "learning_rate": 0.0001, + "loss": 2.0295, + "step": 315 + }, + { + "epoch": 0.03629889150537017, + "grad_norm": 0.3349141776561737, + "learning_rate": 0.0001, + "loss": 1.8964, + "step": 316 + }, + { + "epoch": 0.03641376141519729, + "grad_norm": 0.3339914381504059, + "learning_rate": 0.0001, + "loss": 1.6882, + "step": 317 + }, + { + "epoch": 0.03652863132502441, + "grad_norm": 0.32706162333488464, + "learning_rate": 0.0001, + "loss": 1.6468, + "step": 318 + }, + { + "epoch": 0.03664350123485153, + "grad_norm": 0.35195019841194153, + "learning_rate": 0.0001, + "loss": 1.8871, + "step": 319 + }, + { + "epoch": 0.03675837114467865, + "grad_norm": 0.2981555163860321, + "learning_rate": 0.0001, + "loss": 1.6054, + "step": 320 + }, + { + "epoch": 0.03687324105450577, + "grad_norm": 0.3194637894630432, + "learning_rate": 0.0001, + "loss": 1.7356, + "step": 321 + }, + { + "epoch": 0.036988110964332896, + "grad_norm": 0.3623133599758148, + "learning_rate": 0.0001, + "loss": 1.8527, + "step": 322 + }, + { + "epoch": 0.037102980874160016, + "grad_norm": 0.3471209704875946, + "learning_rate": 0.0001, + "loss": 1.7521, + "step": 323 + }, + { + "epoch": 0.037217850783987136, + "grad_norm": 0.35352832078933716, + "learning_rate": 0.0001, + "loss": 1.7211, + "step": 324 + }, + { + "epoch": 0.037332720693814256, + "grad_norm": 0.32329848408699036, + "learning_rate": 0.0001, + "loss": 1.7678, + "step": 325 + }, + { + "epoch": 0.037447590603641376, + "grad_norm": 0.32614487409591675, + "learning_rate": 0.0001, + "loss": 1.7791, + "step": 326 + }, + { + "epoch": 0.037562460513468496, + "grad_norm": 0.3770069479942322, + "learning_rate": 0.0001, + "loss": 1.7185, + "step": 327 + }, + { + "epoch": 0.037677330423295616, + "grad_norm": 0.32778894901275635, + "learning_rate": 0.0001, + "loss": 1.6565, + "step": 328 + }, + { + "epoch": 0.037792200333122736, + "grad_norm": 0.37443724274635315, + "learning_rate": 0.0001, + "loss": 2.0193, + "step": 329 + }, + { + "epoch": 0.03790707024294986, + "grad_norm": 0.3752276599407196, + "learning_rate": 0.0001, + "loss": 1.9978, + "step": 330 + }, + { + "epoch": 0.03802194015277698, + "grad_norm": 0.33359140157699585, + "learning_rate": 0.0001, + "loss": 1.7836, + "step": 331 + }, + { + "epoch": 0.0381368100626041, + "grad_norm": 0.3710048794746399, + "learning_rate": 0.0001, + "loss": 1.9855, + "step": 332 + }, + { + "epoch": 0.03825167997243122, + "grad_norm": 0.3335336446762085, + "learning_rate": 0.0001, + "loss": 1.7433, + "step": 333 + }, + { + "epoch": 0.03836654988225834, + "grad_norm": 0.33409109711647034, + "learning_rate": 0.0001, + "loss": 1.7517, + "step": 334 + }, + { + "epoch": 0.03848141979208546, + "grad_norm": 0.3645201623439789, + "learning_rate": 0.0001, + "loss": 1.8764, + "step": 335 + }, + { + "epoch": 0.03859628970191258, + "grad_norm": 0.33294662833213806, + "learning_rate": 0.0001, + "loss": 1.867, + "step": 336 + }, + { + "epoch": 0.0387111596117397, + "grad_norm": 0.32880741357803345, + "learning_rate": 0.0001, + "loss": 1.8049, + "step": 337 + }, + { + "epoch": 0.03882602952156682, + "grad_norm": 0.353669136762619, + "learning_rate": 0.0001, + "loss": 1.7027, + "step": 338 + }, + { + "epoch": 0.03894089943139395, + "grad_norm": 0.3438865840435028, + "learning_rate": 0.0001, + "loss": 1.6894, + "step": 339 + }, + { + "epoch": 0.03905576934122107, + "grad_norm": 0.3039886951446533, + "learning_rate": 0.0001, + "loss": 1.7142, + "step": 340 + }, + { + "epoch": 0.03917063925104819, + "grad_norm": 0.34314149618148804, + "learning_rate": 0.0001, + "loss": 1.7391, + "step": 341 + }, + { + "epoch": 0.03928550916087531, + "grad_norm": 0.3602879047393799, + "learning_rate": 0.0001, + "loss": 1.8054, + "step": 342 + }, + { + "epoch": 0.03940037907070243, + "grad_norm": 0.33207011222839355, + "learning_rate": 0.0001, + "loss": 1.7549, + "step": 343 + }, + { + "epoch": 0.03951524898052955, + "grad_norm": 0.36213555932044983, + "learning_rate": 0.0001, + "loss": 1.8795, + "step": 344 + }, + { + "epoch": 0.03963011889035667, + "grad_norm": 0.37275567650794983, + "learning_rate": 0.0001, + "loss": 1.7072, + "step": 345 + }, + { + "epoch": 0.03974498880018379, + "grad_norm": 0.3421449661254883, + "learning_rate": 0.0001, + "loss": 1.7917, + "step": 346 + }, + { + "epoch": 0.03985985871001091, + "grad_norm": 0.33920395374298096, + "learning_rate": 0.0001, + "loss": 1.8476, + "step": 347 + }, + { + "epoch": 0.039974728619838036, + "grad_norm": 0.3364729881286621, + "learning_rate": 0.0001, + "loss": 1.8673, + "step": 348 + }, + { + "epoch": 0.040089598529665156, + "grad_norm": 0.3258533179759979, + "learning_rate": 0.0001, + "loss": 1.5958, + "step": 349 + }, + { + "epoch": 0.040204468439492276, + "grad_norm": 0.31070706248283386, + "learning_rate": 0.0001, + "loss": 1.482, + "step": 350 + }, + { + "epoch": 0.040319338349319396, + "grad_norm": 0.3752513527870178, + "learning_rate": 0.0001, + "loss": 1.9127, + "step": 351 + }, + { + "epoch": 0.040434208259146516, + "grad_norm": 0.34795695543289185, + "learning_rate": 0.0001, + "loss": 1.7637, + "step": 352 + }, + { + "epoch": 0.040549078168973636, + "grad_norm": 0.3222607374191284, + "learning_rate": 0.0001, + "loss": 1.8052, + "step": 353 + }, + { + "epoch": 0.040663948078800756, + "grad_norm": 0.3587421476840973, + "learning_rate": 0.0001, + "loss": 1.8354, + "step": 354 + }, + { + "epoch": 0.040778817988627876, + "grad_norm": 0.3381020724773407, + "learning_rate": 0.0001, + "loss": 1.9156, + "step": 355 + }, + { + "epoch": 0.040893687898455, + "grad_norm": 0.3503625690937042, + "learning_rate": 0.0001, + "loss": 1.8307, + "step": 356 + }, + { + "epoch": 0.04100855780828212, + "grad_norm": 0.34590432047843933, + "learning_rate": 0.0001, + "loss": 1.7548, + "step": 357 + }, + { + "epoch": 0.04112342771810924, + "grad_norm": 0.38469988107681274, + "learning_rate": 0.0001, + "loss": 1.8686, + "step": 358 + }, + { + "epoch": 0.04123829762793636, + "grad_norm": 0.34001877903938293, + "learning_rate": 0.0001, + "loss": 1.5505, + "step": 359 + }, + { + "epoch": 0.04135316753776348, + "grad_norm": 0.35807985067367554, + "learning_rate": 0.0001, + "loss": 1.8079, + "step": 360 + }, + { + "epoch": 0.0414680374475906, + "grad_norm": 0.3272436857223511, + "learning_rate": 0.0001, + "loss": 1.8085, + "step": 361 + }, + { + "epoch": 0.04158290735741772, + "grad_norm": 0.31507229804992676, + "learning_rate": 0.0001, + "loss": 1.6678, + "step": 362 + }, + { + "epoch": 0.04169777726724484, + "grad_norm": 0.3048648238182068, + "learning_rate": 0.0001, + "loss": 1.5551, + "step": 363 + }, + { + "epoch": 0.04181264717707196, + "grad_norm": 0.3657607436180115, + "learning_rate": 0.0001, + "loss": 1.7309, + "step": 364 + }, + { + "epoch": 0.04192751708689909, + "grad_norm": 0.3520914316177368, + "learning_rate": 0.0001, + "loss": 1.746, + "step": 365 + }, + { + "epoch": 0.04204238699672621, + "grad_norm": 0.3329162299633026, + "learning_rate": 0.0001, + "loss": 1.7818, + "step": 366 + }, + { + "epoch": 0.04215725690655333, + "grad_norm": 0.342009574174881, + "learning_rate": 0.0001, + "loss": 1.6875, + "step": 367 + }, + { + "epoch": 0.04227212681638045, + "grad_norm": 0.3568592667579651, + "learning_rate": 0.0001, + "loss": 1.8474, + "step": 368 + }, + { + "epoch": 0.04238699672620757, + "grad_norm": 0.32136744260787964, + "learning_rate": 0.0001, + "loss": 1.7908, + "step": 369 + }, + { + "epoch": 0.04250186663603469, + "grad_norm": 0.34790685772895813, + "learning_rate": 0.0001, + "loss": 1.8666, + "step": 370 + }, + { + "epoch": 0.04261673654586181, + "grad_norm": 0.3257257044315338, + "learning_rate": 0.0001, + "loss": 1.8399, + "step": 371 + }, + { + "epoch": 0.04273160645568893, + "grad_norm": 0.319909006357193, + "learning_rate": 0.0001, + "loss": 1.5395, + "step": 372 + }, + { + "epoch": 0.042846476365516056, + "grad_norm": 0.34518831968307495, + "learning_rate": 0.0001, + "loss": 1.721, + "step": 373 + }, + { + "epoch": 0.042961346275343176, + "grad_norm": 0.350404292345047, + "learning_rate": 0.0001, + "loss": 1.7933, + "step": 374 + }, + { + "epoch": 0.043076216185170296, + "grad_norm": 0.3150002956390381, + "learning_rate": 0.0001, + "loss": 1.4682, + "step": 375 + }, + { + "epoch": 0.043191086094997416, + "grad_norm": 0.3526088297367096, + "learning_rate": 0.0001, + "loss": 1.7383, + "step": 376 + }, + { + "epoch": 0.043305956004824536, + "grad_norm": 0.3494514226913452, + "learning_rate": 0.0001, + "loss": 1.7906, + "step": 377 + }, + { + "epoch": 0.043420825914651656, + "grad_norm": 0.3323955833911896, + "learning_rate": 0.0001, + "loss": 1.8454, + "step": 378 + }, + { + "epoch": 0.043535695824478776, + "grad_norm": 0.31533968448638916, + "learning_rate": 0.0001, + "loss": 1.538, + "step": 379 + }, + { + "epoch": 0.043650565734305896, + "grad_norm": 0.3485229015350342, + "learning_rate": 0.0001, + "loss": 1.704, + "step": 380 + }, + { + "epoch": 0.043765435644133016, + "grad_norm": 0.3400436043739319, + "learning_rate": 0.0001, + "loss": 1.861, + "step": 381 + }, + { + "epoch": 0.04388030555396014, + "grad_norm": 0.32537248730659485, + "learning_rate": 0.0001, + "loss": 1.926, + "step": 382 + }, + { + "epoch": 0.04399517546378726, + "grad_norm": 0.35941800475120544, + "learning_rate": 0.0001, + "loss": 1.8136, + "step": 383 + }, + { + "epoch": 0.04411004537361438, + "grad_norm": 0.3239869773387909, + "learning_rate": 0.0001, + "loss": 1.5941, + "step": 384 + }, + { + "epoch": 0.0442249152834415, + "grad_norm": 0.34292441606521606, + "learning_rate": 0.0001, + "loss": 2.0001, + "step": 385 + }, + { + "epoch": 0.04433978519326862, + "grad_norm": 0.35059481859207153, + "learning_rate": 0.0001, + "loss": 1.8536, + "step": 386 + }, + { + "epoch": 0.04445465510309574, + "grad_norm": 0.35030195116996765, + "learning_rate": 0.0001, + "loss": 1.7471, + "step": 387 + }, + { + "epoch": 0.04456952501292286, + "grad_norm": 0.34561511874198914, + "learning_rate": 0.0001, + "loss": 1.6975, + "step": 388 + }, + { + "epoch": 0.04468439492274998, + "grad_norm": 0.3485510051250458, + "learning_rate": 0.0001, + "loss": 1.7986, + "step": 389 + }, + { + "epoch": 0.04479926483257711, + "grad_norm": 0.3528231978416443, + "learning_rate": 0.0001, + "loss": 1.7736, + "step": 390 + }, + { + "epoch": 0.04491413474240423, + "grad_norm": 0.3497932255268097, + "learning_rate": 0.0001, + "loss": 1.8213, + "step": 391 + }, + { + "epoch": 0.04502900465223135, + "grad_norm": 0.34640979766845703, + "learning_rate": 0.0001, + "loss": 2.0511, + "step": 392 + }, + { + "epoch": 0.04514387456205847, + "grad_norm": 0.3179776668548584, + "learning_rate": 0.0001, + "loss": 1.6705, + "step": 393 + }, + { + "epoch": 0.04525874447188559, + "grad_norm": 0.3492574691772461, + "learning_rate": 0.0001, + "loss": 1.7829, + "step": 394 + }, + { + "epoch": 0.04537361438171271, + "grad_norm": 0.32157397270202637, + "learning_rate": 0.0001, + "loss": 1.8084, + "step": 395 + }, + { + "epoch": 0.04548848429153983, + "grad_norm": 0.32758328318595886, + "learning_rate": 0.0001, + "loss": 1.6538, + "step": 396 + }, + { + "epoch": 0.04560335420136695, + "grad_norm": 0.34034964442253113, + "learning_rate": 0.0001, + "loss": 1.6549, + "step": 397 + }, + { + "epoch": 0.04571822411119407, + "grad_norm": 0.3422950506210327, + "learning_rate": 0.0001, + "loss": 1.9178, + "step": 398 + }, + { + "epoch": 0.045833094021021197, + "grad_norm": 0.3140711784362793, + "learning_rate": 0.0001, + "loss": 1.6569, + "step": 399 + }, + { + "epoch": 0.045947963930848316, + "grad_norm": 0.33444762229919434, + "learning_rate": 0.0001, + "loss": 1.7443, + "step": 400 + }, + { + "epoch": 0.046062833840675436, + "grad_norm": 0.3292236924171448, + "learning_rate": 0.0001, + "loss": 1.786, + "step": 401 + }, + { + "epoch": 0.046177703750502556, + "grad_norm": 0.3406231999397278, + "learning_rate": 0.0001, + "loss": 1.8477, + "step": 402 + }, + { + "epoch": 0.046292573660329676, + "grad_norm": 0.35850241780281067, + "learning_rate": 0.0001, + "loss": 1.9219, + "step": 403 + }, + { + "epoch": 0.046407443570156796, + "grad_norm": 0.39436420798301697, + "learning_rate": 0.0001, + "loss": 1.9564, + "step": 404 + }, + { + "epoch": 0.046522313479983916, + "grad_norm": 0.3540903627872467, + "learning_rate": 0.0001, + "loss": 1.6521, + "step": 405 + }, + { + "epoch": 0.046637183389811036, + "grad_norm": 0.4025273323059082, + "learning_rate": 0.0001, + "loss": 1.7531, + "step": 406 + }, + { + "epoch": 0.04675205329963816, + "grad_norm": 0.3019482493400574, + "learning_rate": 0.0001, + "loss": 1.5366, + "step": 407 + }, + { + "epoch": 0.04686692320946528, + "grad_norm": 0.3409338593482971, + "learning_rate": 0.0001, + "loss": 1.8024, + "step": 408 + }, + { + "epoch": 0.0469817931192924, + "grad_norm": 0.3111829161643982, + "learning_rate": 0.0001, + "loss": 1.486, + "step": 409 + }, + { + "epoch": 0.04709666302911952, + "grad_norm": 0.38814571499824524, + "learning_rate": 0.0001, + "loss": 2.0234, + "step": 410 + }, + { + "epoch": 0.04721153293894664, + "grad_norm": 0.3604361116886139, + "learning_rate": 0.0001, + "loss": 1.7389, + "step": 411 + }, + { + "epoch": 0.04732640284877376, + "grad_norm": 0.36267879605293274, + "learning_rate": 0.0001, + "loss": 1.9424, + "step": 412 + }, + { + "epoch": 0.04744127275860088, + "grad_norm": 0.3534083366394043, + "learning_rate": 0.0001, + "loss": 1.9132, + "step": 413 + }, + { + "epoch": 0.047556142668428, + "grad_norm": 0.31778329610824585, + "learning_rate": 0.0001, + "loss": 1.7196, + "step": 414 + }, + { + "epoch": 0.04767101257825512, + "grad_norm": 0.3307456970214844, + "learning_rate": 0.0001, + "loss": 1.7706, + "step": 415 + }, + { + "epoch": 0.04778588248808225, + "grad_norm": 0.33666694164276123, + "learning_rate": 0.0001, + "loss": 1.8634, + "step": 416 + }, + { + "epoch": 0.04790075239790937, + "grad_norm": 0.352847158908844, + "learning_rate": 0.0001, + "loss": 1.8828, + "step": 417 + }, + { + "epoch": 0.04801562230773649, + "grad_norm": 0.33423542976379395, + "learning_rate": 0.0001, + "loss": 1.9455, + "step": 418 + }, + { + "epoch": 0.04813049221756361, + "grad_norm": 0.35909709334373474, + "learning_rate": 0.0001, + "loss": 1.8293, + "step": 419 + }, + { + "epoch": 0.04824536212739073, + "grad_norm": 0.35591983795166016, + "learning_rate": 0.0001, + "loss": 1.8619, + "step": 420 + }, + { + "epoch": 0.04836023203721785, + "grad_norm": 0.33195993304252625, + "learning_rate": 0.0001, + "loss": 1.6672, + "step": 421 + }, + { + "epoch": 0.04847510194704497, + "grad_norm": 0.3257507383823395, + "learning_rate": 0.0001, + "loss": 1.7567, + "step": 422 + }, + { + "epoch": 0.04858997185687209, + "grad_norm": 0.31000784039497375, + "learning_rate": 0.0001, + "loss": 1.7718, + "step": 423 + }, + { + "epoch": 0.04870484176669921, + "grad_norm": 0.3311057686805725, + "learning_rate": 0.0001, + "loss": 1.8224, + "step": 424 + }, + { + "epoch": 0.04881971167652634, + "grad_norm": 0.32574066519737244, + "learning_rate": 0.0001, + "loss": 1.7199, + "step": 425 + }, + { + "epoch": 0.04893458158635346, + "grad_norm": 0.30064114928245544, + "learning_rate": 0.0001, + "loss": 1.6134, + "step": 426 + }, + { + "epoch": 0.04904945149618058, + "grad_norm": 0.331144779920578, + "learning_rate": 0.0001, + "loss": 1.8779, + "step": 427 + }, + { + "epoch": 0.0491643214060077, + "grad_norm": 0.3030913472175598, + "learning_rate": 0.0001, + "loss": 1.5864, + "step": 428 + }, + { + "epoch": 0.04927919131583482, + "grad_norm": 0.3529425263404846, + "learning_rate": 0.0001, + "loss": 1.743, + "step": 429 + }, + { + "epoch": 0.04939406122566194, + "grad_norm": 0.32856279611587524, + "learning_rate": 0.0001, + "loss": 1.6406, + "step": 430 + }, + { + "epoch": 0.04950893113548906, + "grad_norm": 0.32099583745002747, + "learning_rate": 0.0001, + "loss": 1.6706, + "step": 431 + }, + { + "epoch": 0.049623801045316177, + "grad_norm": 0.3427245616912842, + "learning_rate": 0.0001, + "loss": 1.8355, + "step": 432 + }, + { + "epoch": 0.0497386709551433, + "grad_norm": 0.34210020303726196, + "learning_rate": 0.0001, + "loss": 1.7778, + "step": 433 + }, + { + "epoch": 0.04985354086497042, + "grad_norm": 0.3216181695461273, + "learning_rate": 0.0001, + "loss": 1.8209, + "step": 434 + }, + { + "epoch": 0.04996841077479754, + "grad_norm": 0.33420485258102417, + "learning_rate": 0.0001, + "loss": 1.8087, + "step": 435 + }, + { + "epoch": 0.05008328068462466, + "grad_norm": 0.3496338725090027, + "learning_rate": 0.0001, + "loss": 1.737, + "step": 436 + }, + { + "epoch": 0.05019815059445178, + "grad_norm": 0.3511291444301605, + "learning_rate": 0.0001, + "loss": 2.0355, + "step": 437 + }, + { + "epoch": 0.0503130205042789, + "grad_norm": 0.34182974696159363, + "learning_rate": 0.0001, + "loss": 1.7236, + "step": 438 + }, + { + "epoch": 0.05042789041410602, + "grad_norm": 0.3205072581768036, + "learning_rate": 0.0001, + "loss": 1.7691, + "step": 439 + }, + { + "epoch": 0.05054276032393314, + "grad_norm": 0.3407754600048065, + "learning_rate": 0.0001, + "loss": 1.9003, + "step": 440 + }, + { + "epoch": 0.05065763023376026, + "grad_norm": 0.31289535760879517, + "learning_rate": 0.0001, + "loss": 1.6923, + "step": 441 + }, + { + "epoch": 0.05077250014358739, + "grad_norm": 0.2934773564338684, + "learning_rate": 0.0001, + "loss": 1.5488, + "step": 442 + }, + { + "epoch": 0.05088737005341451, + "grad_norm": 0.3349979817867279, + "learning_rate": 0.0001, + "loss": 1.6209, + "step": 443 + }, + { + "epoch": 0.05100223996324163, + "grad_norm": 0.3626347482204437, + "learning_rate": 0.0001, + "loss": 1.8299, + "step": 444 + }, + { + "epoch": 0.05111710987306875, + "grad_norm": 0.3510279059410095, + "learning_rate": 0.0001, + "loss": 1.6544, + "step": 445 + }, + { + "epoch": 0.05123197978289587, + "grad_norm": 0.3967926502227783, + "learning_rate": 0.0001, + "loss": 2.0913, + "step": 446 + }, + { + "epoch": 0.05134684969272299, + "grad_norm": 0.31338948011398315, + "learning_rate": 0.0001, + "loss": 1.5693, + "step": 447 + }, + { + "epoch": 0.05146171960255011, + "grad_norm": 0.32318705320358276, + "learning_rate": 0.0001, + "loss": 1.5317, + "step": 448 + }, + { + "epoch": 0.05157658951237723, + "grad_norm": 0.31790030002593994, + "learning_rate": 0.0001, + "loss": 1.6432, + "step": 449 + }, + { + "epoch": 0.05169145942220436, + "grad_norm": 0.3432879149913788, + "learning_rate": 0.0001, + "loss": 1.6502, + "step": 450 + }, + { + "epoch": 0.05180632933203148, + "grad_norm": 0.38444429636001587, + "learning_rate": 0.0001, + "loss": 1.844, + "step": 451 + }, + { + "epoch": 0.0519211992418586, + "grad_norm": 0.33826589584350586, + "learning_rate": 0.0001, + "loss": 1.7821, + "step": 452 + }, + { + "epoch": 0.05203606915168572, + "grad_norm": 0.340986967086792, + "learning_rate": 0.0001, + "loss": 1.7472, + "step": 453 + }, + { + "epoch": 0.05215093906151284, + "grad_norm": 0.34421804547309875, + "learning_rate": 0.0001, + "loss": 1.7193, + "step": 454 + }, + { + "epoch": 0.05226580897133996, + "grad_norm": 0.33938467502593994, + "learning_rate": 0.0001, + "loss": 1.8119, + "step": 455 + }, + { + "epoch": 0.05238067888116708, + "grad_norm": 0.3173467516899109, + "learning_rate": 0.0001, + "loss": 1.6541, + "step": 456 + }, + { + "epoch": 0.0524955487909942, + "grad_norm": 0.32488059997558594, + "learning_rate": 0.0001, + "loss": 1.5981, + "step": 457 + }, + { + "epoch": 0.05261041870082132, + "grad_norm": 0.35664987564086914, + "learning_rate": 0.0001, + "loss": 1.539, + "step": 458 + }, + { + "epoch": 0.052725288610648444, + "grad_norm": 0.3289016783237457, + "learning_rate": 0.0001, + "loss": 1.6183, + "step": 459 + }, + { + "epoch": 0.052840158520475564, + "grad_norm": 0.3202899992465973, + "learning_rate": 0.0001, + "loss": 1.6696, + "step": 460 + }, + { + "epoch": 0.052955028430302684, + "grad_norm": 0.34299010038375854, + "learning_rate": 0.0001, + "loss": 1.8016, + "step": 461 + }, + { + "epoch": 0.053069898340129804, + "grad_norm": 0.34644559025764465, + "learning_rate": 0.0001, + "loss": 1.7947, + "step": 462 + }, + { + "epoch": 0.053184768249956924, + "grad_norm": 0.32393816113471985, + "learning_rate": 0.0001, + "loss": 1.7503, + "step": 463 + }, + { + "epoch": 0.053299638159784044, + "grad_norm": 0.36531350016593933, + "learning_rate": 0.0001, + "loss": 1.6858, + "step": 464 + }, + { + "epoch": 0.053414508069611163, + "grad_norm": 0.29397326707839966, + "learning_rate": 0.0001, + "loss": 1.6449, + "step": 465 + }, + { + "epoch": 0.05352937797943828, + "grad_norm": 0.3278639614582062, + "learning_rate": 0.0001, + "loss": 1.8286, + "step": 466 + }, + { + "epoch": 0.05364424788926541, + "grad_norm": 0.3598267734050751, + "learning_rate": 0.0001, + "loss": 1.6739, + "step": 467 + }, + { + "epoch": 0.05375911779909253, + "grad_norm": 0.36339178681373596, + "learning_rate": 0.0001, + "loss": 1.9749, + "step": 468 + }, + { + "epoch": 0.05387398770891965, + "grad_norm": 0.31908461451530457, + "learning_rate": 0.0001, + "loss": 1.6836, + "step": 469 + }, + { + "epoch": 0.05398885761874677, + "grad_norm": 0.3400243818759918, + "learning_rate": 0.0001, + "loss": 1.7625, + "step": 470 + }, + { + "epoch": 0.05410372752857389, + "grad_norm": 0.3101711869239807, + "learning_rate": 0.0001, + "loss": 1.5419, + "step": 471 + }, + { + "epoch": 0.05421859743840101, + "grad_norm": 0.33695968985557556, + "learning_rate": 0.0001, + "loss": 1.8077, + "step": 472 + }, + { + "epoch": 0.05433346734822813, + "grad_norm": 0.3501220941543579, + "learning_rate": 0.0001, + "loss": 1.8684, + "step": 473 + }, + { + "epoch": 0.05444833725805525, + "grad_norm": 0.3185228705406189, + "learning_rate": 0.0001, + "loss": 1.7575, + "step": 474 + }, + { + "epoch": 0.05456320716788237, + "grad_norm": 0.33037081360816956, + "learning_rate": 0.0001, + "loss": 1.8286, + "step": 475 + }, + { + "epoch": 0.0546780770777095, + "grad_norm": 0.3522806763648987, + "learning_rate": 0.0001, + "loss": 1.8658, + "step": 476 + }, + { + "epoch": 0.05479294698753662, + "grad_norm": 0.35064372420310974, + "learning_rate": 0.0001, + "loss": 1.7537, + "step": 477 + }, + { + "epoch": 0.05490781689736374, + "grad_norm": 0.3410029411315918, + "learning_rate": 0.0001, + "loss": 1.8334, + "step": 478 + }, + { + "epoch": 0.05502268680719086, + "grad_norm": 0.34402716159820557, + "learning_rate": 0.0001, + "loss": 1.8506, + "step": 479 + }, + { + "epoch": 0.05513755671701798, + "grad_norm": 0.34367635846138, + "learning_rate": 0.0001, + "loss": 1.832, + "step": 480 + }, + { + "epoch": 0.0552524266268451, + "grad_norm": 0.33249610662460327, + "learning_rate": 0.0001, + "loss": 1.7356, + "step": 481 + }, + { + "epoch": 0.05536729653667222, + "grad_norm": 0.33778467774391174, + "learning_rate": 0.0001, + "loss": 1.6529, + "step": 482 + }, + { + "epoch": 0.05548216644649934, + "grad_norm": 0.31293541193008423, + "learning_rate": 0.0001, + "loss": 1.5809, + "step": 483 + }, + { + "epoch": 0.05559703635632646, + "grad_norm": 0.3448321521282196, + "learning_rate": 0.0001, + "loss": 1.8017, + "step": 484 + }, + { + "epoch": 0.055711906266153584, + "grad_norm": 0.36519747972488403, + "learning_rate": 0.0001, + "loss": 2.0294, + "step": 485 + }, + { + "epoch": 0.055826776175980704, + "grad_norm": 0.36726245284080505, + "learning_rate": 0.0001, + "loss": 1.9747, + "step": 486 + }, + { + "epoch": 0.055941646085807824, + "grad_norm": 0.35360264778137207, + "learning_rate": 0.0001, + "loss": 1.6291, + "step": 487 + }, + { + "epoch": 0.056056515995634944, + "grad_norm": 0.31348568201065063, + "learning_rate": 0.0001, + "loss": 1.5357, + "step": 488 + }, + { + "epoch": 0.056171385905462064, + "grad_norm": 0.3481610119342804, + "learning_rate": 0.0001, + "loss": 1.6213, + "step": 489 + }, + { + "epoch": 0.056286255815289184, + "grad_norm": 0.30713188648223877, + "learning_rate": 0.0001, + "loss": 1.5417, + "step": 490 + }, + { + "epoch": 0.056401125725116304, + "grad_norm": 0.33684420585632324, + "learning_rate": 0.0001, + "loss": 1.7113, + "step": 491 + }, + { + "epoch": 0.056515995634943424, + "grad_norm": 0.36983436346054077, + "learning_rate": 0.0001, + "loss": 1.8795, + "step": 492 + }, + { + "epoch": 0.05663086554477055, + "grad_norm": 0.32992857694625854, + "learning_rate": 0.0001, + "loss": 1.8728, + "step": 493 + }, + { + "epoch": 0.05674573545459767, + "grad_norm": 0.3567913770675659, + "learning_rate": 0.0001, + "loss": 1.7194, + "step": 494 + }, + { + "epoch": 0.05686060536442479, + "grad_norm": 0.3176042437553406, + "learning_rate": 0.0001, + "loss": 1.6712, + "step": 495 + }, + { + "epoch": 0.05697547527425191, + "grad_norm": 0.30702850222587585, + "learning_rate": 0.0001, + "loss": 1.6685, + "step": 496 + }, + { + "epoch": 0.05709034518407903, + "grad_norm": 0.33594316244125366, + "learning_rate": 0.0001, + "loss": 1.7385, + "step": 497 + }, + { + "epoch": 0.05720521509390615, + "grad_norm": 0.3079899549484253, + "learning_rate": 0.0001, + "loss": 1.6913, + "step": 498 + }, + { + "epoch": 0.05732008500373327, + "grad_norm": 0.3289260268211365, + "learning_rate": 0.0001, + "loss": 1.7654, + "step": 499 + }, + { + "epoch": 0.05743495491356039, + "grad_norm": 0.33289793133735657, + "learning_rate": 0.0001, + "loss": 1.7681, + "step": 500 + }, + { + "epoch": 0.05754982482338751, + "grad_norm": 0.34192466735839844, + "learning_rate": 0.0001, + "loss": 1.753, + "step": 501 + }, + { + "epoch": 0.05766469473321464, + "grad_norm": 0.40127864480018616, + "learning_rate": 0.0001, + "loss": 2.114, + "step": 502 + }, + { + "epoch": 0.05777956464304176, + "grad_norm": 0.3597794473171234, + "learning_rate": 0.0001, + "loss": 1.8872, + "step": 503 + }, + { + "epoch": 0.05789443455286888, + "grad_norm": 0.3503691256046295, + "learning_rate": 0.0001, + "loss": 1.764, + "step": 504 + }, + { + "epoch": 0.058009304462696, + "grad_norm": 0.33984240889549255, + "learning_rate": 0.0001, + "loss": 1.6724, + "step": 505 + }, + { + "epoch": 0.05812417437252312, + "grad_norm": 0.3279857337474823, + "learning_rate": 0.0001, + "loss": 1.6906, + "step": 506 + }, + { + "epoch": 0.05823904428235024, + "grad_norm": 0.3238902986049652, + "learning_rate": 0.0001, + "loss": 1.5953, + "step": 507 + }, + { + "epoch": 0.05835391419217736, + "grad_norm": 0.35532209277153015, + "learning_rate": 0.0001, + "loss": 1.8577, + "step": 508 + }, + { + "epoch": 0.05846878410200448, + "grad_norm": 0.3488325774669647, + "learning_rate": 0.0001, + "loss": 1.7485, + "step": 509 + }, + { + "epoch": 0.058583654011831604, + "grad_norm": 0.32330816984176636, + "learning_rate": 0.0001, + "loss": 1.5808, + "step": 510 + }, + { + "epoch": 0.058698523921658724, + "grad_norm": 0.32216721773147583, + "learning_rate": 0.0001, + "loss": 1.6325, + "step": 511 + }, + { + "epoch": 0.058813393831485844, + "grad_norm": 0.3667677640914917, + "learning_rate": 0.0001, + "loss": 1.916, + "step": 512 + }, + { + "epoch": 0.058928263741312964, + "grad_norm": 0.3243674635887146, + "learning_rate": 0.0001, + "loss": 1.6907, + "step": 513 + }, + { + "epoch": 0.059043133651140084, + "grad_norm": 0.3738958537578583, + "learning_rate": 0.0001, + "loss": 2.0036, + "step": 514 + }, + { + "epoch": 0.059158003560967204, + "grad_norm": 0.33784958720207214, + "learning_rate": 0.0001, + "loss": 1.8144, + "step": 515 + }, + { + "epoch": 0.059272873470794324, + "grad_norm": 0.34589648246765137, + "learning_rate": 0.0001, + "loss": 1.7907, + "step": 516 + }, + { + "epoch": 0.059387743380621444, + "grad_norm": 0.3396931290626526, + "learning_rate": 0.0001, + "loss": 1.8062, + "step": 517 + }, + { + "epoch": 0.059502613290448564, + "grad_norm": 0.33430787920951843, + "learning_rate": 0.0001, + "loss": 1.8861, + "step": 518 + }, + { + "epoch": 0.05961748320027569, + "grad_norm": 0.33483850955963135, + "learning_rate": 0.0001, + "loss": 1.7733, + "step": 519 + }, + { + "epoch": 0.05973235311010281, + "grad_norm": 0.3303142488002777, + "learning_rate": 0.0001, + "loss": 1.6669, + "step": 520 + }, + { + "epoch": 0.05984722301992993, + "grad_norm": 0.38823017477989197, + "learning_rate": 0.0001, + "loss": 1.9658, + "step": 521 + }, + { + "epoch": 0.05996209292975705, + "grad_norm": 0.3367463946342468, + "learning_rate": 0.0001, + "loss": 1.7768, + "step": 522 + }, + { + "epoch": 0.06007696283958417, + "grad_norm": 0.3448854684829712, + "learning_rate": 0.0001, + "loss": 1.8997, + "step": 523 + }, + { + "epoch": 0.06019183274941129, + "grad_norm": 0.33760958909988403, + "learning_rate": 0.0001, + "loss": 1.7803, + "step": 524 + }, + { + "epoch": 0.06030670265923841, + "grad_norm": 0.3285827934741974, + "learning_rate": 0.0001, + "loss": 1.7758, + "step": 525 + }, + { + "epoch": 0.06042157256906553, + "grad_norm": 0.32994288206100464, + "learning_rate": 0.0001, + "loss": 1.7558, + "step": 526 + }, + { + "epoch": 0.06053644247889266, + "grad_norm": 0.31827473640441895, + "learning_rate": 0.0001, + "loss": 1.6905, + "step": 527 + }, + { + "epoch": 0.06065131238871978, + "grad_norm": 0.35536283254623413, + "learning_rate": 0.0001, + "loss": 1.8389, + "step": 528 + }, + { + "epoch": 0.0607661822985469, + "grad_norm": 0.34808462858200073, + "learning_rate": 0.0001, + "loss": 1.7249, + "step": 529 + }, + { + "epoch": 0.06088105220837402, + "grad_norm": 0.32803285121917725, + "learning_rate": 0.0001, + "loss": 1.7431, + "step": 530 + }, + { + "epoch": 0.06099592211820114, + "grad_norm": 0.3689769506454468, + "learning_rate": 0.0001, + "loss": 1.95, + "step": 531 + }, + { + "epoch": 0.06111079202802826, + "grad_norm": 0.324849396944046, + "learning_rate": 0.0001, + "loss": 1.6858, + "step": 532 + }, + { + "epoch": 0.06122566193785538, + "grad_norm": 0.367699533700943, + "learning_rate": 0.0001, + "loss": 1.9701, + "step": 533 + }, + { + "epoch": 0.0613405318476825, + "grad_norm": 0.346123069524765, + "learning_rate": 0.0001, + "loss": 1.7326, + "step": 534 + }, + { + "epoch": 0.06145540175750962, + "grad_norm": 0.33581048250198364, + "learning_rate": 0.0001, + "loss": 1.7353, + "step": 535 + }, + { + "epoch": 0.061570271667336744, + "grad_norm": 0.31508779525756836, + "learning_rate": 0.0001, + "loss": 1.6533, + "step": 536 + }, + { + "epoch": 0.061685141577163864, + "grad_norm": 0.31980377435684204, + "learning_rate": 0.0001, + "loss": 1.7605, + "step": 537 + }, + { + "epoch": 0.061800011486990984, + "grad_norm": 0.324613481760025, + "learning_rate": 0.0001, + "loss": 1.6779, + "step": 538 + }, + { + "epoch": 0.061914881396818104, + "grad_norm": 0.36573949456214905, + "learning_rate": 0.0001, + "loss": 1.8471, + "step": 539 + }, + { + "epoch": 0.062029751306645224, + "grad_norm": 0.34220999479293823, + "learning_rate": 0.0001, + "loss": 1.8383, + "step": 540 + }, + { + "epoch": 0.062144621216472344, + "grad_norm": 0.3276033103466034, + "learning_rate": 0.0001, + "loss": 1.6503, + "step": 541 + }, + { + "epoch": 0.062259491126299464, + "grad_norm": 0.320403516292572, + "learning_rate": 0.0001, + "loss": 1.4869, + "step": 542 + }, + { + "epoch": 0.062374361036126584, + "grad_norm": 0.333486407995224, + "learning_rate": 0.0001, + "loss": 1.6474, + "step": 543 + }, + { + "epoch": 0.062489230945953704, + "grad_norm": 0.3390301764011383, + "learning_rate": 0.0001, + "loss": 1.6722, + "step": 544 + }, + { + "epoch": 0.06260410085578083, + "grad_norm": 0.3051248788833618, + "learning_rate": 0.0001, + "loss": 1.632, + "step": 545 + }, + { + "epoch": 0.06271897076560795, + "grad_norm": 0.4015922248363495, + "learning_rate": 0.0001, + "loss": 1.9027, + "step": 546 + }, + { + "epoch": 0.06283384067543507, + "grad_norm": 0.35976481437683105, + "learning_rate": 0.0001, + "loss": 1.8451, + "step": 547 + }, + { + "epoch": 0.06294871058526219, + "grad_norm": 0.3470692038536072, + "learning_rate": 0.0001, + "loss": 1.7992, + "step": 548 + }, + { + "epoch": 0.06306358049508931, + "grad_norm": 0.324569433927536, + "learning_rate": 0.0001, + "loss": 1.7567, + "step": 549 + }, + { + "epoch": 0.06317845040491643, + "grad_norm": 0.36248597502708435, + "learning_rate": 0.0001, + "loss": 1.8184, + "step": 550 + }, + { + "epoch": 0.06329332031474355, + "grad_norm": 0.33431920409202576, + "learning_rate": 0.0001, + "loss": 1.7591, + "step": 551 + }, + { + "epoch": 0.06340819022457067, + "grad_norm": 0.3006725013256073, + "learning_rate": 0.0001, + "loss": 1.5713, + "step": 552 + }, + { + "epoch": 0.06352306013439779, + "grad_norm": 0.3330213725566864, + "learning_rate": 0.0001, + "loss": 1.7226, + "step": 553 + }, + { + "epoch": 0.06363793004422491, + "grad_norm": 0.34222641587257385, + "learning_rate": 0.0001, + "loss": 1.6587, + "step": 554 + }, + { + "epoch": 0.06375279995405203, + "grad_norm": 0.34243908524513245, + "learning_rate": 0.0001, + "loss": 1.7752, + "step": 555 + }, + { + "epoch": 0.06386766986387915, + "grad_norm": 0.3517223596572876, + "learning_rate": 0.0001, + "loss": 1.91, + "step": 556 + }, + { + "epoch": 0.06398253977370628, + "grad_norm": 0.38396772742271423, + "learning_rate": 0.0001, + "loss": 2.1684, + "step": 557 + }, + { + "epoch": 0.0640974096835334, + "grad_norm": 0.34429916739463806, + "learning_rate": 0.0001, + "loss": 1.9133, + "step": 558 + }, + { + "epoch": 0.06421227959336052, + "grad_norm": 0.31630486249923706, + "learning_rate": 0.0001, + "loss": 1.574, + "step": 559 + }, + { + "epoch": 0.06432714950318764, + "grad_norm": 0.3668649196624756, + "learning_rate": 0.0001, + "loss": 1.7739, + "step": 560 + }, + { + "epoch": 0.06444201941301476, + "grad_norm": 0.33105143904685974, + "learning_rate": 0.0001, + "loss": 1.6974, + "step": 561 + }, + { + "epoch": 0.06455688932284188, + "grad_norm": 0.3978722393512726, + "learning_rate": 0.0001, + "loss": 1.8365, + "step": 562 + }, + { + "epoch": 0.064671759232669, + "grad_norm": 0.3352854549884796, + "learning_rate": 0.0001, + "loss": 1.7237, + "step": 563 + }, + { + "epoch": 0.06478662914249612, + "grad_norm": 0.3484468460083008, + "learning_rate": 0.0001, + "loss": 1.8297, + "step": 564 + }, + { + "epoch": 0.06490149905232324, + "grad_norm": 0.3346973955631256, + "learning_rate": 0.0001, + "loss": 1.7631, + "step": 565 + }, + { + "epoch": 0.06501636896215036, + "grad_norm": 0.3363039195537567, + "learning_rate": 0.0001, + "loss": 1.6818, + "step": 566 + }, + { + "epoch": 0.06513123887197748, + "grad_norm": 0.34610244631767273, + "learning_rate": 0.0001, + "loss": 1.8759, + "step": 567 + }, + { + "epoch": 0.0652461087818046, + "grad_norm": 0.36252561211586, + "learning_rate": 0.0001, + "loss": 1.8793, + "step": 568 + }, + { + "epoch": 0.06536097869163172, + "grad_norm": 0.3493739068508148, + "learning_rate": 0.0001, + "loss": 1.7335, + "step": 569 + }, + { + "epoch": 0.06547584860145884, + "grad_norm": 0.3322302997112274, + "learning_rate": 0.0001, + "loss": 1.5927, + "step": 570 + }, + { + "epoch": 0.06559071851128596, + "grad_norm": 0.31431615352630615, + "learning_rate": 0.0001, + "loss": 1.5679, + "step": 571 + }, + { + "epoch": 0.06570558842111308, + "grad_norm": 0.3374696671962738, + "learning_rate": 0.0001, + "loss": 1.7067, + "step": 572 + }, + { + "epoch": 0.0658204583309402, + "grad_norm": 0.343352735042572, + "learning_rate": 0.0001, + "loss": 1.8222, + "step": 573 + }, + { + "epoch": 0.06593532824076732, + "grad_norm": 0.33851170539855957, + "learning_rate": 0.0001, + "loss": 1.6893, + "step": 574 + }, + { + "epoch": 0.06605019815059446, + "grad_norm": 0.36369964480400085, + "learning_rate": 0.0001, + "loss": 1.8065, + "step": 575 + }, + { + "epoch": 0.06616506806042158, + "grad_norm": 0.31349602341651917, + "learning_rate": 0.0001, + "loss": 1.5028, + "step": 576 + }, + { + "epoch": 0.0662799379702487, + "grad_norm": 0.3367163836956024, + "learning_rate": 0.0001, + "loss": 1.8033, + "step": 577 + }, + { + "epoch": 0.06639480788007582, + "grad_norm": 0.3456117510795593, + "learning_rate": 0.0001, + "loss": 1.711, + "step": 578 + }, + { + "epoch": 0.06650967778990294, + "grad_norm": 0.31135809421539307, + "learning_rate": 0.0001, + "loss": 1.3939, + "step": 579 + }, + { + "epoch": 0.06662454769973006, + "grad_norm": 0.327361136674881, + "learning_rate": 0.0001, + "loss": 1.6371, + "step": 580 + }, + { + "epoch": 0.06673941760955718, + "grad_norm": 0.345680296421051, + "learning_rate": 0.0001, + "loss": 1.7006, + "step": 581 + }, + { + "epoch": 0.0668542875193843, + "grad_norm": 0.33879801630973816, + "learning_rate": 0.0001, + "loss": 1.8236, + "step": 582 + }, + { + "epoch": 0.06696915742921142, + "grad_norm": 0.3614217936992645, + "learning_rate": 0.0001, + "loss": 1.8137, + "step": 583 + }, + { + "epoch": 0.06708402733903854, + "grad_norm": 0.34495973587036133, + "learning_rate": 0.0001, + "loss": 1.6859, + "step": 584 + }, + { + "epoch": 0.06719889724886566, + "grad_norm": 0.31370049715042114, + "learning_rate": 0.0001, + "loss": 1.7482, + "step": 585 + }, + { + "epoch": 0.06731376715869278, + "grad_norm": 0.3325652480125427, + "learning_rate": 0.0001, + "loss": 1.88, + "step": 586 + }, + { + "epoch": 0.0674286370685199, + "grad_norm": 0.351700097322464, + "learning_rate": 0.0001, + "loss": 1.8817, + "step": 587 + }, + { + "epoch": 0.06754350697834702, + "grad_norm": 0.3200022280216217, + "learning_rate": 0.0001, + "loss": 1.7419, + "step": 588 + }, + { + "epoch": 0.06765837688817414, + "grad_norm": 0.36215293407440186, + "learning_rate": 0.0001, + "loss": 1.9603, + "step": 589 + }, + { + "epoch": 0.06777324679800126, + "grad_norm": 0.3392893970012665, + "learning_rate": 0.0001, + "loss": 1.7047, + "step": 590 + }, + { + "epoch": 0.06788811670782838, + "grad_norm": 0.3454267084598541, + "learning_rate": 0.0001, + "loss": 1.6759, + "step": 591 + }, + { + "epoch": 0.06800298661765551, + "grad_norm": 0.35802924633026123, + "learning_rate": 0.0001, + "loss": 1.8607, + "step": 592 + }, + { + "epoch": 0.06811785652748263, + "grad_norm": 0.2966287434101105, + "learning_rate": 0.0001, + "loss": 1.594, + "step": 593 + }, + { + "epoch": 0.06823272643730975, + "grad_norm": 0.36141011118888855, + "learning_rate": 0.0001, + "loss": 1.7596, + "step": 594 + }, + { + "epoch": 0.06834759634713687, + "grad_norm": 0.36919906735420227, + "learning_rate": 0.0001, + "loss": 1.9334, + "step": 595 + }, + { + "epoch": 0.06846246625696399, + "grad_norm": 0.34811851382255554, + "learning_rate": 0.0001, + "loss": 1.92, + "step": 596 + }, + { + "epoch": 0.06857733616679111, + "grad_norm": 0.3520393669605255, + "learning_rate": 0.0001, + "loss": 1.5446, + "step": 597 + }, + { + "epoch": 0.06869220607661823, + "grad_norm": 0.3605727553367615, + "learning_rate": 0.0001, + "loss": 1.8132, + "step": 598 + }, + { + "epoch": 0.06880707598644535, + "grad_norm": 0.3948690593242645, + "learning_rate": 0.0001, + "loss": 1.7842, + "step": 599 + }, + { + "epoch": 0.06892194589627247, + "grad_norm": 0.34386035799980164, + "learning_rate": 0.0001, + "loss": 1.6827, + "step": 600 + }, + { + "epoch": 0.06903681580609959, + "grad_norm": 0.37180081009864807, + "learning_rate": 0.0001, + "loss": 1.7982, + "step": 601 + }, + { + "epoch": 0.06915168571592671, + "grad_norm": 0.3451867401599884, + "learning_rate": 0.0001, + "loss": 1.8339, + "step": 602 + }, + { + "epoch": 0.06926655562575383, + "grad_norm": 0.3325120210647583, + "learning_rate": 0.0001, + "loss": 1.7506, + "step": 603 + }, + { + "epoch": 0.06938142553558095, + "grad_norm": 0.3503422737121582, + "learning_rate": 0.0001, + "loss": 1.8154, + "step": 604 + }, + { + "epoch": 0.06949629544540807, + "grad_norm": 0.36198514699935913, + "learning_rate": 0.0001, + "loss": 1.7698, + "step": 605 + }, + { + "epoch": 0.06961116535523519, + "grad_norm": 0.35194844007492065, + "learning_rate": 0.0001, + "loss": 1.7223, + "step": 606 + }, + { + "epoch": 0.06972603526506231, + "grad_norm": 0.3265458643436432, + "learning_rate": 0.0001, + "loss": 1.6648, + "step": 607 + }, + { + "epoch": 0.06984090517488943, + "grad_norm": 0.3669833838939667, + "learning_rate": 0.0001, + "loss": 1.8176, + "step": 608 + }, + { + "epoch": 0.06995577508471656, + "grad_norm": 0.35220587253570557, + "learning_rate": 0.0001, + "loss": 1.6173, + "step": 609 + }, + { + "epoch": 0.07007064499454368, + "grad_norm": 0.3398061692714691, + "learning_rate": 0.0001, + "loss": 1.4962, + "step": 610 + }, + { + "epoch": 0.0701855149043708, + "grad_norm": 0.36075279116630554, + "learning_rate": 0.0001, + "loss": 1.8924, + "step": 611 + }, + { + "epoch": 0.07030038481419792, + "grad_norm": 0.34688082337379456, + "learning_rate": 0.0001, + "loss": 1.5272, + "step": 612 + }, + { + "epoch": 0.07041525472402504, + "grad_norm": 0.33742544054985046, + "learning_rate": 0.0001, + "loss": 1.75, + "step": 613 + }, + { + "epoch": 0.07053012463385216, + "grad_norm": 0.3314124345779419, + "learning_rate": 0.0001, + "loss": 1.7574, + "step": 614 + }, + { + "epoch": 0.07064499454367928, + "grad_norm": 0.36540499329566956, + "learning_rate": 0.0001, + "loss": 1.926, + "step": 615 + }, + { + "epoch": 0.0707598644535064, + "grad_norm": 0.3602568507194519, + "learning_rate": 0.0001, + "loss": 1.7833, + "step": 616 + }, + { + "epoch": 0.07087473436333352, + "grad_norm": 0.3252723515033722, + "learning_rate": 0.0001, + "loss": 1.8458, + "step": 617 + }, + { + "epoch": 0.07098960427316064, + "grad_norm": 0.3487424850463867, + "learning_rate": 0.0001, + "loss": 1.8981, + "step": 618 + }, + { + "epoch": 0.07110447418298776, + "grad_norm": 0.33129236102104187, + "learning_rate": 0.0001, + "loss": 1.6613, + "step": 619 + }, + { + "epoch": 0.07121934409281488, + "grad_norm": 0.33063703775405884, + "learning_rate": 0.0001, + "loss": 1.6859, + "step": 620 + }, + { + "epoch": 0.071334214002642, + "grad_norm": 0.32881104946136475, + "learning_rate": 0.0001, + "loss": 1.7321, + "step": 621 + }, + { + "epoch": 0.07144908391246912, + "grad_norm": 0.3607088029384613, + "learning_rate": 0.0001, + "loss": 1.698, + "step": 622 + }, + { + "epoch": 0.07156395382229624, + "grad_norm": 0.3350905179977417, + "learning_rate": 0.0001, + "loss": 1.7254, + "step": 623 + }, + { + "epoch": 0.07167882373212336, + "grad_norm": 0.35111361742019653, + "learning_rate": 0.0001, + "loss": 1.9324, + "step": 624 + }, + { + "epoch": 0.07179369364195048, + "grad_norm": 0.3444902002811432, + "learning_rate": 0.0001, + "loss": 1.8281, + "step": 625 + }, + { + "epoch": 0.07190856355177762, + "grad_norm": 0.34226348996162415, + "learning_rate": 0.0001, + "loss": 1.7824, + "step": 626 + }, + { + "epoch": 0.07202343346160474, + "grad_norm": 0.33303242921829224, + "learning_rate": 0.0001, + "loss": 1.8413, + "step": 627 + }, + { + "epoch": 0.07213830337143186, + "grad_norm": 0.3243139088153839, + "learning_rate": 0.0001, + "loss": 1.8731, + "step": 628 + }, + { + "epoch": 0.07225317328125898, + "grad_norm": 0.3446861505508423, + "learning_rate": 0.0001, + "loss": 1.7179, + "step": 629 + }, + { + "epoch": 0.0723680431910861, + "grad_norm": 0.337261438369751, + "learning_rate": 0.0001, + "loss": 1.6671, + "step": 630 + }, + { + "epoch": 0.07248291310091322, + "grad_norm": 0.32150766253471375, + "learning_rate": 0.0001, + "loss": 1.8097, + "step": 631 + }, + { + "epoch": 0.07259778301074034, + "grad_norm": 0.3300226032733917, + "learning_rate": 0.0001, + "loss": 1.6207, + "step": 632 + }, + { + "epoch": 0.07271265292056746, + "grad_norm": 0.3479246199131012, + "learning_rate": 0.0001, + "loss": 1.7969, + "step": 633 + }, + { + "epoch": 0.07282752283039458, + "grad_norm": 0.33039695024490356, + "learning_rate": 0.0001, + "loss": 1.4106, + "step": 634 + }, + { + "epoch": 0.0729423927402217, + "grad_norm": 0.3294956088066101, + "learning_rate": 0.0001, + "loss": 1.7817, + "step": 635 + }, + { + "epoch": 0.07305726265004882, + "grad_norm": 0.3452272415161133, + "learning_rate": 0.0001, + "loss": 1.7148, + "step": 636 + }, + { + "epoch": 0.07317213255987594, + "grad_norm": 0.33393406867980957, + "learning_rate": 0.0001, + "loss": 1.8335, + "step": 637 + }, + { + "epoch": 0.07328700246970306, + "grad_norm": 0.3169970214366913, + "learning_rate": 0.0001, + "loss": 1.6022, + "step": 638 + }, + { + "epoch": 0.07340187237953018, + "grad_norm": 0.3217456340789795, + "learning_rate": 0.0001, + "loss": 1.579, + "step": 639 + }, + { + "epoch": 0.0735167422893573, + "grad_norm": 0.34844133257865906, + "learning_rate": 0.0001, + "loss": 1.8499, + "step": 640 + }, + { + "epoch": 0.07363161219918442, + "grad_norm": 0.33645370602607727, + "learning_rate": 0.0001, + "loss": 1.862, + "step": 641 + }, + { + "epoch": 0.07374648210901154, + "grad_norm": 0.3201218843460083, + "learning_rate": 0.0001, + "loss": 1.7561, + "step": 642 + }, + { + "epoch": 0.07386135201883867, + "grad_norm": 0.34113234281539917, + "learning_rate": 0.0001, + "loss": 1.8113, + "step": 643 + }, + { + "epoch": 0.07397622192866579, + "grad_norm": 0.33081957697868347, + "learning_rate": 0.0001, + "loss": 1.7435, + "step": 644 + }, + { + "epoch": 0.07409109183849291, + "grad_norm": 0.3413662314414978, + "learning_rate": 0.0001, + "loss": 1.8755, + "step": 645 + }, + { + "epoch": 0.07420596174832003, + "grad_norm": 0.3311666250228882, + "learning_rate": 0.0001, + "loss": 1.769, + "step": 646 + }, + { + "epoch": 0.07432083165814715, + "grad_norm": 0.33269715309143066, + "learning_rate": 0.0001, + "loss": 1.7321, + "step": 647 + }, + { + "epoch": 0.07443570156797427, + "grad_norm": 0.34695979952812195, + "learning_rate": 0.0001, + "loss": 1.7628, + "step": 648 + }, + { + "epoch": 0.07455057147780139, + "grad_norm": 0.3331931233406067, + "learning_rate": 0.0001, + "loss": 1.7498, + "step": 649 + }, + { + "epoch": 0.07466544138762851, + "grad_norm": 0.3264698088169098, + "learning_rate": 0.0001, + "loss": 1.6114, + "step": 650 + }, + { + "epoch": 0.07478031129745563, + "grad_norm": 0.3626859486103058, + "learning_rate": 0.0001, + "loss": 1.8154, + "step": 651 + }, + { + "epoch": 0.07489518120728275, + "grad_norm": 0.32509714365005493, + "learning_rate": 0.0001, + "loss": 1.4671, + "step": 652 + }, + { + "epoch": 0.07501005111710987, + "grad_norm": 0.3228186070919037, + "learning_rate": 0.0001, + "loss": 1.8735, + "step": 653 + }, + { + "epoch": 0.07512492102693699, + "grad_norm": 0.3465333878993988, + "learning_rate": 0.0001, + "loss": 1.872, + "step": 654 + }, + { + "epoch": 0.07523979093676411, + "grad_norm": 0.3378332555294037, + "learning_rate": 0.0001, + "loss": 1.6398, + "step": 655 + }, + { + "epoch": 0.07535466084659123, + "grad_norm": 0.3364262282848358, + "learning_rate": 0.0001, + "loss": 1.8611, + "step": 656 + }, + { + "epoch": 0.07546953075641835, + "grad_norm": 0.34226563572883606, + "learning_rate": 0.0001, + "loss": 1.7843, + "step": 657 + }, + { + "epoch": 0.07558440066624547, + "grad_norm": 0.3533295691013336, + "learning_rate": 0.0001, + "loss": 1.7962, + "step": 658 + }, + { + "epoch": 0.07569927057607259, + "grad_norm": 0.3422401249408722, + "learning_rate": 0.0001, + "loss": 1.8801, + "step": 659 + }, + { + "epoch": 0.07581414048589973, + "grad_norm": 0.3540160357952118, + "learning_rate": 0.0001, + "loss": 1.8129, + "step": 660 + }, + { + "epoch": 0.07592901039572685, + "grad_norm": 0.334587424993515, + "learning_rate": 0.0001, + "loss": 1.8578, + "step": 661 + }, + { + "epoch": 0.07604388030555397, + "grad_norm": 0.32655155658721924, + "learning_rate": 0.0001, + "loss": 1.6494, + "step": 662 + }, + { + "epoch": 0.07615875021538109, + "grad_norm": 0.36004751920700073, + "learning_rate": 0.0001, + "loss": 1.7735, + "step": 663 + }, + { + "epoch": 0.0762736201252082, + "grad_norm": 0.32442474365234375, + "learning_rate": 0.0001, + "loss": 1.5389, + "step": 664 + }, + { + "epoch": 0.07638849003503533, + "grad_norm": 0.344626784324646, + "learning_rate": 0.0001, + "loss": 1.6952, + "step": 665 + }, + { + "epoch": 0.07650335994486245, + "grad_norm": 0.31557947397232056, + "learning_rate": 0.0001, + "loss": 1.5595, + "step": 666 + }, + { + "epoch": 0.07661822985468957, + "grad_norm": 0.3274221420288086, + "learning_rate": 0.0001, + "loss": 1.6471, + "step": 667 + }, + { + "epoch": 0.07673309976451669, + "grad_norm": 0.3906736969947815, + "learning_rate": 0.0001, + "loss": 1.6702, + "step": 668 + }, + { + "epoch": 0.0768479696743438, + "grad_norm": 0.35045820474624634, + "learning_rate": 0.0001, + "loss": 1.8738, + "step": 669 + }, + { + "epoch": 0.07696283958417093, + "grad_norm": 0.3959348201751709, + "learning_rate": 0.0001, + "loss": 2.094, + "step": 670 + }, + { + "epoch": 0.07707770949399805, + "grad_norm": 0.3369539976119995, + "learning_rate": 0.0001, + "loss": 1.6899, + "step": 671 + }, + { + "epoch": 0.07719257940382517, + "grad_norm": 0.34965980052948, + "learning_rate": 0.0001, + "loss": 1.8694, + "step": 672 + }, + { + "epoch": 0.07730744931365229, + "grad_norm": 0.33249253034591675, + "learning_rate": 0.0001, + "loss": 1.5991, + "step": 673 + }, + { + "epoch": 0.0774223192234794, + "grad_norm": 0.32257145643234253, + "learning_rate": 0.0001, + "loss": 1.7565, + "step": 674 + }, + { + "epoch": 0.07753718913330652, + "grad_norm": 0.33610349893569946, + "learning_rate": 0.0001, + "loss": 1.8517, + "step": 675 + }, + { + "epoch": 0.07765205904313364, + "grad_norm": 0.3666530251502991, + "learning_rate": 0.0001, + "loss": 1.6426, + "step": 676 + }, + { + "epoch": 0.07776692895296078, + "grad_norm": 0.3422529697418213, + "learning_rate": 0.0001, + "loss": 1.6884, + "step": 677 + }, + { + "epoch": 0.0778817988627879, + "grad_norm": 0.3361228108406067, + "learning_rate": 0.0001, + "loss": 1.7452, + "step": 678 + }, + { + "epoch": 0.07799666877261502, + "grad_norm": 0.3532163202762604, + "learning_rate": 0.0001, + "loss": 1.6878, + "step": 679 + }, + { + "epoch": 0.07811153868244214, + "grad_norm": 0.3304887115955353, + "learning_rate": 0.0001, + "loss": 1.6648, + "step": 680 + }, + { + "epoch": 0.07822640859226926, + "grad_norm": 0.3171667456626892, + "learning_rate": 0.0001, + "loss": 1.5896, + "step": 681 + }, + { + "epoch": 0.07834127850209638, + "grad_norm": 0.36117199063301086, + "learning_rate": 0.0001, + "loss": 1.7566, + "step": 682 + }, + { + "epoch": 0.0784561484119235, + "grad_norm": 0.37346988916397095, + "learning_rate": 0.0001, + "loss": 1.8458, + "step": 683 + }, + { + "epoch": 0.07857101832175062, + "grad_norm": 0.34928634762763977, + "learning_rate": 0.0001, + "loss": 1.661, + "step": 684 + }, + { + "epoch": 0.07868588823157774, + "grad_norm": 0.34768396615982056, + "learning_rate": 0.0001, + "loss": 1.8074, + "step": 685 + }, + { + "epoch": 0.07880075814140486, + "grad_norm": 0.3412458300590515, + "learning_rate": 0.0001, + "loss": 1.5803, + "step": 686 + }, + { + "epoch": 0.07891562805123198, + "grad_norm": 0.3672271966934204, + "learning_rate": 0.0001, + "loss": 1.8826, + "step": 687 + }, + { + "epoch": 0.0790304979610591, + "grad_norm": 0.33574315905570984, + "learning_rate": 0.0001, + "loss": 1.6836, + "step": 688 + }, + { + "epoch": 0.07914536787088622, + "grad_norm": 0.3387349247932434, + "learning_rate": 0.0001, + "loss": 1.6531, + "step": 689 + }, + { + "epoch": 0.07926023778071334, + "grad_norm": 0.3550173342227936, + "learning_rate": 0.0001, + "loss": 1.7975, + "step": 690 + }, + { + "epoch": 0.07937510769054046, + "grad_norm": 0.380522221326828, + "learning_rate": 0.0001, + "loss": 1.9264, + "step": 691 + }, + { + "epoch": 0.07948997760036758, + "grad_norm": 0.374406099319458, + "learning_rate": 0.0001, + "loss": 1.8652, + "step": 692 + }, + { + "epoch": 0.0796048475101947, + "grad_norm": 0.34742456674575806, + "learning_rate": 0.0001, + "loss": 1.9069, + "step": 693 + }, + { + "epoch": 0.07971971742002182, + "grad_norm": 0.35284706950187683, + "learning_rate": 0.0001, + "loss": 1.8065, + "step": 694 + }, + { + "epoch": 0.07983458732984895, + "grad_norm": 0.3327619135379791, + "learning_rate": 0.0001, + "loss": 1.7721, + "step": 695 + }, + { + "epoch": 0.07994945723967607, + "grad_norm": 0.3591189384460449, + "learning_rate": 0.0001, + "loss": 1.6802, + "step": 696 + }, + { + "epoch": 0.08006432714950319, + "grad_norm": 0.34091916680336, + "learning_rate": 0.0001, + "loss": 1.6535, + "step": 697 + }, + { + "epoch": 0.08017919705933031, + "grad_norm": 0.352022260427475, + "learning_rate": 0.0001, + "loss": 1.7479, + "step": 698 + }, + { + "epoch": 0.08029406696915743, + "grad_norm": 0.33692196011543274, + "learning_rate": 0.0001, + "loss": 1.6643, + "step": 699 + }, + { + "epoch": 0.08040893687898455, + "grad_norm": 0.370638370513916, + "learning_rate": 0.0001, + "loss": 2.0285, + "step": 700 + }, + { + "epoch": 0.08052380678881167, + "grad_norm": 0.3345963656902313, + "learning_rate": 0.0001, + "loss": 1.7766, + "step": 701 + }, + { + "epoch": 0.08063867669863879, + "grad_norm": 0.34501156210899353, + "learning_rate": 0.0001, + "loss": 1.8179, + "step": 702 + }, + { + "epoch": 0.08075354660846591, + "grad_norm": 0.33231601119041443, + "learning_rate": 0.0001, + "loss": 1.6653, + "step": 703 + }, + { + "epoch": 0.08086841651829303, + "grad_norm": 0.34279513359069824, + "learning_rate": 0.0001, + "loss": 1.695, + "step": 704 + }, + { + "epoch": 0.08098328642812015, + "grad_norm": 0.3368370234966278, + "learning_rate": 0.0001, + "loss": 1.7584, + "step": 705 + }, + { + "epoch": 0.08109815633794727, + "grad_norm": 0.34584423899650574, + "learning_rate": 0.0001, + "loss": 1.6271, + "step": 706 + }, + { + "epoch": 0.08121302624777439, + "grad_norm": 0.35114485025405884, + "learning_rate": 0.0001, + "loss": 1.7287, + "step": 707 + }, + { + "epoch": 0.08132789615760151, + "grad_norm": 0.31173431873321533, + "learning_rate": 0.0001, + "loss": 1.6346, + "step": 708 + }, + { + "epoch": 0.08144276606742863, + "grad_norm": 0.3620467782020569, + "learning_rate": 0.0001, + "loss": 1.8973, + "step": 709 + }, + { + "epoch": 0.08155763597725575, + "grad_norm": 0.3193514049053192, + "learning_rate": 0.0001, + "loss": 1.6496, + "step": 710 + }, + { + "epoch": 0.08167250588708287, + "grad_norm": 0.34377214312553406, + "learning_rate": 0.0001, + "loss": 1.8458, + "step": 711 + }, + { + "epoch": 0.08178737579691, + "grad_norm": 0.3406418263912201, + "learning_rate": 0.0001, + "loss": 1.6495, + "step": 712 + }, + { + "epoch": 0.08190224570673713, + "grad_norm": 0.33058011531829834, + "learning_rate": 0.0001, + "loss": 1.7232, + "step": 713 + }, + { + "epoch": 0.08201711561656425, + "grad_norm": 0.32958585023880005, + "learning_rate": 0.0001, + "loss": 1.588, + "step": 714 + }, + { + "epoch": 0.08213198552639137, + "grad_norm": 0.3345566689968109, + "learning_rate": 0.0001, + "loss": 1.7372, + "step": 715 + }, + { + "epoch": 0.08224685543621849, + "grad_norm": 0.3363969027996063, + "learning_rate": 0.0001, + "loss": 1.5248, + "step": 716 + }, + { + "epoch": 0.0823617253460456, + "grad_norm": 0.3310002386569977, + "learning_rate": 0.0001, + "loss": 1.4364, + "step": 717 + }, + { + "epoch": 0.08247659525587273, + "grad_norm": 0.35177963972091675, + "learning_rate": 0.0001, + "loss": 1.8578, + "step": 718 + }, + { + "epoch": 0.08259146516569985, + "grad_norm": 0.37990766763687134, + "learning_rate": 0.0001, + "loss": 1.8266, + "step": 719 + }, + { + "epoch": 0.08270633507552697, + "grad_norm": 0.3506127893924713, + "learning_rate": 0.0001, + "loss": 1.7454, + "step": 720 + }, + { + "epoch": 0.08282120498535409, + "grad_norm": 0.3488253653049469, + "learning_rate": 0.0001, + "loss": 1.8147, + "step": 721 + }, + { + "epoch": 0.0829360748951812, + "grad_norm": 0.34471482038497925, + "learning_rate": 0.0001, + "loss": 1.7288, + "step": 722 + }, + { + "epoch": 0.08305094480500833, + "grad_norm": 0.33807575702667236, + "learning_rate": 0.0001, + "loss": 1.8128, + "step": 723 + }, + { + "epoch": 0.08316581471483545, + "grad_norm": 0.3196840286254883, + "learning_rate": 0.0001, + "loss": 1.6008, + "step": 724 + }, + { + "epoch": 0.08328068462466257, + "grad_norm": 0.36051392555236816, + "learning_rate": 0.0001, + "loss": 1.7202, + "step": 725 + }, + { + "epoch": 0.08339555453448969, + "grad_norm": 0.3275487720966339, + "learning_rate": 0.0001, + "loss": 1.6035, + "step": 726 + }, + { + "epoch": 0.0835104244443168, + "grad_norm": 0.3423649072647095, + "learning_rate": 0.0001, + "loss": 1.7988, + "step": 727 + }, + { + "epoch": 0.08362529435414393, + "grad_norm": 0.3507118225097656, + "learning_rate": 0.0001, + "loss": 1.5723, + "step": 728 + }, + { + "epoch": 0.08374016426397106, + "grad_norm": 0.3463688790798187, + "learning_rate": 0.0001, + "loss": 1.8294, + "step": 729 + }, + { + "epoch": 0.08385503417379818, + "grad_norm": 0.3360377848148346, + "learning_rate": 0.0001, + "loss": 1.7275, + "step": 730 + }, + { + "epoch": 0.0839699040836253, + "grad_norm": 0.3324314057826996, + "learning_rate": 0.0001, + "loss": 1.7487, + "step": 731 + }, + { + "epoch": 0.08408477399345242, + "grad_norm": 0.3390614688396454, + "learning_rate": 0.0001, + "loss": 1.7351, + "step": 732 + }, + { + "epoch": 0.08419964390327954, + "grad_norm": 0.35863199830055237, + "learning_rate": 0.0001, + "loss": 1.908, + "step": 733 + }, + { + "epoch": 0.08431451381310666, + "grad_norm": 0.3417515158653259, + "learning_rate": 0.0001, + "loss": 1.8832, + "step": 734 + }, + { + "epoch": 0.08442938372293378, + "grad_norm": 0.31135502457618713, + "learning_rate": 0.0001, + "loss": 1.432, + "step": 735 + }, + { + "epoch": 0.0845442536327609, + "grad_norm": 0.33590710163116455, + "learning_rate": 0.0001, + "loss": 1.7915, + "step": 736 + }, + { + "epoch": 0.08465912354258802, + "grad_norm": 0.33955830335617065, + "learning_rate": 0.0001, + "loss": 1.7333, + "step": 737 + }, + { + "epoch": 0.08477399345241514, + "grad_norm": 0.3825172483921051, + "learning_rate": 0.0001, + "loss": 1.7999, + "step": 738 + }, + { + "epoch": 0.08488886336224226, + "grad_norm": 0.33067587018013, + "learning_rate": 0.0001, + "loss": 1.7567, + "step": 739 + }, + { + "epoch": 0.08500373327206938, + "grad_norm": 0.344756156206131, + "learning_rate": 0.0001, + "loss": 1.7844, + "step": 740 + }, + { + "epoch": 0.0851186031818965, + "grad_norm": 0.3478997051715851, + "learning_rate": 0.0001, + "loss": 1.8903, + "step": 741 + }, + { + "epoch": 0.08523347309172362, + "grad_norm": 0.3651295006275177, + "learning_rate": 0.0001, + "loss": 1.8882, + "step": 742 + }, + { + "epoch": 0.08534834300155074, + "grad_norm": 0.30588778853416443, + "learning_rate": 0.0001, + "loss": 1.6636, + "step": 743 + }, + { + "epoch": 0.08546321291137786, + "grad_norm": 0.3360552489757538, + "learning_rate": 0.0001, + "loss": 1.865, + "step": 744 + }, + { + "epoch": 0.08557808282120498, + "grad_norm": 0.3278788626194, + "learning_rate": 0.0001, + "loss": 1.5782, + "step": 745 + }, + { + "epoch": 0.08569295273103211, + "grad_norm": 0.3437139391899109, + "learning_rate": 0.0001, + "loss": 1.8981, + "step": 746 + }, + { + "epoch": 0.08580782264085923, + "grad_norm": 0.34554487466812134, + "learning_rate": 0.0001, + "loss": 1.7921, + "step": 747 + }, + { + "epoch": 0.08592269255068635, + "grad_norm": 0.35071298480033875, + "learning_rate": 0.0001, + "loss": 1.8354, + "step": 748 + }, + { + "epoch": 0.08603756246051347, + "grad_norm": 0.36390817165374756, + "learning_rate": 0.0001, + "loss": 1.6217, + "step": 749 + }, + { + "epoch": 0.08615243237034059, + "grad_norm": 0.35594916343688965, + "learning_rate": 0.0001, + "loss": 1.4535, + "step": 750 + }, + { + "epoch": 0.08626730228016771, + "grad_norm": 0.3606272339820862, + "learning_rate": 0.0001, + "loss": 1.8661, + "step": 751 + }, + { + "epoch": 0.08638217218999483, + "grad_norm": 0.3557438552379608, + "learning_rate": 0.0001, + "loss": 1.7762, + "step": 752 + }, + { + "epoch": 0.08649704209982195, + "grad_norm": 0.37106749415397644, + "learning_rate": 0.0001, + "loss": 1.913, + "step": 753 + }, + { + "epoch": 0.08661191200964907, + "grad_norm": 0.34176573157310486, + "learning_rate": 0.0001, + "loss": 1.7081, + "step": 754 + }, + { + "epoch": 0.08672678191947619, + "grad_norm": 0.32890862226486206, + "learning_rate": 0.0001, + "loss": 1.7647, + "step": 755 + }, + { + "epoch": 0.08684165182930331, + "grad_norm": 0.3404117822647095, + "learning_rate": 0.0001, + "loss": 1.7522, + "step": 756 + }, + { + "epoch": 0.08695652173913043, + "grad_norm": 0.3754931092262268, + "learning_rate": 0.0001, + "loss": 2.0646, + "step": 757 + }, + { + "epoch": 0.08707139164895755, + "grad_norm": 0.32435300946235657, + "learning_rate": 0.0001, + "loss": 1.5691, + "step": 758 + }, + { + "epoch": 0.08718626155878467, + "grad_norm": 0.3234858810901642, + "learning_rate": 0.0001, + "loss": 1.6367, + "step": 759 + }, + { + "epoch": 0.08730113146861179, + "grad_norm": 0.34800320863723755, + "learning_rate": 0.0001, + "loss": 1.7462, + "step": 760 + }, + { + "epoch": 0.08741600137843891, + "grad_norm": 0.3534897565841675, + "learning_rate": 0.0001, + "loss": 1.7654, + "step": 761 + }, + { + "epoch": 0.08753087128826603, + "grad_norm": 0.3358789384365082, + "learning_rate": 0.0001, + "loss": 1.8423, + "step": 762 + }, + { + "epoch": 0.08764574119809317, + "grad_norm": 0.35216909646987915, + "learning_rate": 0.0001, + "loss": 1.9625, + "step": 763 + }, + { + "epoch": 0.08776061110792029, + "grad_norm": 0.32955506443977356, + "learning_rate": 0.0001, + "loss": 1.7347, + "step": 764 + }, + { + "epoch": 0.0878754810177474, + "grad_norm": 0.3332022428512573, + "learning_rate": 0.0001, + "loss": 1.7732, + "step": 765 + }, + { + "epoch": 0.08799035092757453, + "grad_norm": 0.3248218894004822, + "learning_rate": 0.0001, + "loss": 1.6407, + "step": 766 + }, + { + "epoch": 0.08810522083740165, + "grad_norm": 0.3316574692726135, + "learning_rate": 0.0001, + "loss": 1.8211, + "step": 767 + }, + { + "epoch": 0.08822009074722877, + "grad_norm": 0.33877885341644287, + "learning_rate": 0.0001, + "loss": 1.7671, + "step": 768 + }, + { + "epoch": 0.08833496065705589, + "grad_norm": 0.34064409136772156, + "learning_rate": 0.0001, + "loss": 1.8061, + "step": 769 + }, + { + "epoch": 0.088449830566883, + "grad_norm": 0.3360152542591095, + "learning_rate": 0.0001, + "loss": 1.762, + "step": 770 + }, + { + "epoch": 0.08856470047671013, + "grad_norm": 0.3164994716644287, + "learning_rate": 0.0001, + "loss": 1.7382, + "step": 771 + }, + { + "epoch": 0.08867957038653725, + "grad_norm": 0.3785625994205475, + "learning_rate": 0.0001, + "loss": 1.9568, + "step": 772 + }, + { + "epoch": 0.08879444029636437, + "grad_norm": 0.3452891409397125, + "learning_rate": 0.0001, + "loss": 1.7799, + "step": 773 + }, + { + "epoch": 0.08890931020619149, + "grad_norm": 0.3608962595462799, + "learning_rate": 0.0001, + "loss": 1.7845, + "step": 774 + }, + { + "epoch": 0.0890241801160186, + "grad_norm": 0.3125813901424408, + "learning_rate": 0.0001, + "loss": 1.7237, + "step": 775 + }, + { + "epoch": 0.08913905002584573, + "grad_norm": 0.34034648537635803, + "learning_rate": 0.0001, + "loss": 1.7136, + "step": 776 + }, + { + "epoch": 0.08925391993567285, + "grad_norm": 0.31160974502563477, + "learning_rate": 0.0001, + "loss": 1.6431, + "step": 777 + }, + { + "epoch": 0.08936878984549997, + "grad_norm": 0.3525000512599945, + "learning_rate": 0.0001, + "loss": 1.8185, + "step": 778 + }, + { + "epoch": 0.08948365975532709, + "grad_norm": 0.3665052056312561, + "learning_rate": 0.0001, + "loss": 1.5812, + "step": 779 + }, + { + "epoch": 0.08959852966515422, + "grad_norm": 0.37317225337028503, + "learning_rate": 0.0001, + "loss": 1.7051, + "step": 780 + }, + { + "epoch": 0.08971339957498134, + "grad_norm": 0.3269886374473572, + "learning_rate": 0.0001, + "loss": 1.6282, + "step": 781 + }, + { + "epoch": 0.08982826948480846, + "grad_norm": 0.34751588106155396, + "learning_rate": 0.0001, + "loss": 1.7788, + "step": 782 + }, + { + "epoch": 0.08994313939463558, + "grad_norm": 0.35242506861686707, + "learning_rate": 0.0001, + "loss": 1.8759, + "step": 783 + }, + { + "epoch": 0.0900580093044627, + "grad_norm": 0.3438495695590973, + "learning_rate": 0.0001, + "loss": 1.6846, + "step": 784 + }, + { + "epoch": 0.09017287921428982, + "grad_norm": 0.32642510533332825, + "learning_rate": 0.0001, + "loss": 1.6951, + "step": 785 + }, + { + "epoch": 0.09028774912411694, + "grad_norm": 0.32845741510391235, + "learning_rate": 0.0001, + "loss": 1.7464, + "step": 786 + }, + { + "epoch": 0.09040261903394406, + "grad_norm": 0.36917203664779663, + "learning_rate": 0.0001, + "loss": 1.7882, + "step": 787 + }, + { + "epoch": 0.09051748894377118, + "grad_norm": 0.3328888416290283, + "learning_rate": 0.0001, + "loss": 1.6509, + "step": 788 + }, + { + "epoch": 0.0906323588535983, + "grad_norm": 0.3314300775527954, + "learning_rate": 0.0001, + "loss": 1.7402, + "step": 789 + }, + { + "epoch": 0.09074722876342542, + "grad_norm": 0.34112605452537537, + "learning_rate": 0.0001, + "loss": 1.8362, + "step": 790 + }, + { + "epoch": 0.09086209867325254, + "grad_norm": 0.3412010669708252, + "learning_rate": 0.0001, + "loss": 1.9103, + "step": 791 + }, + { + "epoch": 0.09097696858307966, + "grad_norm": 0.34920912981033325, + "learning_rate": 0.0001, + "loss": 1.756, + "step": 792 + }, + { + "epoch": 0.09109183849290678, + "grad_norm": 0.36219510436058044, + "learning_rate": 0.0001, + "loss": 1.8899, + "step": 793 + }, + { + "epoch": 0.0912067084027339, + "grad_norm": 0.34478431940078735, + "learning_rate": 0.0001, + "loss": 1.8184, + "step": 794 + }, + { + "epoch": 0.09132157831256102, + "grad_norm": 0.32101911306381226, + "learning_rate": 0.0001, + "loss": 1.8286, + "step": 795 + }, + { + "epoch": 0.09143644822238814, + "grad_norm": 0.3403749465942383, + "learning_rate": 0.0001, + "loss": 1.7323, + "step": 796 + }, + { + "epoch": 0.09155131813221527, + "grad_norm": 0.3531983196735382, + "learning_rate": 0.0001, + "loss": 1.9426, + "step": 797 + }, + { + "epoch": 0.09166618804204239, + "grad_norm": 0.33084288239479065, + "learning_rate": 0.0001, + "loss": 1.662, + "step": 798 + }, + { + "epoch": 0.09178105795186951, + "grad_norm": 0.3422669768333435, + "learning_rate": 0.0001, + "loss": 1.8586, + "step": 799 + }, + { + "epoch": 0.09189592786169663, + "grad_norm": 0.333498477935791, + "learning_rate": 0.0001, + "loss": 1.6855, + "step": 800 + }, + { + "epoch": 0.09201079777152375, + "grad_norm": 0.3705277740955353, + "learning_rate": 0.0001, + "loss": 1.7961, + "step": 801 + }, + { + "epoch": 0.09212566768135087, + "grad_norm": 0.3193943500518799, + "learning_rate": 0.0001, + "loss": 1.6723, + "step": 802 + }, + { + "epoch": 0.09224053759117799, + "grad_norm": 0.3290367126464844, + "learning_rate": 0.0001, + "loss": 1.6078, + "step": 803 + }, + { + "epoch": 0.09235540750100511, + "grad_norm": 0.3147204518318176, + "learning_rate": 0.0001, + "loss": 1.6357, + "step": 804 + }, + { + "epoch": 0.09247027741083223, + "grad_norm": 0.33811822533607483, + "learning_rate": 0.0001, + "loss": 1.6841, + "step": 805 + }, + { + "epoch": 0.09258514732065935, + "grad_norm": 0.34665969014167786, + "learning_rate": 0.0001, + "loss": 1.8406, + "step": 806 + }, + { + "epoch": 0.09270001723048647, + "grad_norm": 0.367832213640213, + "learning_rate": 0.0001, + "loss": 1.8237, + "step": 807 + }, + { + "epoch": 0.09281488714031359, + "grad_norm": 0.3354632258415222, + "learning_rate": 0.0001, + "loss": 1.7281, + "step": 808 + }, + { + "epoch": 0.09292975705014071, + "grad_norm": 0.3267354965209961, + "learning_rate": 0.0001, + "loss": 1.6339, + "step": 809 + }, + { + "epoch": 0.09304462695996783, + "grad_norm": 0.3412437438964844, + "learning_rate": 0.0001, + "loss": 1.8719, + "step": 810 + }, + { + "epoch": 0.09315949686979495, + "grad_norm": 0.3305417597293854, + "learning_rate": 0.0001, + "loss": 1.8327, + "step": 811 + }, + { + "epoch": 0.09327436677962207, + "grad_norm": 0.3420979678630829, + "learning_rate": 0.0001, + "loss": 1.7677, + "step": 812 + }, + { + "epoch": 0.09338923668944919, + "grad_norm": 0.3424234092235565, + "learning_rate": 0.0001, + "loss": 1.8198, + "step": 813 + }, + { + "epoch": 0.09350410659927633, + "grad_norm": 0.3489876985549927, + "learning_rate": 0.0001, + "loss": 1.8778, + "step": 814 + }, + { + "epoch": 0.09361897650910345, + "grad_norm": 0.3356654942035675, + "learning_rate": 0.0001, + "loss": 1.6532, + "step": 815 + }, + { + "epoch": 0.09373384641893057, + "grad_norm": 0.33615395426750183, + "learning_rate": 0.0001, + "loss": 1.6045, + "step": 816 + }, + { + "epoch": 0.09384871632875769, + "grad_norm": 0.3469023108482361, + "learning_rate": 0.0001, + "loss": 1.7758, + "step": 817 + }, + { + "epoch": 0.0939635862385848, + "grad_norm": 0.3822448253631592, + "learning_rate": 0.0001, + "loss": 1.8071, + "step": 818 + }, + { + "epoch": 0.09407845614841193, + "grad_norm": 0.32721102237701416, + "learning_rate": 0.0001, + "loss": 1.7461, + "step": 819 + }, + { + "epoch": 0.09419332605823905, + "grad_norm": 0.3481324017047882, + "learning_rate": 0.0001, + "loss": 1.5403, + "step": 820 + }, + { + "epoch": 0.09430819596806617, + "grad_norm": 0.34965309500694275, + "learning_rate": 0.0001, + "loss": 1.6718, + "step": 821 + }, + { + "epoch": 0.09442306587789329, + "grad_norm": 0.3382103443145752, + "learning_rate": 0.0001, + "loss": 1.6406, + "step": 822 + }, + { + "epoch": 0.0945379357877204, + "grad_norm": 0.33241525292396545, + "learning_rate": 0.0001, + "loss": 1.6408, + "step": 823 + }, + { + "epoch": 0.09465280569754753, + "grad_norm": 0.34981343150138855, + "learning_rate": 0.0001, + "loss": 1.9116, + "step": 824 + }, + { + "epoch": 0.09476767560737465, + "grad_norm": 0.3368913531303406, + "learning_rate": 0.0001, + "loss": 1.8079, + "step": 825 + }, + { + "epoch": 0.09488254551720177, + "grad_norm": 0.3350062370300293, + "learning_rate": 0.0001, + "loss": 1.7194, + "step": 826 + }, + { + "epoch": 0.09499741542702889, + "grad_norm": 0.365464448928833, + "learning_rate": 0.0001, + "loss": 1.9002, + "step": 827 + }, + { + "epoch": 0.095112285336856, + "grad_norm": 0.3544370234012604, + "learning_rate": 0.0001, + "loss": 1.9647, + "step": 828 + }, + { + "epoch": 0.09522715524668313, + "grad_norm": 0.3434012532234192, + "learning_rate": 0.0001, + "loss": 1.8148, + "step": 829 + }, + { + "epoch": 0.09534202515651025, + "grad_norm": 0.33346980810165405, + "learning_rate": 0.0001, + "loss": 1.5846, + "step": 830 + }, + { + "epoch": 0.09545689506633737, + "grad_norm": 0.3305695652961731, + "learning_rate": 0.0001, + "loss": 1.7361, + "step": 831 + }, + { + "epoch": 0.0955717649761645, + "grad_norm": 0.32634860277175903, + "learning_rate": 0.0001, + "loss": 1.5721, + "step": 832 + }, + { + "epoch": 0.09568663488599162, + "grad_norm": 0.3320368528366089, + "learning_rate": 0.0001, + "loss": 1.6206, + "step": 833 + }, + { + "epoch": 0.09580150479581874, + "grad_norm": 0.33831140398979187, + "learning_rate": 0.0001, + "loss": 1.9525, + "step": 834 + }, + { + "epoch": 0.09591637470564586, + "grad_norm": 0.37599530816078186, + "learning_rate": 0.0001, + "loss": 1.8062, + "step": 835 + }, + { + "epoch": 0.09603124461547298, + "grad_norm": 0.3151451647281647, + "learning_rate": 0.0001, + "loss": 1.5515, + "step": 836 + }, + { + "epoch": 0.0961461145253001, + "grad_norm": 0.3700610101222992, + "learning_rate": 0.0001, + "loss": 1.9294, + "step": 837 + }, + { + "epoch": 0.09626098443512722, + "grad_norm": 0.3530856668949127, + "learning_rate": 0.0001, + "loss": 1.9053, + "step": 838 + }, + { + "epoch": 0.09637585434495434, + "grad_norm": 0.3287757933139801, + "learning_rate": 0.0001, + "loss": 1.7148, + "step": 839 + }, + { + "epoch": 0.09649072425478146, + "grad_norm": 0.33994293212890625, + "learning_rate": 0.0001, + "loss": 1.6897, + "step": 840 + }, + { + "epoch": 0.09660559416460858, + "grad_norm": 0.3461272418498993, + "learning_rate": 0.0001, + "loss": 1.6242, + "step": 841 + }, + { + "epoch": 0.0967204640744357, + "grad_norm": 0.3444170653820038, + "learning_rate": 0.0001, + "loss": 1.6429, + "step": 842 + }, + { + "epoch": 0.09683533398426282, + "grad_norm": 0.33912792801856995, + "learning_rate": 0.0001, + "loss": 1.7618, + "step": 843 + }, + { + "epoch": 0.09695020389408994, + "grad_norm": 0.36995047330856323, + "learning_rate": 0.0001, + "loss": 1.7405, + "step": 844 + }, + { + "epoch": 0.09706507380391706, + "grad_norm": 0.318174809217453, + "learning_rate": 0.0001, + "loss": 1.5174, + "step": 845 + }, + { + "epoch": 0.09717994371374418, + "grad_norm": 0.340555876493454, + "learning_rate": 0.0001, + "loss": 1.7104, + "step": 846 + }, + { + "epoch": 0.0972948136235713, + "grad_norm": 0.3448858857154846, + "learning_rate": 0.0001, + "loss": 1.6208, + "step": 847 + }, + { + "epoch": 0.09740968353339842, + "grad_norm": 0.34647923707962036, + "learning_rate": 0.0001, + "loss": 1.8009, + "step": 848 + }, + { + "epoch": 0.09752455344322555, + "grad_norm": 0.3762481212615967, + "learning_rate": 0.0001, + "loss": 1.6759, + "step": 849 + }, + { + "epoch": 0.09763942335305267, + "grad_norm": 0.3799351751804352, + "learning_rate": 0.0001, + "loss": 1.9706, + "step": 850 + }, + { + "epoch": 0.0977542932628798, + "grad_norm": 0.33727753162384033, + "learning_rate": 0.0001, + "loss": 1.6215, + "step": 851 + }, + { + "epoch": 0.09786916317270691, + "grad_norm": 0.3406698703765869, + "learning_rate": 0.0001, + "loss": 1.7837, + "step": 852 + }, + { + "epoch": 0.09798403308253403, + "grad_norm": 0.3939476013183594, + "learning_rate": 0.0001, + "loss": 1.6971, + "step": 853 + }, + { + "epoch": 0.09809890299236115, + "grad_norm": 0.3444227874279022, + "learning_rate": 0.0001, + "loss": 1.5426, + "step": 854 + }, + { + "epoch": 0.09821377290218827, + "grad_norm": 0.38864031434059143, + "learning_rate": 0.0001, + "loss": 1.7997, + "step": 855 + }, + { + "epoch": 0.0983286428120154, + "grad_norm": 0.36266061663627625, + "learning_rate": 0.0001, + "loss": 1.6731, + "step": 856 + }, + { + "epoch": 0.09844351272184251, + "grad_norm": 0.3754750192165375, + "learning_rate": 0.0001, + "loss": 1.922, + "step": 857 + }, + { + "epoch": 0.09855838263166963, + "grad_norm": 0.3244558274745941, + "learning_rate": 0.0001, + "loss": 1.6094, + "step": 858 + }, + { + "epoch": 0.09867325254149675, + "grad_norm": 0.3381154537200928, + "learning_rate": 0.0001, + "loss": 1.5764, + "step": 859 + }, + { + "epoch": 0.09878812245132387, + "grad_norm": 0.37016791105270386, + "learning_rate": 0.0001, + "loss": 1.7252, + "step": 860 + }, + { + "epoch": 0.098902992361151, + "grad_norm": 0.3377515971660614, + "learning_rate": 0.0001, + "loss": 1.7457, + "step": 861 + }, + { + "epoch": 0.09901786227097811, + "grad_norm": 0.3582906126976013, + "learning_rate": 0.0001, + "loss": 1.7201, + "step": 862 + }, + { + "epoch": 0.09913273218080523, + "grad_norm": 0.39218735694885254, + "learning_rate": 0.0001, + "loss": 2.1357, + "step": 863 + }, + { + "epoch": 0.09924760209063235, + "grad_norm": 0.3586115837097168, + "learning_rate": 0.0001, + "loss": 1.7406, + "step": 864 + }, + { + "epoch": 0.09936247200045947, + "grad_norm": 0.3303786516189575, + "learning_rate": 0.0001, + "loss": 1.6618, + "step": 865 + }, + { + "epoch": 0.0994773419102866, + "grad_norm": 0.34086883068084717, + "learning_rate": 0.0001, + "loss": 1.6499, + "step": 866 + }, + { + "epoch": 0.09959221182011373, + "grad_norm": 0.3244491219520569, + "learning_rate": 0.0001, + "loss": 1.621, + "step": 867 + }, + { + "epoch": 0.09970708172994085, + "grad_norm": 0.3519229292869568, + "learning_rate": 0.0001, + "loss": 1.7324, + "step": 868 + }, + { + "epoch": 0.09982195163976797, + "grad_norm": 0.3447989523410797, + "learning_rate": 0.0001, + "loss": 1.7337, + "step": 869 + }, + { + "epoch": 0.09993682154959509, + "grad_norm": 0.3334493935108185, + "learning_rate": 0.0001, + "loss": 1.685, + "step": 870 + }, + { + "epoch": 0.1000516914594222, + "grad_norm": 0.3429296612739563, + "learning_rate": 0.0001, + "loss": 1.7162, + "step": 871 + }, + { + "epoch": 0.10016656136924933, + "grad_norm": 0.3839278519153595, + "learning_rate": 0.0001, + "loss": 1.7534, + "step": 872 + }, + { + "epoch": 0.10028143127907645, + "grad_norm": 0.33588939905166626, + "learning_rate": 0.0001, + "loss": 1.6247, + "step": 873 + }, + { + "epoch": 0.10039630118890357, + "grad_norm": 0.34705570340156555, + "learning_rate": 0.0001, + "loss": 1.841, + "step": 874 + }, + { + "epoch": 0.10051117109873069, + "grad_norm": 0.36733031272888184, + "learning_rate": 0.0001, + "loss": 1.852, + "step": 875 + }, + { + "epoch": 0.1006260410085578, + "grad_norm": 0.3770068883895874, + "learning_rate": 0.0001, + "loss": 1.6318, + "step": 876 + }, + { + "epoch": 0.10074091091838493, + "grad_norm": 0.3212607800960541, + "learning_rate": 0.0001, + "loss": 1.7223, + "step": 877 + }, + { + "epoch": 0.10085578082821205, + "grad_norm": 0.36928656697273254, + "learning_rate": 0.0001, + "loss": 1.7392, + "step": 878 + }, + { + "epoch": 0.10097065073803917, + "grad_norm": 0.34793323278427124, + "learning_rate": 0.0001, + "loss": 1.7446, + "step": 879 + }, + { + "epoch": 0.10108552064786629, + "grad_norm": 0.3587128221988678, + "learning_rate": 0.0001, + "loss": 1.8248, + "step": 880 + }, + { + "epoch": 0.1012003905576934, + "grad_norm": 0.37394312024116516, + "learning_rate": 0.0001, + "loss": 1.8343, + "step": 881 + }, + { + "epoch": 0.10131526046752053, + "grad_norm": 0.3411267101764679, + "learning_rate": 0.0001, + "loss": 1.7313, + "step": 882 + }, + { + "epoch": 0.10143013037734766, + "grad_norm": 0.3414922058582306, + "learning_rate": 0.0001, + "loss": 1.6468, + "step": 883 + }, + { + "epoch": 0.10154500028717478, + "grad_norm": 0.3430386781692505, + "learning_rate": 0.0001, + "loss": 1.6358, + "step": 884 + }, + { + "epoch": 0.1016598701970019, + "grad_norm": 0.38398411870002747, + "learning_rate": 0.0001, + "loss": 1.8655, + "step": 885 + }, + { + "epoch": 0.10177474010682902, + "grad_norm": 0.35513511300086975, + "learning_rate": 0.0001, + "loss": 1.7441, + "step": 886 + }, + { + "epoch": 0.10188961001665614, + "grad_norm": 0.37063589692115784, + "learning_rate": 0.0001, + "loss": 1.8402, + "step": 887 + }, + { + "epoch": 0.10200447992648326, + "grad_norm": 0.3473655581474304, + "learning_rate": 0.0001, + "loss": 1.7719, + "step": 888 + }, + { + "epoch": 0.10211934983631038, + "grad_norm": 0.3341835141181946, + "learning_rate": 0.0001, + "loss": 1.9165, + "step": 889 + }, + { + "epoch": 0.1022342197461375, + "grad_norm": 0.33776983618736267, + "learning_rate": 0.0001, + "loss": 1.6519, + "step": 890 + }, + { + "epoch": 0.10234908965596462, + "grad_norm": 0.34027940034866333, + "learning_rate": 0.0001, + "loss": 1.6075, + "step": 891 + }, + { + "epoch": 0.10246395956579174, + "grad_norm": 0.3332427442073822, + "learning_rate": 0.0001, + "loss": 1.6832, + "step": 892 + }, + { + "epoch": 0.10257882947561886, + "grad_norm": 0.3453330099582672, + "learning_rate": 0.0001, + "loss": 1.8543, + "step": 893 + }, + { + "epoch": 0.10269369938544598, + "grad_norm": 0.3439124524593353, + "learning_rate": 0.0001, + "loss": 1.8294, + "step": 894 + }, + { + "epoch": 0.1028085692952731, + "grad_norm": 0.3373720645904541, + "learning_rate": 0.0001, + "loss": 1.816, + "step": 895 + }, + { + "epoch": 0.10292343920510022, + "grad_norm": 0.3701860308647156, + "learning_rate": 0.0001, + "loss": 1.6363, + "step": 896 + }, + { + "epoch": 0.10303830911492734, + "grad_norm": 0.3510701358318329, + "learning_rate": 0.0001, + "loss": 1.8181, + "step": 897 + }, + { + "epoch": 0.10315317902475446, + "grad_norm": 0.383064329624176, + "learning_rate": 0.0001, + "loss": 1.7649, + "step": 898 + }, + { + "epoch": 0.10326804893458158, + "grad_norm": 0.33695435523986816, + "learning_rate": 0.0001, + "loss": 1.7667, + "step": 899 + }, + { + "epoch": 0.10338291884440871, + "grad_norm": 0.32354670763015747, + "learning_rate": 0.0001, + "loss": 1.6521, + "step": 900 + }, + { + "epoch": 0.10349778875423583, + "grad_norm": 0.3468325138092041, + "learning_rate": 0.0001, + "loss": 1.897, + "step": 901 + }, + { + "epoch": 0.10361265866406295, + "grad_norm": 0.33674290776252747, + "learning_rate": 0.0001, + "loss": 1.7802, + "step": 902 + }, + { + "epoch": 0.10372752857389007, + "grad_norm": 0.3250616490840912, + "learning_rate": 0.0001, + "loss": 1.7262, + "step": 903 + }, + { + "epoch": 0.1038423984837172, + "grad_norm": 0.3321487009525299, + "learning_rate": 0.0001, + "loss": 1.7408, + "step": 904 + }, + { + "epoch": 0.10395726839354431, + "grad_norm": 0.3137947916984558, + "learning_rate": 0.0001, + "loss": 1.6136, + "step": 905 + }, + { + "epoch": 0.10407213830337143, + "grad_norm": 0.35805246233940125, + "learning_rate": 0.0001, + "loss": 1.7689, + "step": 906 + }, + { + "epoch": 0.10418700821319855, + "grad_norm": 0.35889506340026855, + "learning_rate": 0.0001, + "loss": 1.8557, + "step": 907 + }, + { + "epoch": 0.10430187812302567, + "grad_norm": 0.3665921092033386, + "learning_rate": 0.0001, + "loss": 1.8241, + "step": 908 + }, + { + "epoch": 0.1044167480328528, + "grad_norm": 0.32144057750701904, + "learning_rate": 0.0001, + "loss": 1.6686, + "step": 909 + }, + { + "epoch": 0.10453161794267991, + "grad_norm": 0.3347714841365814, + "learning_rate": 0.0001, + "loss": 1.5503, + "step": 910 + }, + { + "epoch": 0.10464648785250703, + "grad_norm": 0.3488980233669281, + "learning_rate": 0.0001, + "loss": 1.8368, + "step": 911 + }, + { + "epoch": 0.10476135776233415, + "grad_norm": 0.3683393895626068, + "learning_rate": 0.0001, + "loss": 1.8511, + "step": 912 + }, + { + "epoch": 0.10487622767216127, + "grad_norm": 0.36709287762641907, + "learning_rate": 0.0001, + "loss": 1.8485, + "step": 913 + }, + { + "epoch": 0.1049910975819884, + "grad_norm": 0.3464992642402649, + "learning_rate": 0.0001, + "loss": 1.8211, + "step": 914 + }, + { + "epoch": 0.10510596749181551, + "grad_norm": 0.3259120285511017, + "learning_rate": 0.0001, + "loss": 1.6602, + "step": 915 + }, + { + "epoch": 0.10522083740164263, + "grad_norm": 0.3253241777420044, + "learning_rate": 0.0001, + "loss": 1.721, + "step": 916 + }, + { + "epoch": 0.10533570731146977, + "grad_norm": 0.3512059152126312, + "learning_rate": 0.0001, + "loss": 1.7504, + "step": 917 + }, + { + "epoch": 0.10545057722129689, + "grad_norm": 0.34296149015426636, + "learning_rate": 0.0001, + "loss": 1.655, + "step": 918 + }, + { + "epoch": 0.10556544713112401, + "grad_norm": 0.34532561898231506, + "learning_rate": 0.0001, + "loss": 1.7067, + "step": 919 + }, + { + "epoch": 0.10568031704095113, + "grad_norm": 0.4030701816082001, + "learning_rate": 0.0001, + "loss": 1.8292, + "step": 920 + }, + { + "epoch": 0.10579518695077825, + "grad_norm": 0.3220067322254181, + "learning_rate": 0.0001, + "loss": 1.7312, + "step": 921 + }, + { + "epoch": 0.10591005686060537, + "grad_norm": 0.35036081075668335, + "learning_rate": 0.0001, + "loss": 1.9524, + "step": 922 + }, + { + "epoch": 0.10602492677043249, + "grad_norm": 0.34477829933166504, + "learning_rate": 0.0001, + "loss": 1.6833, + "step": 923 + }, + { + "epoch": 0.10613979668025961, + "grad_norm": 0.3312055468559265, + "learning_rate": 0.0001, + "loss": 1.7349, + "step": 924 + }, + { + "epoch": 0.10625466659008673, + "grad_norm": 0.3699291944503784, + "learning_rate": 0.0001, + "loss": 1.8016, + "step": 925 + }, + { + "epoch": 0.10636953649991385, + "grad_norm": 0.35141250491142273, + "learning_rate": 0.0001, + "loss": 1.6422, + "step": 926 + }, + { + "epoch": 0.10648440640974097, + "grad_norm": 0.3546350300312042, + "learning_rate": 0.0001, + "loss": 1.792, + "step": 927 + }, + { + "epoch": 0.10659927631956809, + "grad_norm": 0.32933250069618225, + "learning_rate": 0.0001, + "loss": 1.6196, + "step": 928 + }, + { + "epoch": 0.10671414622939521, + "grad_norm": 0.31342101097106934, + "learning_rate": 0.0001, + "loss": 1.4959, + "step": 929 + }, + { + "epoch": 0.10682901613922233, + "grad_norm": 0.3823557496070862, + "learning_rate": 0.0001, + "loss": 1.7735, + "step": 930 + }, + { + "epoch": 0.10694388604904945, + "grad_norm": 0.37668943405151367, + "learning_rate": 0.0001, + "loss": 1.7153, + "step": 931 + }, + { + "epoch": 0.10705875595887657, + "grad_norm": 0.3394898772239685, + "learning_rate": 0.0001, + "loss": 1.5666, + "step": 932 + }, + { + "epoch": 0.10717362586870369, + "grad_norm": 0.3595438301563263, + "learning_rate": 0.0001, + "loss": 1.5773, + "step": 933 + }, + { + "epoch": 0.10728849577853082, + "grad_norm": 0.3278176486492157, + "learning_rate": 0.0001, + "loss": 1.4859, + "step": 934 + }, + { + "epoch": 0.10740336568835794, + "grad_norm": 0.35469111800193787, + "learning_rate": 0.0001, + "loss": 1.7812, + "step": 935 + }, + { + "epoch": 0.10751823559818506, + "grad_norm": 0.35269084572792053, + "learning_rate": 0.0001, + "loss": 1.7469, + "step": 936 + }, + { + "epoch": 0.10763310550801218, + "grad_norm": 0.3482814133167267, + "learning_rate": 0.0001, + "loss": 1.4346, + "step": 937 + }, + { + "epoch": 0.1077479754178393, + "grad_norm": 0.3587512671947479, + "learning_rate": 0.0001, + "loss": 1.433, + "step": 938 + }, + { + "epoch": 0.10786284532766642, + "grad_norm": 0.3574024736881256, + "learning_rate": 0.0001, + "loss": 1.9325, + "step": 939 + }, + { + "epoch": 0.10797771523749354, + "grad_norm": 0.3392809331417084, + "learning_rate": 0.0001, + "loss": 1.7022, + "step": 940 + }, + { + "epoch": 0.10809258514732066, + "grad_norm": 0.3584175109863281, + "learning_rate": 0.0001, + "loss": 1.6396, + "step": 941 + }, + { + "epoch": 0.10820745505714778, + "grad_norm": 0.35646557807922363, + "learning_rate": 0.0001, + "loss": 1.7241, + "step": 942 + }, + { + "epoch": 0.1083223249669749, + "grad_norm": 0.3471522331237793, + "learning_rate": 0.0001, + "loss": 1.7296, + "step": 943 + }, + { + "epoch": 0.10843719487680202, + "grad_norm": 0.36520814895629883, + "learning_rate": 0.0001, + "loss": 1.7646, + "step": 944 + }, + { + "epoch": 0.10855206478662914, + "grad_norm": 0.3618639409542084, + "learning_rate": 0.0001, + "loss": 1.879, + "step": 945 + }, + { + "epoch": 0.10866693469645626, + "grad_norm": 0.3887125849723816, + "learning_rate": 0.0001, + "loss": 1.9636, + "step": 946 + }, + { + "epoch": 0.10878180460628338, + "grad_norm": 0.36584070324897766, + "learning_rate": 0.0001, + "loss": 1.8166, + "step": 947 + }, + { + "epoch": 0.1088966745161105, + "grad_norm": 0.4011528491973877, + "learning_rate": 0.0001, + "loss": 1.8934, + "step": 948 + }, + { + "epoch": 0.10901154442593762, + "grad_norm": 0.35552287101745605, + "learning_rate": 0.0001, + "loss": 1.7606, + "step": 949 + }, + { + "epoch": 0.10912641433576474, + "grad_norm": 0.3487032949924469, + "learning_rate": 0.0001, + "loss": 1.6729, + "step": 950 + }, + { + "epoch": 0.10924128424559186, + "grad_norm": 0.34331613779067993, + "learning_rate": 0.0001, + "loss": 1.5127, + "step": 951 + }, + { + "epoch": 0.109356154155419, + "grad_norm": 0.35324063897132874, + "learning_rate": 0.0001, + "loss": 1.7534, + "step": 952 + }, + { + "epoch": 0.10947102406524611, + "grad_norm": 0.33275148272514343, + "learning_rate": 0.0001, + "loss": 1.6231, + "step": 953 + }, + { + "epoch": 0.10958589397507323, + "grad_norm": 0.329111784696579, + "learning_rate": 0.0001, + "loss": 1.682, + "step": 954 + }, + { + "epoch": 0.10970076388490035, + "grad_norm": 0.3550437092781067, + "learning_rate": 0.0001, + "loss": 1.8724, + "step": 955 + }, + { + "epoch": 0.10981563379472747, + "grad_norm": 0.3454737663269043, + "learning_rate": 0.0001, + "loss": 1.7989, + "step": 956 + }, + { + "epoch": 0.1099305037045546, + "grad_norm": 0.3605286777019501, + "learning_rate": 0.0001, + "loss": 1.7102, + "step": 957 + }, + { + "epoch": 0.11004537361438171, + "grad_norm": 0.3494301438331604, + "learning_rate": 0.0001, + "loss": 1.6718, + "step": 958 + }, + { + "epoch": 0.11016024352420883, + "grad_norm": 0.3383633494377136, + "learning_rate": 0.0001, + "loss": 1.8941, + "step": 959 + }, + { + "epoch": 0.11027511343403595, + "grad_norm": 0.3342844247817993, + "learning_rate": 0.0001, + "loss": 1.7833, + "step": 960 + }, + { + "epoch": 0.11038998334386307, + "grad_norm": 0.3412388861179352, + "learning_rate": 0.0001, + "loss": 1.6875, + "step": 961 + }, + { + "epoch": 0.1105048532536902, + "grad_norm": 0.34491872787475586, + "learning_rate": 0.0001, + "loss": 1.6936, + "step": 962 + }, + { + "epoch": 0.11061972316351731, + "grad_norm": 0.3250080943107605, + "learning_rate": 0.0001, + "loss": 1.5619, + "step": 963 + }, + { + "epoch": 0.11073459307334443, + "grad_norm": 0.3618282973766327, + "learning_rate": 0.0001, + "loss": 1.7687, + "step": 964 + }, + { + "epoch": 0.11084946298317155, + "grad_norm": 0.37689584493637085, + "learning_rate": 0.0001, + "loss": 1.6914, + "step": 965 + }, + { + "epoch": 0.11096433289299867, + "grad_norm": 0.35625600814819336, + "learning_rate": 0.0001, + "loss": 1.7087, + "step": 966 + }, + { + "epoch": 0.1110792028028258, + "grad_norm": 0.36949923634529114, + "learning_rate": 0.0001, + "loss": 1.8479, + "step": 967 + }, + { + "epoch": 0.11119407271265291, + "grad_norm": 0.32159894704818726, + "learning_rate": 0.0001, + "loss": 1.7816, + "step": 968 + }, + { + "epoch": 0.11130894262248005, + "grad_norm": 0.35336339473724365, + "learning_rate": 0.0001, + "loss": 1.7206, + "step": 969 + }, + { + "epoch": 0.11142381253230717, + "grad_norm": 0.3411477506160736, + "learning_rate": 0.0001, + "loss": 1.7505, + "step": 970 + }, + { + "epoch": 0.11153868244213429, + "grad_norm": 0.37740951776504517, + "learning_rate": 0.0001, + "loss": 1.8982, + "step": 971 + }, + { + "epoch": 0.11165355235196141, + "grad_norm": 0.32587164640426636, + "learning_rate": 0.0001, + "loss": 1.7459, + "step": 972 + }, + { + "epoch": 0.11176842226178853, + "grad_norm": 0.40314409136772156, + "learning_rate": 0.0001, + "loss": 1.752, + "step": 973 + }, + { + "epoch": 0.11188329217161565, + "grad_norm": 0.3304173946380615, + "learning_rate": 0.0001, + "loss": 1.5715, + "step": 974 + }, + { + "epoch": 0.11199816208144277, + "grad_norm": 0.3680497407913208, + "learning_rate": 0.0001, + "loss": 1.6852, + "step": 975 + }, + { + "epoch": 0.11211303199126989, + "grad_norm": 0.3391849100589752, + "learning_rate": 0.0001, + "loss": 1.7026, + "step": 976 + }, + { + "epoch": 0.11222790190109701, + "grad_norm": 0.37164103984832764, + "learning_rate": 0.0001, + "loss": 1.847, + "step": 977 + }, + { + "epoch": 0.11234277181092413, + "grad_norm": 0.3532995581626892, + "learning_rate": 0.0001, + "loss": 1.8528, + "step": 978 + }, + { + "epoch": 0.11245764172075125, + "grad_norm": 0.35095518827438354, + "learning_rate": 0.0001, + "loss": 1.8725, + "step": 979 + }, + { + "epoch": 0.11257251163057837, + "grad_norm": 0.3317796289920807, + "learning_rate": 0.0001, + "loss": 1.6293, + "step": 980 + }, + { + "epoch": 0.11268738154040549, + "grad_norm": 0.3477189838886261, + "learning_rate": 0.0001, + "loss": 1.6846, + "step": 981 + }, + { + "epoch": 0.11280225145023261, + "grad_norm": 0.34023842215538025, + "learning_rate": 0.0001, + "loss": 1.6921, + "step": 982 + }, + { + "epoch": 0.11291712136005973, + "grad_norm": 0.3701896369457245, + "learning_rate": 0.0001, + "loss": 1.545, + "step": 983 + }, + { + "epoch": 0.11303199126988685, + "grad_norm": 0.3711189329624176, + "learning_rate": 0.0001, + "loss": 1.4718, + "step": 984 + }, + { + "epoch": 0.11314686117971397, + "grad_norm": 0.35467204451560974, + "learning_rate": 0.0001, + "loss": 1.7369, + "step": 985 + }, + { + "epoch": 0.1132617310895411, + "grad_norm": 0.3388875722885132, + "learning_rate": 0.0001, + "loss": 1.7018, + "step": 986 + }, + { + "epoch": 0.11337660099936822, + "grad_norm": 0.34032291173934937, + "learning_rate": 0.0001, + "loss": 1.6629, + "step": 987 + }, + { + "epoch": 0.11349147090919534, + "grad_norm": 0.3326657712459564, + "learning_rate": 0.0001, + "loss": 1.5377, + "step": 988 + }, + { + "epoch": 0.11360634081902246, + "grad_norm": 0.3543054163455963, + "learning_rate": 0.0001, + "loss": 1.6426, + "step": 989 + }, + { + "epoch": 0.11372121072884958, + "grad_norm": 0.3578903377056122, + "learning_rate": 0.0001, + "loss": 1.7261, + "step": 990 + }, + { + "epoch": 0.1138360806386767, + "grad_norm": 0.3672734498977661, + "learning_rate": 0.0001, + "loss": 1.8798, + "step": 991 + }, + { + "epoch": 0.11395095054850382, + "grad_norm": 0.3719589114189148, + "learning_rate": 0.0001, + "loss": 1.6408, + "step": 992 + }, + { + "epoch": 0.11406582045833094, + "grad_norm": 0.3530134856700897, + "learning_rate": 0.0001, + "loss": 1.7737, + "step": 993 + }, + { + "epoch": 0.11418069036815806, + "grad_norm": 0.371442586183548, + "learning_rate": 0.0001, + "loss": 1.5202, + "step": 994 + }, + { + "epoch": 0.11429556027798518, + "grad_norm": 0.33966636657714844, + "learning_rate": 0.0001, + "loss": 1.8347, + "step": 995 + }, + { + "epoch": 0.1144104301878123, + "grad_norm": 0.3425755202770233, + "learning_rate": 0.0001, + "loss": 1.7236, + "step": 996 + }, + { + "epoch": 0.11452530009763942, + "grad_norm": 0.3630322515964508, + "learning_rate": 0.0001, + "loss": 1.6771, + "step": 997 + }, + { + "epoch": 0.11464017000746654, + "grad_norm": 0.31861042976379395, + "learning_rate": 0.0001, + "loss": 1.5433, + "step": 998 + }, + { + "epoch": 0.11475503991729366, + "grad_norm": 0.34319064021110535, + "learning_rate": 0.0001, + "loss": 1.7079, + "step": 999 + }, + { + "epoch": 0.11486990982712078, + "grad_norm": 0.33769530057907104, + "learning_rate": 0.0001, + "loss": 1.6176, + "step": 1000 + }, + { + "epoch": 0.1149847797369479, + "grad_norm": 0.3288537859916687, + "learning_rate": 0.0001, + "loss": 1.4786, + "step": 1001 + }, + { + "epoch": 0.11509964964677502, + "grad_norm": 0.41031739115715027, + "learning_rate": 0.0001, + "loss": 1.7241, + "step": 1002 + }, + { + "epoch": 0.11521451955660215, + "grad_norm": 0.33957231044769287, + "learning_rate": 0.0001, + "loss": 1.608, + "step": 1003 + }, + { + "epoch": 0.11532938946642927, + "grad_norm": 0.35768911242485046, + "learning_rate": 0.0001, + "loss": 1.8097, + "step": 1004 + }, + { + "epoch": 0.1154442593762564, + "grad_norm": 0.3527219593524933, + "learning_rate": 0.0001, + "loss": 1.6337, + "step": 1005 + }, + { + "epoch": 0.11555912928608351, + "grad_norm": 0.3403669595718384, + "learning_rate": 0.0001, + "loss": 1.7182, + "step": 1006 + }, + { + "epoch": 0.11567399919591063, + "grad_norm": 0.33553534746170044, + "learning_rate": 0.0001, + "loss": 1.7559, + "step": 1007 + }, + { + "epoch": 0.11578886910573775, + "grad_norm": 0.39612364768981934, + "learning_rate": 0.0001, + "loss": 2.1183, + "step": 1008 + }, + { + "epoch": 0.11590373901556487, + "grad_norm": 0.3312492072582245, + "learning_rate": 0.0001, + "loss": 1.6856, + "step": 1009 + }, + { + "epoch": 0.116018608925392, + "grad_norm": 0.35512349009513855, + "learning_rate": 0.0001, + "loss": 1.8648, + "step": 1010 + }, + { + "epoch": 0.11613347883521911, + "grad_norm": 0.34850746393203735, + "learning_rate": 0.0001, + "loss": 1.7185, + "step": 1011 + }, + { + "epoch": 0.11624834874504623, + "grad_norm": 0.34042853116989136, + "learning_rate": 0.0001, + "loss": 1.5852, + "step": 1012 + }, + { + "epoch": 0.11636321865487335, + "grad_norm": 0.3678249716758728, + "learning_rate": 0.0001, + "loss": 1.9585, + "step": 1013 + }, + { + "epoch": 0.11647808856470047, + "grad_norm": 0.38127046823501587, + "learning_rate": 0.0001, + "loss": 1.7736, + "step": 1014 + }, + { + "epoch": 0.1165929584745276, + "grad_norm": 0.33116331696510315, + "learning_rate": 0.0001, + "loss": 1.6812, + "step": 1015 + }, + { + "epoch": 0.11670782838435471, + "grad_norm": 0.3540381193161011, + "learning_rate": 0.0001, + "loss": 1.666, + "step": 1016 + }, + { + "epoch": 0.11682269829418183, + "grad_norm": 0.34696200489997864, + "learning_rate": 0.0001, + "loss": 1.88, + "step": 1017 + }, + { + "epoch": 0.11693756820400895, + "grad_norm": 0.3833637833595276, + "learning_rate": 0.0001, + "loss": 1.9577, + "step": 1018 + }, + { + "epoch": 0.11705243811383607, + "grad_norm": 0.34638655185699463, + "learning_rate": 0.0001, + "loss": 1.7792, + "step": 1019 + }, + { + "epoch": 0.11716730802366321, + "grad_norm": 0.3427535593509674, + "learning_rate": 0.0001, + "loss": 1.5908, + "step": 1020 + }, + { + "epoch": 0.11728217793349033, + "grad_norm": 0.3441438376903534, + "learning_rate": 0.0001, + "loss": 1.7116, + "step": 1021 + }, + { + "epoch": 0.11739704784331745, + "grad_norm": 0.3629339039325714, + "learning_rate": 0.0001, + "loss": 1.7716, + "step": 1022 + }, + { + "epoch": 0.11751191775314457, + "grad_norm": 0.35050028562545776, + "learning_rate": 0.0001, + "loss": 1.7887, + "step": 1023 + }, + { + "epoch": 0.11762678766297169, + "grad_norm": 0.3770378828048706, + "learning_rate": 0.0001, + "loss": 1.9182, + "step": 1024 + }, + { + "epoch": 0.11774165757279881, + "grad_norm": 0.3678201138973236, + "learning_rate": 0.0001, + "loss": 1.7746, + "step": 1025 + }, + { + "epoch": 0.11785652748262593, + "grad_norm": 0.3541167080402374, + "learning_rate": 0.0001, + "loss": 1.7004, + "step": 1026 + }, + { + "epoch": 0.11797139739245305, + "grad_norm": 0.34430310130119324, + "learning_rate": 0.0001, + "loss": 1.6821, + "step": 1027 + }, + { + "epoch": 0.11808626730228017, + "grad_norm": 0.35517948865890503, + "learning_rate": 0.0001, + "loss": 1.8264, + "step": 1028 + }, + { + "epoch": 0.11820113721210729, + "grad_norm": 0.35066622495651245, + "learning_rate": 0.0001, + "loss": 1.6285, + "step": 1029 + }, + { + "epoch": 0.11831600712193441, + "grad_norm": 0.3344581425189972, + "learning_rate": 0.0001, + "loss": 1.6052, + "step": 1030 + }, + { + "epoch": 0.11843087703176153, + "grad_norm": 0.3826616406440735, + "learning_rate": 0.0001, + "loss": 1.74, + "step": 1031 + }, + { + "epoch": 0.11854574694158865, + "grad_norm": 0.3551161587238312, + "learning_rate": 0.0001, + "loss": 1.7829, + "step": 1032 + }, + { + "epoch": 0.11866061685141577, + "grad_norm": 0.3458511233329773, + "learning_rate": 0.0001, + "loss": 1.7166, + "step": 1033 + }, + { + "epoch": 0.11877548676124289, + "grad_norm": 0.38753172755241394, + "learning_rate": 0.0001, + "loss": 1.5937, + "step": 1034 + }, + { + "epoch": 0.11889035667107001, + "grad_norm": 0.36152660846710205, + "learning_rate": 0.0001, + "loss": 1.7422, + "step": 1035 + }, + { + "epoch": 0.11900522658089713, + "grad_norm": 0.3490775525569916, + "learning_rate": 0.0001, + "loss": 1.7268, + "step": 1036 + }, + { + "epoch": 0.11912009649072426, + "grad_norm": 0.3666988015174866, + "learning_rate": 0.0001, + "loss": 1.7706, + "step": 1037 + }, + { + "epoch": 0.11923496640055138, + "grad_norm": 0.3366585671901703, + "learning_rate": 0.0001, + "loss": 1.5956, + "step": 1038 + }, + { + "epoch": 0.1193498363103785, + "grad_norm": 0.36800581216812134, + "learning_rate": 0.0001, + "loss": 1.7542, + "step": 1039 + }, + { + "epoch": 0.11946470622020562, + "grad_norm": 0.36081400513648987, + "learning_rate": 0.0001, + "loss": 1.882, + "step": 1040 + }, + { + "epoch": 0.11957957613003274, + "grad_norm": 0.3410419821739197, + "learning_rate": 0.0001, + "loss": 1.6178, + "step": 1041 + }, + { + "epoch": 0.11969444603985986, + "grad_norm": 0.3386545777320862, + "learning_rate": 0.0001, + "loss": 1.6777, + "step": 1042 + }, + { + "epoch": 0.11980931594968698, + "grad_norm": 0.31798994541168213, + "learning_rate": 0.0001, + "loss": 1.5473, + "step": 1043 + }, + { + "epoch": 0.1199241858595141, + "grad_norm": 0.3879840672016144, + "learning_rate": 0.0001, + "loss": 1.9508, + "step": 1044 + }, + { + "epoch": 0.12003905576934122, + "grad_norm": 0.3498954176902771, + "learning_rate": 0.0001, + "loss": 1.6588, + "step": 1045 + }, + { + "epoch": 0.12015392567916834, + "grad_norm": 0.3412124216556549, + "learning_rate": 0.0001, + "loss": 1.6106, + "step": 1046 + }, + { + "epoch": 0.12026879558899546, + "grad_norm": 0.35735565423965454, + "learning_rate": 0.0001, + "loss": 1.7393, + "step": 1047 + }, + { + "epoch": 0.12038366549882258, + "grad_norm": 0.339915931224823, + "learning_rate": 0.0001, + "loss": 1.7161, + "step": 1048 + }, + { + "epoch": 0.1204985354086497, + "grad_norm": 0.3441682457923889, + "learning_rate": 0.0001, + "loss": 1.7686, + "step": 1049 + }, + { + "epoch": 0.12061340531847682, + "grad_norm": 0.36500778794288635, + "learning_rate": 0.0001, + "loss": 1.7857, + "step": 1050 + }, + { + "epoch": 0.12072827522830394, + "grad_norm": 0.34607526659965515, + "learning_rate": 0.0001, + "loss": 1.8434, + "step": 1051 + }, + { + "epoch": 0.12084314513813106, + "grad_norm": 0.33628061413764954, + "learning_rate": 0.0001, + "loss": 1.7286, + "step": 1052 + }, + { + "epoch": 0.12095801504795818, + "grad_norm": 0.3468761146068573, + "learning_rate": 0.0001, + "loss": 1.8219, + "step": 1053 + }, + { + "epoch": 0.12107288495778532, + "grad_norm": 0.34483084082603455, + "learning_rate": 0.0001, + "loss": 1.7694, + "step": 1054 + }, + { + "epoch": 0.12118775486761243, + "grad_norm": 0.3372325003147125, + "learning_rate": 0.0001, + "loss": 1.6335, + "step": 1055 + }, + { + "epoch": 0.12130262477743955, + "grad_norm": 0.33837631344795227, + "learning_rate": 0.0001, + "loss": 1.6369, + "step": 1056 + }, + { + "epoch": 0.12141749468726667, + "grad_norm": 0.3314455449581146, + "learning_rate": 0.0001, + "loss": 1.6946, + "step": 1057 + }, + { + "epoch": 0.1215323645970938, + "grad_norm": 0.33914583921432495, + "learning_rate": 0.0001, + "loss": 1.6406, + "step": 1058 + }, + { + "epoch": 0.12164723450692091, + "grad_norm": 0.33204326033592224, + "learning_rate": 0.0001, + "loss": 1.6924, + "step": 1059 + }, + { + "epoch": 0.12176210441674803, + "grad_norm": 0.3244878649711609, + "learning_rate": 0.0001, + "loss": 1.622, + "step": 1060 + }, + { + "epoch": 0.12187697432657515, + "grad_norm": 0.37732627987861633, + "learning_rate": 0.0001, + "loss": 1.7643, + "step": 1061 + }, + { + "epoch": 0.12199184423640227, + "grad_norm": 0.3585972785949707, + "learning_rate": 0.0001, + "loss": 1.6568, + "step": 1062 + }, + { + "epoch": 0.1221067141462294, + "grad_norm": 0.36080244183540344, + "learning_rate": 0.0001, + "loss": 1.5489, + "step": 1063 + }, + { + "epoch": 0.12222158405605651, + "grad_norm": 0.3640299141407013, + "learning_rate": 0.0001, + "loss": 1.6219, + "step": 1064 + }, + { + "epoch": 0.12233645396588363, + "grad_norm": 0.3357522189617157, + "learning_rate": 0.0001, + "loss": 1.5328, + "step": 1065 + }, + { + "epoch": 0.12245132387571075, + "grad_norm": 0.37440431118011475, + "learning_rate": 0.0001, + "loss": 1.4352, + "step": 1066 + }, + { + "epoch": 0.12256619378553787, + "grad_norm": 0.3403536379337311, + "learning_rate": 0.0001, + "loss": 1.7168, + "step": 1067 + }, + { + "epoch": 0.122681063695365, + "grad_norm": 0.34183967113494873, + "learning_rate": 0.0001, + "loss": 1.6223, + "step": 1068 + }, + { + "epoch": 0.12279593360519211, + "grad_norm": 0.34776198863983154, + "learning_rate": 0.0001, + "loss": 1.7219, + "step": 1069 + }, + { + "epoch": 0.12291080351501923, + "grad_norm": 0.36906319856643677, + "learning_rate": 0.0001, + "loss": 1.8587, + "step": 1070 + }, + { + "epoch": 0.12302567342484635, + "grad_norm": 0.33320197463035583, + "learning_rate": 0.0001, + "loss": 1.6826, + "step": 1071 + }, + { + "epoch": 0.12314054333467349, + "grad_norm": 0.35025712847709656, + "learning_rate": 0.0001, + "loss": 1.7588, + "step": 1072 + }, + { + "epoch": 0.12325541324450061, + "grad_norm": 0.33611828088760376, + "learning_rate": 0.0001, + "loss": 1.6779, + "step": 1073 + }, + { + "epoch": 0.12337028315432773, + "grad_norm": 0.34194374084472656, + "learning_rate": 0.0001, + "loss": 1.6713, + "step": 1074 + }, + { + "epoch": 0.12348515306415485, + "grad_norm": 0.3618099093437195, + "learning_rate": 0.0001, + "loss": 1.928, + "step": 1075 + }, + { + "epoch": 0.12360002297398197, + "grad_norm": 0.36288702487945557, + "learning_rate": 0.0001, + "loss": 1.825, + "step": 1076 + }, + { + "epoch": 0.12371489288380909, + "grad_norm": 0.3502649664878845, + "learning_rate": 0.0001, + "loss": 1.805, + "step": 1077 + }, + { + "epoch": 0.12382976279363621, + "grad_norm": 0.3414769470691681, + "learning_rate": 0.0001, + "loss": 1.769, + "step": 1078 + }, + { + "epoch": 0.12394463270346333, + "grad_norm": 0.3354913592338562, + "learning_rate": 0.0001, + "loss": 1.8155, + "step": 1079 + }, + { + "epoch": 0.12405950261329045, + "grad_norm": 0.34553056955337524, + "learning_rate": 0.0001, + "loss": 1.5901, + "step": 1080 + }, + { + "epoch": 0.12417437252311757, + "grad_norm": 0.3307434320449829, + "learning_rate": 0.0001, + "loss": 1.564, + "step": 1081 + }, + { + "epoch": 0.12428924243294469, + "grad_norm": 0.3462519347667694, + "learning_rate": 0.0001, + "loss": 1.7164, + "step": 1082 + }, + { + "epoch": 0.12440411234277181, + "grad_norm": 0.3434395492076874, + "learning_rate": 0.0001, + "loss": 1.9303, + "step": 1083 + }, + { + "epoch": 0.12451898225259893, + "grad_norm": 0.32828354835510254, + "learning_rate": 0.0001, + "loss": 1.5756, + "step": 1084 + }, + { + "epoch": 0.12463385216242605, + "grad_norm": 0.3581448793411255, + "learning_rate": 0.0001, + "loss": 1.7733, + "step": 1085 + }, + { + "epoch": 0.12474872207225317, + "grad_norm": 0.33728450536727905, + "learning_rate": 0.0001, + "loss": 1.6101, + "step": 1086 + }, + { + "epoch": 0.12486359198208029, + "grad_norm": 0.36441946029663086, + "learning_rate": 0.0001, + "loss": 1.8154, + "step": 1087 + }, + { + "epoch": 0.12497846189190741, + "grad_norm": 0.3511005640029907, + "learning_rate": 0.0001, + "loss": 1.8249, + "step": 1088 + }, + { + "epoch": 0.12509333180173454, + "grad_norm": 0.36078888177871704, + "learning_rate": 0.0001, + "loss": 1.7089, + "step": 1089 + }, + { + "epoch": 0.12520820171156166, + "grad_norm": 0.3429834246635437, + "learning_rate": 0.0001, + "loss": 1.66, + "step": 1090 + }, + { + "epoch": 0.12532307162138878, + "grad_norm": 0.3412310779094696, + "learning_rate": 0.0001, + "loss": 1.704, + "step": 1091 + }, + { + "epoch": 0.1254379415312159, + "grad_norm": 0.377566933631897, + "learning_rate": 0.0001, + "loss": 2.0321, + "step": 1092 + }, + { + "epoch": 0.12555281144104302, + "grad_norm": 0.3425246477127075, + "learning_rate": 0.0001, + "loss": 1.6953, + "step": 1093 + }, + { + "epoch": 0.12566768135087014, + "grad_norm": 0.36676526069641113, + "learning_rate": 0.0001, + "loss": 1.5699, + "step": 1094 + }, + { + "epoch": 0.12578255126069726, + "grad_norm": 0.3748563528060913, + "learning_rate": 0.0001, + "loss": 1.6054, + "step": 1095 + }, + { + "epoch": 0.12589742117052438, + "grad_norm": 0.3399527370929718, + "learning_rate": 0.0001, + "loss": 1.5754, + "step": 1096 + }, + { + "epoch": 0.1260122910803515, + "grad_norm": 0.32709023356437683, + "learning_rate": 0.0001, + "loss": 1.606, + "step": 1097 + }, + { + "epoch": 0.12612716099017862, + "grad_norm": 0.3199503421783447, + "learning_rate": 0.0001, + "loss": 1.5727, + "step": 1098 + }, + { + "epoch": 0.12624203090000574, + "grad_norm": 0.36548885703086853, + "learning_rate": 0.0001, + "loss": 1.5882, + "step": 1099 + }, + { + "epoch": 0.12635690080983286, + "grad_norm": 0.35938435792922974, + "learning_rate": 0.0001, + "loss": 1.8585, + "step": 1100 + }, + { + "epoch": 0.12647177071965998, + "grad_norm": 0.3288070857524872, + "learning_rate": 0.0001, + "loss": 1.8322, + "step": 1101 + }, + { + "epoch": 0.1265866406294871, + "grad_norm": 0.3379286527633667, + "learning_rate": 0.0001, + "loss": 1.7366, + "step": 1102 + }, + { + "epoch": 0.12670151053931422, + "grad_norm": 0.38364285230636597, + "learning_rate": 0.0001, + "loss": 1.9132, + "step": 1103 + }, + { + "epoch": 0.12681638044914134, + "grad_norm": 0.348711222410202, + "learning_rate": 0.0001, + "loss": 1.8004, + "step": 1104 + }, + { + "epoch": 0.12693125035896846, + "grad_norm": 0.3605641722679138, + "learning_rate": 0.0001, + "loss": 1.7316, + "step": 1105 + }, + { + "epoch": 0.12704612026879558, + "grad_norm": 0.3259005546569824, + "learning_rate": 0.0001, + "loss": 1.7338, + "step": 1106 + }, + { + "epoch": 0.1271609901786227, + "grad_norm": 0.374476820230484, + "learning_rate": 0.0001, + "loss": 1.9204, + "step": 1107 + }, + { + "epoch": 0.12727586008844982, + "grad_norm": 0.36476418375968933, + "learning_rate": 0.0001, + "loss": 1.7403, + "step": 1108 + }, + { + "epoch": 0.12739072999827694, + "grad_norm": 0.3357693552970886, + "learning_rate": 0.0001, + "loss": 1.6227, + "step": 1109 + }, + { + "epoch": 0.12750559990810406, + "grad_norm": 0.34735891222953796, + "learning_rate": 0.0001, + "loss": 1.5068, + "step": 1110 + }, + { + "epoch": 0.12762046981793118, + "grad_norm": 0.34894609451293945, + "learning_rate": 0.0001, + "loss": 1.6742, + "step": 1111 + }, + { + "epoch": 0.1277353397277583, + "grad_norm": 0.34470030665397644, + "learning_rate": 0.0001, + "loss": 1.6905, + "step": 1112 + }, + { + "epoch": 0.12785020963758542, + "grad_norm": 0.34968432784080505, + "learning_rate": 0.0001, + "loss": 1.535, + "step": 1113 + }, + { + "epoch": 0.12796507954741257, + "grad_norm": 0.35253041982650757, + "learning_rate": 0.0001, + "loss": 1.5959, + "step": 1114 + }, + { + "epoch": 0.1280799494572397, + "grad_norm": 0.36112409830093384, + "learning_rate": 0.0001, + "loss": 1.7485, + "step": 1115 + }, + { + "epoch": 0.1281948193670668, + "grad_norm": 0.361447274684906, + "learning_rate": 0.0001, + "loss": 1.8199, + "step": 1116 + }, + { + "epoch": 0.12830968927689393, + "grad_norm": 0.3329083323478699, + "learning_rate": 0.0001, + "loss": 1.8049, + "step": 1117 + }, + { + "epoch": 0.12842455918672105, + "grad_norm": 0.37761425971984863, + "learning_rate": 0.0001, + "loss": 1.8367, + "step": 1118 + }, + { + "epoch": 0.12853942909654817, + "grad_norm": 0.36730146408081055, + "learning_rate": 0.0001, + "loss": 1.7953, + "step": 1119 + }, + { + "epoch": 0.1286542990063753, + "grad_norm": 0.3333646357059479, + "learning_rate": 0.0001, + "loss": 1.6046, + "step": 1120 + }, + { + "epoch": 0.1287691689162024, + "grad_norm": 0.37335070967674255, + "learning_rate": 0.0001, + "loss": 1.9296, + "step": 1121 + }, + { + "epoch": 0.12888403882602953, + "grad_norm": 0.36722204089164734, + "learning_rate": 0.0001, + "loss": 1.7312, + "step": 1122 + }, + { + "epoch": 0.12899890873585665, + "grad_norm": 0.34996774792671204, + "learning_rate": 0.0001, + "loss": 1.7276, + "step": 1123 + }, + { + "epoch": 0.12911377864568377, + "grad_norm": 0.3631379306316376, + "learning_rate": 0.0001, + "loss": 1.7279, + "step": 1124 + }, + { + "epoch": 0.1292286485555109, + "grad_norm": 0.3820423483848572, + "learning_rate": 0.0001, + "loss": 1.7714, + "step": 1125 + }, + { + "epoch": 0.129343518465338, + "grad_norm": 0.3379541039466858, + "learning_rate": 0.0001, + "loss": 1.6302, + "step": 1126 + }, + { + "epoch": 0.12945838837516513, + "grad_norm": 0.35675719380378723, + "learning_rate": 0.0001, + "loss": 1.5973, + "step": 1127 + }, + { + "epoch": 0.12957325828499225, + "grad_norm": 0.34876230359077454, + "learning_rate": 0.0001, + "loss": 1.6188, + "step": 1128 + }, + { + "epoch": 0.12968812819481937, + "grad_norm": 0.35678061842918396, + "learning_rate": 0.0001, + "loss": 1.8694, + "step": 1129 + }, + { + "epoch": 0.1298029981046465, + "grad_norm": 0.3658314347267151, + "learning_rate": 0.0001, + "loss": 1.9091, + "step": 1130 + }, + { + "epoch": 0.1299178680144736, + "grad_norm": 0.3842300474643707, + "learning_rate": 0.0001, + "loss": 1.7441, + "step": 1131 + }, + { + "epoch": 0.13003273792430073, + "grad_norm": 0.3557139039039612, + "learning_rate": 0.0001, + "loss": 1.9989, + "step": 1132 + }, + { + "epoch": 0.13014760783412785, + "grad_norm": 0.3451981246471405, + "learning_rate": 0.0001, + "loss": 1.5792, + "step": 1133 + }, + { + "epoch": 0.13026247774395497, + "grad_norm": 0.34899675846099854, + "learning_rate": 0.0001, + "loss": 1.632, + "step": 1134 + }, + { + "epoch": 0.1303773476537821, + "grad_norm": 0.3958018124103546, + "learning_rate": 0.0001, + "loss": 2.2029, + "step": 1135 + }, + { + "epoch": 0.1304922175636092, + "grad_norm": 0.33789554238319397, + "learning_rate": 0.0001, + "loss": 1.677, + "step": 1136 + }, + { + "epoch": 0.13060708747343633, + "grad_norm": 0.3390919864177704, + "learning_rate": 0.0001, + "loss": 1.7488, + "step": 1137 + }, + { + "epoch": 0.13072195738326345, + "grad_norm": 0.32724031805992126, + "learning_rate": 0.0001, + "loss": 1.6631, + "step": 1138 + }, + { + "epoch": 0.13083682729309057, + "grad_norm": 0.35171130299568176, + "learning_rate": 0.0001, + "loss": 1.8461, + "step": 1139 + }, + { + "epoch": 0.1309516972029177, + "grad_norm": 0.37492451071739197, + "learning_rate": 0.0001, + "loss": 1.6274, + "step": 1140 + }, + { + "epoch": 0.1310665671127448, + "grad_norm": 0.3509044051170349, + "learning_rate": 0.0001, + "loss": 1.6438, + "step": 1141 + }, + { + "epoch": 0.13118143702257193, + "grad_norm": 0.379950612783432, + "learning_rate": 0.0001, + "loss": 1.8488, + "step": 1142 + }, + { + "epoch": 0.13129630693239905, + "grad_norm": 0.3770541846752167, + "learning_rate": 0.0001, + "loss": 1.9027, + "step": 1143 + }, + { + "epoch": 0.13141117684222617, + "grad_norm": 0.3268395960330963, + "learning_rate": 0.0001, + "loss": 1.6642, + "step": 1144 + }, + { + "epoch": 0.1315260467520533, + "grad_norm": 0.3465530276298523, + "learning_rate": 0.0001, + "loss": 1.5535, + "step": 1145 + }, + { + "epoch": 0.1316409166618804, + "grad_norm": 0.3526296317577362, + "learning_rate": 0.0001, + "loss": 1.9214, + "step": 1146 + }, + { + "epoch": 0.13175578657170753, + "grad_norm": 0.3607937693595886, + "learning_rate": 0.0001, + "loss": 1.9947, + "step": 1147 + }, + { + "epoch": 0.13187065648153465, + "grad_norm": 0.35963696241378784, + "learning_rate": 0.0001, + "loss": 1.7277, + "step": 1148 + }, + { + "epoch": 0.1319855263913618, + "grad_norm": 0.3521120846271515, + "learning_rate": 0.0001, + "loss": 1.7295, + "step": 1149 + }, + { + "epoch": 0.13210039630118892, + "grad_norm": 0.34942013025283813, + "learning_rate": 0.0001, + "loss": 1.6966, + "step": 1150 + }, + { + "epoch": 0.13221526621101604, + "grad_norm": 0.3466810882091522, + "learning_rate": 0.0001, + "loss": 1.5861, + "step": 1151 + }, + { + "epoch": 0.13233013612084316, + "grad_norm": 0.36123740673065186, + "learning_rate": 0.0001, + "loss": 1.7322, + "step": 1152 + }, + { + "epoch": 0.13244500603067028, + "grad_norm": 0.36202317476272583, + "learning_rate": 0.0001, + "loss": 1.844, + "step": 1153 + }, + { + "epoch": 0.1325598759404974, + "grad_norm": 0.3521657884120941, + "learning_rate": 0.0001, + "loss": 1.778, + "step": 1154 + }, + { + "epoch": 0.13267474585032452, + "grad_norm": 0.35501620173454285, + "learning_rate": 0.0001, + "loss": 1.7139, + "step": 1155 + }, + { + "epoch": 0.13278961576015164, + "grad_norm": 0.35365980863571167, + "learning_rate": 0.0001, + "loss": 1.5055, + "step": 1156 + }, + { + "epoch": 0.13290448566997876, + "grad_norm": 0.3492221534252167, + "learning_rate": 0.0001, + "loss": 1.7555, + "step": 1157 + }, + { + "epoch": 0.13301935557980588, + "grad_norm": 0.4094052016735077, + "learning_rate": 0.0001, + "loss": 1.7331, + "step": 1158 + }, + { + "epoch": 0.133134225489633, + "grad_norm": 0.385098934173584, + "learning_rate": 0.0001, + "loss": 1.6894, + "step": 1159 + }, + { + "epoch": 0.13324909539946012, + "grad_norm": 0.3582170605659485, + "learning_rate": 0.0001, + "loss": 1.8903, + "step": 1160 + }, + { + "epoch": 0.13336396530928724, + "grad_norm": 0.35824549198150635, + "learning_rate": 0.0001, + "loss": 1.6978, + "step": 1161 + }, + { + "epoch": 0.13347883521911436, + "grad_norm": 0.36423972249031067, + "learning_rate": 0.0001, + "loss": 1.7682, + "step": 1162 + }, + { + "epoch": 0.13359370512894148, + "grad_norm": 0.3333613872528076, + "learning_rate": 0.0001, + "loss": 1.7676, + "step": 1163 + }, + { + "epoch": 0.1337085750387686, + "grad_norm": 0.3693676292896271, + "learning_rate": 0.0001, + "loss": 1.8687, + "step": 1164 + }, + { + "epoch": 0.13382344494859572, + "grad_norm": 0.36510157585144043, + "learning_rate": 0.0001, + "loss": 1.8022, + "step": 1165 + }, + { + "epoch": 0.13393831485842284, + "grad_norm": 0.35315313935279846, + "learning_rate": 0.0001, + "loss": 1.6586, + "step": 1166 + }, + { + "epoch": 0.13405318476824996, + "grad_norm": 0.3328644037246704, + "learning_rate": 0.0001, + "loss": 1.5863, + "step": 1167 + }, + { + "epoch": 0.13416805467807708, + "grad_norm": 0.34875044226646423, + "learning_rate": 0.0001, + "loss": 1.6538, + "step": 1168 + }, + { + "epoch": 0.1342829245879042, + "grad_norm": 0.3806982636451721, + "learning_rate": 0.0001, + "loss": 1.8317, + "step": 1169 + }, + { + "epoch": 0.13439779449773132, + "grad_norm": 0.35632237792015076, + "learning_rate": 0.0001, + "loss": 1.757, + "step": 1170 + }, + { + "epoch": 0.13451266440755844, + "grad_norm": 0.3340839147567749, + "learning_rate": 0.0001, + "loss": 1.6582, + "step": 1171 + }, + { + "epoch": 0.13462753431738556, + "grad_norm": 0.3517102599143982, + "learning_rate": 0.0001, + "loss": 1.6821, + "step": 1172 + }, + { + "epoch": 0.13474240422721268, + "grad_norm": 0.33846724033355713, + "learning_rate": 0.0001, + "loss": 1.7031, + "step": 1173 + }, + { + "epoch": 0.1348572741370398, + "grad_norm": 0.3715790808200836, + "learning_rate": 0.0001, + "loss": 1.6864, + "step": 1174 + }, + { + "epoch": 0.13497214404686692, + "grad_norm": 0.30750709772109985, + "learning_rate": 0.0001, + "loss": 1.4213, + "step": 1175 + }, + { + "epoch": 0.13508701395669404, + "grad_norm": 0.35879287123680115, + "learning_rate": 0.0001, + "loss": 1.6197, + "step": 1176 + }, + { + "epoch": 0.13520188386652116, + "grad_norm": 0.35407108068466187, + "learning_rate": 0.0001, + "loss": 1.6578, + "step": 1177 + }, + { + "epoch": 0.13531675377634828, + "grad_norm": 0.35184237360954285, + "learning_rate": 0.0001, + "loss": 1.7593, + "step": 1178 + }, + { + "epoch": 0.1354316236861754, + "grad_norm": 0.3242950737476349, + "learning_rate": 0.0001, + "loss": 1.6438, + "step": 1179 + }, + { + "epoch": 0.13554649359600252, + "grad_norm": 0.3433818817138672, + "learning_rate": 0.0001, + "loss": 1.568, + "step": 1180 + }, + { + "epoch": 0.13566136350582964, + "grad_norm": 0.34305769205093384, + "learning_rate": 0.0001, + "loss": 1.5726, + "step": 1181 + }, + { + "epoch": 0.13577623341565676, + "grad_norm": 0.35595643520355225, + "learning_rate": 0.0001, + "loss": 1.7659, + "step": 1182 + }, + { + "epoch": 0.1358911033254839, + "grad_norm": 0.3613067865371704, + "learning_rate": 0.0001, + "loss": 1.8036, + "step": 1183 + }, + { + "epoch": 0.13600597323531102, + "grad_norm": 0.38018158078193665, + "learning_rate": 0.0001, + "loss": 1.9313, + "step": 1184 + }, + { + "epoch": 0.13612084314513814, + "grad_norm": 0.3490848243236542, + "learning_rate": 0.0001, + "loss": 1.7979, + "step": 1185 + }, + { + "epoch": 0.13623571305496526, + "grad_norm": 0.35665586590766907, + "learning_rate": 0.0001, + "loss": 1.7383, + "step": 1186 + }, + { + "epoch": 0.13635058296479238, + "grad_norm": 0.3440529704093933, + "learning_rate": 0.0001, + "loss": 1.5626, + "step": 1187 + }, + { + "epoch": 0.1364654528746195, + "grad_norm": 0.31870830059051514, + "learning_rate": 0.0001, + "loss": 1.5472, + "step": 1188 + }, + { + "epoch": 0.13658032278444662, + "grad_norm": 0.3264746367931366, + "learning_rate": 0.0001, + "loss": 1.6856, + "step": 1189 + }, + { + "epoch": 0.13669519269427374, + "grad_norm": 0.364711731672287, + "learning_rate": 0.0001, + "loss": 1.662, + "step": 1190 + }, + { + "epoch": 0.13681006260410086, + "grad_norm": 0.34434038400650024, + "learning_rate": 0.0001, + "loss": 1.6279, + "step": 1191 + }, + { + "epoch": 0.13692493251392798, + "grad_norm": 0.36517781019210815, + "learning_rate": 0.0001, + "loss": 1.8602, + "step": 1192 + }, + { + "epoch": 0.1370398024237551, + "grad_norm": 0.36987563967704773, + "learning_rate": 0.0001, + "loss": 1.7108, + "step": 1193 + }, + { + "epoch": 0.13715467233358222, + "grad_norm": 0.34185507893562317, + "learning_rate": 0.0001, + "loss": 1.7236, + "step": 1194 + }, + { + "epoch": 0.13726954224340934, + "grad_norm": 0.3375924527645111, + "learning_rate": 0.0001, + "loss": 1.7451, + "step": 1195 + }, + { + "epoch": 0.13738441215323646, + "grad_norm": 0.3625550866127014, + "learning_rate": 0.0001, + "loss": 1.8872, + "step": 1196 + }, + { + "epoch": 0.13749928206306358, + "grad_norm": 0.36060526967048645, + "learning_rate": 0.0001, + "loss": 1.9059, + "step": 1197 + }, + { + "epoch": 0.1376141519728907, + "grad_norm": 0.32978108525276184, + "learning_rate": 0.0001, + "loss": 1.5382, + "step": 1198 + }, + { + "epoch": 0.13772902188271782, + "grad_norm": 0.3556724488735199, + "learning_rate": 0.0001, + "loss": 1.659, + "step": 1199 + }, + { + "epoch": 0.13784389179254494, + "grad_norm": 0.36621591448783875, + "learning_rate": 0.0001, + "loss": 1.9125, + "step": 1200 + }, + { + "epoch": 0.13795876170237206, + "grad_norm": 0.3547092080116272, + "learning_rate": 0.0001, + "loss": 1.7579, + "step": 1201 + }, + { + "epoch": 0.13807363161219918, + "grad_norm": 0.3358149826526642, + "learning_rate": 0.0001, + "loss": 1.7776, + "step": 1202 + }, + { + "epoch": 0.1381885015220263, + "grad_norm": 0.37259694933891296, + "learning_rate": 0.0001, + "loss": 1.7184, + "step": 1203 + }, + { + "epoch": 0.13830337143185342, + "grad_norm": 0.35611268877983093, + "learning_rate": 0.0001, + "loss": 1.7256, + "step": 1204 + }, + { + "epoch": 0.13841824134168054, + "grad_norm": 0.3654508590698242, + "learning_rate": 0.0001, + "loss": 1.8226, + "step": 1205 + }, + { + "epoch": 0.13853311125150766, + "grad_norm": 0.3912622928619385, + "learning_rate": 0.0001, + "loss": 1.7948, + "step": 1206 + }, + { + "epoch": 0.13864798116133478, + "grad_norm": 0.37169212102890015, + "learning_rate": 0.0001, + "loss": 1.7112, + "step": 1207 + }, + { + "epoch": 0.1387628510711619, + "grad_norm": 0.34099262952804565, + "learning_rate": 0.0001, + "loss": 1.5217, + "step": 1208 + }, + { + "epoch": 0.13887772098098902, + "grad_norm": 0.34905219078063965, + "learning_rate": 0.0001, + "loss": 1.734, + "step": 1209 + }, + { + "epoch": 0.13899259089081614, + "grad_norm": 0.35248732566833496, + "learning_rate": 0.0001, + "loss": 1.6179, + "step": 1210 + }, + { + "epoch": 0.13910746080064326, + "grad_norm": 0.343364417552948, + "learning_rate": 0.0001, + "loss": 1.8933, + "step": 1211 + }, + { + "epoch": 0.13922233071047038, + "grad_norm": 0.3398562967777252, + "learning_rate": 0.0001, + "loss": 1.7221, + "step": 1212 + }, + { + "epoch": 0.1393372006202975, + "grad_norm": 0.3378565013408661, + "learning_rate": 0.0001, + "loss": 1.7439, + "step": 1213 + }, + { + "epoch": 0.13945207053012462, + "grad_norm": 0.36921605467796326, + "learning_rate": 0.0001, + "loss": 1.6726, + "step": 1214 + }, + { + "epoch": 0.13956694043995174, + "grad_norm": 0.3712867200374603, + "learning_rate": 0.0001, + "loss": 1.8628, + "step": 1215 + }, + { + "epoch": 0.13968181034977886, + "grad_norm": 0.3491092324256897, + "learning_rate": 0.0001, + "loss": 1.7379, + "step": 1216 + }, + { + "epoch": 0.139796680259606, + "grad_norm": 0.3602731227874756, + "learning_rate": 0.0001, + "loss": 1.7088, + "step": 1217 + }, + { + "epoch": 0.13991155016943313, + "grad_norm": 0.39251574873924255, + "learning_rate": 0.0001, + "loss": 1.8268, + "step": 1218 + }, + { + "epoch": 0.14002642007926025, + "grad_norm": 0.3653351664543152, + "learning_rate": 0.0001, + "loss": 1.7597, + "step": 1219 + }, + { + "epoch": 0.14014128998908737, + "grad_norm": 0.3393062353134155, + "learning_rate": 0.0001, + "loss": 1.6635, + "step": 1220 + }, + { + "epoch": 0.1402561598989145, + "grad_norm": 0.3772340416908264, + "learning_rate": 0.0001, + "loss": 1.7796, + "step": 1221 + }, + { + "epoch": 0.1403710298087416, + "grad_norm": 0.3478431701660156, + "learning_rate": 0.0001, + "loss": 1.6802, + "step": 1222 + }, + { + "epoch": 0.14048589971856873, + "grad_norm": 0.33005356788635254, + "learning_rate": 0.0001, + "loss": 1.555, + "step": 1223 + }, + { + "epoch": 0.14060076962839585, + "grad_norm": 0.348294734954834, + "learning_rate": 0.0001, + "loss": 1.6254, + "step": 1224 + }, + { + "epoch": 0.14071563953822297, + "grad_norm": 0.35383546352386475, + "learning_rate": 0.0001, + "loss": 1.5475, + "step": 1225 + }, + { + "epoch": 0.1408305094480501, + "grad_norm": 0.37990522384643555, + "learning_rate": 0.0001, + "loss": 1.8762, + "step": 1226 + }, + { + "epoch": 0.1409453793578772, + "grad_norm": 0.3329927623271942, + "learning_rate": 0.0001, + "loss": 1.752, + "step": 1227 + }, + { + "epoch": 0.14106024926770433, + "grad_norm": 0.3647679090499878, + "learning_rate": 0.0001, + "loss": 1.7886, + "step": 1228 + }, + { + "epoch": 0.14117511917753145, + "grad_norm": 0.3833819627761841, + "learning_rate": 0.0001, + "loss": 1.8977, + "step": 1229 + }, + { + "epoch": 0.14128998908735857, + "grad_norm": 0.3702079951763153, + "learning_rate": 0.0001, + "loss": 1.6843, + "step": 1230 + }, + { + "epoch": 0.1414048589971857, + "grad_norm": 0.3493407368659973, + "learning_rate": 0.0001, + "loss": 1.7134, + "step": 1231 + }, + { + "epoch": 0.1415197289070128, + "grad_norm": 0.3664112687110901, + "learning_rate": 0.0001, + "loss": 1.7641, + "step": 1232 + }, + { + "epoch": 0.14163459881683993, + "grad_norm": 0.3498665988445282, + "learning_rate": 0.0001, + "loss": 1.7903, + "step": 1233 + }, + { + "epoch": 0.14174946872666705, + "grad_norm": 0.37883898615837097, + "learning_rate": 0.0001, + "loss": 1.7601, + "step": 1234 + }, + { + "epoch": 0.14186433863649417, + "grad_norm": 0.37011733651161194, + "learning_rate": 0.0001, + "loss": 1.8304, + "step": 1235 + }, + { + "epoch": 0.1419792085463213, + "grad_norm": 0.3556150197982788, + "learning_rate": 0.0001, + "loss": 1.7222, + "step": 1236 + }, + { + "epoch": 0.1420940784561484, + "grad_norm": 0.3517720103263855, + "learning_rate": 0.0001, + "loss": 1.5597, + "step": 1237 + }, + { + "epoch": 0.14220894836597553, + "grad_norm": 0.3677484691143036, + "learning_rate": 0.0001, + "loss": 1.9557, + "step": 1238 + }, + { + "epoch": 0.14232381827580265, + "grad_norm": 0.35142794251441956, + "learning_rate": 0.0001, + "loss": 1.7057, + "step": 1239 + }, + { + "epoch": 0.14243868818562977, + "grad_norm": 0.33948856592178345, + "learning_rate": 0.0001, + "loss": 1.5986, + "step": 1240 + }, + { + "epoch": 0.1425535580954569, + "grad_norm": 0.340010404586792, + "learning_rate": 0.0001, + "loss": 1.6787, + "step": 1241 + }, + { + "epoch": 0.142668428005284, + "grad_norm": 0.34367939829826355, + "learning_rate": 0.0001, + "loss": 1.628, + "step": 1242 + }, + { + "epoch": 0.14278329791511113, + "grad_norm": 0.38786405324935913, + "learning_rate": 0.0001, + "loss": 1.7947, + "step": 1243 + }, + { + "epoch": 0.14289816782493825, + "grad_norm": 0.35119712352752686, + "learning_rate": 0.0001, + "loss": 1.6828, + "step": 1244 + }, + { + "epoch": 0.14301303773476537, + "grad_norm": 0.3436744809150696, + "learning_rate": 0.0001, + "loss": 1.6213, + "step": 1245 + }, + { + "epoch": 0.1431279076445925, + "grad_norm": 0.3338393270969391, + "learning_rate": 0.0001, + "loss": 1.536, + "step": 1246 + }, + { + "epoch": 0.1432427775544196, + "grad_norm": 0.38156992197036743, + "learning_rate": 0.0001, + "loss": 1.7975, + "step": 1247 + }, + { + "epoch": 0.14335764746424673, + "grad_norm": 0.34734535217285156, + "learning_rate": 0.0001, + "loss": 1.6203, + "step": 1248 + }, + { + "epoch": 0.14347251737407385, + "grad_norm": 0.34041401743888855, + "learning_rate": 0.0001, + "loss": 1.7274, + "step": 1249 + }, + { + "epoch": 0.14358738728390097, + "grad_norm": 0.37156790494918823, + "learning_rate": 0.0001, + "loss": 1.7642, + "step": 1250 + }, + { + "epoch": 0.14370225719372812, + "grad_norm": 0.3499716818332672, + "learning_rate": 0.0001, + "loss": 1.7686, + "step": 1251 + }, + { + "epoch": 0.14381712710355524, + "grad_norm": 0.32781967520713806, + "learning_rate": 0.0001, + "loss": 1.5751, + "step": 1252 + }, + { + "epoch": 0.14393199701338236, + "grad_norm": 0.3511214852333069, + "learning_rate": 0.0001, + "loss": 1.6698, + "step": 1253 + }, + { + "epoch": 0.14404686692320948, + "grad_norm": 0.3647572994232178, + "learning_rate": 0.0001, + "loss": 1.6883, + "step": 1254 + }, + { + "epoch": 0.1441617368330366, + "grad_norm": 0.3573339879512787, + "learning_rate": 0.0001, + "loss": 1.6322, + "step": 1255 + }, + { + "epoch": 0.14427660674286372, + "grad_norm": 0.34564316272735596, + "learning_rate": 0.0001, + "loss": 1.7221, + "step": 1256 + }, + { + "epoch": 0.14439147665269084, + "grad_norm": 0.3462870717048645, + "learning_rate": 0.0001, + "loss": 1.7767, + "step": 1257 + }, + { + "epoch": 0.14450634656251796, + "grad_norm": 0.3375271260738373, + "learning_rate": 0.0001, + "loss": 1.6136, + "step": 1258 + }, + { + "epoch": 0.14462121647234508, + "grad_norm": 0.3306554853916168, + "learning_rate": 0.0001, + "loss": 1.6813, + "step": 1259 + }, + { + "epoch": 0.1447360863821722, + "grad_norm": 0.3348354995250702, + "learning_rate": 0.0001, + "loss": 1.5897, + "step": 1260 + }, + { + "epoch": 0.14485095629199932, + "grad_norm": 0.3619769811630249, + "learning_rate": 0.0001, + "loss": 1.6966, + "step": 1261 + }, + { + "epoch": 0.14496582620182644, + "grad_norm": 0.3743123412132263, + "learning_rate": 0.0001, + "loss": 1.8824, + "step": 1262 + }, + { + "epoch": 0.14508069611165356, + "grad_norm": 0.33532246947288513, + "learning_rate": 0.0001, + "loss": 1.6501, + "step": 1263 + }, + { + "epoch": 0.14519556602148068, + "grad_norm": 0.37136563658714294, + "learning_rate": 0.0001, + "loss": 1.6995, + "step": 1264 + }, + { + "epoch": 0.1453104359313078, + "grad_norm": 0.4397648274898529, + "learning_rate": 0.0001, + "loss": 2.1078, + "step": 1265 + }, + { + "epoch": 0.14542530584113492, + "grad_norm": 0.3656454086303711, + "learning_rate": 0.0001, + "loss": 1.8501, + "step": 1266 + }, + { + "epoch": 0.14554017575096204, + "grad_norm": 0.3662125766277313, + "learning_rate": 0.0001, + "loss": 1.8535, + "step": 1267 + }, + { + "epoch": 0.14565504566078916, + "grad_norm": 0.4044576585292816, + "learning_rate": 0.0001, + "loss": 1.9077, + "step": 1268 + }, + { + "epoch": 0.14576991557061628, + "grad_norm": 0.3433517515659332, + "learning_rate": 0.0001, + "loss": 1.4109, + "step": 1269 + }, + { + "epoch": 0.1458847854804434, + "grad_norm": 0.3514901101589203, + "learning_rate": 0.0001, + "loss": 1.6432, + "step": 1270 + }, + { + "epoch": 0.14599965539027052, + "grad_norm": 0.3364449143409729, + "learning_rate": 0.0001, + "loss": 1.6527, + "step": 1271 + }, + { + "epoch": 0.14611452530009764, + "grad_norm": 0.3305410146713257, + "learning_rate": 0.0001, + "loss": 1.6434, + "step": 1272 + }, + { + "epoch": 0.14622939520992476, + "grad_norm": 0.35502052307128906, + "learning_rate": 0.0001, + "loss": 1.6401, + "step": 1273 + }, + { + "epoch": 0.14634426511975188, + "grad_norm": 0.3556758165359497, + "learning_rate": 0.0001, + "loss": 1.6573, + "step": 1274 + }, + { + "epoch": 0.146459135029579, + "grad_norm": 0.3576909899711609, + "learning_rate": 0.0001, + "loss": 1.6997, + "step": 1275 + }, + { + "epoch": 0.14657400493940612, + "grad_norm": 0.34596705436706543, + "learning_rate": 0.0001, + "loss": 1.7428, + "step": 1276 + }, + { + "epoch": 0.14668887484923324, + "grad_norm": 0.38846471905708313, + "learning_rate": 0.0001, + "loss": 1.7209, + "step": 1277 + }, + { + "epoch": 0.14680374475906036, + "grad_norm": 0.37087368965148926, + "learning_rate": 0.0001, + "loss": 1.443, + "step": 1278 + }, + { + "epoch": 0.14691861466888748, + "grad_norm": 0.3498583137989044, + "learning_rate": 0.0001, + "loss": 1.6488, + "step": 1279 + }, + { + "epoch": 0.1470334845787146, + "grad_norm": 0.34810611605644226, + "learning_rate": 0.0001, + "loss": 1.6464, + "step": 1280 + }, + { + "epoch": 0.14714835448854172, + "grad_norm": 0.3536350727081299, + "learning_rate": 0.0001, + "loss": 1.8913, + "step": 1281 + }, + { + "epoch": 0.14726322439836884, + "grad_norm": 0.3864216208457947, + "learning_rate": 0.0001, + "loss": 1.9487, + "step": 1282 + }, + { + "epoch": 0.14737809430819596, + "grad_norm": 0.36202993988990784, + "learning_rate": 0.0001, + "loss": 1.7515, + "step": 1283 + }, + { + "epoch": 0.14749296421802308, + "grad_norm": 0.3679467439651489, + "learning_rate": 0.0001, + "loss": 1.8294, + "step": 1284 + }, + { + "epoch": 0.1476078341278502, + "grad_norm": 0.3474784195423126, + "learning_rate": 0.0001, + "loss": 1.5674, + "step": 1285 + }, + { + "epoch": 0.14772270403767734, + "grad_norm": 0.3244973123073578, + "learning_rate": 0.0001, + "loss": 1.604, + "step": 1286 + }, + { + "epoch": 0.14783757394750446, + "grad_norm": 0.36702069640159607, + "learning_rate": 0.0001, + "loss": 1.6718, + "step": 1287 + }, + { + "epoch": 0.14795244385733158, + "grad_norm": 0.3686244487762451, + "learning_rate": 0.0001, + "loss": 1.9021, + "step": 1288 + }, + { + "epoch": 0.1480673137671587, + "grad_norm": 0.36901167035102844, + "learning_rate": 0.0001, + "loss": 1.8618, + "step": 1289 + }, + { + "epoch": 0.14818218367698582, + "grad_norm": 0.3515526354312897, + "learning_rate": 0.0001, + "loss": 1.6961, + "step": 1290 + }, + { + "epoch": 0.14829705358681294, + "grad_norm": 0.36879947781562805, + "learning_rate": 0.0001, + "loss": 1.7333, + "step": 1291 + }, + { + "epoch": 0.14841192349664006, + "grad_norm": 0.3325467109680176, + "learning_rate": 0.0001, + "loss": 1.5058, + "step": 1292 + }, + { + "epoch": 0.14852679340646718, + "grad_norm": 0.3682822287082672, + "learning_rate": 0.0001, + "loss": 1.8649, + "step": 1293 + }, + { + "epoch": 0.1486416633162943, + "grad_norm": 0.3390534222126007, + "learning_rate": 0.0001, + "loss": 1.7462, + "step": 1294 + }, + { + "epoch": 0.14875653322612142, + "grad_norm": 0.3411741554737091, + "learning_rate": 0.0001, + "loss": 1.733, + "step": 1295 + }, + { + "epoch": 0.14887140313594854, + "grad_norm": 0.33508262038230896, + "learning_rate": 0.0001, + "loss": 1.7306, + "step": 1296 + }, + { + "epoch": 0.14898627304577566, + "grad_norm": 0.3365498483181, + "learning_rate": 0.0001, + "loss": 1.6963, + "step": 1297 + }, + { + "epoch": 0.14910114295560278, + "grad_norm": 0.3673759400844574, + "learning_rate": 0.0001, + "loss": 1.3957, + "step": 1298 + }, + { + "epoch": 0.1492160128654299, + "grad_norm": 0.32368943095207214, + "learning_rate": 0.0001, + "loss": 1.4204, + "step": 1299 + }, + { + "epoch": 0.14933088277525702, + "grad_norm": 0.37173643708229065, + "learning_rate": 0.0001, + "loss": 1.7783, + "step": 1300 + }, + { + "epoch": 0.14944575268508414, + "grad_norm": 0.3451181650161743, + "learning_rate": 0.0001, + "loss": 1.6048, + "step": 1301 + }, + { + "epoch": 0.14956062259491126, + "grad_norm": 0.3521316945552826, + "learning_rate": 0.0001, + "loss": 1.7907, + "step": 1302 + }, + { + "epoch": 0.14967549250473838, + "grad_norm": 0.3625703454017639, + "learning_rate": 0.0001, + "loss": 1.9229, + "step": 1303 + }, + { + "epoch": 0.1497903624145655, + "grad_norm": 0.32430633902549744, + "learning_rate": 0.0001, + "loss": 1.5798, + "step": 1304 + }, + { + "epoch": 0.14990523232439262, + "grad_norm": 0.3538731038570404, + "learning_rate": 0.0001, + "loss": 1.6985, + "step": 1305 + }, + { + "epoch": 0.15002010223421974, + "grad_norm": 0.362496018409729, + "learning_rate": 0.0001, + "loss": 1.6863, + "step": 1306 + }, + { + "epoch": 0.15013497214404686, + "grad_norm": 0.33480730652809143, + "learning_rate": 0.0001, + "loss": 1.6784, + "step": 1307 + }, + { + "epoch": 0.15024984205387398, + "grad_norm": 0.3331620991230011, + "learning_rate": 0.0001, + "loss": 1.635, + "step": 1308 + }, + { + "epoch": 0.1503647119637011, + "grad_norm": 0.39281463623046875, + "learning_rate": 0.0001, + "loss": 1.791, + "step": 1309 + }, + { + "epoch": 0.15047958187352822, + "grad_norm": 0.3314681351184845, + "learning_rate": 0.0001, + "loss": 1.6435, + "step": 1310 + }, + { + "epoch": 0.15059445178335534, + "grad_norm": 0.349573016166687, + "learning_rate": 0.0001, + "loss": 1.6755, + "step": 1311 + }, + { + "epoch": 0.15070932169318246, + "grad_norm": 0.35691556334495544, + "learning_rate": 0.0001, + "loss": 1.7388, + "step": 1312 + }, + { + "epoch": 0.15082419160300958, + "grad_norm": 0.3481789827346802, + "learning_rate": 0.0001, + "loss": 1.6805, + "step": 1313 + }, + { + "epoch": 0.1509390615128367, + "grad_norm": 0.36650916934013367, + "learning_rate": 0.0001, + "loss": 1.8396, + "step": 1314 + }, + { + "epoch": 0.15105393142266382, + "grad_norm": 0.3846674859523773, + "learning_rate": 0.0001, + "loss": 1.9883, + "step": 1315 + }, + { + "epoch": 0.15116880133249094, + "grad_norm": 0.3474387228488922, + "learning_rate": 0.0001, + "loss": 1.4629, + "step": 1316 + }, + { + "epoch": 0.15128367124231806, + "grad_norm": 0.37427589297294617, + "learning_rate": 0.0001, + "loss": 1.7977, + "step": 1317 + }, + { + "epoch": 0.15139854115214518, + "grad_norm": 0.38520902395248413, + "learning_rate": 0.0001, + "loss": 1.8338, + "step": 1318 + }, + { + "epoch": 0.1515134110619723, + "grad_norm": 0.3556143641471863, + "learning_rate": 0.0001, + "loss": 1.5932, + "step": 1319 + }, + { + "epoch": 0.15162828097179945, + "grad_norm": 0.36979398131370544, + "learning_rate": 0.0001, + "loss": 1.7625, + "step": 1320 + }, + { + "epoch": 0.15174315088162657, + "grad_norm": 0.35435059666633606, + "learning_rate": 0.0001, + "loss": 1.6956, + "step": 1321 + }, + { + "epoch": 0.1518580207914537, + "grad_norm": 0.35871556401252747, + "learning_rate": 0.0001, + "loss": 1.8573, + "step": 1322 + }, + { + "epoch": 0.1519728907012808, + "grad_norm": 0.3628619611263275, + "learning_rate": 0.0001, + "loss": 1.8769, + "step": 1323 + }, + { + "epoch": 0.15208776061110793, + "grad_norm": 0.3393780291080475, + "learning_rate": 0.0001, + "loss": 1.5716, + "step": 1324 + }, + { + "epoch": 0.15220263052093505, + "grad_norm": 0.3389873802661896, + "learning_rate": 0.0001, + "loss": 1.6257, + "step": 1325 + }, + { + "epoch": 0.15231750043076217, + "grad_norm": 0.3557165563106537, + "learning_rate": 0.0001, + "loss": 1.8152, + "step": 1326 + }, + { + "epoch": 0.1524323703405893, + "grad_norm": 0.36443039774894714, + "learning_rate": 0.0001, + "loss": 1.7758, + "step": 1327 + }, + { + "epoch": 0.1525472402504164, + "grad_norm": 0.3473578989505768, + "learning_rate": 0.0001, + "loss": 1.6577, + "step": 1328 + }, + { + "epoch": 0.15266211016024353, + "grad_norm": 0.36054491996765137, + "learning_rate": 0.0001, + "loss": 1.7633, + "step": 1329 + }, + { + "epoch": 0.15277698007007065, + "grad_norm": 0.3966304659843445, + "learning_rate": 0.0001, + "loss": 1.8216, + "step": 1330 + }, + { + "epoch": 0.15289184997989777, + "grad_norm": 0.3621263802051544, + "learning_rate": 0.0001, + "loss": 1.8102, + "step": 1331 + }, + { + "epoch": 0.1530067198897249, + "grad_norm": 0.346164733171463, + "learning_rate": 0.0001, + "loss": 1.8498, + "step": 1332 + }, + { + "epoch": 0.153121589799552, + "grad_norm": 0.3580979108810425, + "learning_rate": 0.0001, + "loss": 1.4788, + "step": 1333 + }, + { + "epoch": 0.15323645970937913, + "grad_norm": 0.4236307740211487, + "learning_rate": 0.0001, + "loss": 2.0173, + "step": 1334 + }, + { + "epoch": 0.15335132961920625, + "grad_norm": 0.36115512251853943, + "learning_rate": 0.0001, + "loss": 1.7246, + "step": 1335 + }, + { + "epoch": 0.15346619952903337, + "grad_norm": 0.35453563928604126, + "learning_rate": 0.0001, + "loss": 1.7424, + "step": 1336 + }, + { + "epoch": 0.1535810694388605, + "grad_norm": 0.37019240856170654, + "learning_rate": 0.0001, + "loss": 1.7979, + "step": 1337 + }, + { + "epoch": 0.1536959393486876, + "grad_norm": 0.34816619753837585, + "learning_rate": 0.0001, + "loss": 1.8167, + "step": 1338 + }, + { + "epoch": 0.15381080925851473, + "grad_norm": 0.3500208556652069, + "learning_rate": 0.0001, + "loss": 1.79, + "step": 1339 + }, + { + "epoch": 0.15392567916834185, + "grad_norm": 0.34298229217529297, + "learning_rate": 0.0001, + "loss": 1.6299, + "step": 1340 + }, + { + "epoch": 0.15404054907816897, + "grad_norm": 0.34139198064804077, + "learning_rate": 0.0001, + "loss": 1.694, + "step": 1341 + }, + { + "epoch": 0.1541554189879961, + "grad_norm": 0.3685595989227295, + "learning_rate": 0.0001, + "loss": 1.748, + "step": 1342 + }, + { + "epoch": 0.1542702888978232, + "grad_norm": 0.33886897563934326, + "learning_rate": 0.0001, + "loss": 1.7177, + "step": 1343 + }, + { + "epoch": 0.15438515880765033, + "grad_norm": 0.3549462854862213, + "learning_rate": 0.0001, + "loss": 1.7208, + "step": 1344 + }, + { + "epoch": 0.15450002871747745, + "grad_norm": 0.40802568197250366, + "learning_rate": 0.0001, + "loss": 1.6444, + "step": 1345 + }, + { + "epoch": 0.15461489862730457, + "grad_norm": 0.37750929594039917, + "learning_rate": 0.0001, + "loss": 1.5996, + "step": 1346 + }, + { + "epoch": 0.1547297685371317, + "grad_norm": 0.3724265694618225, + "learning_rate": 0.0001, + "loss": 1.8619, + "step": 1347 + }, + { + "epoch": 0.1548446384469588, + "grad_norm": 0.37521660327911377, + "learning_rate": 0.0001, + "loss": 1.7973, + "step": 1348 + }, + { + "epoch": 0.15495950835678593, + "grad_norm": 0.35817015171051025, + "learning_rate": 0.0001, + "loss": 1.8391, + "step": 1349 + }, + { + "epoch": 0.15507437826661305, + "grad_norm": 0.35718318819999695, + "learning_rate": 0.0001, + "loss": 1.5746, + "step": 1350 + }, + { + "epoch": 0.15518924817644017, + "grad_norm": 0.3497890830039978, + "learning_rate": 0.0001, + "loss": 1.7013, + "step": 1351 + }, + { + "epoch": 0.1553041180862673, + "grad_norm": 0.3606964349746704, + "learning_rate": 0.0001, + "loss": 1.6408, + "step": 1352 + }, + { + "epoch": 0.1554189879960944, + "grad_norm": 0.367531955242157, + "learning_rate": 0.0001, + "loss": 2.0184, + "step": 1353 + }, + { + "epoch": 0.15553385790592156, + "grad_norm": 0.364165723323822, + "learning_rate": 0.0001, + "loss": 1.7831, + "step": 1354 + }, + { + "epoch": 0.15564872781574868, + "grad_norm": 0.3798482418060303, + "learning_rate": 0.0001, + "loss": 1.9703, + "step": 1355 + }, + { + "epoch": 0.1557635977255758, + "grad_norm": 0.38180306553840637, + "learning_rate": 0.0001, + "loss": 1.9124, + "step": 1356 + }, + { + "epoch": 0.15587846763540292, + "grad_norm": 0.34865236282348633, + "learning_rate": 0.0001, + "loss": 1.6616, + "step": 1357 + }, + { + "epoch": 0.15599333754523004, + "grad_norm": 0.3822656571865082, + "learning_rate": 0.0001, + "loss": 1.8847, + "step": 1358 + }, + { + "epoch": 0.15610820745505716, + "grad_norm": 0.37447991967201233, + "learning_rate": 0.0001, + "loss": 1.7326, + "step": 1359 + }, + { + "epoch": 0.15622307736488428, + "grad_norm": 0.39164337515830994, + "learning_rate": 0.0001, + "loss": 1.9483, + "step": 1360 + }, + { + "epoch": 0.1563379472747114, + "grad_norm": 0.37871086597442627, + "learning_rate": 0.0001, + "loss": 1.6851, + "step": 1361 + }, + { + "epoch": 0.15645281718453852, + "grad_norm": 0.4217022657394409, + "learning_rate": 0.0001, + "loss": 1.7808, + "step": 1362 + }, + { + "epoch": 0.15656768709436564, + "grad_norm": 0.35998785495758057, + "learning_rate": 0.0001, + "loss": 1.644, + "step": 1363 + }, + { + "epoch": 0.15668255700419276, + "grad_norm": 0.36455628275871277, + "learning_rate": 0.0001, + "loss": 1.5796, + "step": 1364 + }, + { + "epoch": 0.15679742691401988, + "grad_norm": 0.33390358090400696, + "learning_rate": 0.0001, + "loss": 1.5069, + "step": 1365 + }, + { + "epoch": 0.156912296823847, + "grad_norm": 0.35853371024131775, + "learning_rate": 0.0001, + "loss": 1.8491, + "step": 1366 + }, + { + "epoch": 0.15702716673367412, + "grad_norm": 0.39624473452568054, + "learning_rate": 0.0001, + "loss": 1.8059, + "step": 1367 + }, + { + "epoch": 0.15714203664350124, + "grad_norm": 0.341155081987381, + "learning_rate": 0.0001, + "loss": 1.6845, + "step": 1368 + }, + { + "epoch": 0.15725690655332836, + "grad_norm": 0.3553493320941925, + "learning_rate": 0.0001, + "loss": 1.7542, + "step": 1369 + }, + { + "epoch": 0.15737177646315548, + "grad_norm": 0.3464072644710541, + "learning_rate": 0.0001, + "loss": 1.6961, + "step": 1370 + }, + { + "epoch": 0.1574866463729826, + "grad_norm": 0.32570725679397583, + "learning_rate": 0.0001, + "loss": 1.4859, + "step": 1371 + }, + { + "epoch": 0.15760151628280972, + "grad_norm": 0.3374817371368408, + "learning_rate": 0.0001, + "loss": 1.6443, + "step": 1372 + }, + { + "epoch": 0.15771638619263684, + "grad_norm": 0.3570788502693176, + "learning_rate": 0.0001, + "loss": 1.7114, + "step": 1373 + }, + { + "epoch": 0.15783125610246396, + "grad_norm": 0.3562948703765869, + "learning_rate": 0.0001, + "loss": 1.6656, + "step": 1374 + }, + { + "epoch": 0.15794612601229108, + "grad_norm": 0.3416786789894104, + "learning_rate": 0.0001, + "loss": 1.5716, + "step": 1375 + }, + { + "epoch": 0.1580609959221182, + "grad_norm": 0.36627301573753357, + "learning_rate": 0.0001, + "loss": 1.8055, + "step": 1376 + }, + { + "epoch": 0.15817586583194532, + "grad_norm": 0.38520297408103943, + "learning_rate": 0.0001, + "loss": 1.7763, + "step": 1377 + }, + { + "epoch": 0.15829073574177244, + "grad_norm": 0.35688209533691406, + "learning_rate": 0.0001, + "loss": 1.7664, + "step": 1378 + }, + { + "epoch": 0.15840560565159956, + "grad_norm": 0.3223513960838318, + "learning_rate": 0.0001, + "loss": 1.6262, + "step": 1379 + }, + { + "epoch": 0.15852047556142668, + "grad_norm": 0.3319501578807831, + "learning_rate": 0.0001, + "loss": 1.6991, + "step": 1380 + }, + { + "epoch": 0.1586353454712538, + "grad_norm": 0.3719445765018463, + "learning_rate": 0.0001, + "loss": 1.7745, + "step": 1381 + }, + { + "epoch": 0.15875021538108092, + "grad_norm": 0.41066795587539673, + "learning_rate": 0.0001, + "loss": 1.7897, + "step": 1382 + }, + { + "epoch": 0.15886508529090804, + "grad_norm": 0.38358965516090393, + "learning_rate": 0.0001, + "loss": 1.9239, + "step": 1383 + }, + { + "epoch": 0.15897995520073516, + "grad_norm": 0.35356229543685913, + "learning_rate": 0.0001, + "loss": 1.7281, + "step": 1384 + }, + { + "epoch": 0.15909482511056228, + "grad_norm": 0.36122894287109375, + "learning_rate": 0.0001, + "loss": 1.7863, + "step": 1385 + }, + { + "epoch": 0.1592096950203894, + "grad_norm": 0.35605597496032715, + "learning_rate": 0.0001, + "loss": 1.7772, + "step": 1386 + }, + { + "epoch": 0.15932456493021652, + "grad_norm": 0.3338839113712311, + "learning_rate": 0.0001, + "loss": 1.4497, + "step": 1387 + }, + { + "epoch": 0.15943943484004364, + "grad_norm": 0.34843042492866516, + "learning_rate": 0.0001, + "loss": 1.8037, + "step": 1388 + }, + { + "epoch": 0.15955430474987078, + "grad_norm": 0.3467456102371216, + "learning_rate": 0.0001, + "loss": 1.6538, + "step": 1389 + }, + { + "epoch": 0.1596691746596979, + "grad_norm": 0.37024548649787903, + "learning_rate": 0.0001, + "loss": 1.888, + "step": 1390 + }, + { + "epoch": 0.15978404456952502, + "grad_norm": 0.3692200481891632, + "learning_rate": 0.0001, + "loss": 1.7917, + "step": 1391 + }, + { + "epoch": 0.15989891447935214, + "grad_norm": 0.32846784591674805, + "learning_rate": 0.0001, + "loss": 1.5709, + "step": 1392 + }, + { + "epoch": 0.16001378438917926, + "grad_norm": 0.3437194228172302, + "learning_rate": 0.0001, + "loss": 1.7055, + "step": 1393 + }, + { + "epoch": 0.16012865429900638, + "grad_norm": 0.346202552318573, + "learning_rate": 0.0001, + "loss": 1.7074, + "step": 1394 + }, + { + "epoch": 0.1602435242088335, + "grad_norm": 0.3795225918292999, + "learning_rate": 0.0001, + "loss": 1.9109, + "step": 1395 + }, + { + "epoch": 0.16035839411866062, + "grad_norm": 0.33237701654434204, + "learning_rate": 0.0001, + "loss": 1.4413, + "step": 1396 + }, + { + "epoch": 0.16047326402848774, + "grad_norm": 0.38135427236557007, + "learning_rate": 0.0001, + "loss": 1.767, + "step": 1397 + }, + { + "epoch": 0.16058813393831486, + "grad_norm": 0.37453657388687134, + "learning_rate": 0.0001, + "loss": 1.7438, + "step": 1398 + }, + { + "epoch": 0.16070300384814198, + "grad_norm": 0.3387562334537506, + "learning_rate": 0.0001, + "loss": 1.6319, + "step": 1399 + }, + { + "epoch": 0.1608178737579691, + "grad_norm": 0.33894360065460205, + "learning_rate": 0.0001, + "loss": 1.5218, + "step": 1400 + }, + { + "epoch": 0.16093274366779622, + "grad_norm": 0.3334555923938751, + "learning_rate": 0.0001, + "loss": 1.5768, + "step": 1401 + }, + { + "epoch": 0.16104761357762334, + "grad_norm": 0.36844760179519653, + "learning_rate": 0.0001, + "loss": 1.7755, + "step": 1402 + }, + { + "epoch": 0.16116248348745046, + "grad_norm": 0.34594935178756714, + "learning_rate": 0.0001, + "loss": 1.8161, + "step": 1403 + }, + { + "epoch": 0.16127735339727758, + "grad_norm": 0.37070515751838684, + "learning_rate": 0.0001, + "loss": 1.6415, + "step": 1404 + }, + { + "epoch": 0.1613922233071047, + "grad_norm": 0.3500889241695404, + "learning_rate": 0.0001, + "loss": 1.6533, + "step": 1405 + }, + { + "epoch": 0.16150709321693182, + "grad_norm": 0.3509732186794281, + "learning_rate": 0.0001, + "loss": 1.7667, + "step": 1406 + }, + { + "epoch": 0.16162196312675894, + "grad_norm": 0.33721843361854553, + "learning_rate": 0.0001, + "loss": 1.6363, + "step": 1407 + }, + { + "epoch": 0.16173683303658606, + "grad_norm": 0.3420223593711853, + "learning_rate": 0.0001, + "loss": 1.3828, + "step": 1408 + }, + { + "epoch": 0.16185170294641318, + "grad_norm": 0.3664703965187073, + "learning_rate": 0.0001, + "loss": 1.8106, + "step": 1409 + }, + { + "epoch": 0.1619665728562403, + "grad_norm": 0.4004788398742676, + "learning_rate": 0.0001, + "loss": 1.8107, + "step": 1410 + }, + { + "epoch": 0.16208144276606742, + "grad_norm": 0.3599262237548828, + "learning_rate": 0.0001, + "loss": 1.7453, + "step": 1411 + }, + { + "epoch": 0.16219631267589454, + "grad_norm": 0.37064090371131897, + "learning_rate": 0.0001, + "loss": 1.7842, + "step": 1412 + }, + { + "epoch": 0.16231118258572166, + "grad_norm": 0.3801650404930115, + "learning_rate": 0.0001, + "loss": 1.6275, + "step": 1413 + }, + { + "epoch": 0.16242605249554878, + "grad_norm": 0.3450910449028015, + "learning_rate": 0.0001, + "loss": 1.6267, + "step": 1414 + }, + { + "epoch": 0.1625409224053759, + "grad_norm": 0.35267990827560425, + "learning_rate": 0.0001, + "loss": 1.5651, + "step": 1415 + }, + { + "epoch": 0.16265579231520302, + "grad_norm": 0.3844035267829895, + "learning_rate": 0.0001, + "loss": 1.9421, + "step": 1416 + }, + { + "epoch": 0.16277066222503014, + "grad_norm": 0.35612425208091736, + "learning_rate": 0.0001, + "loss": 1.6319, + "step": 1417 + }, + { + "epoch": 0.16288553213485726, + "grad_norm": 0.3794462978839874, + "learning_rate": 0.0001, + "loss": 1.8599, + "step": 1418 + }, + { + "epoch": 0.16300040204468438, + "grad_norm": 0.33937835693359375, + "learning_rate": 0.0001, + "loss": 1.6891, + "step": 1419 + }, + { + "epoch": 0.1631152719545115, + "grad_norm": 0.3379872441291809, + "learning_rate": 0.0001, + "loss": 1.708, + "step": 1420 + }, + { + "epoch": 0.16323014186433862, + "grad_norm": 0.35873672366142273, + "learning_rate": 0.0001, + "loss": 1.7832, + "step": 1421 + }, + { + "epoch": 0.16334501177416574, + "grad_norm": 0.37622302770614624, + "learning_rate": 0.0001, + "loss": 1.716, + "step": 1422 + }, + { + "epoch": 0.1634598816839929, + "grad_norm": 0.34433531761169434, + "learning_rate": 0.0001, + "loss": 1.6682, + "step": 1423 + }, + { + "epoch": 0.16357475159382, + "grad_norm": 0.35809025168418884, + "learning_rate": 0.0001, + "loss": 1.6115, + "step": 1424 + }, + { + "epoch": 0.16368962150364713, + "grad_norm": 0.35675248503685, + "learning_rate": 0.0001, + "loss": 1.6158, + "step": 1425 + }, + { + "epoch": 0.16380449141347425, + "grad_norm": 0.356037437915802, + "learning_rate": 0.0001, + "loss": 1.5049, + "step": 1426 + }, + { + "epoch": 0.16391936132330137, + "grad_norm": 0.3485028147697449, + "learning_rate": 0.0001, + "loss": 1.7554, + "step": 1427 + }, + { + "epoch": 0.1640342312331285, + "grad_norm": 0.36230984330177307, + "learning_rate": 0.0001, + "loss": 1.6059, + "step": 1428 + }, + { + "epoch": 0.1641491011429556, + "grad_norm": 0.35187479853630066, + "learning_rate": 0.0001, + "loss": 1.7184, + "step": 1429 + }, + { + "epoch": 0.16426397105278273, + "grad_norm": 0.34478455781936646, + "learning_rate": 0.0001, + "loss": 1.7086, + "step": 1430 + }, + { + "epoch": 0.16437884096260985, + "grad_norm": 0.4025746285915375, + "learning_rate": 0.0001, + "loss": 1.8836, + "step": 1431 + }, + { + "epoch": 0.16449371087243697, + "grad_norm": 0.358385294675827, + "learning_rate": 0.0001, + "loss": 1.8405, + "step": 1432 + }, + { + "epoch": 0.1646085807822641, + "grad_norm": 0.3677537143230438, + "learning_rate": 0.0001, + "loss": 1.6616, + "step": 1433 + }, + { + "epoch": 0.1647234506920912, + "grad_norm": 0.3488091826438904, + "learning_rate": 0.0001, + "loss": 1.6911, + "step": 1434 + }, + { + "epoch": 0.16483832060191833, + "grad_norm": 0.3559654951095581, + "learning_rate": 0.0001, + "loss": 1.6136, + "step": 1435 + }, + { + "epoch": 0.16495319051174545, + "grad_norm": 0.4024467170238495, + "learning_rate": 0.0001, + "loss": 1.8107, + "step": 1436 + }, + { + "epoch": 0.16506806042157257, + "grad_norm": 0.3618294298648834, + "learning_rate": 0.0001, + "loss": 1.8111, + "step": 1437 + }, + { + "epoch": 0.1651829303313997, + "grad_norm": 0.3504227101802826, + "learning_rate": 0.0001, + "loss": 1.639, + "step": 1438 + }, + { + "epoch": 0.1652978002412268, + "grad_norm": 0.36997562646865845, + "learning_rate": 0.0001, + "loss": 1.8394, + "step": 1439 + }, + { + "epoch": 0.16541267015105393, + "grad_norm": 0.3473089635372162, + "learning_rate": 0.0001, + "loss": 1.7694, + "step": 1440 + }, + { + "epoch": 0.16552754006088105, + "grad_norm": 0.3754732012748718, + "learning_rate": 0.0001, + "loss": 1.6047, + "step": 1441 + }, + { + "epoch": 0.16564240997070817, + "grad_norm": 0.318775475025177, + "learning_rate": 0.0001, + "loss": 1.4992, + "step": 1442 + }, + { + "epoch": 0.1657572798805353, + "grad_norm": 0.3740909695625305, + "learning_rate": 0.0001, + "loss": 1.7752, + "step": 1443 + }, + { + "epoch": 0.1658721497903624, + "grad_norm": 0.34481796622276306, + "learning_rate": 0.0001, + "loss": 1.7427, + "step": 1444 + }, + { + "epoch": 0.16598701970018953, + "grad_norm": 0.3509489595890045, + "learning_rate": 0.0001, + "loss": 1.8072, + "step": 1445 + }, + { + "epoch": 0.16610188961001665, + "grad_norm": 0.364310085773468, + "learning_rate": 0.0001, + "loss": 1.7988, + "step": 1446 + }, + { + "epoch": 0.16621675951984377, + "grad_norm": 0.36774981021881104, + "learning_rate": 0.0001, + "loss": 1.8602, + "step": 1447 + }, + { + "epoch": 0.1663316294296709, + "grad_norm": 0.34403902292251587, + "learning_rate": 0.0001, + "loss": 1.7574, + "step": 1448 + }, + { + "epoch": 0.166446499339498, + "grad_norm": 0.3631366193294525, + "learning_rate": 0.0001, + "loss": 1.7609, + "step": 1449 + }, + { + "epoch": 0.16656136924932513, + "grad_norm": 0.4153347313404083, + "learning_rate": 0.0001, + "loss": 1.8153, + "step": 1450 + }, + { + "epoch": 0.16667623915915225, + "grad_norm": 0.3547952175140381, + "learning_rate": 0.0001, + "loss": 1.5481, + "step": 1451 + }, + { + "epoch": 0.16679110906897937, + "grad_norm": 0.3516184091567993, + "learning_rate": 0.0001, + "loss": 1.6451, + "step": 1452 + }, + { + "epoch": 0.1669059789788065, + "grad_norm": 0.35548651218414307, + "learning_rate": 0.0001, + "loss": 1.7025, + "step": 1453 + }, + { + "epoch": 0.1670208488886336, + "grad_norm": 0.3711240291595459, + "learning_rate": 0.0001, + "loss": 1.6624, + "step": 1454 + }, + { + "epoch": 0.16713571879846073, + "grad_norm": 0.3621267080307007, + "learning_rate": 0.0001, + "loss": 1.587, + "step": 1455 + }, + { + "epoch": 0.16725058870828785, + "grad_norm": 0.36165568232536316, + "learning_rate": 0.0001, + "loss": 1.7507, + "step": 1456 + }, + { + "epoch": 0.167365458618115, + "grad_norm": 0.3373228907585144, + "learning_rate": 0.0001, + "loss": 1.6279, + "step": 1457 + }, + { + "epoch": 0.16748032852794212, + "grad_norm": 0.3716026842594147, + "learning_rate": 0.0001, + "loss": 1.7762, + "step": 1458 + }, + { + "epoch": 0.16759519843776924, + "grad_norm": 0.3812613785266876, + "learning_rate": 0.0001, + "loss": 1.6665, + "step": 1459 + }, + { + "epoch": 0.16771006834759636, + "grad_norm": 0.3630736470222473, + "learning_rate": 0.0001, + "loss": 1.7054, + "step": 1460 + }, + { + "epoch": 0.16782493825742348, + "grad_norm": 0.3346702754497528, + "learning_rate": 0.0001, + "loss": 1.6076, + "step": 1461 + }, + { + "epoch": 0.1679398081672506, + "grad_norm": 0.3383999764919281, + "learning_rate": 0.0001, + "loss": 1.6432, + "step": 1462 + }, + { + "epoch": 0.16805467807707772, + "grad_norm": 0.3548593521118164, + "learning_rate": 0.0001, + "loss": 1.6434, + "step": 1463 + }, + { + "epoch": 0.16816954798690484, + "grad_norm": 0.3538789451122284, + "learning_rate": 0.0001, + "loss": 1.657, + "step": 1464 + }, + { + "epoch": 0.16828441789673196, + "grad_norm": 0.3505842387676239, + "learning_rate": 0.0001, + "loss": 1.7951, + "step": 1465 + }, + { + "epoch": 0.16839928780655908, + "grad_norm": 0.3387717604637146, + "learning_rate": 0.0001, + "loss": 1.6237, + "step": 1466 + }, + { + "epoch": 0.1685141577163862, + "grad_norm": 0.3535427153110504, + "learning_rate": 0.0001, + "loss": 1.7087, + "step": 1467 + }, + { + "epoch": 0.16862902762621332, + "grad_norm": 0.35021501779556274, + "learning_rate": 0.0001, + "loss": 1.6706, + "step": 1468 + }, + { + "epoch": 0.16874389753604044, + "grad_norm": 0.36078891158103943, + "learning_rate": 0.0001, + "loss": 1.7588, + "step": 1469 + }, + { + "epoch": 0.16885876744586756, + "grad_norm": 0.3418395221233368, + "learning_rate": 0.0001, + "loss": 1.5688, + "step": 1470 + }, + { + "epoch": 0.16897363735569468, + "grad_norm": 0.37558332085609436, + "learning_rate": 0.0001, + "loss": 1.6314, + "step": 1471 + }, + { + "epoch": 0.1690885072655218, + "grad_norm": 0.3494422137737274, + "learning_rate": 0.0001, + "loss": 1.7267, + "step": 1472 + }, + { + "epoch": 0.16920337717534892, + "grad_norm": 0.35918739438056946, + "learning_rate": 0.0001, + "loss": 1.6485, + "step": 1473 + }, + { + "epoch": 0.16931824708517604, + "grad_norm": 0.36204949021339417, + "learning_rate": 0.0001, + "loss": 1.8109, + "step": 1474 + }, + { + "epoch": 0.16943311699500316, + "grad_norm": 0.35251131653785706, + "learning_rate": 0.0001, + "loss": 1.6827, + "step": 1475 + }, + { + "epoch": 0.16954798690483028, + "grad_norm": 0.35120296478271484, + "learning_rate": 0.0001, + "loss": 1.6935, + "step": 1476 + }, + { + "epoch": 0.1696628568146574, + "grad_norm": 0.34975898265838623, + "learning_rate": 0.0001, + "loss": 1.7992, + "step": 1477 + }, + { + "epoch": 0.16977772672448452, + "grad_norm": 0.33744266629219055, + "learning_rate": 0.0001, + "loss": 1.6606, + "step": 1478 + }, + { + "epoch": 0.16989259663431164, + "grad_norm": 0.34000107645988464, + "learning_rate": 0.0001, + "loss": 1.6484, + "step": 1479 + }, + { + "epoch": 0.17000746654413876, + "grad_norm": 0.36184847354888916, + "learning_rate": 0.0001, + "loss": 1.7398, + "step": 1480 + }, + { + "epoch": 0.17012233645396588, + "grad_norm": 0.3685035705566406, + "learning_rate": 0.0001, + "loss": 1.8032, + "step": 1481 + }, + { + "epoch": 0.170237206363793, + "grad_norm": 0.38592687249183655, + "learning_rate": 0.0001, + "loss": 1.7185, + "step": 1482 + }, + { + "epoch": 0.17035207627362012, + "grad_norm": 0.3724033832550049, + "learning_rate": 0.0001, + "loss": 1.7696, + "step": 1483 + }, + { + "epoch": 0.17046694618344724, + "grad_norm": 0.3662974238395691, + "learning_rate": 0.0001, + "loss": 1.727, + "step": 1484 + }, + { + "epoch": 0.17058181609327436, + "grad_norm": 0.38893744349479675, + "learning_rate": 0.0001, + "loss": 1.746, + "step": 1485 + }, + { + "epoch": 0.17069668600310148, + "grad_norm": 0.368671178817749, + "learning_rate": 0.0001, + "loss": 1.7863, + "step": 1486 + }, + { + "epoch": 0.1708115559129286, + "grad_norm": 0.3590819835662842, + "learning_rate": 0.0001, + "loss": 1.8859, + "step": 1487 + }, + { + "epoch": 0.17092642582275572, + "grad_norm": 0.36079901456832886, + "learning_rate": 0.0001, + "loss": 1.7423, + "step": 1488 + }, + { + "epoch": 0.17104129573258284, + "grad_norm": 0.355546772480011, + "learning_rate": 0.0001, + "loss": 1.7369, + "step": 1489 + }, + { + "epoch": 0.17115616564240996, + "grad_norm": 0.3821921944618225, + "learning_rate": 0.0001, + "loss": 1.9179, + "step": 1490 + }, + { + "epoch": 0.1712710355522371, + "grad_norm": 0.3505462110042572, + "learning_rate": 0.0001, + "loss": 1.871, + "step": 1491 + }, + { + "epoch": 0.17138590546206423, + "grad_norm": 0.3656969368457794, + "learning_rate": 0.0001, + "loss": 1.6599, + "step": 1492 + }, + { + "epoch": 0.17150077537189135, + "grad_norm": 0.3786547780036926, + "learning_rate": 0.0001, + "loss": 1.6202, + "step": 1493 + }, + { + "epoch": 0.17161564528171847, + "grad_norm": 0.37065404653549194, + "learning_rate": 0.0001, + "loss": 1.6333, + "step": 1494 + }, + { + "epoch": 0.17173051519154559, + "grad_norm": 0.3699958622455597, + "learning_rate": 0.0001, + "loss": 1.7745, + "step": 1495 + }, + { + "epoch": 0.1718453851013727, + "grad_norm": 0.3573478162288666, + "learning_rate": 0.0001, + "loss": 1.6497, + "step": 1496 + }, + { + "epoch": 0.17196025501119983, + "grad_norm": 0.3474213778972626, + "learning_rate": 0.0001, + "loss": 1.5043, + "step": 1497 + }, + { + "epoch": 0.17207512492102695, + "grad_norm": 0.3627040684223175, + "learning_rate": 0.0001, + "loss": 1.7553, + "step": 1498 + }, + { + "epoch": 0.17218999483085407, + "grad_norm": 0.34735116362571716, + "learning_rate": 0.0001, + "loss": 1.6331, + "step": 1499 + }, + { + "epoch": 0.17230486474068119, + "grad_norm": 0.4130633771419525, + "learning_rate": 0.0001, + "loss": 1.8155, + "step": 1500 + }, + { + "epoch": 0.1724197346505083, + "grad_norm": 0.38091927766799927, + "learning_rate": 0.0001, + "loss": 1.7231, + "step": 1501 + }, + { + "epoch": 0.17253460456033543, + "grad_norm": 0.39104804396629333, + "learning_rate": 0.0001, + "loss": 1.756, + "step": 1502 + }, + { + "epoch": 0.17264947447016255, + "grad_norm": 0.39437583088874817, + "learning_rate": 0.0001, + "loss": 1.7184, + "step": 1503 + }, + { + "epoch": 0.17276434437998966, + "grad_norm": 0.37000584602355957, + "learning_rate": 0.0001, + "loss": 1.6456, + "step": 1504 + }, + { + "epoch": 0.17287921428981678, + "grad_norm": 0.37976545095443726, + "learning_rate": 0.0001, + "loss": 1.8923, + "step": 1505 + }, + { + "epoch": 0.1729940841996439, + "grad_norm": 0.36573851108551025, + "learning_rate": 0.0001, + "loss": 1.8202, + "step": 1506 + }, + { + "epoch": 0.17310895410947102, + "grad_norm": 0.37791380286216736, + "learning_rate": 0.0001, + "loss": 1.7888, + "step": 1507 + }, + { + "epoch": 0.17322382401929814, + "grad_norm": 0.3388189673423767, + "learning_rate": 0.0001, + "loss": 1.5188, + "step": 1508 + }, + { + "epoch": 0.17333869392912526, + "grad_norm": 0.36455753445625305, + "learning_rate": 0.0001, + "loss": 1.8167, + "step": 1509 + }, + { + "epoch": 0.17345356383895238, + "grad_norm": 0.3855915665626526, + "learning_rate": 0.0001, + "loss": 1.8208, + "step": 1510 + }, + { + "epoch": 0.1735684337487795, + "grad_norm": 0.3601621687412262, + "learning_rate": 0.0001, + "loss": 1.8135, + "step": 1511 + }, + { + "epoch": 0.17368330365860662, + "grad_norm": 0.34166282415390015, + "learning_rate": 0.0001, + "loss": 1.5916, + "step": 1512 + }, + { + "epoch": 0.17379817356843374, + "grad_norm": 0.36744624376296997, + "learning_rate": 0.0001, + "loss": 1.7702, + "step": 1513 + }, + { + "epoch": 0.17391304347826086, + "grad_norm": 0.3522723913192749, + "learning_rate": 0.0001, + "loss": 1.7836, + "step": 1514 + }, + { + "epoch": 0.17402791338808798, + "grad_norm": 0.34597423672676086, + "learning_rate": 0.0001, + "loss": 1.6167, + "step": 1515 + }, + { + "epoch": 0.1741427832979151, + "grad_norm": 0.33401763439178467, + "learning_rate": 0.0001, + "loss": 1.5352, + "step": 1516 + }, + { + "epoch": 0.17425765320774222, + "grad_norm": 0.3676266074180603, + "learning_rate": 0.0001, + "loss": 1.7157, + "step": 1517 + }, + { + "epoch": 0.17437252311756934, + "grad_norm": 0.3546941578388214, + "learning_rate": 0.0001, + "loss": 1.8186, + "step": 1518 + }, + { + "epoch": 0.17448739302739646, + "grad_norm": 0.37473762035369873, + "learning_rate": 0.0001, + "loss": 1.7717, + "step": 1519 + }, + { + "epoch": 0.17460226293722358, + "grad_norm": 0.35405465960502625, + "learning_rate": 0.0001, + "loss": 1.683, + "step": 1520 + }, + { + "epoch": 0.1747171328470507, + "grad_norm": 0.3613182306289673, + "learning_rate": 0.0001, + "loss": 1.7042, + "step": 1521 + }, + { + "epoch": 0.17483200275687782, + "grad_norm": 0.3764897882938385, + "learning_rate": 0.0001, + "loss": 1.834, + "step": 1522 + }, + { + "epoch": 0.17494687266670494, + "grad_norm": 0.3841586410999298, + "learning_rate": 0.0001, + "loss": 1.8038, + "step": 1523 + }, + { + "epoch": 0.17506174257653206, + "grad_norm": 0.37141889333724976, + "learning_rate": 0.0001, + "loss": 1.5864, + "step": 1524 + }, + { + "epoch": 0.17517661248635918, + "grad_norm": 0.37981778383255005, + "learning_rate": 0.0001, + "loss": 1.772, + "step": 1525 + }, + { + "epoch": 0.17529148239618633, + "grad_norm": 0.40624651312828064, + "learning_rate": 0.0001, + "loss": 1.5573, + "step": 1526 + }, + { + "epoch": 0.17540635230601345, + "grad_norm": 0.37434014678001404, + "learning_rate": 0.0001, + "loss": 1.7501, + "step": 1527 + }, + { + "epoch": 0.17552122221584057, + "grad_norm": 0.3867623805999756, + "learning_rate": 0.0001, + "loss": 1.6846, + "step": 1528 + }, + { + "epoch": 0.1756360921256677, + "grad_norm": 0.384644478559494, + "learning_rate": 0.0001, + "loss": 1.7358, + "step": 1529 + }, + { + "epoch": 0.1757509620354948, + "grad_norm": 0.36406537890434265, + "learning_rate": 0.0001, + "loss": 1.6338, + "step": 1530 + }, + { + "epoch": 0.17586583194532193, + "grad_norm": 0.3523077070713043, + "learning_rate": 0.0001, + "loss": 1.663, + "step": 1531 + }, + { + "epoch": 0.17598070185514905, + "grad_norm": 0.3456611633300781, + "learning_rate": 0.0001, + "loss": 1.6552, + "step": 1532 + }, + { + "epoch": 0.17609557176497617, + "grad_norm": 0.4034580588340759, + "learning_rate": 0.0001, + "loss": 1.8922, + "step": 1533 + }, + { + "epoch": 0.1762104416748033, + "grad_norm": 0.3668345510959625, + "learning_rate": 0.0001, + "loss": 1.7633, + "step": 1534 + }, + { + "epoch": 0.1763253115846304, + "grad_norm": 0.3617863059043884, + "learning_rate": 0.0001, + "loss": 1.766, + "step": 1535 + }, + { + "epoch": 0.17644018149445753, + "grad_norm": 0.4189690053462982, + "learning_rate": 0.0001, + "loss": 1.8517, + "step": 1536 + }, + { + "epoch": 0.17655505140428465, + "grad_norm": 0.36103829741477966, + "learning_rate": 0.0001, + "loss": 1.7356, + "step": 1537 + }, + { + "epoch": 0.17666992131411177, + "grad_norm": 0.3502132296562195, + "learning_rate": 0.0001, + "loss": 1.7039, + "step": 1538 + }, + { + "epoch": 0.1767847912239389, + "grad_norm": 0.4156895577907562, + "learning_rate": 0.0001, + "loss": 1.8255, + "step": 1539 + }, + { + "epoch": 0.176899661133766, + "grad_norm": 0.35794487595558167, + "learning_rate": 0.0001, + "loss": 1.762, + "step": 1540 + }, + { + "epoch": 0.17701453104359313, + "grad_norm": 0.3665020763874054, + "learning_rate": 0.0001, + "loss": 1.7417, + "step": 1541 + }, + { + "epoch": 0.17712940095342025, + "grad_norm": 0.42144718766212463, + "learning_rate": 0.0001, + "loss": 1.9003, + "step": 1542 + }, + { + "epoch": 0.17724427086324737, + "grad_norm": 0.3615649342536926, + "learning_rate": 0.0001, + "loss": 1.4503, + "step": 1543 + }, + { + "epoch": 0.1773591407730745, + "grad_norm": 0.36150482296943665, + "learning_rate": 0.0001, + "loss": 1.7157, + "step": 1544 + }, + { + "epoch": 0.1774740106829016, + "grad_norm": 0.39197593927383423, + "learning_rate": 0.0001, + "loss": 1.717, + "step": 1545 + }, + { + "epoch": 0.17758888059272873, + "grad_norm": 0.40486010909080505, + "learning_rate": 0.0001, + "loss": 1.7411, + "step": 1546 + }, + { + "epoch": 0.17770375050255585, + "grad_norm": 0.3970898389816284, + "learning_rate": 0.0001, + "loss": 2.0669, + "step": 1547 + }, + { + "epoch": 0.17781862041238297, + "grad_norm": 0.3371671736240387, + "learning_rate": 0.0001, + "loss": 1.5913, + "step": 1548 + }, + { + "epoch": 0.1779334903222101, + "grad_norm": 0.33657070994377136, + "learning_rate": 0.0001, + "loss": 1.5829, + "step": 1549 + }, + { + "epoch": 0.1780483602320372, + "grad_norm": 0.34936872124671936, + "learning_rate": 0.0001, + "loss": 1.6385, + "step": 1550 + }, + { + "epoch": 0.17816323014186433, + "grad_norm": 0.353533536195755, + "learning_rate": 0.0001, + "loss": 1.6985, + "step": 1551 + }, + { + "epoch": 0.17827810005169145, + "grad_norm": 0.3584658205509186, + "learning_rate": 0.0001, + "loss": 1.6322, + "step": 1552 + }, + { + "epoch": 0.17839296996151857, + "grad_norm": 0.3596382141113281, + "learning_rate": 0.0001, + "loss": 1.8855, + "step": 1553 + }, + { + "epoch": 0.1785078398713457, + "grad_norm": 0.3663223683834076, + "learning_rate": 0.0001, + "loss": 1.5073, + "step": 1554 + }, + { + "epoch": 0.1786227097811728, + "grad_norm": 0.39811477065086365, + "learning_rate": 0.0001, + "loss": 1.7019, + "step": 1555 + }, + { + "epoch": 0.17873757969099993, + "grad_norm": 0.3574909567832947, + "learning_rate": 0.0001, + "loss": 1.7678, + "step": 1556 + }, + { + "epoch": 0.17885244960082705, + "grad_norm": 0.3777164816856384, + "learning_rate": 0.0001, + "loss": 1.8767, + "step": 1557 + }, + { + "epoch": 0.17896731951065417, + "grad_norm": 0.3364102244377136, + "learning_rate": 0.0001, + "loss": 1.6808, + "step": 1558 + }, + { + "epoch": 0.1790821894204813, + "grad_norm": 0.36213961243629456, + "learning_rate": 0.0001, + "loss": 1.7973, + "step": 1559 + }, + { + "epoch": 0.17919705933030844, + "grad_norm": 0.4215514361858368, + "learning_rate": 0.0001, + "loss": 1.8289, + "step": 1560 + }, + { + "epoch": 0.17931192924013556, + "grad_norm": 0.35058748722076416, + "learning_rate": 0.0001, + "loss": 1.5582, + "step": 1561 + }, + { + "epoch": 0.17942679914996268, + "grad_norm": 0.4215516448020935, + "learning_rate": 0.0001, + "loss": 1.6928, + "step": 1562 + }, + { + "epoch": 0.1795416690597898, + "grad_norm": 0.3747852146625519, + "learning_rate": 0.0001, + "loss": 1.6683, + "step": 1563 + }, + { + "epoch": 0.17965653896961692, + "grad_norm": 0.3510657846927643, + "learning_rate": 0.0001, + "loss": 1.6948, + "step": 1564 + }, + { + "epoch": 0.17977140887944404, + "grad_norm": 0.3715681731700897, + "learning_rate": 0.0001, + "loss": 1.7083, + "step": 1565 + }, + { + "epoch": 0.17988627878927116, + "grad_norm": 0.3528061509132385, + "learning_rate": 0.0001, + "loss": 1.6339, + "step": 1566 + }, + { + "epoch": 0.18000114869909828, + "grad_norm": 0.3377302587032318, + "learning_rate": 0.0001, + "loss": 1.7079, + "step": 1567 + }, + { + "epoch": 0.1801160186089254, + "grad_norm": 0.40321823954582214, + "learning_rate": 0.0001, + "loss": 1.8162, + "step": 1568 + }, + { + "epoch": 0.18023088851875252, + "grad_norm": 0.3601834774017334, + "learning_rate": 0.0001, + "loss": 1.6868, + "step": 1569 + }, + { + "epoch": 0.18034575842857964, + "grad_norm": 0.3452896475791931, + "learning_rate": 0.0001, + "loss": 1.4845, + "step": 1570 + }, + { + "epoch": 0.18046062833840676, + "grad_norm": 0.3979194462299347, + "learning_rate": 0.0001, + "loss": 1.9325, + "step": 1571 + }, + { + "epoch": 0.18057549824823388, + "grad_norm": 0.3308473229408264, + "learning_rate": 0.0001, + "loss": 1.455, + "step": 1572 + }, + { + "epoch": 0.180690368158061, + "grad_norm": 0.3808495104312897, + "learning_rate": 0.0001, + "loss": 1.8788, + "step": 1573 + }, + { + "epoch": 0.18080523806788812, + "grad_norm": 0.35744503140449524, + "learning_rate": 0.0001, + "loss": 1.6442, + "step": 1574 + }, + { + "epoch": 0.18092010797771524, + "grad_norm": 0.35669422149658203, + "learning_rate": 0.0001, + "loss": 1.5716, + "step": 1575 + }, + { + "epoch": 0.18103497788754236, + "grad_norm": 0.3452187776565552, + "learning_rate": 0.0001, + "loss": 1.6189, + "step": 1576 + }, + { + "epoch": 0.18114984779736948, + "grad_norm": 0.36027148365974426, + "learning_rate": 0.0001, + "loss": 1.5917, + "step": 1577 + }, + { + "epoch": 0.1812647177071966, + "grad_norm": 0.32262781262397766, + "learning_rate": 0.0001, + "loss": 1.5746, + "step": 1578 + }, + { + "epoch": 0.18137958761702372, + "grad_norm": 0.3979918658733368, + "learning_rate": 0.0001, + "loss": 1.8699, + "step": 1579 + }, + { + "epoch": 0.18149445752685084, + "grad_norm": 0.42020371556282043, + "learning_rate": 0.0001, + "loss": 1.9041, + "step": 1580 + }, + { + "epoch": 0.18160932743667796, + "grad_norm": 0.34996458888053894, + "learning_rate": 0.0001, + "loss": 1.6857, + "step": 1581 + }, + { + "epoch": 0.18172419734650508, + "grad_norm": 0.3742469549179077, + "learning_rate": 0.0001, + "loss": 1.934, + "step": 1582 + }, + { + "epoch": 0.1818390672563322, + "grad_norm": 0.37955376505851746, + "learning_rate": 0.0001, + "loss": 1.7681, + "step": 1583 + }, + { + "epoch": 0.18195393716615932, + "grad_norm": 0.31765666604042053, + "learning_rate": 0.0001, + "loss": 1.5858, + "step": 1584 + }, + { + "epoch": 0.18206880707598644, + "grad_norm": 0.39358144998550415, + "learning_rate": 0.0001, + "loss": 1.8361, + "step": 1585 + }, + { + "epoch": 0.18218367698581356, + "grad_norm": 0.35237935185432434, + "learning_rate": 0.0001, + "loss": 1.7351, + "step": 1586 + }, + { + "epoch": 0.18229854689564068, + "grad_norm": 0.38084107637405396, + "learning_rate": 0.0001, + "loss": 1.7978, + "step": 1587 + }, + { + "epoch": 0.1824134168054678, + "grad_norm": 0.37168824672698975, + "learning_rate": 0.0001, + "loss": 1.7858, + "step": 1588 + }, + { + "epoch": 0.18252828671529492, + "grad_norm": 0.33814021944999695, + "learning_rate": 0.0001, + "loss": 1.738, + "step": 1589 + }, + { + "epoch": 0.18264315662512204, + "grad_norm": 0.39129403233528137, + "learning_rate": 0.0001, + "loss": 1.9242, + "step": 1590 + }, + { + "epoch": 0.18275802653494916, + "grad_norm": 0.3859502971172333, + "learning_rate": 0.0001, + "loss": 1.8557, + "step": 1591 + }, + { + "epoch": 0.18287289644477628, + "grad_norm": 0.3586483895778656, + "learning_rate": 0.0001, + "loss": 1.7178, + "step": 1592 + }, + { + "epoch": 0.1829877663546034, + "grad_norm": 0.3621407449245453, + "learning_rate": 0.0001, + "loss": 1.5869, + "step": 1593 + }, + { + "epoch": 0.18310263626443055, + "grad_norm": 0.3588270843029022, + "learning_rate": 0.0001, + "loss": 1.6165, + "step": 1594 + }, + { + "epoch": 0.18321750617425767, + "grad_norm": 0.36701640486717224, + "learning_rate": 0.0001, + "loss": 1.6575, + "step": 1595 + }, + { + "epoch": 0.18333237608408479, + "grad_norm": 0.3731893301010132, + "learning_rate": 0.0001, + "loss": 1.5814, + "step": 1596 + }, + { + "epoch": 0.1834472459939119, + "grad_norm": 0.3400730788707733, + "learning_rate": 0.0001, + "loss": 1.5258, + "step": 1597 + }, + { + "epoch": 0.18356211590373903, + "grad_norm": 0.3284122943878174, + "learning_rate": 0.0001, + "loss": 1.3967, + "step": 1598 + }, + { + "epoch": 0.18367698581356615, + "grad_norm": 0.3313588798046112, + "learning_rate": 0.0001, + "loss": 1.5584, + "step": 1599 + }, + { + "epoch": 0.18379185572339327, + "grad_norm": 0.4088406562805176, + "learning_rate": 0.0001, + "loss": 1.8716, + "step": 1600 + }, + { + "epoch": 0.18390672563322039, + "grad_norm": 0.3627072870731354, + "learning_rate": 0.0001, + "loss": 1.7801, + "step": 1601 + }, + { + "epoch": 0.1840215955430475, + "grad_norm": 0.3515871465206146, + "learning_rate": 0.0001, + "loss": 1.6491, + "step": 1602 + }, + { + "epoch": 0.18413646545287463, + "grad_norm": 0.4161235988140106, + "learning_rate": 0.0001, + "loss": 1.749, + "step": 1603 + }, + { + "epoch": 0.18425133536270175, + "grad_norm": 0.35696670413017273, + "learning_rate": 0.0001, + "loss": 1.6153, + "step": 1604 + }, + { + "epoch": 0.18436620527252887, + "grad_norm": 0.3441614508628845, + "learning_rate": 0.0001, + "loss": 1.6761, + "step": 1605 + }, + { + "epoch": 0.18448107518235599, + "grad_norm": 0.35174059867858887, + "learning_rate": 0.0001, + "loss": 1.5445, + "step": 1606 + }, + { + "epoch": 0.1845959450921831, + "grad_norm": 0.3795402944087982, + "learning_rate": 0.0001, + "loss": 1.7702, + "step": 1607 + }, + { + "epoch": 0.18471081500201023, + "grad_norm": 0.3490031957626343, + "learning_rate": 0.0001, + "loss": 1.6797, + "step": 1608 + }, + { + "epoch": 0.18482568491183735, + "grad_norm": 0.39439791440963745, + "learning_rate": 0.0001, + "loss": 1.8993, + "step": 1609 + }, + { + "epoch": 0.18494055482166447, + "grad_norm": 0.3577129542827606, + "learning_rate": 0.0001, + "loss": 1.6329, + "step": 1610 + }, + { + "epoch": 0.18505542473149159, + "grad_norm": 0.34242141246795654, + "learning_rate": 0.0001, + "loss": 1.4998, + "step": 1611 + }, + { + "epoch": 0.1851702946413187, + "grad_norm": 0.3696388304233551, + "learning_rate": 0.0001, + "loss": 1.7205, + "step": 1612 + }, + { + "epoch": 0.18528516455114583, + "grad_norm": 0.3409230411052704, + "learning_rate": 0.0001, + "loss": 1.6485, + "step": 1613 + }, + { + "epoch": 0.18540003446097295, + "grad_norm": 0.34659913182258606, + "learning_rate": 0.0001, + "loss": 1.6436, + "step": 1614 + }, + { + "epoch": 0.18551490437080007, + "grad_norm": 0.3633543848991394, + "learning_rate": 0.0001, + "loss": 1.8687, + "step": 1615 + }, + { + "epoch": 0.18562977428062719, + "grad_norm": 0.3456427752971649, + "learning_rate": 0.0001, + "loss": 1.6913, + "step": 1616 + }, + { + "epoch": 0.1857446441904543, + "grad_norm": 0.3466663062572479, + "learning_rate": 0.0001, + "loss": 1.7164, + "step": 1617 + }, + { + "epoch": 0.18585951410028143, + "grad_norm": 0.373751163482666, + "learning_rate": 0.0001, + "loss": 1.7618, + "step": 1618 + }, + { + "epoch": 0.18597438401010855, + "grad_norm": 0.3516460657119751, + "learning_rate": 0.0001, + "loss": 1.587, + "step": 1619 + }, + { + "epoch": 0.18608925391993567, + "grad_norm": 0.36074620485305786, + "learning_rate": 0.0001, + "loss": 1.6287, + "step": 1620 + }, + { + "epoch": 0.18620412382976279, + "grad_norm": 0.37144434452056885, + "learning_rate": 0.0001, + "loss": 1.6111, + "step": 1621 + }, + { + "epoch": 0.1863189937395899, + "grad_norm": 0.34920889139175415, + "learning_rate": 0.0001, + "loss": 1.5325, + "step": 1622 + }, + { + "epoch": 0.18643386364941703, + "grad_norm": 0.3639899790287018, + "learning_rate": 0.0001, + "loss": 1.773, + "step": 1623 + }, + { + "epoch": 0.18654873355924415, + "grad_norm": 0.38695764541625977, + "learning_rate": 0.0001, + "loss": 1.9671, + "step": 1624 + }, + { + "epoch": 0.18666360346907127, + "grad_norm": 0.3656146228313446, + "learning_rate": 0.0001, + "loss": 1.8799, + "step": 1625 + }, + { + "epoch": 0.18677847337889839, + "grad_norm": 0.3708580732345581, + "learning_rate": 0.0001, + "loss": 1.8234, + "step": 1626 + }, + { + "epoch": 0.1868933432887255, + "grad_norm": 0.3623522222042084, + "learning_rate": 0.0001, + "loss": 1.623, + "step": 1627 + }, + { + "epoch": 0.18700821319855265, + "grad_norm": 0.3340558409690857, + "learning_rate": 0.0001, + "loss": 1.5646, + "step": 1628 + }, + { + "epoch": 0.18712308310837977, + "grad_norm": 0.3819306790828705, + "learning_rate": 0.0001, + "loss": 1.9233, + "step": 1629 + }, + { + "epoch": 0.1872379530182069, + "grad_norm": 0.35179227590560913, + "learning_rate": 0.0001, + "loss": 1.6158, + "step": 1630 + }, + { + "epoch": 0.187352822928034, + "grad_norm": 0.3724440634250641, + "learning_rate": 0.0001, + "loss": 1.6322, + "step": 1631 + }, + { + "epoch": 0.18746769283786113, + "grad_norm": 0.35643836855888367, + "learning_rate": 0.0001, + "loss": 1.6294, + "step": 1632 + }, + { + "epoch": 0.18758256274768825, + "grad_norm": 0.3557715117931366, + "learning_rate": 0.0001, + "loss": 1.8076, + "step": 1633 + }, + { + "epoch": 0.18769743265751537, + "grad_norm": 0.3418234586715698, + "learning_rate": 0.0001, + "loss": 1.7002, + "step": 1634 + }, + { + "epoch": 0.1878123025673425, + "grad_norm": 0.3681597113609314, + "learning_rate": 0.0001, + "loss": 1.7233, + "step": 1635 + }, + { + "epoch": 0.1879271724771696, + "grad_norm": 0.34994348883628845, + "learning_rate": 0.0001, + "loss": 1.4936, + "step": 1636 + }, + { + "epoch": 0.18804204238699673, + "grad_norm": 0.35400843620300293, + "learning_rate": 0.0001, + "loss": 1.5871, + "step": 1637 + }, + { + "epoch": 0.18815691229682385, + "grad_norm": 0.3841044008731842, + "learning_rate": 0.0001, + "loss": 1.8312, + "step": 1638 + }, + { + "epoch": 0.18827178220665097, + "grad_norm": 0.3630238473415375, + "learning_rate": 0.0001, + "loss": 1.7448, + "step": 1639 + }, + { + "epoch": 0.1883866521164781, + "grad_norm": 0.3915660083293915, + "learning_rate": 0.0001, + "loss": 1.8261, + "step": 1640 + }, + { + "epoch": 0.1885015220263052, + "grad_norm": 0.3483685851097107, + "learning_rate": 0.0001, + "loss": 1.5901, + "step": 1641 + }, + { + "epoch": 0.18861639193613233, + "grad_norm": 0.35304656624794006, + "learning_rate": 0.0001, + "loss": 1.6511, + "step": 1642 + }, + { + "epoch": 0.18873126184595945, + "grad_norm": 0.3424839973449707, + "learning_rate": 0.0001, + "loss": 1.6643, + "step": 1643 + }, + { + "epoch": 0.18884613175578657, + "grad_norm": 0.3762650489807129, + "learning_rate": 0.0001, + "loss": 1.7361, + "step": 1644 + }, + { + "epoch": 0.1889610016656137, + "grad_norm": 0.3635323643684387, + "learning_rate": 0.0001, + "loss": 1.7845, + "step": 1645 + }, + { + "epoch": 0.1890758715754408, + "grad_norm": 0.36258241534233093, + "learning_rate": 0.0001, + "loss": 1.8612, + "step": 1646 + }, + { + "epoch": 0.18919074148526793, + "grad_norm": 0.35478660464286804, + "learning_rate": 0.0001, + "loss": 1.7232, + "step": 1647 + }, + { + "epoch": 0.18930561139509505, + "grad_norm": 0.38521048426628113, + "learning_rate": 0.0001, + "loss": 1.7481, + "step": 1648 + }, + { + "epoch": 0.18942048130492217, + "grad_norm": 0.34351152181625366, + "learning_rate": 0.0001, + "loss": 1.5854, + "step": 1649 + }, + { + "epoch": 0.1895353512147493, + "grad_norm": 0.38085636496543884, + "learning_rate": 0.0001, + "loss": 1.8211, + "step": 1650 + }, + { + "epoch": 0.1896502211245764, + "grad_norm": 0.3599552512168884, + "learning_rate": 0.0001, + "loss": 1.6675, + "step": 1651 + }, + { + "epoch": 0.18976509103440353, + "grad_norm": 0.3752254247665405, + "learning_rate": 0.0001, + "loss": 1.6259, + "step": 1652 + }, + { + "epoch": 0.18987996094423065, + "grad_norm": 0.3557283580303192, + "learning_rate": 0.0001, + "loss": 1.8026, + "step": 1653 + }, + { + "epoch": 0.18999483085405777, + "grad_norm": 0.3593176603317261, + "learning_rate": 0.0001, + "loss": 1.6131, + "step": 1654 + }, + { + "epoch": 0.1901097007638849, + "grad_norm": 0.36632347106933594, + "learning_rate": 0.0001, + "loss": 1.751, + "step": 1655 + }, + { + "epoch": 0.190224570673712, + "grad_norm": 0.36848095059394836, + "learning_rate": 0.0001, + "loss": 1.7481, + "step": 1656 + }, + { + "epoch": 0.19033944058353913, + "grad_norm": 0.35497191548347473, + "learning_rate": 0.0001, + "loss": 1.6606, + "step": 1657 + }, + { + "epoch": 0.19045431049336625, + "grad_norm": 0.3646465837955475, + "learning_rate": 0.0001, + "loss": 1.6281, + "step": 1658 + }, + { + "epoch": 0.19056918040319337, + "grad_norm": 0.3539585471153259, + "learning_rate": 0.0001, + "loss": 1.6751, + "step": 1659 + }, + { + "epoch": 0.1906840503130205, + "grad_norm": 0.36736389994621277, + "learning_rate": 0.0001, + "loss": 1.633, + "step": 1660 + }, + { + "epoch": 0.1907989202228476, + "grad_norm": 0.38588473200798035, + "learning_rate": 0.0001, + "loss": 1.7653, + "step": 1661 + }, + { + "epoch": 0.19091379013267473, + "grad_norm": 0.38840097188949585, + "learning_rate": 0.0001, + "loss": 1.6896, + "step": 1662 + }, + { + "epoch": 0.19102866004250188, + "grad_norm": 0.34677135944366455, + "learning_rate": 0.0001, + "loss": 1.6718, + "step": 1663 + }, + { + "epoch": 0.191143529952329, + "grad_norm": 0.3521466553211212, + "learning_rate": 0.0001, + "loss": 1.6003, + "step": 1664 + }, + { + "epoch": 0.19125839986215612, + "grad_norm": 0.34969663619995117, + "learning_rate": 0.0001, + "loss": 1.7715, + "step": 1665 + }, + { + "epoch": 0.19137326977198324, + "grad_norm": 0.3782643973827362, + "learning_rate": 0.0001, + "loss": 1.777, + "step": 1666 + }, + { + "epoch": 0.19148813968181036, + "grad_norm": 0.3731124699115753, + "learning_rate": 0.0001, + "loss": 1.7143, + "step": 1667 + }, + { + "epoch": 0.19160300959163748, + "grad_norm": 0.37945446372032166, + "learning_rate": 0.0001, + "loss": 1.7618, + "step": 1668 + }, + { + "epoch": 0.1917178795014646, + "grad_norm": 0.331589937210083, + "learning_rate": 0.0001, + "loss": 1.6613, + "step": 1669 + }, + { + "epoch": 0.19183274941129172, + "grad_norm": 0.3730468451976776, + "learning_rate": 0.0001, + "loss": 1.611, + "step": 1670 + }, + { + "epoch": 0.19194761932111884, + "grad_norm": 0.40152809023857117, + "learning_rate": 0.0001, + "loss": 1.9885, + "step": 1671 + }, + { + "epoch": 0.19206248923094596, + "grad_norm": 0.3776914179325104, + "learning_rate": 0.0001, + "loss": 1.8431, + "step": 1672 + }, + { + "epoch": 0.19217735914077308, + "grad_norm": 0.3614851236343384, + "learning_rate": 0.0001, + "loss": 1.7252, + "step": 1673 + }, + { + "epoch": 0.1922922290506002, + "grad_norm": 0.33890727162361145, + "learning_rate": 0.0001, + "loss": 1.6394, + "step": 1674 + }, + { + "epoch": 0.19240709896042732, + "grad_norm": 0.3711831867694855, + "learning_rate": 0.0001, + "loss": 1.8968, + "step": 1675 + }, + { + "epoch": 0.19252196887025444, + "grad_norm": 0.407746285200119, + "learning_rate": 0.0001, + "loss": 1.6691, + "step": 1676 + }, + { + "epoch": 0.19263683878008156, + "grad_norm": 0.3913660943508148, + "learning_rate": 0.0001, + "loss": 1.6698, + "step": 1677 + }, + { + "epoch": 0.19275170868990868, + "grad_norm": 0.3649699091911316, + "learning_rate": 0.0001, + "loss": 1.8171, + "step": 1678 + }, + { + "epoch": 0.1928665785997358, + "grad_norm": 0.34938865900039673, + "learning_rate": 0.0001, + "loss": 1.7187, + "step": 1679 + }, + { + "epoch": 0.19298144850956292, + "grad_norm": 0.3610716462135315, + "learning_rate": 0.0001, + "loss": 1.7097, + "step": 1680 + }, + { + "epoch": 0.19309631841939004, + "grad_norm": 0.3534272015094757, + "learning_rate": 0.0001, + "loss": 1.7907, + "step": 1681 + }, + { + "epoch": 0.19321118832921716, + "grad_norm": 0.3574727177619934, + "learning_rate": 0.0001, + "loss": 1.7394, + "step": 1682 + }, + { + "epoch": 0.19332605823904428, + "grad_norm": 0.32285967469215393, + "learning_rate": 0.0001, + "loss": 1.5277, + "step": 1683 + }, + { + "epoch": 0.1934409281488714, + "grad_norm": 0.36247870326042175, + "learning_rate": 0.0001, + "loss": 1.7193, + "step": 1684 + }, + { + "epoch": 0.19355579805869852, + "grad_norm": 0.3250444233417511, + "learning_rate": 0.0001, + "loss": 1.5945, + "step": 1685 + }, + { + "epoch": 0.19367066796852564, + "grad_norm": 0.3848918676376343, + "learning_rate": 0.0001, + "loss": 1.8016, + "step": 1686 + }, + { + "epoch": 0.19378553787835276, + "grad_norm": 0.3321680426597595, + "learning_rate": 0.0001, + "loss": 1.5136, + "step": 1687 + }, + { + "epoch": 0.19390040778817988, + "grad_norm": 0.3534335196018219, + "learning_rate": 0.0001, + "loss": 1.8101, + "step": 1688 + }, + { + "epoch": 0.194015277698007, + "grad_norm": 0.3084717094898224, + "learning_rate": 0.0001, + "loss": 1.2239, + "step": 1689 + }, + { + "epoch": 0.19413014760783412, + "grad_norm": 0.36308553814888, + "learning_rate": 0.0001, + "loss": 1.699, + "step": 1690 + }, + { + "epoch": 0.19424501751766124, + "grad_norm": 0.35767173767089844, + "learning_rate": 0.0001, + "loss": 1.7313, + "step": 1691 + }, + { + "epoch": 0.19435988742748836, + "grad_norm": 0.3621061444282532, + "learning_rate": 0.0001, + "loss": 1.5605, + "step": 1692 + }, + { + "epoch": 0.19447475733731548, + "grad_norm": 0.3489883840084076, + "learning_rate": 0.0001, + "loss": 1.7473, + "step": 1693 + }, + { + "epoch": 0.1945896272471426, + "grad_norm": 0.35943081974983215, + "learning_rate": 0.0001, + "loss": 1.8841, + "step": 1694 + }, + { + "epoch": 0.19470449715696972, + "grad_norm": 0.3859713673591614, + "learning_rate": 0.0001, + "loss": 1.7675, + "step": 1695 + }, + { + "epoch": 0.19481936706679684, + "grad_norm": 0.3559940457344055, + "learning_rate": 0.0001, + "loss": 1.6382, + "step": 1696 + }, + { + "epoch": 0.194934236976624, + "grad_norm": 0.37918147444725037, + "learning_rate": 0.0001, + "loss": 1.7507, + "step": 1697 + }, + { + "epoch": 0.1950491068864511, + "grad_norm": 0.36371055245399475, + "learning_rate": 0.0001, + "loss": 1.7621, + "step": 1698 + }, + { + "epoch": 0.19516397679627823, + "grad_norm": 0.34695690870285034, + "learning_rate": 0.0001, + "loss": 1.7555, + "step": 1699 + }, + { + "epoch": 0.19527884670610535, + "grad_norm": 0.35389262437820435, + "learning_rate": 0.0001, + "loss": 1.6632, + "step": 1700 + }, + { + "epoch": 0.19539371661593247, + "grad_norm": 0.35778507590293884, + "learning_rate": 0.0001, + "loss": 1.7939, + "step": 1701 + }, + { + "epoch": 0.1955085865257596, + "grad_norm": 0.37663915753364563, + "learning_rate": 0.0001, + "loss": 1.6431, + "step": 1702 + }, + { + "epoch": 0.1956234564355867, + "grad_norm": 0.3157816529273987, + "learning_rate": 0.0001, + "loss": 1.3818, + "step": 1703 + }, + { + "epoch": 0.19573832634541383, + "grad_norm": 0.35252466797828674, + "learning_rate": 0.0001, + "loss": 1.7476, + "step": 1704 + }, + { + "epoch": 0.19585319625524095, + "grad_norm": 0.3793637752532959, + "learning_rate": 0.0001, + "loss": 1.8395, + "step": 1705 + }, + { + "epoch": 0.19596806616506807, + "grad_norm": 0.3287891149520874, + "learning_rate": 0.0001, + "loss": 1.4472, + "step": 1706 + }, + { + "epoch": 0.1960829360748952, + "grad_norm": 0.3720473349094391, + "learning_rate": 0.0001, + "loss": 1.8553, + "step": 1707 + }, + { + "epoch": 0.1961978059847223, + "grad_norm": 0.3807579576969147, + "learning_rate": 0.0001, + "loss": 1.8815, + "step": 1708 + }, + { + "epoch": 0.19631267589454943, + "grad_norm": 0.3885481655597687, + "learning_rate": 0.0001, + "loss": 1.8668, + "step": 1709 + }, + { + "epoch": 0.19642754580437655, + "grad_norm": 0.3450814187526703, + "learning_rate": 0.0001, + "loss": 1.5449, + "step": 1710 + }, + { + "epoch": 0.19654241571420367, + "grad_norm": 0.3540419936180115, + "learning_rate": 0.0001, + "loss": 1.6886, + "step": 1711 + }, + { + "epoch": 0.1966572856240308, + "grad_norm": 0.38482004404067993, + "learning_rate": 0.0001, + "loss": 1.6155, + "step": 1712 + }, + { + "epoch": 0.1967721555338579, + "grad_norm": 0.37782052159309387, + "learning_rate": 0.0001, + "loss": 1.6786, + "step": 1713 + }, + { + "epoch": 0.19688702544368503, + "grad_norm": 0.3788130283355713, + "learning_rate": 0.0001, + "loss": 1.6707, + "step": 1714 + }, + { + "epoch": 0.19700189535351215, + "grad_norm": 0.3536849617958069, + "learning_rate": 0.0001, + "loss": 1.7248, + "step": 1715 + }, + { + "epoch": 0.19711676526333927, + "grad_norm": 0.4016459584236145, + "learning_rate": 0.0001, + "loss": 1.4351, + "step": 1716 + }, + { + "epoch": 0.1972316351731664, + "grad_norm": 0.34957659244537354, + "learning_rate": 0.0001, + "loss": 1.7459, + "step": 1717 + }, + { + "epoch": 0.1973465050829935, + "grad_norm": 0.36093124747276306, + "learning_rate": 0.0001, + "loss": 1.7523, + "step": 1718 + }, + { + "epoch": 0.19746137499282063, + "grad_norm": 0.36018863320350647, + "learning_rate": 0.0001, + "loss": 1.607, + "step": 1719 + }, + { + "epoch": 0.19757624490264775, + "grad_norm": 0.37135326862335205, + "learning_rate": 0.0001, + "loss": 1.4374, + "step": 1720 + }, + { + "epoch": 0.19769111481247487, + "grad_norm": 0.3906667232513428, + "learning_rate": 0.0001, + "loss": 1.7177, + "step": 1721 + }, + { + "epoch": 0.197805984722302, + "grad_norm": 0.3404282331466675, + "learning_rate": 0.0001, + "loss": 1.4111, + "step": 1722 + }, + { + "epoch": 0.1979208546321291, + "grad_norm": 0.39864858984947205, + "learning_rate": 0.0001, + "loss": 1.7668, + "step": 1723 + }, + { + "epoch": 0.19803572454195623, + "grad_norm": 0.38260820508003235, + "learning_rate": 0.0001, + "loss": 1.4592, + "step": 1724 + }, + { + "epoch": 0.19815059445178335, + "grad_norm": 0.35546496510505676, + "learning_rate": 0.0001, + "loss": 1.7121, + "step": 1725 + }, + { + "epoch": 0.19826546436161047, + "grad_norm": 0.34984710812568665, + "learning_rate": 0.0001, + "loss": 1.7286, + "step": 1726 + }, + { + "epoch": 0.1983803342714376, + "grad_norm": 0.38462913036346436, + "learning_rate": 0.0001, + "loss": 1.9247, + "step": 1727 + }, + { + "epoch": 0.1984952041812647, + "grad_norm": 0.3563911020755768, + "learning_rate": 0.0001, + "loss": 1.6917, + "step": 1728 + }, + { + "epoch": 0.19861007409109183, + "grad_norm": 0.3691602647304535, + "learning_rate": 0.0001, + "loss": 1.6081, + "step": 1729 + }, + { + "epoch": 0.19872494400091895, + "grad_norm": 0.36970895528793335, + "learning_rate": 0.0001, + "loss": 1.7104, + "step": 1730 + }, + { + "epoch": 0.1988398139107461, + "grad_norm": 0.3286248445510864, + "learning_rate": 0.0001, + "loss": 1.4056, + "step": 1731 + }, + { + "epoch": 0.1989546838205732, + "grad_norm": 0.3443751037120819, + "learning_rate": 0.0001, + "loss": 1.5495, + "step": 1732 + }, + { + "epoch": 0.19906955373040033, + "grad_norm": 0.3585871458053589, + "learning_rate": 0.0001, + "loss": 1.8211, + "step": 1733 + }, + { + "epoch": 0.19918442364022745, + "grad_norm": 0.37067654728889465, + "learning_rate": 0.0001, + "loss": 1.6953, + "step": 1734 + }, + { + "epoch": 0.19929929355005457, + "grad_norm": 0.3535691797733307, + "learning_rate": 0.0001, + "loss": 1.6193, + "step": 1735 + }, + { + "epoch": 0.1994141634598817, + "grad_norm": 0.37914103269577026, + "learning_rate": 0.0001, + "loss": 1.6576, + "step": 1736 + }, + { + "epoch": 0.1995290333697088, + "grad_norm": 0.3962135910987854, + "learning_rate": 0.0001, + "loss": 1.625, + "step": 1737 + }, + { + "epoch": 0.19964390327953593, + "grad_norm": 0.37456363439559937, + "learning_rate": 0.0001, + "loss": 1.6887, + "step": 1738 + }, + { + "epoch": 0.19975877318936305, + "grad_norm": 0.36762315034866333, + "learning_rate": 0.0001, + "loss": 1.7478, + "step": 1739 + }, + { + "epoch": 0.19987364309919017, + "grad_norm": 0.37871554493904114, + "learning_rate": 0.0001, + "loss": 1.7603, + "step": 1740 + }, + { + "epoch": 0.1999885130090173, + "grad_norm": 0.39336419105529785, + "learning_rate": 0.0001, + "loss": 1.7888, + "step": 1741 + }, + { + "epoch": 0.2001033829188444, + "grad_norm": 0.366931289434433, + "learning_rate": 0.0001, + "loss": 1.6331, + "step": 1742 + }, + { + "epoch": 0.20021825282867153, + "grad_norm": 0.37387847900390625, + "learning_rate": 0.0001, + "loss": 1.6847, + "step": 1743 + }, + { + "epoch": 0.20033312273849865, + "grad_norm": 0.3491780459880829, + "learning_rate": 0.0001, + "loss": 1.8099, + "step": 1744 + }, + { + "epoch": 0.20044799264832577, + "grad_norm": 0.39339229464530945, + "learning_rate": 0.0001, + "loss": 1.7889, + "step": 1745 + }, + { + "epoch": 0.2005628625581529, + "grad_norm": 0.3711383640766144, + "learning_rate": 0.0001, + "loss": 1.853, + "step": 1746 + }, + { + "epoch": 0.20067773246798, + "grad_norm": 0.33763977885246277, + "learning_rate": 0.0001, + "loss": 1.6071, + "step": 1747 + }, + { + "epoch": 0.20079260237780713, + "grad_norm": 0.34965288639068604, + "learning_rate": 0.0001, + "loss": 1.5583, + "step": 1748 + }, + { + "epoch": 0.20090747228763425, + "grad_norm": 0.3817383646965027, + "learning_rate": 0.0001, + "loss": 1.6416, + "step": 1749 + }, + { + "epoch": 0.20102234219746137, + "grad_norm": 0.37496039271354675, + "learning_rate": 0.0001, + "loss": 1.711, + "step": 1750 + }, + { + "epoch": 0.2011372121072885, + "grad_norm": 0.37966370582580566, + "learning_rate": 0.0001, + "loss": 1.8306, + "step": 1751 + }, + { + "epoch": 0.2012520820171156, + "grad_norm": 0.3994872272014618, + "learning_rate": 0.0001, + "loss": 1.9473, + "step": 1752 + }, + { + "epoch": 0.20136695192694273, + "grad_norm": 0.3680518865585327, + "learning_rate": 0.0001, + "loss": 1.7061, + "step": 1753 + }, + { + "epoch": 0.20148182183676985, + "grad_norm": 0.31908658146858215, + "learning_rate": 0.0001, + "loss": 1.4684, + "step": 1754 + }, + { + "epoch": 0.20159669174659697, + "grad_norm": 0.362386554479599, + "learning_rate": 0.0001, + "loss": 1.5278, + "step": 1755 + }, + { + "epoch": 0.2017115616564241, + "grad_norm": 0.35823360085487366, + "learning_rate": 0.0001, + "loss": 1.7714, + "step": 1756 + }, + { + "epoch": 0.2018264315662512, + "grad_norm": 0.3523258566856384, + "learning_rate": 0.0001, + "loss": 1.8227, + "step": 1757 + }, + { + "epoch": 0.20194130147607833, + "grad_norm": 0.348457396030426, + "learning_rate": 0.0001, + "loss": 1.6338, + "step": 1758 + }, + { + "epoch": 0.20205617138590545, + "grad_norm": 0.35159286856651306, + "learning_rate": 0.0001, + "loss": 1.4644, + "step": 1759 + }, + { + "epoch": 0.20217104129573257, + "grad_norm": 0.38442832231521606, + "learning_rate": 0.0001, + "loss": 1.7378, + "step": 1760 + }, + { + "epoch": 0.2022859112055597, + "grad_norm": 0.3663921356201172, + "learning_rate": 0.0001, + "loss": 1.8518, + "step": 1761 + }, + { + "epoch": 0.2024007811153868, + "grad_norm": 0.3565858006477356, + "learning_rate": 0.0001, + "loss": 1.6532, + "step": 1762 + }, + { + "epoch": 0.20251565102521393, + "grad_norm": 0.37562236189842224, + "learning_rate": 0.0001, + "loss": 1.7504, + "step": 1763 + }, + { + "epoch": 0.20263052093504105, + "grad_norm": 0.3319898247718811, + "learning_rate": 0.0001, + "loss": 1.5616, + "step": 1764 + }, + { + "epoch": 0.20274539084486817, + "grad_norm": 0.35017985105514526, + "learning_rate": 0.0001, + "loss": 1.6769, + "step": 1765 + }, + { + "epoch": 0.20286026075469532, + "grad_norm": 0.3883030116558075, + "learning_rate": 0.0001, + "loss": 1.6532, + "step": 1766 + }, + { + "epoch": 0.20297513066452244, + "grad_norm": 0.34447354078292847, + "learning_rate": 0.0001, + "loss": 1.6055, + "step": 1767 + }, + { + "epoch": 0.20309000057434956, + "grad_norm": 0.3480866253376007, + "learning_rate": 0.0001, + "loss": 1.6489, + "step": 1768 + }, + { + "epoch": 0.20320487048417668, + "grad_norm": 0.3792140781879425, + "learning_rate": 0.0001, + "loss": 1.7436, + "step": 1769 + }, + { + "epoch": 0.2033197403940038, + "grad_norm": 0.3636226952075958, + "learning_rate": 0.0001, + "loss": 1.4727, + "step": 1770 + }, + { + "epoch": 0.20343461030383092, + "grad_norm": 0.3649579584598541, + "learning_rate": 0.0001, + "loss": 1.5061, + "step": 1771 + }, + { + "epoch": 0.20354948021365804, + "grad_norm": 0.34622448682785034, + "learning_rate": 0.0001, + "loss": 1.5336, + "step": 1772 + }, + { + "epoch": 0.20366435012348516, + "grad_norm": 0.4073683023452759, + "learning_rate": 0.0001, + "loss": 1.9036, + "step": 1773 + }, + { + "epoch": 0.20377922003331228, + "grad_norm": 0.35996997356414795, + "learning_rate": 0.0001, + "loss": 1.8344, + "step": 1774 + }, + { + "epoch": 0.2038940899431394, + "grad_norm": 0.35649874806404114, + "learning_rate": 0.0001, + "loss": 1.4739, + "step": 1775 + }, + { + "epoch": 0.20400895985296652, + "grad_norm": 0.3954346477985382, + "learning_rate": 0.0001, + "loss": 1.9226, + "step": 1776 + }, + { + "epoch": 0.20412382976279364, + "grad_norm": 0.3589356243610382, + "learning_rate": 0.0001, + "loss": 1.6198, + "step": 1777 + }, + { + "epoch": 0.20423869967262076, + "grad_norm": 0.36763888597488403, + "learning_rate": 0.0001, + "loss": 1.7394, + "step": 1778 + }, + { + "epoch": 0.20435356958244788, + "grad_norm": 0.38217705488204956, + "learning_rate": 0.0001, + "loss": 1.9139, + "step": 1779 + }, + { + "epoch": 0.204468439492275, + "grad_norm": 0.39067110419273376, + "learning_rate": 0.0001, + "loss": 1.8865, + "step": 1780 + }, + { + "epoch": 0.20458330940210212, + "grad_norm": 0.369056761264801, + "learning_rate": 0.0001, + "loss": 1.7165, + "step": 1781 + }, + { + "epoch": 0.20469817931192924, + "grad_norm": 0.3984009623527527, + "learning_rate": 0.0001, + "loss": 1.6909, + "step": 1782 + }, + { + "epoch": 0.20481304922175636, + "grad_norm": 0.3637178838253021, + "learning_rate": 0.0001, + "loss": 1.7317, + "step": 1783 + }, + { + "epoch": 0.20492791913158348, + "grad_norm": 0.3644060790538788, + "learning_rate": 0.0001, + "loss": 1.7301, + "step": 1784 + }, + { + "epoch": 0.2050427890414106, + "grad_norm": 0.36648574471473694, + "learning_rate": 0.0001, + "loss": 1.793, + "step": 1785 + }, + { + "epoch": 0.20515765895123772, + "grad_norm": 0.37490981817245483, + "learning_rate": 0.0001, + "loss": 1.7762, + "step": 1786 + }, + { + "epoch": 0.20527252886106484, + "grad_norm": 0.35913270711898804, + "learning_rate": 0.0001, + "loss": 1.6501, + "step": 1787 + }, + { + "epoch": 0.20538739877089196, + "grad_norm": 0.35344579815864563, + "learning_rate": 0.0001, + "loss": 1.7904, + "step": 1788 + }, + { + "epoch": 0.20550226868071908, + "grad_norm": 0.4043060839176178, + "learning_rate": 0.0001, + "loss": 1.8669, + "step": 1789 + }, + { + "epoch": 0.2056171385905462, + "grad_norm": 0.35678645968437195, + "learning_rate": 0.0001, + "loss": 1.6165, + "step": 1790 + }, + { + "epoch": 0.20573200850037332, + "grad_norm": 0.3554267883300781, + "learning_rate": 0.0001, + "loss": 1.5462, + "step": 1791 + }, + { + "epoch": 0.20584687841020044, + "grad_norm": 0.3699265122413635, + "learning_rate": 0.0001, + "loss": 1.8192, + "step": 1792 + }, + { + "epoch": 0.20596174832002756, + "grad_norm": 0.3494689464569092, + "learning_rate": 0.0001, + "loss": 1.4697, + "step": 1793 + }, + { + "epoch": 0.20607661822985468, + "grad_norm": 0.3618113398551941, + "learning_rate": 0.0001, + "loss": 1.7152, + "step": 1794 + }, + { + "epoch": 0.2061914881396818, + "grad_norm": 0.36774349212646484, + "learning_rate": 0.0001, + "loss": 1.769, + "step": 1795 + }, + { + "epoch": 0.20630635804950892, + "grad_norm": 0.3754447400569916, + "learning_rate": 0.0001, + "loss": 1.549, + "step": 1796 + }, + { + "epoch": 0.20642122795933604, + "grad_norm": 0.4026842713356018, + "learning_rate": 0.0001, + "loss": 1.832, + "step": 1797 + }, + { + "epoch": 0.20653609786916316, + "grad_norm": 0.34931662678718567, + "learning_rate": 0.0001, + "loss": 1.6936, + "step": 1798 + }, + { + "epoch": 0.20665096777899028, + "grad_norm": 0.35631394386291504, + "learning_rate": 0.0001, + "loss": 1.7474, + "step": 1799 + }, + { + "epoch": 0.20676583768881743, + "grad_norm": 0.35016921162605286, + "learning_rate": 0.0001, + "loss": 1.8252, + "step": 1800 + }, + { + "epoch": 0.20688070759864455, + "grad_norm": 0.36527907848358154, + "learning_rate": 0.0001, + "loss": 1.7728, + "step": 1801 + }, + { + "epoch": 0.20699557750847167, + "grad_norm": 0.375942200422287, + "learning_rate": 0.0001, + "loss": 1.76, + "step": 1802 + }, + { + "epoch": 0.2071104474182988, + "grad_norm": 0.3598606586456299, + "learning_rate": 0.0001, + "loss": 1.7765, + "step": 1803 + }, + { + "epoch": 0.2072253173281259, + "grad_norm": 0.3487381041049957, + "learning_rate": 0.0001, + "loss": 1.5144, + "step": 1804 + }, + { + "epoch": 0.20734018723795303, + "grad_norm": 0.33189913630485535, + "learning_rate": 0.0001, + "loss": 1.5354, + "step": 1805 + }, + { + "epoch": 0.20745505714778015, + "grad_norm": 0.3427751660346985, + "learning_rate": 0.0001, + "loss": 1.4288, + "step": 1806 + }, + { + "epoch": 0.20756992705760727, + "grad_norm": 0.3979537785053253, + "learning_rate": 0.0001, + "loss": 1.6666, + "step": 1807 + }, + { + "epoch": 0.2076847969674344, + "grad_norm": 0.37315791845321655, + "learning_rate": 0.0001, + "loss": 1.5901, + "step": 1808 + }, + { + "epoch": 0.2077996668772615, + "grad_norm": 0.37909650802612305, + "learning_rate": 0.0001, + "loss": 1.7111, + "step": 1809 + }, + { + "epoch": 0.20791453678708863, + "grad_norm": 0.3696240186691284, + "learning_rate": 0.0001, + "loss": 1.7612, + "step": 1810 + }, + { + "epoch": 0.20802940669691575, + "grad_norm": 0.38484904170036316, + "learning_rate": 0.0001, + "loss": 1.5298, + "step": 1811 + }, + { + "epoch": 0.20814427660674287, + "grad_norm": 0.3398043215274811, + "learning_rate": 0.0001, + "loss": 1.6615, + "step": 1812 + }, + { + "epoch": 0.20825914651657, + "grad_norm": 0.3796899616718292, + "learning_rate": 0.0001, + "loss": 1.8264, + "step": 1813 + }, + { + "epoch": 0.2083740164263971, + "grad_norm": 0.38819029927253723, + "learning_rate": 0.0001, + "loss": 1.8616, + "step": 1814 + }, + { + "epoch": 0.20848888633622423, + "grad_norm": 0.4355449378490448, + "learning_rate": 0.0001, + "loss": 1.8042, + "step": 1815 + }, + { + "epoch": 0.20860375624605135, + "grad_norm": 0.37194108963012695, + "learning_rate": 0.0001, + "loss": 1.7524, + "step": 1816 + }, + { + "epoch": 0.20871862615587847, + "grad_norm": 0.3792515993118286, + "learning_rate": 0.0001, + "loss": 1.6455, + "step": 1817 + }, + { + "epoch": 0.2088334960657056, + "grad_norm": 0.37876373529434204, + "learning_rate": 0.0001, + "loss": 1.5946, + "step": 1818 + }, + { + "epoch": 0.2089483659755327, + "grad_norm": 0.34558865427970886, + "learning_rate": 0.0001, + "loss": 1.6621, + "step": 1819 + }, + { + "epoch": 0.20906323588535983, + "grad_norm": 0.43524423241615295, + "learning_rate": 0.0001, + "loss": 2.0707, + "step": 1820 + }, + { + "epoch": 0.20917810579518695, + "grad_norm": 0.4018253982067108, + "learning_rate": 0.0001, + "loss": 1.8651, + "step": 1821 + }, + { + "epoch": 0.20929297570501407, + "grad_norm": 0.3796786367893219, + "learning_rate": 0.0001, + "loss": 1.6014, + "step": 1822 + }, + { + "epoch": 0.2094078456148412, + "grad_norm": 0.35122066736221313, + "learning_rate": 0.0001, + "loss": 1.7398, + "step": 1823 + }, + { + "epoch": 0.2095227155246683, + "grad_norm": 0.3692324161529541, + "learning_rate": 0.0001, + "loss": 1.4884, + "step": 1824 + }, + { + "epoch": 0.20963758543449543, + "grad_norm": 0.39032313227653503, + "learning_rate": 0.0001, + "loss": 1.7603, + "step": 1825 + }, + { + "epoch": 0.20975245534432255, + "grad_norm": 0.34031352400779724, + "learning_rate": 0.0001, + "loss": 1.4029, + "step": 1826 + }, + { + "epoch": 0.20986732525414967, + "grad_norm": 0.40523627400398254, + "learning_rate": 0.0001, + "loss": 1.6919, + "step": 1827 + }, + { + "epoch": 0.2099821951639768, + "grad_norm": 0.3522171378135681, + "learning_rate": 0.0001, + "loss": 1.7032, + "step": 1828 + }, + { + "epoch": 0.2100970650738039, + "grad_norm": 0.3715449273586273, + "learning_rate": 0.0001, + "loss": 1.7735, + "step": 1829 + }, + { + "epoch": 0.21021193498363103, + "grad_norm": 0.39641836285591125, + "learning_rate": 0.0001, + "loss": 1.8061, + "step": 1830 + }, + { + "epoch": 0.21032680489345815, + "grad_norm": 0.3934457302093506, + "learning_rate": 0.0001, + "loss": 1.7792, + "step": 1831 + }, + { + "epoch": 0.21044167480328527, + "grad_norm": 0.35226938128471375, + "learning_rate": 0.0001, + "loss": 1.7576, + "step": 1832 + }, + { + "epoch": 0.2105565447131124, + "grad_norm": 0.36764612793922424, + "learning_rate": 0.0001, + "loss": 1.819, + "step": 1833 + }, + { + "epoch": 0.21067141462293953, + "grad_norm": 0.36354872584342957, + "learning_rate": 0.0001, + "loss": 1.5628, + "step": 1834 + }, + { + "epoch": 0.21078628453276665, + "grad_norm": 0.3655260503292084, + "learning_rate": 0.0001, + "loss": 1.6431, + "step": 1835 + }, + { + "epoch": 0.21090115444259377, + "grad_norm": 0.36574462056159973, + "learning_rate": 0.0001, + "loss": 1.8352, + "step": 1836 + }, + { + "epoch": 0.2110160243524209, + "grad_norm": 0.3510647118091583, + "learning_rate": 0.0001, + "loss": 1.671, + "step": 1837 + }, + { + "epoch": 0.21113089426224801, + "grad_norm": 0.38021084666252136, + "learning_rate": 0.0001, + "loss": 1.7853, + "step": 1838 + }, + { + "epoch": 0.21124576417207513, + "grad_norm": 0.38449275493621826, + "learning_rate": 0.0001, + "loss": 1.68, + "step": 1839 + }, + { + "epoch": 0.21136063408190225, + "grad_norm": 0.378221720457077, + "learning_rate": 0.0001, + "loss": 1.7663, + "step": 1840 + }, + { + "epoch": 0.21147550399172937, + "grad_norm": 0.3675908148288727, + "learning_rate": 0.0001, + "loss": 1.7717, + "step": 1841 + }, + { + "epoch": 0.2115903739015565, + "grad_norm": 0.3863251805305481, + "learning_rate": 0.0001, + "loss": 1.686, + "step": 1842 + }, + { + "epoch": 0.21170524381138361, + "grad_norm": 0.4383453130722046, + "learning_rate": 0.0001, + "loss": 1.9812, + "step": 1843 + }, + { + "epoch": 0.21182011372121073, + "grad_norm": 0.3583828806877136, + "learning_rate": 0.0001, + "loss": 1.7918, + "step": 1844 + }, + { + "epoch": 0.21193498363103785, + "grad_norm": 0.39561352133750916, + "learning_rate": 0.0001, + "loss": 1.673, + "step": 1845 + }, + { + "epoch": 0.21204985354086497, + "grad_norm": 0.37109145522117615, + "learning_rate": 0.0001, + "loss": 1.7545, + "step": 1846 + }, + { + "epoch": 0.2121647234506921, + "grad_norm": 0.3461610972881317, + "learning_rate": 0.0001, + "loss": 1.4589, + "step": 1847 + }, + { + "epoch": 0.21227959336051921, + "grad_norm": 0.43045949935913086, + "learning_rate": 0.0001, + "loss": 1.7546, + "step": 1848 + }, + { + "epoch": 0.21239446327034633, + "grad_norm": 0.4053126871585846, + "learning_rate": 0.0001, + "loss": 1.6525, + "step": 1849 + }, + { + "epoch": 0.21250933318017345, + "grad_norm": 0.35681119561195374, + "learning_rate": 0.0001, + "loss": 1.5118, + "step": 1850 + }, + { + "epoch": 0.21262420309000057, + "grad_norm": 0.3934498429298401, + "learning_rate": 0.0001, + "loss": 1.5628, + "step": 1851 + }, + { + "epoch": 0.2127390729998277, + "grad_norm": 0.3668583035469055, + "learning_rate": 0.0001, + "loss": 1.4663, + "step": 1852 + }, + { + "epoch": 0.21285394290965481, + "grad_norm": 0.36090970039367676, + "learning_rate": 0.0001, + "loss": 1.8151, + "step": 1853 + }, + { + "epoch": 0.21296881281948193, + "grad_norm": 0.3945942521095276, + "learning_rate": 0.0001, + "loss": 1.7807, + "step": 1854 + }, + { + "epoch": 0.21308368272930905, + "grad_norm": 0.34316137433052063, + "learning_rate": 0.0001, + "loss": 1.5175, + "step": 1855 + }, + { + "epoch": 0.21319855263913617, + "grad_norm": 0.3736970126628876, + "learning_rate": 0.0001, + "loss": 1.66, + "step": 1856 + }, + { + "epoch": 0.2133134225489633, + "grad_norm": 0.36034896969795227, + "learning_rate": 0.0001, + "loss": 1.6949, + "step": 1857 + }, + { + "epoch": 0.21342829245879041, + "grad_norm": 0.38381823897361755, + "learning_rate": 0.0001, + "loss": 1.7625, + "step": 1858 + }, + { + "epoch": 0.21354316236861753, + "grad_norm": 0.39188024401664734, + "learning_rate": 0.0001, + "loss": 1.6711, + "step": 1859 + }, + { + "epoch": 0.21365803227844465, + "grad_norm": 0.3275800049304962, + "learning_rate": 0.0001, + "loss": 1.5081, + "step": 1860 + }, + { + "epoch": 0.21377290218827177, + "grad_norm": 0.38419318199157715, + "learning_rate": 0.0001, + "loss": 1.6184, + "step": 1861 + }, + { + "epoch": 0.2138877720980989, + "grad_norm": 0.3843541443347931, + "learning_rate": 0.0001, + "loss": 1.7979, + "step": 1862 + }, + { + "epoch": 0.21400264200792601, + "grad_norm": 0.4217662513256073, + "learning_rate": 0.0001, + "loss": 1.7142, + "step": 1863 + }, + { + "epoch": 0.21411751191775313, + "grad_norm": 0.37826740741729736, + "learning_rate": 0.0001, + "loss": 1.7926, + "step": 1864 + }, + { + "epoch": 0.21423238182758025, + "grad_norm": 0.38108256459236145, + "learning_rate": 0.0001, + "loss": 1.672, + "step": 1865 + }, + { + "epoch": 0.21434725173740737, + "grad_norm": 0.3516540229320526, + "learning_rate": 0.0001, + "loss": 1.571, + "step": 1866 + }, + { + "epoch": 0.2144621216472345, + "grad_norm": 0.3580091595649719, + "learning_rate": 0.0001, + "loss": 1.7167, + "step": 1867 + }, + { + "epoch": 0.21457699155706164, + "grad_norm": 0.3416488468647003, + "learning_rate": 0.0001, + "loss": 1.4516, + "step": 1868 + }, + { + "epoch": 0.21469186146688876, + "grad_norm": 0.3605569303035736, + "learning_rate": 0.0001, + "loss": 1.6703, + "step": 1869 + }, + { + "epoch": 0.21480673137671588, + "grad_norm": 0.3645571768283844, + "learning_rate": 0.0001, + "loss": 1.6764, + "step": 1870 + }, + { + "epoch": 0.214921601286543, + "grad_norm": 0.3980044424533844, + "learning_rate": 0.0001, + "loss": 1.7082, + "step": 1871 + }, + { + "epoch": 0.21503647119637012, + "grad_norm": 0.3569796681404114, + "learning_rate": 0.0001, + "loss": 1.572, + "step": 1872 + }, + { + "epoch": 0.21515134110619724, + "grad_norm": 0.37738198041915894, + "learning_rate": 0.0001, + "loss": 1.6913, + "step": 1873 + }, + { + "epoch": 0.21526621101602436, + "grad_norm": 0.34949570894241333, + "learning_rate": 0.0001, + "loss": 1.6524, + "step": 1874 + }, + { + "epoch": 0.21538108092585148, + "grad_norm": 0.3858441114425659, + "learning_rate": 0.0001, + "loss": 1.7056, + "step": 1875 + }, + { + "epoch": 0.2154959508356786, + "grad_norm": 0.33417800068855286, + "learning_rate": 0.0001, + "loss": 1.5352, + "step": 1876 + }, + { + "epoch": 0.21561082074550572, + "grad_norm": 0.363370418548584, + "learning_rate": 0.0001, + "loss": 1.6378, + "step": 1877 + }, + { + "epoch": 0.21572569065533284, + "grad_norm": 0.37960943579673767, + "learning_rate": 0.0001, + "loss": 1.8329, + "step": 1878 + }, + { + "epoch": 0.21584056056515996, + "grad_norm": 0.34242677688598633, + "learning_rate": 0.0001, + "loss": 1.6365, + "step": 1879 + }, + { + "epoch": 0.21595543047498708, + "grad_norm": 0.34529823064804077, + "learning_rate": 0.0001, + "loss": 1.7954, + "step": 1880 + }, + { + "epoch": 0.2160703003848142, + "grad_norm": 0.3592895567417145, + "learning_rate": 0.0001, + "loss": 1.7834, + "step": 1881 + }, + { + "epoch": 0.21618517029464132, + "grad_norm": 0.3785233795642853, + "learning_rate": 0.0001, + "loss": 1.585, + "step": 1882 + }, + { + "epoch": 0.21630004020446844, + "grad_norm": 0.3563402593135834, + "learning_rate": 0.0001, + "loss": 1.7312, + "step": 1883 + }, + { + "epoch": 0.21641491011429556, + "grad_norm": 0.4084146320819855, + "learning_rate": 0.0001, + "loss": 1.8166, + "step": 1884 + }, + { + "epoch": 0.21652978002412268, + "grad_norm": 0.3653600811958313, + "learning_rate": 0.0001, + "loss": 1.5874, + "step": 1885 + }, + { + "epoch": 0.2166446499339498, + "grad_norm": 0.34954777359962463, + "learning_rate": 0.0001, + "loss": 1.7595, + "step": 1886 + }, + { + "epoch": 0.21675951984377692, + "grad_norm": 0.34437569975852966, + "learning_rate": 0.0001, + "loss": 1.5336, + "step": 1887 + }, + { + "epoch": 0.21687438975360404, + "grad_norm": 0.37032750248908997, + "learning_rate": 0.0001, + "loss": 1.6742, + "step": 1888 + }, + { + "epoch": 0.21698925966343116, + "grad_norm": 0.36316627264022827, + "learning_rate": 0.0001, + "loss": 1.448, + "step": 1889 + }, + { + "epoch": 0.21710412957325828, + "grad_norm": 0.3642198145389557, + "learning_rate": 0.0001, + "loss": 1.7953, + "step": 1890 + }, + { + "epoch": 0.2172189994830854, + "grad_norm": 0.36583060026168823, + "learning_rate": 0.0001, + "loss": 1.5196, + "step": 1891 + }, + { + "epoch": 0.21733386939291252, + "grad_norm": 0.35113057494163513, + "learning_rate": 0.0001, + "loss": 1.4707, + "step": 1892 + }, + { + "epoch": 0.21744873930273964, + "grad_norm": 0.4065643846988678, + "learning_rate": 0.0001, + "loss": 1.8893, + "step": 1893 + }, + { + "epoch": 0.21756360921256676, + "grad_norm": 0.36826783418655396, + "learning_rate": 0.0001, + "loss": 1.8333, + "step": 1894 + }, + { + "epoch": 0.21767847912239388, + "grad_norm": 0.3617238998413086, + "learning_rate": 0.0001, + "loss": 1.7314, + "step": 1895 + }, + { + "epoch": 0.217793349032221, + "grad_norm": 0.40174373984336853, + "learning_rate": 0.0001, + "loss": 1.8332, + "step": 1896 + }, + { + "epoch": 0.21790821894204812, + "grad_norm": 0.3792894184589386, + "learning_rate": 0.0001, + "loss": 1.615, + "step": 1897 + }, + { + "epoch": 0.21802308885187524, + "grad_norm": 0.35536929965019226, + "learning_rate": 0.0001, + "loss": 1.69, + "step": 1898 + }, + { + "epoch": 0.21813795876170236, + "grad_norm": 0.36117202043533325, + "learning_rate": 0.0001, + "loss": 1.6592, + "step": 1899 + }, + { + "epoch": 0.21825282867152948, + "grad_norm": 0.3893747627735138, + "learning_rate": 0.0001, + "loss": 1.7219, + "step": 1900 + }, + { + "epoch": 0.2183676985813566, + "grad_norm": 0.40482643246650696, + "learning_rate": 0.0001, + "loss": 1.8143, + "step": 1901 + }, + { + "epoch": 0.21848256849118372, + "grad_norm": 0.35201117396354675, + "learning_rate": 0.0001, + "loss": 1.5626, + "step": 1902 + }, + { + "epoch": 0.21859743840101087, + "grad_norm": 0.359512060880661, + "learning_rate": 0.0001, + "loss": 1.7391, + "step": 1903 + }, + { + "epoch": 0.218712308310838, + "grad_norm": 0.4181397259235382, + "learning_rate": 0.0001, + "loss": 1.9794, + "step": 1904 + }, + { + "epoch": 0.2188271782206651, + "grad_norm": 0.3568892478942871, + "learning_rate": 0.0001, + "loss": 1.5442, + "step": 1905 + }, + { + "epoch": 0.21894204813049223, + "grad_norm": 0.3569229245185852, + "learning_rate": 0.0001, + "loss": 1.8519, + "step": 1906 + }, + { + "epoch": 0.21905691804031935, + "grad_norm": 0.3385428488254547, + "learning_rate": 0.0001, + "loss": 1.5492, + "step": 1907 + }, + { + "epoch": 0.21917178795014647, + "grad_norm": 0.35773056745529175, + "learning_rate": 0.0001, + "loss": 1.7635, + "step": 1908 + }, + { + "epoch": 0.2192866578599736, + "grad_norm": 0.37452182173728943, + "learning_rate": 0.0001, + "loss": 1.8033, + "step": 1909 + }, + { + "epoch": 0.2194015277698007, + "grad_norm": 0.38838356733322144, + "learning_rate": 0.0001, + "loss": 1.6672, + "step": 1910 + }, + { + "epoch": 0.21951639767962783, + "grad_norm": 0.3544471263885498, + "learning_rate": 0.0001, + "loss": 1.4947, + "step": 1911 + }, + { + "epoch": 0.21963126758945495, + "grad_norm": 0.3904416561126709, + "learning_rate": 0.0001, + "loss": 1.8598, + "step": 1912 + }, + { + "epoch": 0.21974613749928207, + "grad_norm": 0.3736543357372284, + "learning_rate": 0.0001, + "loss": 1.6786, + "step": 1913 + }, + { + "epoch": 0.2198610074091092, + "grad_norm": 0.3756701648235321, + "learning_rate": 0.0001, + "loss": 1.6923, + "step": 1914 + }, + { + "epoch": 0.2199758773189363, + "grad_norm": 0.334176629781723, + "learning_rate": 0.0001, + "loss": 1.3314, + "step": 1915 + }, + { + "epoch": 0.22009074722876343, + "grad_norm": 0.35989829897880554, + "learning_rate": 0.0001, + "loss": 1.6578, + "step": 1916 + }, + { + "epoch": 0.22020561713859055, + "grad_norm": 0.3670518100261688, + "learning_rate": 0.0001, + "loss": 1.6586, + "step": 1917 + }, + { + "epoch": 0.22032048704841767, + "grad_norm": 0.3405352234840393, + "learning_rate": 0.0001, + "loss": 1.4737, + "step": 1918 + }, + { + "epoch": 0.2204353569582448, + "grad_norm": 0.3269241154193878, + "learning_rate": 0.0001, + "loss": 1.5191, + "step": 1919 + }, + { + "epoch": 0.2205502268680719, + "grad_norm": 0.34716111421585083, + "learning_rate": 0.0001, + "loss": 1.6567, + "step": 1920 + }, + { + "epoch": 0.22066509677789903, + "grad_norm": 0.3655507564544678, + "learning_rate": 0.0001, + "loss": 1.7015, + "step": 1921 + }, + { + "epoch": 0.22077996668772615, + "grad_norm": 0.37460073828697205, + "learning_rate": 0.0001, + "loss": 1.6521, + "step": 1922 + }, + { + "epoch": 0.22089483659755327, + "grad_norm": 0.3556302785873413, + "learning_rate": 0.0001, + "loss": 1.672, + "step": 1923 + }, + { + "epoch": 0.2210097065073804, + "grad_norm": 0.30222126841545105, + "learning_rate": 0.0001, + "loss": 1.3695, + "step": 1924 + }, + { + "epoch": 0.2211245764172075, + "grad_norm": 0.3618417978286743, + "learning_rate": 0.0001, + "loss": 1.6877, + "step": 1925 + }, + { + "epoch": 0.22123944632703463, + "grad_norm": 0.3906730115413666, + "learning_rate": 0.0001, + "loss": 1.7139, + "step": 1926 + }, + { + "epoch": 0.22135431623686175, + "grad_norm": 0.3969448506832123, + "learning_rate": 0.0001, + "loss": 1.5555, + "step": 1927 + }, + { + "epoch": 0.22146918614668887, + "grad_norm": 0.4062451124191284, + "learning_rate": 0.0001, + "loss": 1.8545, + "step": 1928 + }, + { + "epoch": 0.221584056056516, + "grad_norm": 0.3763776421546936, + "learning_rate": 0.0001, + "loss": 1.6737, + "step": 1929 + }, + { + "epoch": 0.2216989259663431, + "grad_norm": 0.3991961181163788, + "learning_rate": 0.0001, + "loss": 1.776, + "step": 1930 + }, + { + "epoch": 0.22181379587617023, + "grad_norm": 0.37733355164527893, + "learning_rate": 0.0001, + "loss": 1.6528, + "step": 1931 + }, + { + "epoch": 0.22192866578599735, + "grad_norm": 0.383766233921051, + "learning_rate": 0.0001, + "loss": 1.665, + "step": 1932 + }, + { + "epoch": 0.22204353569582447, + "grad_norm": 0.35736414790153503, + "learning_rate": 0.0001, + "loss": 1.7631, + "step": 1933 + }, + { + "epoch": 0.2221584056056516, + "grad_norm": 0.3915473222732544, + "learning_rate": 0.0001, + "loss": 1.8461, + "step": 1934 + }, + { + "epoch": 0.2222732755154787, + "grad_norm": 0.4120464622974396, + "learning_rate": 0.0001, + "loss": 1.9815, + "step": 1935 + }, + { + "epoch": 0.22238814542530583, + "grad_norm": 0.36867755651474, + "learning_rate": 0.0001, + "loss": 1.6904, + "step": 1936 + }, + { + "epoch": 0.22250301533513298, + "grad_norm": 0.3585013151168823, + "learning_rate": 0.0001, + "loss": 1.5512, + "step": 1937 + }, + { + "epoch": 0.2226178852449601, + "grad_norm": 0.36579304933547974, + "learning_rate": 0.0001, + "loss": 1.6213, + "step": 1938 + }, + { + "epoch": 0.22273275515478722, + "grad_norm": 0.43999186158180237, + "learning_rate": 0.0001, + "loss": 1.9172, + "step": 1939 + }, + { + "epoch": 0.22284762506461434, + "grad_norm": 0.340197890996933, + "learning_rate": 0.0001, + "loss": 1.6752, + "step": 1940 + }, + { + "epoch": 0.22296249497444146, + "grad_norm": 0.3636626601219177, + "learning_rate": 0.0001, + "loss": 1.849, + "step": 1941 + }, + { + "epoch": 0.22307736488426858, + "grad_norm": 0.33925965428352356, + "learning_rate": 0.0001, + "loss": 1.6694, + "step": 1942 + }, + { + "epoch": 0.2231922347940957, + "grad_norm": 0.3498164713382721, + "learning_rate": 0.0001, + "loss": 1.5879, + "step": 1943 + }, + { + "epoch": 0.22330710470392282, + "grad_norm": 0.35146066546440125, + "learning_rate": 0.0001, + "loss": 1.6627, + "step": 1944 + }, + { + "epoch": 0.22342197461374994, + "grad_norm": 0.40588539838790894, + "learning_rate": 0.0001, + "loss": 2.0319, + "step": 1945 + }, + { + "epoch": 0.22353684452357706, + "grad_norm": 0.35790249705314636, + "learning_rate": 0.0001, + "loss": 1.5836, + "step": 1946 + }, + { + "epoch": 0.22365171443340418, + "grad_norm": 0.3292383849620819, + "learning_rate": 0.0001, + "loss": 1.372, + "step": 1947 + }, + { + "epoch": 0.2237665843432313, + "grad_norm": 0.3615871071815491, + "learning_rate": 0.0001, + "loss": 1.7117, + "step": 1948 + }, + { + "epoch": 0.22388145425305842, + "grad_norm": 0.3719077408313751, + "learning_rate": 0.0001, + "loss": 1.5964, + "step": 1949 + }, + { + "epoch": 0.22399632416288554, + "grad_norm": 0.4070468544960022, + "learning_rate": 0.0001, + "loss": 1.9156, + "step": 1950 + }, + { + "epoch": 0.22411119407271266, + "grad_norm": 0.39306938648223877, + "learning_rate": 0.0001, + "loss": 1.8247, + "step": 1951 + }, + { + "epoch": 0.22422606398253977, + "grad_norm": 0.380942165851593, + "learning_rate": 0.0001, + "loss": 1.8662, + "step": 1952 + }, + { + "epoch": 0.2243409338923669, + "grad_norm": 0.3670596480369568, + "learning_rate": 0.0001, + "loss": 1.7237, + "step": 1953 + }, + { + "epoch": 0.22445580380219401, + "grad_norm": 0.3629164695739746, + "learning_rate": 0.0001, + "loss": 1.6835, + "step": 1954 + }, + { + "epoch": 0.22457067371202113, + "grad_norm": 0.3525116443634033, + "learning_rate": 0.0001, + "loss": 1.5414, + "step": 1955 + }, + { + "epoch": 0.22468554362184825, + "grad_norm": 0.362248957157135, + "learning_rate": 0.0001, + "loss": 1.7471, + "step": 1956 + }, + { + "epoch": 0.22480041353167537, + "grad_norm": 0.3838635981082916, + "learning_rate": 0.0001, + "loss": 1.845, + "step": 1957 + }, + { + "epoch": 0.2249152834415025, + "grad_norm": 0.3582911193370819, + "learning_rate": 0.0001, + "loss": 1.6315, + "step": 1958 + }, + { + "epoch": 0.22503015335132961, + "grad_norm": 0.3515222668647766, + "learning_rate": 0.0001, + "loss": 1.7908, + "step": 1959 + }, + { + "epoch": 0.22514502326115673, + "grad_norm": 0.38744720816612244, + "learning_rate": 0.0001, + "loss": 1.6675, + "step": 1960 + }, + { + "epoch": 0.22525989317098385, + "grad_norm": 0.356286883354187, + "learning_rate": 0.0001, + "loss": 1.635, + "step": 1961 + }, + { + "epoch": 0.22537476308081097, + "grad_norm": 0.3699129819869995, + "learning_rate": 0.0001, + "loss": 1.7541, + "step": 1962 + }, + { + "epoch": 0.2254896329906381, + "grad_norm": 0.3715314269065857, + "learning_rate": 0.0001, + "loss": 1.7375, + "step": 1963 + }, + { + "epoch": 0.22560450290046521, + "grad_norm": 0.3639882206916809, + "learning_rate": 0.0001, + "loss": 1.6326, + "step": 1964 + }, + { + "epoch": 0.22571937281029233, + "grad_norm": 0.34050387144088745, + "learning_rate": 0.0001, + "loss": 1.5521, + "step": 1965 + }, + { + "epoch": 0.22583424272011945, + "grad_norm": 0.36640483140945435, + "learning_rate": 0.0001, + "loss": 1.7285, + "step": 1966 + }, + { + "epoch": 0.22594911262994657, + "grad_norm": 0.39753469824790955, + "learning_rate": 0.0001, + "loss": 1.8873, + "step": 1967 + }, + { + "epoch": 0.2260639825397737, + "grad_norm": 0.3590884208679199, + "learning_rate": 0.0001, + "loss": 1.647, + "step": 1968 + }, + { + "epoch": 0.22617885244960081, + "grad_norm": 0.36375731229782104, + "learning_rate": 0.0001, + "loss": 1.6994, + "step": 1969 + }, + { + "epoch": 0.22629372235942793, + "grad_norm": 0.3400304317474365, + "learning_rate": 0.0001, + "loss": 1.5619, + "step": 1970 + }, + { + "epoch": 0.22640859226925508, + "grad_norm": 0.36975133419036865, + "learning_rate": 0.0001, + "loss": 1.58, + "step": 1971 + }, + { + "epoch": 0.2265234621790822, + "grad_norm": 0.3886123299598694, + "learning_rate": 0.0001, + "loss": 1.6675, + "step": 1972 + }, + { + "epoch": 0.22663833208890932, + "grad_norm": 0.37827616930007935, + "learning_rate": 0.0001, + "loss": 1.8186, + "step": 1973 + }, + { + "epoch": 0.22675320199873644, + "grad_norm": 0.3893340229988098, + "learning_rate": 0.0001, + "loss": 1.7555, + "step": 1974 + }, + { + "epoch": 0.22686807190856356, + "grad_norm": 0.3668055534362793, + "learning_rate": 0.0001, + "loss": 1.585, + "step": 1975 + }, + { + "epoch": 0.22698294181839068, + "grad_norm": 0.37988904118537903, + "learning_rate": 0.0001, + "loss": 1.7357, + "step": 1976 + }, + { + "epoch": 0.2270978117282178, + "grad_norm": 0.3713071942329407, + "learning_rate": 0.0001, + "loss": 1.73, + "step": 1977 + }, + { + "epoch": 0.22721268163804492, + "grad_norm": 0.3730880916118622, + "learning_rate": 0.0001, + "loss": 1.8461, + "step": 1978 + }, + { + "epoch": 0.22732755154787204, + "grad_norm": 0.38077569007873535, + "learning_rate": 0.0001, + "loss": 1.8324, + "step": 1979 + }, + { + "epoch": 0.22744242145769916, + "grad_norm": 0.34529733657836914, + "learning_rate": 0.0001, + "loss": 1.5826, + "step": 1980 + }, + { + "epoch": 0.22755729136752628, + "grad_norm": 0.3712940812110901, + "learning_rate": 0.0001, + "loss": 1.8049, + "step": 1981 + }, + { + "epoch": 0.2276721612773534, + "grad_norm": 0.39372700452804565, + "learning_rate": 0.0001, + "loss": 1.7774, + "step": 1982 + }, + { + "epoch": 0.22778703118718052, + "grad_norm": 0.3780519664287567, + "learning_rate": 0.0001, + "loss": 1.8419, + "step": 1983 + }, + { + "epoch": 0.22790190109700764, + "grad_norm": 0.36118772625923157, + "learning_rate": 0.0001, + "loss": 1.6452, + "step": 1984 + }, + { + "epoch": 0.22801677100683476, + "grad_norm": 0.36958158016204834, + "learning_rate": 0.0001, + "loss": 1.5734, + "step": 1985 + }, + { + "epoch": 0.22813164091666188, + "grad_norm": 0.365408331155777, + "learning_rate": 0.0001, + "loss": 1.5612, + "step": 1986 + }, + { + "epoch": 0.228246510826489, + "grad_norm": 0.3450736999511719, + "learning_rate": 0.0001, + "loss": 1.6522, + "step": 1987 + }, + { + "epoch": 0.22836138073631612, + "grad_norm": 0.37709590792655945, + "learning_rate": 0.0001, + "loss": 1.7375, + "step": 1988 + }, + { + "epoch": 0.22847625064614324, + "grad_norm": 0.3818245232105255, + "learning_rate": 0.0001, + "loss": 1.7378, + "step": 1989 + }, + { + "epoch": 0.22859112055597036, + "grad_norm": 0.375186949968338, + "learning_rate": 0.0001, + "loss": 1.7919, + "step": 1990 + }, + { + "epoch": 0.22870599046579748, + "grad_norm": 0.3831838369369507, + "learning_rate": 0.0001, + "loss": 1.9628, + "step": 1991 + }, + { + "epoch": 0.2288208603756246, + "grad_norm": 0.3555734157562256, + "learning_rate": 0.0001, + "loss": 1.7085, + "step": 1992 + }, + { + "epoch": 0.22893573028545172, + "grad_norm": 0.36431822180747986, + "learning_rate": 0.0001, + "loss": 1.6481, + "step": 1993 + }, + { + "epoch": 0.22905060019527884, + "grad_norm": 0.36310869455337524, + "learning_rate": 0.0001, + "loss": 1.8057, + "step": 1994 + }, + { + "epoch": 0.22916547010510596, + "grad_norm": 0.3640761375427246, + "learning_rate": 0.0001, + "loss": 1.7278, + "step": 1995 + }, + { + "epoch": 0.22928034001493308, + "grad_norm": 0.3574616014957428, + "learning_rate": 0.0001, + "loss": 1.6583, + "step": 1996 + }, + { + "epoch": 0.2293952099247602, + "grad_norm": 0.36930689215660095, + "learning_rate": 0.0001, + "loss": 1.7673, + "step": 1997 + }, + { + "epoch": 0.22951007983458732, + "grad_norm": 0.3680112659931183, + "learning_rate": 0.0001, + "loss": 1.6705, + "step": 1998 + }, + { + "epoch": 0.22962494974441444, + "grad_norm": 0.3612775504589081, + "learning_rate": 0.0001, + "loss": 1.827, + "step": 1999 + }, + { + "epoch": 0.22973981965424156, + "grad_norm": 0.3716758191585541, + "learning_rate": 0.0001, + "loss": 1.7205, + "step": 2000 + }, + { + "epoch": 0.22985468956406868, + "grad_norm": 0.3520873785018921, + "learning_rate": 0.0001, + "loss": 1.5418, + "step": 2001 + }, + { + "epoch": 0.2299695594738958, + "grad_norm": 0.3716805875301361, + "learning_rate": 0.0001, + "loss": 1.9395, + "step": 2002 + }, + { + "epoch": 0.23008442938372292, + "grad_norm": 0.35050535202026367, + "learning_rate": 0.0001, + "loss": 1.7538, + "step": 2003 + }, + { + "epoch": 0.23019929929355004, + "grad_norm": 0.35754668712615967, + "learning_rate": 0.0001, + "loss": 1.6116, + "step": 2004 + }, + { + "epoch": 0.23031416920337716, + "grad_norm": 0.35769370198249817, + "learning_rate": 0.0001, + "loss": 1.6091, + "step": 2005 + }, + { + "epoch": 0.2304290391132043, + "grad_norm": 0.3884750008583069, + "learning_rate": 0.0001, + "loss": 1.73, + "step": 2006 + }, + { + "epoch": 0.23054390902303143, + "grad_norm": 0.35404741764068604, + "learning_rate": 0.0001, + "loss": 1.5435, + "step": 2007 + }, + { + "epoch": 0.23065877893285855, + "grad_norm": 0.37792080640792847, + "learning_rate": 0.0001, + "loss": 1.694, + "step": 2008 + }, + { + "epoch": 0.23077364884268567, + "grad_norm": 0.3727210462093353, + "learning_rate": 0.0001, + "loss": 1.6421, + "step": 2009 + }, + { + "epoch": 0.2308885187525128, + "grad_norm": 0.3520371615886688, + "learning_rate": 0.0001, + "loss": 1.6815, + "step": 2010 + }, + { + "epoch": 0.2310033886623399, + "grad_norm": 0.3894649147987366, + "learning_rate": 0.0001, + "loss": 1.6817, + "step": 2011 + }, + { + "epoch": 0.23111825857216703, + "grad_norm": 0.3612670302391052, + "learning_rate": 0.0001, + "loss": 1.6792, + "step": 2012 + }, + { + "epoch": 0.23123312848199415, + "grad_norm": 0.35834938287734985, + "learning_rate": 0.0001, + "loss": 1.6607, + "step": 2013 + }, + { + "epoch": 0.23134799839182127, + "grad_norm": 0.36019662022590637, + "learning_rate": 0.0001, + "loss": 1.8426, + "step": 2014 + }, + { + "epoch": 0.2314628683016484, + "grad_norm": 0.35688281059265137, + "learning_rate": 0.0001, + "loss": 1.6774, + "step": 2015 + }, + { + "epoch": 0.2315777382114755, + "grad_norm": 0.3944391906261444, + "learning_rate": 0.0001, + "loss": 1.6247, + "step": 2016 + }, + { + "epoch": 0.23169260812130263, + "grad_norm": 0.4213406443595886, + "learning_rate": 0.0001, + "loss": 1.5176, + "step": 2017 + }, + { + "epoch": 0.23180747803112975, + "grad_norm": 0.37755805253982544, + "learning_rate": 0.0001, + "loss": 1.7714, + "step": 2018 + }, + { + "epoch": 0.23192234794095687, + "grad_norm": 0.36677590012550354, + "learning_rate": 0.0001, + "loss": 1.5277, + "step": 2019 + }, + { + "epoch": 0.232037217850784, + "grad_norm": 0.40023931860923767, + "learning_rate": 0.0001, + "loss": 1.6677, + "step": 2020 + }, + { + "epoch": 0.2321520877606111, + "grad_norm": 0.36305657029151917, + "learning_rate": 0.0001, + "loss": 1.726, + "step": 2021 + }, + { + "epoch": 0.23226695767043823, + "grad_norm": 0.3816761076450348, + "learning_rate": 0.0001, + "loss": 1.8716, + "step": 2022 + }, + { + "epoch": 0.23238182758026535, + "grad_norm": 0.3882976770401001, + "learning_rate": 0.0001, + "loss": 1.8066, + "step": 2023 + }, + { + "epoch": 0.23249669749009247, + "grad_norm": 0.3663192093372345, + "learning_rate": 0.0001, + "loss": 1.5011, + "step": 2024 + }, + { + "epoch": 0.2326115673999196, + "grad_norm": 0.3655019998550415, + "learning_rate": 0.0001, + "loss": 1.7241, + "step": 2025 + }, + { + "epoch": 0.2327264373097467, + "grad_norm": 0.3717341423034668, + "learning_rate": 0.0001, + "loss": 1.766, + "step": 2026 + }, + { + "epoch": 0.23284130721957383, + "grad_norm": 0.38333335518836975, + "learning_rate": 0.0001, + "loss": 1.8287, + "step": 2027 + }, + { + "epoch": 0.23295617712940095, + "grad_norm": 0.3660372197628021, + "learning_rate": 0.0001, + "loss": 1.6404, + "step": 2028 + }, + { + "epoch": 0.23307104703922807, + "grad_norm": 0.4071906507015228, + "learning_rate": 0.0001, + "loss": 1.6666, + "step": 2029 + }, + { + "epoch": 0.2331859169490552, + "grad_norm": 0.33285489678382874, + "learning_rate": 0.0001, + "loss": 1.5495, + "step": 2030 + }, + { + "epoch": 0.2333007868588823, + "grad_norm": 0.36184608936309814, + "learning_rate": 0.0001, + "loss": 1.7848, + "step": 2031 + }, + { + "epoch": 0.23341565676870943, + "grad_norm": 0.3524230718612671, + "learning_rate": 0.0001, + "loss": 1.606, + "step": 2032 + }, + { + "epoch": 0.23353052667853655, + "grad_norm": 0.37483248114585876, + "learning_rate": 0.0001, + "loss": 1.6525, + "step": 2033 + }, + { + "epoch": 0.23364539658836367, + "grad_norm": 0.3705480992794037, + "learning_rate": 0.0001, + "loss": 1.5733, + "step": 2034 + }, + { + "epoch": 0.2337602664981908, + "grad_norm": 0.39686012268066406, + "learning_rate": 0.0001, + "loss": 1.7466, + "step": 2035 + }, + { + "epoch": 0.2338751364080179, + "grad_norm": 0.3723219633102417, + "learning_rate": 0.0001, + "loss": 1.6358, + "step": 2036 + }, + { + "epoch": 0.23399000631784503, + "grad_norm": 0.3748587965965271, + "learning_rate": 0.0001, + "loss": 1.7883, + "step": 2037 + }, + { + "epoch": 0.23410487622767215, + "grad_norm": 0.3536939024925232, + "learning_rate": 0.0001, + "loss": 1.8048, + "step": 2038 + }, + { + "epoch": 0.23421974613749927, + "grad_norm": 0.36391976475715637, + "learning_rate": 0.0001, + "loss": 1.7326, + "step": 2039 + }, + { + "epoch": 0.23433461604732642, + "grad_norm": 0.3579852879047394, + "learning_rate": 0.0001, + "loss": 1.7311, + "step": 2040 + }, + { + "epoch": 0.23444948595715354, + "grad_norm": 0.3592579960823059, + "learning_rate": 0.0001, + "loss": 1.7675, + "step": 2041 + }, + { + "epoch": 0.23456435586698066, + "grad_norm": 0.34196656942367554, + "learning_rate": 0.0001, + "loss": 1.4591, + "step": 2042 + }, + { + "epoch": 0.23467922577680778, + "grad_norm": 0.40393322706222534, + "learning_rate": 0.0001, + "loss": 1.8106, + "step": 2043 + }, + { + "epoch": 0.2347940956866349, + "grad_norm": 0.39818137884140015, + "learning_rate": 0.0001, + "loss": 1.6923, + "step": 2044 + }, + { + "epoch": 0.23490896559646202, + "grad_norm": 0.3436848521232605, + "learning_rate": 0.0001, + "loss": 1.5952, + "step": 2045 + }, + { + "epoch": 0.23502383550628914, + "grad_norm": 0.35754308104515076, + "learning_rate": 0.0001, + "loss": 1.7914, + "step": 2046 + }, + { + "epoch": 0.23513870541611626, + "grad_norm": 0.37163278460502625, + "learning_rate": 0.0001, + "loss": 1.7085, + "step": 2047 + }, + { + "epoch": 0.23525357532594338, + "grad_norm": 0.3725832402706146, + "learning_rate": 0.0001, + "loss": 1.7469, + "step": 2048 + }, + { + "epoch": 0.2353684452357705, + "grad_norm": 0.3440713584423065, + "learning_rate": 0.0001, + "loss": 1.615, + "step": 2049 + }, + { + "epoch": 0.23548331514559762, + "grad_norm": 0.3776625096797943, + "learning_rate": 0.0001, + "loss": 1.6978, + "step": 2050 + }, + { + "epoch": 0.23559818505542474, + "grad_norm": 0.37252920866012573, + "learning_rate": 0.0001, + "loss": 1.6163, + "step": 2051 + }, + { + "epoch": 0.23571305496525186, + "grad_norm": 0.36468106508255005, + "learning_rate": 0.0001, + "loss": 1.7879, + "step": 2052 + }, + { + "epoch": 0.23582792487507898, + "grad_norm": 0.3846648037433624, + "learning_rate": 0.0001, + "loss": 1.7629, + "step": 2053 + }, + { + "epoch": 0.2359427947849061, + "grad_norm": 0.3676551580429077, + "learning_rate": 0.0001, + "loss": 1.687, + "step": 2054 + }, + { + "epoch": 0.23605766469473322, + "grad_norm": 0.38455480337142944, + "learning_rate": 0.0001, + "loss": 1.4812, + "step": 2055 + }, + { + "epoch": 0.23617253460456034, + "grad_norm": 0.39479124546051025, + "learning_rate": 0.0001, + "loss": 1.8082, + "step": 2056 + }, + { + "epoch": 0.23628740451438746, + "grad_norm": 0.37550801038742065, + "learning_rate": 0.0001, + "loss": 1.7738, + "step": 2057 + }, + { + "epoch": 0.23640227442421458, + "grad_norm": 0.3539324104785919, + "learning_rate": 0.0001, + "loss": 1.4211, + "step": 2058 + }, + { + "epoch": 0.2365171443340417, + "grad_norm": 0.38804805278778076, + "learning_rate": 0.0001, + "loss": 1.4982, + "step": 2059 + }, + { + "epoch": 0.23663201424386882, + "grad_norm": 0.34223702549934387, + "learning_rate": 0.0001, + "loss": 1.6774, + "step": 2060 + }, + { + "epoch": 0.23674688415369594, + "grad_norm": 0.3723870515823364, + "learning_rate": 0.0001, + "loss": 1.7951, + "step": 2061 + }, + { + "epoch": 0.23686175406352306, + "grad_norm": 0.3813320994377136, + "learning_rate": 0.0001, + "loss": 1.7424, + "step": 2062 + }, + { + "epoch": 0.23697662397335018, + "grad_norm": 0.38882893323898315, + "learning_rate": 0.0001, + "loss": 1.7914, + "step": 2063 + }, + { + "epoch": 0.2370914938831773, + "grad_norm": 0.3598068058490753, + "learning_rate": 0.0001, + "loss": 1.605, + "step": 2064 + }, + { + "epoch": 0.23720636379300442, + "grad_norm": 0.35870295763015747, + "learning_rate": 0.0001, + "loss": 1.7168, + "step": 2065 + }, + { + "epoch": 0.23732123370283154, + "grad_norm": 0.38279473781585693, + "learning_rate": 0.0001, + "loss": 1.7661, + "step": 2066 + }, + { + "epoch": 0.23743610361265866, + "grad_norm": 0.3647814393043518, + "learning_rate": 0.0001, + "loss": 1.656, + "step": 2067 + }, + { + "epoch": 0.23755097352248578, + "grad_norm": 0.38286200165748596, + "learning_rate": 0.0001, + "loss": 1.7234, + "step": 2068 + }, + { + "epoch": 0.2376658434323129, + "grad_norm": 0.3622128367424011, + "learning_rate": 0.0001, + "loss": 1.8133, + "step": 2069 + }, + { + "epoch": 0.23778071334214002, + "grad_norm": 0.38336479663848877, + "learning_rate": 0.0001, + "loss": 1.8398, + "step": 2070 + }, + { + "epoch": 0.23789558325196714, + "grad_norm": 0.34429532289505005, + "learning_rate": 0.0001, + "loss": 1.5766, + "step": 2071 + }, + { + "epoch": 0.23801045316179426, + "grad_norm": 0.3539319932460785, + "learning_rate": 0.0001, + "loss": 1.8332, + "step": 2072 + }, + { + "epoch": 0.23812532307162138, + "grad_norm": 0.41278359293937683, + "learning_rate": 0.0001, + "loss": 1.7155, + "step": 2073 + }, + { + "epoch": 0.23824019298144852, + "grad_norm": 0.3446906507015228, + "learning_rate": 0.0001, + "loss": 1.6454, + "step": 2074 + }, + { + "epoch": 0.23835506289127564, + "grad_norm": 0.3742326498031616, + "learning_rate": 0.0001, + "loss": 1.7497, + "step": 2075 + }, + { + "epoch": 0.23846993280110276, + "grad_norm": 0.3751114308834076, + "learning_rate": 0.0001, + "loss": 1.766, + "step": 2076 + }, + { + "epoch": 0.23858480271092988, + "grad_norm": 0.358395516872406, + "learning_rate": 0.0001, + "loss": 1.8033, + "step": 2077 + }, + { + "epoch": 0.238699672620757, + "grad_norm": 0.3309650123119354, + "learning_rate": 0.0001, + "loss": 1.4627, + "step": 2078 + }, + { + "epoch": 0.23881454253058412, + "grad_norm": 0.3873356878757477, + "learning_rate": 0.0001, + "loss": 1.7555, + "step": 2079 + }, + { + "epoch": 0.23892941244041124, + "grad_norm": 0.3576614558696747, + "learning_rate": 0.0001, + "loss": 1.7634, + "step": 2080 + }, + { + "epoch": 0.23904428235023836, + "grad_norm": 0.4060707688331604, + "learning_rate": 0.0001, + "loss": 1.8486, + "step": 2081 + }, + { + "epoch": 0.23915915226006548, + "grad_norm": 0.3609447181224823, + "learning_rate": 0.0001, + "loss": 1.7257, + "step": 2082 + }, + { + "epoch": 0.2392740221698926, + "grad_norm": 0.39137205481529236, + "learning_rate": 0.0001, + "loss": 1.788, + "step": 2083 + }, + { + "epoch": 0.23938889207971972, + "grad_norm": 0.4108488857746124, + "learning_rate": 0.0001, + "loss": 1.5738, + "step": 2084 + }, + { + "epoch": 0.23950376198954684, + "grad_norm": 0.3932496905326843, + "learning_rate": 0.0001, + "loss": 1.9864, + "step": 2085 + }, + { + "epoch": 0.23961863189937396, + "grad_norm": 0.3582679331302643, + "learning_rate": 0.0001, + "loss": 1.6553, + "step": 2086 + }, + { + "epoch": 0.23973350180920108, + "grad_norm": 0.382107138633728, + "learning_rate": 0.0001, + "loss": 1.8809, + "step": 2087 + }, + { + "epoch": 0.2398483717190282, + "grad_norm": 0.37730252742767334, + "learning_rate": 0.0001, + "loss": 1.5719, + "step": 2088 + }, + { + "epoch": 0.23996324162885532, + "grad_norm": 0.39350810647010803, + "learning_rate": 0.0001, + "loss": 1.7602, + "step": 2089 + }, + { + "epoch": 0.24007811153868244, + "grad_norm": 0.3490624725818634, + "learning_rate": 0.0001, + "loss": 1.7111, + "step": 2090 + }, + { + "epoch": 0.24019298144850956, + "grad_norm": 0.39576220512390137, + "learning_rate": 0.0001, + "loss": 1.7265, + "step": 2091 + }, + { + "epoch": 0.24030785135833668, + "grad_norm": 0.34063565731048584, + "learning_rate": 0.0001, + "loss": 1.5151, + "step": 2092 + }, + { + "epoch": 0.2404227212681638, + "grad_norm": 0.36094650626182556, + "learning_rate": 0.0001, + "loss": 1.7337, + "step": 2093 + }, + { + "epoch": 0.24053759117799092, + "grad_norm": 0.39108943939208984, + "learning_rate": 0.0001, + "loss": 1.8195, + "step": 2094 + }, + { + "epoch": 0.24065246108781804, + "grad_norm": 0.3705921471118927, + "learning_rate": 0.0001, + "loss": 1.5288, + "step": 2095 + }, + { + "epoch": 0.24076733099764516, + "grad_norm": 0.38726896047592163, + "learning_rate": 0.0001, + "loss": 1.8124, + "step": 2096 + }, + { + "epoch": 0.24088220090747228, + "grad_norm": 0.3693424165248871, + "learning_rate": 0.0001, + "loss": 1.4665, + "step": 2097 + }, + { + "epoch": 0.2409970708172994, + "grad_norm": 0.37922948598861694, + "learning_rate": 0.0001, + "loss": 1.6475, + "step": 2098 + }, + { + "epoch": 0.24111194072712652, + "grad_norm": 0.3428456485271454, + "learning_rate": 0.0001, + "loss": 1.6265, + "step": 2099 + }, + { + "epoch": 0.24122681063695364, + "grad_norm": 0.38854971528053284, + "learning_rate": 0.0001, + "loss": 1.8648, + "step": 2100 + }, + { + "epoch": 0.24134168054678076, + "grad_norm": 0.3516068756580353, + "learning_rate": 0.0001, + "loss": 1.6517, + "step": 2101 + }, + { + "epoch": 0.24145655045660788, + "grad_norm": 0.3651220500469208, + "learning_rate": 0.0001, + "loss": 1.7118, + "step": 2102 + }, + { + "epoch": 0.241571420366435, + "grad_norm": 0.355566143989563, + "learning_rate": 0.0001, + "loss": 1.473, + "step": 2103 + }, + { + "epoch": 0.24168629027626212, + "grad_norm": 0.3593463599681854, + "learning_rate": 0.0001, + "loss": 1.6904, + "step": 2104 + }, + { + "epoch": 0.24180116018608924, + "grad_norm": 0.35307595133781433, + "learning_rate": 0.0001, + "loss": 1.6098, + "step": 2105 + }, + { + "epoch": 0.24191603009591636, + "grad_norm": 0.37137705087661743, + "learning_rate": 0.0001, + "loss": 1.6786, + "step": 2106 + }, + { + "epoch": 0.24203090000574348, + "grad_norm": 0.3728501498699188, + "learning_rate": 0.0001, + "loss": 1.8597, + "step": 2107 + }, + { + "epoch": 0.24214576991557063, + "grad_norm": 0.3600460886955261, + "learning_rate": 0.0001, + "loss": 1.769, + "step": 2108 + }, + { + "epoch": 0.24226063982539775, + "grad_norm": 0.3472753167152405, + "learning_rate": 0.0001, + "loss": 1.5751, + "step": 2109 + }, + { + "epoch": 0.24237550973522487, + "grad_norm": 0.36193597316741943, + "learning_rate": 0.0001, + "loss": 1.6062, + "step": 2110 + }, + { + "epoch": 0.242490379645052, + "grad_norm": 0.3618480861186981, + "learning_rate": 0.0001, + "loss": 1.6479, + "step": 2111 + }, + { + "epoch": 0.2426052495548791, + "grad_norm": 0.358378529548645, + "learning_rate": 0.0001, + "loss": 1.381, + "step": 2112 + }, + { + "epoch": 0.24272011946470623, + "grad_norm": 0.386943519115448, + "learning_rate": 0.0001, + "loss": 1.896, + "step": 2113 + }, + { + "epoch": 0.24283498937453335, + "grad_norm": 0.38075029850006104, + "learning_rate": 0.0001, + "loss": 1.6401, + "step": 2114 + }, + { + "epoch": 0.24294985928436047, + "grad_norm": 0.3709038197994232, + "learning_rate": 0.0001, + "loss": 1.7108, + "step": 2115 + }, + { + "epoch": 0.2430647291941876, + "grad_norm": 0.3460537791252136, + "learning_rate": 0.0001, + "loss": 1.5338, + "step": 2116 + }, + { + "epoch": 0.2431795991040147, + "grad_norm": 0.3539460599422455, + "learning_rate": 0.0001, + "loss": 1.6383, + "step": 2117 + }, + { + "epoch": 0.24329446901384183, + "grad_norm": 0.3654404580593109, + "learning_rate": 0.0001, + "loss": 1.7205, + "step": 2118 + }, + { + "epoch": 0.24340933892366895, + "grad_norm": 0.37309154868125916, + "learning_rate": 0.0001, + "loss": 1.7026, + "step": 2119 + }, + { + "epoch": 0.24352420883349607, + "grad_norm": 0.3884303569793701, + "learning_rate": 0.0001, + "loss": 1.8393, + "step": 2120 + }, + { + "epoch": 0.2436390787433232, + "grad_norm": 0.41834697127342224, + "learning_rate": 0.0001, + "loss": 1.6144, + "step": 2121 + }, + { + "epoch": 0.2437539486531503, + "grad_norm": 0.48797813057899475, + "learning_rate": 0.0001, + "loss": 1.7264, + "step": 2122 + }, + { + "epoch": 0.24386881856297743, + "grad_norm": 0.3766729533672333, + "learning_rate": 0.0001, + "loss": 1.7361, + "step": 2123 + }, + { + "epoch": 0.24398368847280455, + "grad_norm": 0.3885788023471832, + "learning_rate": 0.0001, + "loss": 1.9709, + "step": 2124 + }, + { + "epoch": 0.24409855838263167, + "grad_norm": 0.36236581206321716, + "learning_rate": 0.0001, + "loss": 1.5214, + "step": 2125 + }, + { + "epoch": 0.2442134282924588, + "grad_norm": 0.368348628282547, + "learning_rate": 0.0001, + "loss": 1.6698, + "step": 2126 + }, + { + "epoch": 0.2443282982022859, + "grad_norm": 0.3759993612766266, + "learning_rate": 0.0001, + "loss": 1.7122, + "step": 2127 + }, + { + "epoch": 0.24444316811211303, + "grad_norm": 0.3532876670360565, + "learning_rate": 0.0001, + "loss": 1.5642, + "step": 2128 + }, + { + "epoch": 0.24455803802194015, + "grad_norm": 0.3924901783466339, + "learning_rate": 0.0001, + "loss": 1.6596, + "step": 2129 + }, + { + "epoch": 0.24467290793176727, + "grad_norm": 0.36675912141799927, + "learning_rate": 0.0001, + "loss": 1.5667, + "step": 2130 + }, + { + "epoch": 0.2447877778415944, + "grad_norm": 0.3623403310775757, + "learning_rate": 0.0001, + "loss": 1.7219, + "step": 2131 + }, + { + "epoch": 0.2449026477514215, + "grad_norm": 0.4029271900653839, + "learning_rate": 0.0001, + "loss": 1.8573, + "step": 2132 + }, + { + "epoch": 0.24501751766124863, + "grad_norm": 0.38149070739746094, + "learning_rate": 0.0001, + "loss": 1.6958, + "step": 2133 + }, + { + "epoch": 0.24513238757107575, + "grad_norm": 0.36011314392089844, + "learning_rate": 0.0001, + "loss": 1.6092, + "step": 2134 + }, + { + "epoch": 0.24524725748090287, + "grad_norm": 0.35948145389556885, + "learning_rate": 0.0001, + "loss": 1.4656, + "step": 2135 + }, + { + "epoch": 0.24536212739073, + "grad_norm": 0.39184871315956116, + "learning_rate": 0.0001, + "loss": 1.6596, + "step": 2136 + }, + { + "epoch": 0.2454769973005571, + "grad_norm": 0.3509827256202698, + "learning_rate": 0.0001, + "loss": 1.6038, + "step": 2137 + }, + { + "epoch": 0.24559186721038423, + "grad_norm": 0.37394797801971436, + "learning_rate": 0.0001, + "loss": 1.7774, + "step": 2138 + }, + { + "epoch": 0.24570673712021135, + "grad_norm": 0.37039676308631897, + "learning_rate": 0.0001, + "loss": 1.6785, + "step": 2139 + }, + { + "epoch": 0.24582160703003847, + "grad_norm": 0.34239184856414795, + "learning_rate": 0.0001, + "loss": 1.6419, + "step": 2140 + }, + { + "epoch": 0.2459364769398656, + "grad_norm": 0.4217103123664856, + "learning_rate": 0.0001, + "loss": 2.0345, + "step": 2141 + }, + { + "epoch": 0.2460513468496927, + "grad_norm": 0.352780282497406, + "learning_rate": 0.0001, + "loss": 1.6952, + "step": 2142 + }, + { + "epoch": 0.24616621675951986, + "grad_norm": 0.34040459990501404, + "learning_rate": 0.0001, + "loss": 1.3958, + "step": 2143 + }, + { + "epoch": 0.24628108666934698, + "grad_norm": 0.3590191900730133, + "learning_rate": 0.0001, + "loss": 1.5738, + "step": 2144 + }, + { + "epoch": 0.2463959565791741, + "grad_norm": 0.3775222599506378, + "learning_rate": 0.0001, + "loss": 1.7631, + "step": 2145 + }, + { + "epoch": 0.24651082648900122, + "grad_norm": 0.3881835639476776, + "learning_rate": 0.0001, + "loss": 1.8367, + "step": 2146 + }, + { + "epoch": 0.24662569639882834, + "grad_norm": 0.35426759719848633, + "learning_rate": 0.0001, + "loss": 1.5748, + "step": 2147 + }, + { + "epoch": 0.24674056630865546, + "grad_norm": 0.43328168988227844, + "learning_rate": 0.0001, + "loss": 1.7385, + "step": 2148 + }, + { + "epoch": 0.24685543621848258, + "grad_norm": 0.38424152135849, + "learning_rate": 0.0001, + "loss": 1.8426, + "step": 2149 + }, + { + "epoch": 0.2469703061283097, + "grad_norm": 0.3645937144756317, + "learning_rate": 0.0001, + "loss": 1.7276, + "step": 2150 + }, + { + "epoch": 0.24708517603813682, + "grad_norm": 0.413036584854126, + "learning_rate": 0.0001, + "loss": 1.8982, + "step": 2151 + }, + { + "epoch": 0.24720004594796394, + "grad_norm": 0.35927438735961914, + "learning_rate": 0.0001, + "loss": 1.663, + "step": 2152 + }, + { + "epoch": 0.24731491585779106, + "grad_norm": 0.3814374804496765, + "learning_rate": 0.0001, + "loss": 1.76, + "step": 2153 + }, + { + "epoch": 0.24742978576761818, + "grad_norm": 0.3744584321975708, + "learning_rate": 0.0001, + "loss": 1.7106, + "step": 2154 + }, + { + "epoch": 0.2475446556774453, + "grad_norm": 0.3815367817878723, + "learning_rate": 0.0001, + "loss": 1.8374, + "step": 2155 + }, + { + "epoch": 0.24765952558727242, + "grad_norm": 0.37428149580955505, + "learning_rate": 0.0001, + "loss": 1.7515, + "step": 2156 + }, + { + "epoch": 0.24777439549709954, + "grad_norm": 0.3383006751537323, + "learning_rate": 0.0001, + "loss": 1.5878, + "step": 2157 + }, + { + "epoch": 0.24788926540692666, + "grad_norm": 0.3787260055541992, + "learning_rate": 0.0001, + "loss": 1.7558, + "step": 2158 + }, + { + "epoch": 0.24800413531675378, + "grad_norm": 0.34863799810409546, + "learning_rate": 0.0001, + "loss": 1.7608, + "step": 2159 + }, + { + "epoch": 0.2481190052265809, + "grad_norm": 0.34521782398223877, + "learning_rate": 0.0001, + "loss": 1.6219, + "step": 2160 + }, + { + "epoch": 0.24823387513640802, + "grad_norm": 0.4506453275680542, + "learning_rate": 0.0001, + "loss": 1.5816, + "step": 2161 + }, + { + "epoch": 0.24834874504623514, + "grad_norm": 0.388887494802475, + "learning_rate": 0.0001, + "loss": 1.9017, + "step": 2162 + }, + { + "epoch": 0.24846361495606226, + "grad_norm": 0.3828551173210144, + "learning_rate": 0.0001, + "loss": 1.755, + "step": 2163 + }, + { + "epoch": 0.24857848486588938, + "grad_norm": 0.35569700598716736, + "learning_rate": 0.0001, + "loss": 1.6083, + "step": 2164 + }, + { + "epoch": 0.2486933547757165, + "grad_norm": 0.3651540279388428, + "learning_rate": 0.0001, + "loss": 1.7724, + "step": 2165 + }, + { + "epoch": 0.24880822468554362, + "grad_norm": 0.362402081489563, + "learning_rate": 0.0001, + "loss": 1.6725, + "step": 2166 + }, + { + "epoch": 0.24892309459537074, + "grad_norm": 0.36203300952911377, + "learning_rate": 0.0001, + "loss": 1.6146, + "step": 2167 + }, + { + "epoch": 0.24903796450519786, + "grad_norm": 0.37122076749801636, + "learning_rate": 0.0001, + "loss": 1.5805, + "step": 2168 + }, + { + "epoch": 0.24915283441502498, + "grad_norm": 0.357860803604126, + "learning_rate": 0.0001, + "loss": 1.6995, + "step": 2169 + }, + { + "epoch": 0.2492677043248521, + "grad_norm": 0.36631032824516296, + "learning_rate": 0.0001, + "loss": 1.7249, + "step": 2170 + }, + { + "epoch": 0.24938257423467922, + "grad_norm": 0.37046635150909424, + "learning_rate": 0.0001, + "loss": 1.6338, + "step": 2171 + }, + { + "epoch": 0.24949744414450634, + "grad_norm": 0.39707261323928833, + "learning_rate": 0.0001, + "loss": 1.9089, + "step": 2172 + }, + { + "epoch": 0.24961231405433346, + "grad_norm": 0.3699803650379181, + "learning_rate": 0.0001, + "loss": 1.6095, + "step": 2173 + }, + { + "epoch": 0.24972718396416058, + "grad_norm": 0.3902735710144043, + "learning_rate": 0.0001, + "loss": 1.7023, + "step": 2174 + }, + { + "epoch": 0.2498420538739877, + "grad_norm": 0.3696674406528473, + "learning_rate": 0.0001, + "loss": 1.7209, + "step": 2175 + }, + { + "epoch": 0.24995692378381482, + "grad_norm": 0.3827657699584961, + "learning_rate": 0.0001, + "loss": 1.7072, + "step": 2176 + }, + { + "epoch": 0.25007179369364196, + "grad_norm": 0.38356491923332214, + "learning_rate": 0.0001, + "loss": 1.7748, + "step": 2177 + }, + { + "epoch": 0.2501866636034691, + "grad_norm": 0.3400535583496094, + "learning_rate": 0.0001, + "loss": 1.5347, + "step": 2178 + }, + { + "epoch": 0.2503015335132962, + "grad_norm": 0.37260451912879944, + "learning_rate": 0.0001, + "loss": 1.7625, + "step": 2179 + }, + { + "epoch": 0.2504164034231233, + "grad_norm": 0.4127359688282013, + "learning_rate": 0.0001, + "loss": 1.4756, + "step": 2180 + }, + { + "epoch": 0.25053127333295044, + "grad_norm": 0.3528091609477997, + "learning_rate": 0.0001, + "loss": 1.6681, + "step": 2181 + }, + { + "epoch": 0.25064614324277756, + "grad_norm": 0.39472371339797974, + "learning_rate": 0.0001, + "loss": 1.8162, + "step": 2182 + }, + { + "epoch": 0.2507610131526047, + "grad_norm": 0.3909297585487366, + "learning_rate": 0.0001, + "loss": 1.7071, + "step": 2183 + }, + { + "epoch": 0.2508758830624318, + "grad_norm": 0.42550724744796753, + "learning_rate": 0.0001, + "loss": 1.5249, + "step": 2184 + }, + { + "epoch": 0.2509907529722589, + "grad_norm": 0.3795909881591797, + "learning_rate": 0.0001, + "loss": 1.6718, + "step": 2185 + }, + { + "epoch": 0.25110562288208604, + "grad_norm": 0.37554746866226196, + "learning_rate": 0.0001, + "loss": 1.8221, + "step": 2186 + }, + { + "epoch": 0.25122049279191316, + "grad_norm": 0.362114280462265, + "learning_rate": 0.0001, + "loss": 1.6699, + "step": 2187 + }, + { + "epoch": 0.2513353627017403, + "grad_norm": 0.36386096477508545, + "learning_rate": 0.0001, + "loss": 1.4788, + "step": 2188 + }, + { + "epoch": 0.2514502326115674, + "grad_norm": 0.3716920018196106, + "learning_rate": 0.0001, + "loss": 1.5724, + "step": 2189 + }, + { + "epoch": 0.2515651025213945, + "grad_norm": 0.3535931706428528, + "learning_rate": 0.0001, + "loss": 1.6721, + "step": 2190 + }, + { + "epoch": 0.25167997243122164, + "grad_norm": 0.3724220097064972, + "learning_rate": 0.0001, + "loss": 1.6522, + "step": 2191 + }, + { + "epoch": 0.25179484234104876, + "grad_norm": 0.40217527747154236, + "learning_rate": 0.0001, + "loss": 1.7181, + "step": 2192 + }, + { + "epoch": 0.2519097122508759, + "grad_norm": 0.383705198764801, + "learning_rate": 0.0001, + "loss": 1.6798, + "step": 2193 + }, + { + "epoch": 0.252024582160703, + "grad_norm": 0.35728463530540466, + "learning_rate": 0.0001, + "loss": 1.7787, + "step": 2194 + }, + { + "epoch": 0.2521394520705301, + "grad_norm": 0.34664642810821533, + "learning_rate": 0.0001, + "loss": 1.4345, + "step": 2195 + }, + { + "epoch": 0.25225432198035724, + "grad_norm": 0.3613569438457489, + "learning_rate": 0.0001, + "loss": 1.4541, + "step": 2196 + }, + { + "epoch": 0.25236919189018436, + "grad_norm": 0.36570632457733154, + "learning_rate": 0.0001, + "loss": 1.674, + "step": 2197 + }, + { + "epoch": 0.2524840618000115, + "grad_norm": 0.36983218789100647, + "learning_rate": 0.0001, + "loss": 1.7606, + "step": 2198 + }, + { + "epoch": 0.2525989317098386, + "grad_norm": 0.35930711030960083, + "learning_rate": 0.0001, + "loss": 1.6157, + "step": 2199 + }, + { + "epoch": 0.2527138016196657, + "grad_norm": 0.39522892236709595, + "learning_rate": 0.0001, + "loss": 1.7063, + "step": 2200 + }, + { + "epoch": 0.25282867152949284, + "grad_norm": 0.3675948679447174, + "learning_rate": 0.0001, + "loss": 1.6295, + "step": 2201 + }, + { + "epoch": 0.25294354143931996, + "grad_norm": 0.3466283679008484, + "learning_rate": 0.0001, + "loss": 1.643, + "step": 2202 + }, + { + "epoch": 0.2530584113491471, + "grad_norm": 0.3820589482784271, + "learning_rate": 0.0001, + "loss": 1.8305, + "step": 2203 + }, + { + "epoch": 0.2531732812589742, + "grad_norm": 0.45334169268608093, + "learning_rate": 0.0001, + "loss": 1.8272, + "step": 2204 + }, + { + "epoch": 0.2532881511688013, + "grad_norm": 0.36322474479675293, + "learning_rate": 0.0001, + "loss": 1.5607, + "step": 2205 + }, + { + "epoch": 0.25340302107862844, + "grad_norm": 0.3699181377887726, + "learning_rate": 0.0001, + "loss": 1.7263, + "step": 2206 + }, + { + "epoch": 0.25351789098845556, + "grad_norm": 0.3594902455806732, + "learning_rate": 0.0001, + "loss": 1.6536, + "step": 2207 + }, + { + "epoch": 0.2536327608982827, + "grad_norm": 0.3868749141693115, + "learning_rate": 0.0001, + "loss": 1.7946, + "step": 2208 + }, + { + "epoch": 0.2537476308081098, + "grad_norm": 0.39426884055137634, + "learning_rate": 0.0001, + "loss": 1.898, + "step": 2209 + }, + { + "epoch": 0.2538625007179369, + "grad_norm": 0.41421404480934143, + "learning_rate": 0.0001, + "loss": 1.8456, + "step": 2210 + }, + { + "epoch": 0.25397737062776404, + "grad_norm": 0.35613730549812317, + "learning_rate": 0.0001, + "loss": 1.706, + "step": 2211 + }, + { + "epoch": 0.25409224053759116, + "grad_norm": 0.38455823063850403, + "learning_rate": 0.0001, + "loss": 1.6824, + "step": 2212 + }, + { + "epoch": 0.2542071104474183, + "grad_norm": 0.3799576759338379, + "learning_rate": 0.0001, + "loss": 1.714, + "step": 2213 + }, + { + "epoch": 0.2543219803572454, + "grad_norm": 0.43304580450057983, + "learning_rate": 0.0001, + "loss": 1.7829, + "step": 2214 + }, + { + "epoch": 0.2544368502670725, + "grad_norm": 0.39498084783554077, + "learning_rate": 0.0001, + "loss": 1.8003, + "step": 2215 + }, + { + "epoch": 0.25455172017689964, + "grad_norm": 0.3838403522968292, + "learning_rate": 0.0001, + "loss": 1.6831, + "step": 2216 + }, + { + "epoch": 0.25466659008672676, + "grad_norm": 0.38586917519569397, + "learning_rate": 0.0001, + "loss": 1.8446, + "step": 2217 + }, + { + "epoch": 0.2547814599965539, + "grad_norm": 0.37291255593299866, + "learning_rate": 0.0001, + "loss": 1.6258, + "step": 2218 + }, + { + "epoch": 0.254896329906381, + "grad_norm": 0.37416213750839233, + "learning_rate": 0.0001, + "loss": 1.6125, + "step": 2219 + }, + { + "epoch": 0.2550111998162081, + "grad_norm": 0.37467607855796814, + "learning_rate": 0.0001, + "loss": 1.6916, + "step": 2220 + }, + { + "epoch": 0.25512606972603524, + "grad_norm": 0.3643462061882019, + "learning_rate": 0.0001, + "loss": 1.6534, + "step": 2221 + }, + { + "epoch": 0.25524093963586236, + "grad_norm": 0.3733225464820862, + "learning_rate": 0.0001, + "loss": 1.7953, + "step": 2222 + }, + { + "epoch": 0.2553558095456895, + "grad_norm": 0.3754335045814514, + "learning_rate": 0.0001, + "loss": 1.8093, + "step": 2223 + }, + { + "epoch": 0.2554706794555166, + "grad_norm": 0.3865663707256317, + "learning_rate": 0.0001, + "loss": 1.7088, + "step": 2224 + }, + { + "epoch": 0.2555855493653437, + "grad_norm": 0.38617023825645447, + "learning_rate": 0.0001, + "loss": 1.8311, + "step": 2225 + }, + { + "epoch": 0.25570041927517084, + "grad_norm": 0.37515413761138916, + "learning_rate": 0.0001, + "loss": 1.5905, + "step": 2226 + }, + { + "epoch": 0.25581528918499796, + "grad_norm": 0.38885021209716797, + "learning_rate": 0.0001, + "loss": 1.8163, + "step": 2227 + }, + { + "epoch": 0.25593015909482514, + "grad_norm": 0.37118270993232727, + "learning_rate": 0.0001, + "loss": 1.8075, + "step": 2228 + }, + { + "epoch": 0.25604502900465226, + "grad_norm": 0.389565110206604, + "learning_rate": 0.0001, + "loss": 1.6765, + "step": 2229 + }, + { + "epoch": 0.2561598989144794, + "grad_norm": 0.3385695517063141, + "learning_rate": 0.0001, + "loss": 1.3302, + "step": 2230 + }, + { + "epoch": 0.2562747688243065, + "grad_norm": 0.37382227182388306, + "learning_rate": 0.0001, + "loss": 1.7651, + "step": 2231 + }, + { + "epoch": 0.2563896387341336, + "grad_norm": 0.39900514483451843, + "learning_rate": 0.0001, + "loss": 1.7818, + "step": 2232 + }, + { + "epoch": 0.25650450864396074, + "grad_norm": 0.3673021197319031, + "learning_rate": 0.0001, + "loss": 1.814, + "step": 2233 + }, + { + "epoch": 0.25661937855378786, + "grad_norm": 0.3759724199771881, + "learning_rate": 0.0001, + "loss": 1.7816, + "step": 2234 + }, + { + "epoch": 0.256734248463615, + "grad_norm": 0.37381884455680847, + "learning_rate": 0.0001, + "loss": 1.7092, + "step": 2235 + }, + { + "epoch": 0.2568491183734421, + "grad_norm": 0.3776073157787323, + "learning_rate": 0.0001, + "loss": 1.6729, + "step": 2236 + }, + { + "epoch": 0.2569639882832692, + "grad_norm": 0.4105524718761444, + "learning_rate": 0.0001, + "loss": 1.5777, + "step": 2237 + }, + { + "epoch": 0.25707885819309634, + "grad_norm": 0.41020524501800537, + "learning_rate": 0.0001, + "loss": 1.9763, + "step": 2238 + }, + { + "epoch": 0.25719372810292346, + "grad_norm": 0.4077359139919281, + "learning_rate": 0.0001, + "loss": 2.0014, + "step": 2239 + }, + { + "epoch": 0.2573085980127506, + "grad_norm": 0.41229790449142456, + "learning_rate": 0.0001, + "loss": 1.6186, + "step": 2240 + }, + { + "epoch": 0.2574234679225777, + "grad_norm": 0.380489319562912, + "learning_rate": 0.0001, + "loss": 1.8079, + "step": 2241 + }, + { + "epoch": 0.2575383378324048, + "grad_norm": 0.3971484899520874, + "learning_rate": 0.0001, + "loss": 1.8367, + "step": 2242 + }, + { + "epoch": 0.25765320774223194, + "grad_norm": 0.37627938389778137, + "learning_rate": 0.0001, + "loss": 1.6502, + "step": 2243 + }, + { + "epoch": 0.25776807765205906, + "grad_norm": 0.3823041319847107, + "learning_rate": 0.0001, + "loss": 1.7642, + "step": 2244 + }, + { + "epoch": 0.2578829475618862, + "grad_norm": 0.3671165704727173, + "learning_rate": 0.0001, + "loss": 1.7102, + "step": 2245 + }, + { + "epoch": 0.2579978174717133, + "grad_norm": 0.39813950657844543, + "learning_rate": 0.0001, + "loss": 1.8005, + "step": 2246 + }, + { + "epoch": 0.2581126873815404, + "grad_norm": 0.3677361309528351, + "learning_rate": 0.0001, + "loss": 1.658, + "step": 2247 + }, + { + "epoch": 0.25822755729136754, + "grad_norm": 0.3524981439113617, + "learning_rate": 0.0001, + "loss": 1.5583, + "step": 2248 + }, + { + "epoch": 0.25834242720119466, + "grad_norm": 0.37544408440589905, + "learning_rate": 0.0001, + "loss": 1.6862, + "step": 2249 + }, + { + "epoch": 0.2584572971110218, + "grad_norm": 0.35381603240966797, + "learning_rate": 0.0001, + "loss": 1.5272, + "step": 2250 + }, + { + "epoch": 0.2585721670208489, + "grad_norm": 0.39469513297080994, + "learning_rate": 0.0001, + "loss": 1.7863, + "step": 2251 + }, + { + "epoch": 0.258687036930676, + "grad_norm": 0.3802177906036377, + "learning_rate": 0.0001, + "loss": 1.7908, + "step": 2252 + }, + { + "epoch": 0.25880190684050314, + "grad_norm": 0.3482286036014557, + "learning_rate": 0.0001, + "loss": 1.6267, + "step": 2253 + }, + { + "epoch": 0.25891677675033026, + "grad_norm": 0.3742547035217285, + "learning_rate": 0.0001, + "loss": 1.6636, + "step": 2254 + }, + { + "epoch": 0.2590316466601574, + "grad_norm": 0.386465847492218, + "learning_rate": 0.0001, + "loss": 1.6659, + "step": 2255 + }, + { + "epoch": 0.2591465165699845, + "grad_norm": 0.365347295999527, + "learning_rate": 0.0001, + "loss": 1.7536, + "step": 2256 + }, + { + "epoch": 0.2592613864798116, + "grad_norm": 0.3640124797821045, + "learning_rate": 0.0001, + "loss": 1.7142, + "step": 2257 + }, + { + "epoch": 0.25937625638963874, + "grad_norm": 0.36901116371154785, + "learning_rate": 0.0001, + "loss": 1.477, + "step": 2258 + }, + { + "epoch": 0.25949112629946586, + "grad_norm": 0.36894455552101135, + "learning_rate": 0.0001, + "loss": 1.6933, + "step": 2259 + }, + { + "epoch": 0.259605996209293, + "grad_norm": 0.37590548396110535, + "learning_rate": 0.0001, + "loss": 1.7388, + "step": 2260 + }, + { + "epoch": 0.2597208661191201, + "grad_norm": 0.4689255654811859, + "learning_rate": 0.0001, + "loss": 2.1016, + "step": 2261 + }, + { + "epoch": 0.2598357360289472, + "grad_norm": 0.3811010420322418, + "learning_rate": 0.0001, + "loss": 1.6575, + "step": 2262 + }, + { + "epoch": 0.25995060593877434, + "grad_norm": 0.3560413420200348, + "learning_rate": 0.0001, + "loss": 1.6288, + "step": 2263 + }, + { + "epoch": 0.26006547584860146, + "grad_norm": 0.4040233790874481, + "learning_rate": 0.0001, + "loss": 1.8888, + "step": 2264 + }, + { + "epoch": 0.2601803457584286, + "grad_norm": 0.3741479516029358, + "learning_rate": 0.0001, + "loss": 1.6749, + "step": 2265 + }, + { + "epoch": 0.2602952156682557, + "grad_norm": 0.3752221465110779, + "learning_rate": 0.0001, + "loss": 1.8194, + "step": 2266 + }, + { + "epoch": 0.2604100855780828, + "grad_norm": 0.3529088497161865, + "learning_rate": 0.0001, + "loss": 1.616, + "step": 2267 + }, + { + "epoch": 0.26052495548790994, + "grad_norm": 0.37346789240837097, + "learning_rate": 0.0001, + "loss": 1.694, + "step": 2268 + }, + { + "epoch": 0.26063982539773706, + "grad_norm": 0.38784459233283997, + "learning_rate": 0.0001, + "loss": 1.7968, + "step": 2269 + }, + { + "epoch": 0.2607546953075642, + "grad_norm": 0.3864971697330475, + "learning_rate": 0.0001, + "loss": 1.7894, + "step": 2270 + }, + { + "epoch": 0.2608695652173913, + "grad_norm": 0.3784656226634979, + "learning_rate": 0.0001, + "loss": 1.6398, + "step": 2271 + }, + { + "epoch": 0.2609844351272184, + "grad_norm": 0.3623911738395691, + "learning_rate": 0.0001, + "loss": 1.7436, + "step": 2272 + }, + { + "epoch": 0.26109930503704554, + "grad_norm": 0.3621329069137573, + "learning_rate": 0.0001, + "loss": 1.4809, + "step": 2273 + }, + { + "epoch": 0.26121417494687266, + "grad_norm": 0.40668338537216187, + "learning_rate": 0.0001, + "loss": 1.7219, + "step": 2274 + }, + { + "epoch": 0.2613290448566998, + "grad_norm": 0.3952760398387909, + "learning_rate": 0.0001, + "loss": 1.9019, + "step": 2275 + }, + { + "epoch": 0.2614439147665269, + "grad_norm": 0.39390838146209717, + "learning_rate": 0.0001, + "loss": 1.8203, + "step": 2276 + }, + { + "epoch": 0.261558784676354, + "grad_norm": 0.3634113073348999, + "learning_rate": 0.0001, + "loss": 1.5633, + "step": 2277 + }, + { + "epoch": 0.26167365458618114, + "grad_norm": 0.36678680777549744, + "learning_rate": 0.0001, + "loss": 1.7097, + "step": 2278 + }, + { + "epoch": 0.26178852449600826, + "grad_norm": 0.4072323143482208, + "learning_rate": 0.0001, + "loss": 1.7973, + "step": 2279 + }, + { + "epoch": 0.2619033944058354, + "grad_norm": 0.39810293912887573, + "learning_rate": 0.0001, + "loss": 1.8374, + "step": 2280 + }, + { + "epoch": 0.2620182643156625, + "grad_norm": 0.35911354422569275, + "learning_rate": 0.0001, + "loss": 1.7054, + "step": 2281 + }, + { + "epoch": 0.2621331342254896, + "grad_norm": 0.3710712492465973, + "learning_rate": 0.0001, + "loss": 1.7595, + "step": 2282 + }, + { + "epoch": 0.26224800413531674, + "grad_norm": 0.3721124231815338, + "learning_rate": 0.0001, + "loss": 1.6115, + "step": 2283 + }, + { + "epoch": 0.26236287404514386, + "grad_norm": 0.35447004437446594, + "learning_rate": 0.0001, + "loss": 1.6689, + "step": 2284 + }, + { + "epoch": 0.262477743954971, + "grad_norm": 0.37349700927734375, + "learning_rate": 0.0001, + "loss": 1.5631, + "step": 2285 + }, + { + "epoch": 0.2625926138647981, + "grad_norm": 0.3656753599643707, + "learning_rate": 0.0001, + "loss": 1.5151, + "step": 2286 + }, + { + "epoch": 0.2627074837746252, + "grad_norm": 0.35918521881103516, + "learning_rate": 0.0001, + "loss": 1.5962, + "step": 2287 + }, + { + "epoch": 0.26282235368445234, + "grad_norm": 0.36438190937042236, + "learning_rate": 0.0001, + "loss": 1.6488, + "step": 2288 + }, + { + "epoch": 0.26293722359427946, + "grad_norm": 0.365792453289032, + "learning_rate": 0.0001, + "loss": 1.6017, + "step": 2289 + }, + { + "epoch": 0.2630520935041066, + "grad_norm": 0.3362356424331665, + "learning_rate": 0.0001, + "loss": 1.3965, + "step": 2290 + }, + { + "epoch": 0.2631669634139337, + "grad_norm": 0.3698473274707794, + "learning_rate": 0.0001, + "loss": 1.6996, + "step": 2291 + }, + { + "epoch": 0.2632818333237608, + "grad_norm": 0.447664737701416, + "learning_rate": 0.0001, + "loss": 1.6443, + "step": 2292 + }, + { + "epoch": 0.26339670323358794, + "grad_norm": 0.4149281978607178, + "learning_rate": 0.0001, + "loss": 1.7173, + "step": 2293 + }, + { + "epoch": 0.26351157314341506, + "grad_norm": 0.37246763706207275, + "learning_rate": 0.0001, + "loss": 1.7145, + "step": 2294 + }, + { + "epoch": 0.2636264430532422, + "grad_norm": 0.37586966156959534, + "learning_rate": 0.0001, + "loss": 1.8224, + "step": 2295 + }, + { + "epoch": 0.2637413129630693, + "grad_norm": 0.36681029200553894, + "learning_rate": 0.0001, + "loss": 1.6284, + "step": 2296 + }, + { + "epoch": 0.26385618287289647, + "grad_norm": 0.3731790781021118, + "learning_rate": 0.0001, + "loss": 1.6059, + "step": 2297 + }, + { + "epoch": 0.2639710527827236, + "grad_norm": 0.4063315987586975, + "learning_rate": 0.0001, + "loss": 1.8513, + "step": 2298 + }, + { + "epoch": 0.2640859226925507, + "grad_norm": 0.36280304193496704, + "learning_rate": 0.0001, + "loss": 1.5944, + "step": 2299 + }, + { + "epoch": 0.26420079260237783, + "grad_norm": 0.41365888714790344, + "learning_rate": 0.0001, + "loss": 1.7712, + "step": 2300 + }, + { + "epoch": 0.26431566251220495, + "grad_norm": 0.4301823079586029, + "learning_rate": 0.0001, + "loss": 2.0047, + "step": 2301 + }, + { + "epoch": 0.26443053242203207, + "grad_norm": 0.3758213222026825, + "learning_rate": 0.0001, + "loss": 1.6871, + "step": 2302 + }, + { + "epoch": 0.2645454023318592, + "grad_norm": 0.34419310092926025, + "learning_rate": 0.0001, + "loss": 1.5699, + "step": 2303 + }, + { + "epoch": 0.2646602722416863, + "grad_norm": 0.38945549726486206, + "learning_rate": 0.0001, + "loss": 1.7238, + "step": 2304 + }, + { + "epoch": 0.26477514215151343, + "grad_norm": 0.3707123398780823, + "learning_rate": 0.0001, + "loss": 1.6401, + "step": 2305 + }, + { + "epoch": 0.26489001206134055, + "grad_norm": 0.35695189237594604, + "learning_rate": 0.0001, + "loss": 1.6122, + "step": 2306 + }, + { + "epoch": 0.26500488197116767, + "grad_norm": 0.36474674940109253, + "learning_rate": 0.0001, + "loss": 1.7198, + "step": 2307 + }, + { + "epoch": 0.2651197518809948, + "grad_norm": 0.35917285084724426, + "learning_rate": 0.0001, + "loss": 1.5133, + "step": 2308 + }, + { + "epoch": 0.2652346217908219, + "grad_norm": 0.38368695974349976, + "learning_rate": 0.0001, + "loss": 1.7404, + "step": 2309 + }, + { + "epoch": 0.26534949170064903, + "grad_norm": 0.4073870778083801, + "learning_rate": 0.0001, + "loss": 1.7424, + "step": 2310 + }, + { + "epoch": 0.26546436161047615, + "grad_norm": 0.35575923323631287, + "learning_rate": 0.0001, + "loss": 1.6298, + "step": 2311 + }, + { + "epoch": 0.26557923152030327, + "grad_norm": 0.3653978109359741, + "learning_rate": 0.0001, + "loss": 1.7346, + "step": 2312 + }, + { + "epoch": 0.2656941014301304, + "grad_norm": 0.35288769006729126, + "learning_rate": 0.0001, + "loss": 1.611, + "step": 2313 + }, + { + "epoch": 0.2658089713399575, + "grad_norm": 0.36546453833580017, + "learning_rate": 0.0001, + "loss": 1.638, + "step": 2314 + }, + { + "epoch": 0.26592384124978463, + "grad_norm": 0.3479709029197693, + "learning_rate": 0.0001, + "loss": 1.6432, + "step": 2315 + }, + { + "epoch": 0.26603871115961175, + "grad_norm": 0.3809703290462494, + "learning_rate": 0.0001, + "loss": 1.7787, + "step": 2316 + }, + { + "epoch": 0.26615358106943887, + "grad_norm": 0.3938155472278595, + "learning_rate": 0.0001, + "loss": 1.7848, + "step": 2317 + }, + { + "epoch": 0.266268450979266, + "grad_norm": 0.3979399502277374, + "learning_rate": 0.0001, + "loss": 1.7428, + "step": 2318 + }, + { + "epoch": 0.2663833208890931, + "grad_norm": 0.34814873337745667, + "learning_rate": 0.0001, + "loss": 1.5687, + "step": 2319 + }, + { + "epoch": 0.26649819079892023, + "grad_norm": 0.3895953595638275, + "learning_rate": 0.0001, + "loss": 1.6384, + "step": 2320 + }, + { + "epoch": 0.26661306070874735, + "grad_norm": 0.3680303692817688, + "learning_rate": 0.0001, + "loss": 1.6297, + "step": 2321 + }, + { + "epoch": 0.26672793061857447, + "grad_norm": 0.3696908950805664, + "learning_rate": 0.0001, + "loss": 1.7016, + "step": 2322 + }, + { + "epoch": 0.2668428005284016, + "grad_norm": 0.3959087133407593, + "learning_rate": 0.0001, + "loss": 1.6687, + "step": 2323 + }, + { + "epoch": 0.2669576704382287, + "grad_norm": 0.41594138741493225, + "learning_rate": 0.0001, + "loss": 1.7522, + "step": 2324 + }, + { + "epoch": 0.26707254034805583, + "grad_norm": 0.3591321110725403, + "learning_rate": 0.0001, + "loss": 1.7126, + "step": 2325 + }, + { + "epoch": 0.26718741025788295, + "grad_norm": 0.3633732795715332, + "learning_rate": 0.0001, + "loss": 1.6606, + "step": 2326 + }, + { + "epoch": 0.26730228016771007, + "grad_norm": 0.3702927529811859, + "learning_rate": 0.0001, + "loss": 1.7004, + "step": 2327 + }, + { + "epoch": 0.2674171500775372, + "grad_norm": 0.38838890194892883, + "learning_rate": 0.0001, + "loss": 1.7151, + "step": 2328 + }, + { + "epoch": 0.2675320199873643, + "grad_norm": 0.3630795180797577, + "learning_rate": 0.0001, + "loss": 1.7975, + "step": 2329 + }, + { + "epoch": 0.26764688989719143, + "grad_norm": 0.3541039228439331, + "learning_rate": 0.0001, + "loss": 1.7219, + "step": 2330 + }, + { + "epoch": 0.26776175980701855, + "grad_norm": 0.35986846685409546, + "learning_rate": 0.0001, + "loss": 1.8925, + "step": 2331 + }, + { + "epoch": 0.26787662971684567, + "grad_norm": 0.3914209008216858, + "learning_rate": 0.0001, + "loss": 1.9096, + "step": 2332 + }, + { + "epoch": 0.2679914996266728, + "grad_norm": 0.3626435399055481, + "learning_rate": 0.0001, + "loss": 1.8039, + "step": 2333 + }, + { + "epoch": 0.2681063695364999, + "grad_norm": 0.33067116141319275, + "learning_rate": 0.0001, + "loss": 1.5963, + "step": 2334 + }, + { + "epoch": 0.26822123944632703, + "grad_norm": 0.38686808943748474, + "learning_rate": 0.0001, + "loss": 1.5786, + "step": 2335 + }, + { + "epoch": 0.26833610935615415, + "grad_norm": 0.3926886022090912, + "learning_rate": 0.0001, + "loss": 1.8575, + "step": 2336 + }, + { + "epoch": 0.26845097926598127, + "grad_norm": 0.3721444606781006, + "learning_rate": 0.0001, + "loss": 1.6829, + "step": 2337 + }, + { + "epoch": 0.2685658491758084, + "grad_norm": 0.39007246494293213, + "learning_rate": 0.0001, + "loss": 1.7679, + "step": 2338 + }, + { + "epoch": 0.2686807190856355, + "grad_norm": 0.3614482283592224, + "learning_rate": 0.0001, + "loss": 1.6376, + "step": 2339 + }, + { + "epoch": 0.26879558899546263, + "grad_norm": 0.39546316862106323, + "learning_rate": 0.0001, + "loss": 1.4712, + "step": 2340 + }, + { + "epoch": 0.26891045890528975, + "grad_norm": 0.36849477887153625, + "learning_rate": 0.0001, + "loss": 1.6153, + "step": 2341 + }, + { + "epoch": 0.26902532881511687, + "grad_norm": 0.3788404166698456, + "learning_rate": 0.0001, + "loss": 1.6837, + "step": 2342 + }, + { + "epoch": 0.269140198724944, + "grad_norm": 0.35742640495300293, + "learning_rate": 0.0001, + "loss": 1.5822, + "step": 2343 + }, + { + "epoch": 0.2692550686347711, + "grad_norm": 0.37428852915763855, + "learning_rate": 0.0001, + "loss": 1.7477, + "step": 2344 + }, + { + "epoch": 0.26936993854459823, + "grad_norm": 0.42374229431152344, + "learning_rate": 0.0001, + "loss": 1.8088, + "step": 2345 + }, + { + "epoch": 0.26948480845442535, + "grad_norm": 0.3708469867706299, + "learning_rate": 0.0001, + "loss": 1.8116, + "step": 2346 + }, + { + "epoch": 0.26959967836425247, + "grad_norm": 0.37047696113586426, + "learning_rate": 0.0001, + "loss": 1.6812, + "step": 2347 + }, + { + "epoch": 0.2697145482740796, + "grad_norm": 0.4440860450267792, + "learning_rate": 0.0001, + "loss": 1.7974, + "step": 2348 + }, + { + "epoch": 0.2698294181839067, + "grad_norm": 0.38337430357933044, + "learning_rate": 0.0001, + "loss": 1.7039, + "step": 2349 + }, + { + "epoch": 0.26994428809373383, + "grad_norm": 0.3824228048324585, + "learning_rate": 0.0001, + "loss": 1.7418, + "step": 2350 + }, + { + "epoch": 0.27005915800356095, + "grad_norm": 0.38111045956611633, + "learning_rate": 0.0001, + "loss": 1.7184, + "step": 2351 + }, + { + "epoch": 0.27017402791338807, + "grad_norm": 0.3806154727935791, + "learning_rate": 0.0001, + "loss": 1.8688, + "step": 2352 + }, + { + "epoch": 0.2702888978232152, + "grad_norm": 0.3737587332725525, + "learning_rate": 0.0001, + "loss": 1.7697, + "step": 2353 + }, + { + "epoch": 0.2704037677330423, + "grad_norm": 0.37875616550445557, + "learning_rate": 0.0001, + "loss": 1.7901, + "step": 2354 + }, + { + "epoch": 0.27051863764286943, + "grad_norm": 0.3805428445339203, + "learning_rate": 0.0001, + "loss": 1.8359, + "step": 2355 + }, + { + "epoch": 0.27063350755269655, + "grad_norm": 0.3564281761646271, + "learning_rate": 0.0001, + "loss": 1.6027, + "step": 2356 + }, + { + "epoch": 0.27074837746252367, + "grad_norm": 0.36830076575279236, + "learning_rate": 0.0001, + "loss": 1.7583, + "step": 2357 + }, + { + "epoch": 0.2708632473723508, + "grad_norm": 0.38345348834991455, + "learning_rate": 0.0001, + "loss": 1.8286, + "step": 2358 + }, + { + "epoch": 0.2709781172821779, + "grad_norm": 0.3683982789516449, + "learning_rate": 0.0001, + "loss": 1.6467, + "step": 2359 + }, + { + "epoch": 0.27109298719200503, + "grad_norm": 0.36231061816215515, + "learning_rate": 0.0001, + "loss": 1.6102, + "step": 2360 + }, + { + "epoch": 0.27120785710183215, + "grad_norm": 0.40081697702407837, + "learning_rate": 0.0001, + "loss": 1.7706, + "step": 2361 + }, + { + "epoch": 0.27132272701165927, + "grad_norm": 0.3795923590660095, + "learning_rate": 0.0001, + "loss": 1.8317, + "step": 2362 + }, + { + "epoch": 0.2714375969214864, + "grad_norm": 0.41488227248191833, + "learning_rate": 0.0001, + "loss": 1.8353, + "step": 2363 + }, + { + "epoch": 0.2715524668313135, + "grad_norm": 0.36446574330329895, + "learning_rate": 0.0001, + "loss": 1.7574, + "step": 2364 + }, + { + "epoch": 0.2716673367411407, + "grad_norm": 0.4068087935447693, + "learning_rate": 0.0001, + "loss": 1.6117, + "step": 2365 + }, + { + "epoch": 0.2717822066509678, + "grad_norm": 0.3724612295627594, + "learning_rate": 0.0001, + "loss": 1.8087, + "step": 2366 + }, + { + "epoch": 0.2718970765607949, + "grad_norm": 0.3523292541503906, + "learning_rate": 0.0001, + "loss": 1.6064, + "step": 2367 + }, + { + "epoch": 0.27201194647062205, + "grad_norm": 0.3559991717338562, + "learning_rate": 0.0001, + "loss": 1.6747, + "step": 2368 + }, + { + "epoch": 0.27212681638044917, + "grad_norm": 0.370811402797699, + "learning_rate": 0.0001, + "loss": 1.5848, + "step": 2369 + }, + { + "epoch": 0.2722416862902763, + "grad_norm": 0.350815087556839, + "learning_rate": 0.0001, + "loss": 1.5376, + "step": 2370 + }, + { + "epoch": 0.2723565562001034, + "grad_norm": 0.3826122283935547, + "learning_rate": 0.0001, + "loss": 1.7605, + "step": 2371 + }, + { + "epoch": 0.2724714261099305, + "grad_norm": 0.3690153956413269, + "learning_rate": 0.0001, + "loss": 1.5615, + "step": 2372 + }, + { + "epoch": 0.27258629601975765, + "grad_norm": 0.3891347050666809, + "learning_rate": 0.0001, + "loss": 1.7345, + "step": 2373 + }, + { + "epoch": 0.27270116592958477, + "grad_norm": 0.3778075873851776, + "learning_rate": 0.0001, + "loss": 1.7759, + "step": 2374 + }, + { + "epoch": 0.2728160358394119, + "grad_norm": 0.35634365677833557, + "learning_rate": 0.0001, + "loss": 1.7241, + "step": 2375 + }, + { + "epoch": 0.272930905749239, + "grad_norm": 0.3891456127166748, + "learning_rate": 0.0001, + "loss": 1.7749, + "step": 2376 + }, + { + "epoch": 0.2730457756590661, + "grad_norm": 0.3792819380760193, + "learning_rate": 0.0001, + "loss": 1.6347, + "step": 2377 + }, + { + "epoch": 0.27316064556889325, + "grad_norm": 0.37712422013282776, + "learning_rate": 0.0001, + "loss": 1.5787, + "step": 2378 + }, + { + "epoch": 0.27327551547872037, + "grad_norm": 0.38600102066993713, + "learning_rate": 0.0001, + "loss": 1.8743, + "step": 2379 + }, + { + "epoch": 0.2733903853885475, + "grad_norm": 0.3560759723186493, + "learning_rate": 0.0001, + "loss": 1.6198, + "step": 2380 + }, + { + "epoch": 0.2735052552983746, + "grad_norm": 0.3798205852508545, + "learning_rate": 0.0001, + "loss": 1.7504, + "step": 2381 + }, + { + "epoch": 0.2736201252082017, + "grad_norm": 0.40677618980407715, + "learning_rate": 0.0001, + "loss": 1.6794, + "step": 2382 + }, + { + "epoch": 0.27373499511802885, + "grad_norm": 0.3787870705127716, + "learning_rate": 0.0001, + "loss": 1.7416, + "step": 2383 + }, + { + "epoch": 0.27384986502785597, + "grad_norm": 0.39786311984062195, + "learning_rate": 0.0001, + "loss": 1.8658, + "step": 2384 + }, + { + "epoch": 0.2739647349376831, + "grad_norm": 0.39431723952293396, + "learning_rate": 0.0001, + "loss": 1.8427, + "step": 2385 + }, + { + "epoch": 0.2740796048475102, + "grad_norm": 0.40065842866897583, + "learning_rate": 0.0001, + "loss": 1.8135, + "step": 2386 + }, + { + "epoch": 0.2741944747573373, + "grad_norm": 0.3779159188270569, + "learning_rate": 0.0001, + "loss": 1.6793, + "step": 2387 + }, + { + "epoch": 0.27430934466716445, + "grad_norm": 0.35582754015922546, + "learning_rate": 0.0001, + "loss": 1.6984, + "step": 2388 + }, + { + "epoch": 0.27442421457699157, + "grad_norm": 0.3920300304889679, + "learning_rate": 0.0001, + "loss": 1.7694, + "step": 2389 + }, + { + "epoch": 0.2745390844868187, + "grad_norm": 0.394414484500885, + "learning_rate": 0.0001, + "loss": 1.6563, + "step": 2390 + }, + { + "epoch": 0.2746539543966458, + "grad_norm": 0.3930966258049011, + "learning_rate": 0.0001, + "loss": 1.5893, + "step": 2391 + }, + { + "epoch": 0.2747688243064729, + "grad_norm": 0.3484657406806946, + "learning_rate": 0.0001, + "loss": 1.5964, + "step": 2392 + }, + { + "epoch": 0.27488369421630005, + "grad_norm": 0.41878804564476013, + "learning_rate": 0.0001, + "loss": 1.7226, + "step": 2393 + }, + { + "epoch": 0.27499856412612717, + "grad_norm": 0.36988186836242676, + "learning_rate": 0.0001, + "loss": 1.7018, + "step": 2394 + }, + { + "epoch": 0.2751134340359543, + "grad_norm": 0.3710579574108124, + "learning_rate": 0.0001, + "loss": 1.5329, + "step": 2395 + }, + { + "epoch": 0.2752283039457814, + "grad_norm": 0.3970875144004822, + "learning_rate": 0.0001, + "loss": 1.5914, + "step": 2396 + }, + { + "epoch": 0.2753431738556085, + "grad_norm": 0.3790924847126007, + "learning_rate": 0.0001, + "loss": 1.7784, + "step": 2397 + }, + { + "epoch": 0.27545804376543565, + "grad_norm": 0.35328909754753113, + "learning_rate": 0.0001, + "loss": 1.8243, + "step": 2398 + }, + { + "epoch": 0.27557291367526277, + "grad_norm": 0.360989511013031, + "learning_rate": 0.0001, + "loss": 1.5756, + "step": 2399 + }, + { + "epoch": 0.2756877835850899, + "grad_norm": 0.4168824553489685, + "learning_rate": 0.0001, + "loss": 1.7192, + "step": 2400 + }, + { + "epoch": 0.275802653494917, + "grad_norm": 0.3885016143321991, + "learning_rate": 0.0001, + "loss": 1.6223, + "step": 2401 + }, + { + "epoch": 0.2759175234047441, + "grad_norm": 0.3799929916858673, + "learning_rate": 0.0001, + "loss": 1.7446, + "step": 2402 + }, + { + "epoch": 0.27603239331457124, + "grad_norm": 0.42033493518829346, + "learning_rate": 0.0001, + "loss": 1.8286, + "step": 2403 + }, + { + "epoch": 0.27614726322439836, + "grad_norm": 0.39111387729644775, + "learning_rate": 0.0001, + "loss": 1.7342, + "step": 2404 + }, + { + "epoch": 0.2762621331342255, + "grad_norm": 0.34956350922584534, + "learning_rate": 0.0001, + "loss": 1.6533, + "step": 2405 + }, + { + "epoch": 0.2763770030440526, + "grad_norm": 0.3685983419418335, + "learning_rate": 0.0001, + "loss": 1.8086, + "step": 2406 + }, + { + "epoch": 0.2764918729538797, + "grad_norm": 0.35792601108551025, + "learning_rate": 0.0001, + "loss": 1.7419, + "step": 2407 + }, + { + "epoch": 0.27660674286370684, + "grad_norm": 0.39823225140571594, + "learning_rate": 0.0001, + "loss": 1.7881, + "step": 2408 + }, + { + "epoch": 0.27672161277353396, + "grad_norm": 0.39737337827682495, + "learning_rate": 0.0001, + "loss": 1.7751, + "step": 2409 + }, + { + "epoch": 0.2768364826833611, + "grad_norm": 0.35919415950775146, + "learning_rate": 0.0001, + "loss": 1.6735, + "step": 2410 + }, + { + "epoch": 0.2769513525931882, + "grad_norm": 0.33741164207458496, + "learning_rate": 0.0001, + "loss": 1.5492, + "step": 2411 + }, + { + "epoch": 0.2770662225030153, + "grad_norm": 0.359222948551178, + "learning_rate": 0.0001, + "loss": 1.6339, + "step": 2412 + }, + { + "epoch": 0.27718109241284244, + "grad_norm": 0.3950967490673065, + "learning_rate": 0.0001, + "loss": 1.7063, + "step": 2413 + }, + { + "epoch": 0.27729596232266956, + "grad_norm": 0.3793290853500366, + "learning_rate": 0.0001, + "loss": 1.732, + "step": 2414 + }, + { + "epoch": 0.2774108322324967, + "grad_norm": 0.38844072818756104, + "learning_rate": 0.0001, + "loss": 1.6996, + "step": 2415 + }, + { + "epoch": 0.2775257021423238, + "grad_norm": 0.3837685286998749, + "learning_rate": 0.0001, + "loss": 1.7407, + "step": 2416 + }, + { + "epoch": 0.2776405720521509, + "grad_norm": 0.4028517007827759, + "learning_rate": 0.0001, + "loss": 1.8119, + "step": 2417 + }, + { + "epoch": 0.27775544196197804, + "grad_norm": 0.38334351778030396, + "learning_rate": 0.0001, + "loss": 1.7055, + "step": 2418 + }, + { + "epoch": 0.27787031187180516, + "grad_norm": 0.3632005453109741, + "learning_rate": 0.0001, + "loss": 1.531, + "step": 2419 + }, + { + "epoch": 0.2779851817816323, + "grad_norm": 0.36728063225746155, + "learning_rate": 0.0001, + "loss": 1.7756, + "step": 2420 + }, + { + "epoch": 0.2781000516914594, + "grad_norm": 0.41414251923561096, + "learning_rate": 0.0001, + "loss": 1.9967, + "step": 2421 + }, + { + "epoch": 0.2782149216012865, + "grad_norm": 0.3955191969871521, + "learning_rate": 0.0001, + "loss": 1.7223, + "step": 2422 + }, + { + "epoch": 0.27832979151111364, + "grad_norm": 0.37816205620765686, + "learning_rate": 0.0001, + "loss": 1.4855, + "step": 2423 + }, + { + "epoch": 0.27844466142094076, + "grad_norm": 0.4215168058872223, + "learning_rate": 0.0001, + "loss": 1.6134, + "step": 2424 + }, + { + "epoch": 0.2785595313307679, + "grad_norm": 0.3857564926147461, + "learning_rate": 0.0001, + "loss": 1.6492, + "step": 2425 + }, + { + "epoch": 0.278674401240595, + "grad_norm": 0.3551070988178253, + "learning_rate": 0.0001, + "loss": 1.5696, + "step": 2426 + }, + { + "epoch": 0.2787892711504221, + "grad_norm": 0.3506256937980652, + "learning_rate": 0.0001, + "loss": 1.6087, + "step": 2427 + }, + { + "epoch": 0.27890414106024924, + "grad_norm": 0.3387966752052307, + "learning_rate": 0.0001, + "loss": 1.5616, + "step": 2428 + }, + { + "epoch": 0.27901901097007636, + "grad_norm": 0.3767859637737274, + "learning_rate": 0.0001, + "loss": 1.7688, + "step": 2429 + }, + { + "epoch": 0.2791338808799035, + "grad_norm": 0.38322755694389343, + "learning_rate": 0.0001, + "loss": 1.867, + "step": 2430 + }, + { + "epoch": 0.2792487507897306, + "grad_norm": 0.3900243043899536, + "learning_rate": 0.0001, + "loss": 1.9258, + "step": 2431 + }, + { + "epoch": 0.2793636206995577, + "grad_norm": 0.3808649182319641, + "learning_rate": 0.0001, + "loss": 1.53, + "step": 2432 + }, + { + "epoch": 0.27947849060938484, + "grad_norm": 0.36853742599487305, + "learning_rate": 0.0001, + "loss": 1.584, + "step": 2433 + }, + { + "epoch": 0.279593360519212, + "grad_norm": 0.36365145444869995, + "learning_rate": 0.0001, + "loss": 1.7215, + "step": 2434 + }, + { + "epoch": 0.27970823042903914, + "grad_norm": 0.361780047416687, + "learning_rate": 0.0001, + "loss": 1.6156, + "step": 2435 + }, + { + "epoch": 0.27982310033886626, + "grad_norm": 0.39141660928726196, + "learning_rate": 0.0001, + "loss": 1.8127, + "step": 2436 + }, + { + "epoch": 0.2799379702486934, + "grad_norm": 0.3739947974681854, + "learning_rate": 0.0001, + "loss": 1.5502, + "step": 2437 + }, + { + "epoch": 0.2800528401585205, + "grad_norm": 0.3897460103034973, + "learning_rate": 0.0001, + "loss": 1.869, + "step": 2438 + }, + { + "epoch": 0.2801677100683476, + "grad_norm": 0.3739219903945923, + "learning_rate": 0.0001, + "loss": 1.5862, + "step": 2439 + }, + { + "epoch": 0.28028257997817474, + "grad_norm": 0.3786073923110962, + "learning_rate": 0.0001, + "loss": 1.6744, + "step": 2440 + }, + { + "epoch": 0.28039744988800186, + "grad_norm": 0.37929031252861023, + "learning_rate": 0.0001, + "loss": 1.7422, + "step": 2441 + }, + { + "epoch": 0.280512319797829, + "grad_norm": 0.35074925422668457, + "learning_rate": 0.0001, + "loss": 1.6005, + "step": 2442 + }, + { + "epoch": 0.2806271897076561, + "grad_norm": 0.3912547528743744, + "learning_rate": 0.0001, + "loss": 1.8629, + "step": 2443 + }, + { + "epoch": 0.2807420596174832, + "grad_norm": 0.36757802963256836, + "learning_rate": 0.0001, + "loss": 1.557, + "step": 2444 + }, + { + "epoch": 0.28085692952731034, + "grad_norm": 0.3755451738834381, + "learning_rate": 0.0001, + "loss": 1.8249, + "step": 2445 + }, + { + "epoch": 0.28097179943713746, + "grad_norm": 0.3435642719268799, + "learning_rate": 0.0001, + "loss": 1.4253, + "step": 2446 + }, + { + "epoch": 0.2810866693469646, + "grad_norm": 0.3567301332950592, + "learning_rate": 0.0001, + "loss": 1.4934, + "step": 2447 + }, + { + "epoch": 0.2812015392567917, + "grad_norm": 0.3861426115036011, + "learning_rate": 0.0001, + "loss": 1.6306, + "step": 2448 + }, + { + "epoch": 0.2813164091666188, + "grad_norm": 0.38505983352661133, + "learning_rate": 0.0001, + "loss": 1.7413, + "step": 2449 + }, + { + "epoch": 0.28143127907644594, + "grad_norm": 0.3739560544490814, + "learning_rate": 0.0001, + "loss": 1.7037, + "step": 2450 + }, + { + "epoch": 0.28154614898627306, + "grad_norm": 0.359266996383667, + "learning_rate": 0.0001, + "loss": 1.6393, + "step": 2451 + }, + { + "epoch": 0.2816610188961002, + "grad_norm": 0.34818223118782043, + "learning_rate": 0.0001, + "loss": 1.4532, + "step": 2452 + }, + { + "epoch": 0.2817758888059273, + "grad_norm": 0.40676432847976685, + "learning_rate": 0.0001, + "loss": 1.8238, + "step": 2453 + }, + { + "epoch": 0.2818907587157544, + "grad_norm": 0.37692880630493164, + "learning_rate": 0.0001, + "loss": 1.7096, + "step": 2454 + }, + { + "epoch": 0.28200562862558154, + "grad_norm": 0.3589748442173004, + "learning_rate": 0.0001, + "loss": 1.5383, + "step": 2455 + }, + { + "epoch": 0.28212049853540866, + "grad_norm": 0.3927457928657532, + "learning_rate": 0.0001, + "loss": 1.7663, + "step": 2456 + }, + { + "epoch": 0.2822353684452358, + "grad_norm": 0.37036100029945374, + "learning_rate": 0.0001, + "loss": 1.6362, + "step": 2457 + }, + { + "epoch": 0.2823502383550629, + "grad_norm": 0.3649539649486542, + "learning_rate": 0.0001, + "loss": 1.4174, + "step": 2458 + }, + { + "epoch": 0.28246510826489, + "grad_norm": 0.3977210521697998, + "learning_rate": 0.0001, + "loss": 1.7361, + "step": 2459 + }, + { + "epoch": 0.28257997817471714, + "grad_norm": 0.37352606654167175, + "learning_rate": 0.0001, + "loss": 1.7877, + "step": 2460 + }, + { + "epoch": 0.28269484808454426, + "grad_norm": 0.37431058287620544, + "learning_rate": 0.0001, + "loss": 1.7231, + "step": 2461 + }, + { + "epoch": 0.2828097179943714, + "grad_norm": 0.3667674660682678, + "learning_rate": 0.0001, + "loss": 1.6047, + "step": 2462 + }, + { + "epoch": 0.2829245879041985, + "grad_norm": 0.4098829925060272, + "learning_rate": 0.0001, + "loss": 1.7942, + "step": 2463 + }, + { + "epoch": 0.2830394578140256, + "grad_norm": 0.35849472880363464, + "learning_rate": 0.0001, + "loss": 1.7135, + "step": 2464 + }, + { + "epoch": 0.28315432772385274, + "grad_norm": 0.40055009722709656, + "learning_rate": 0.0001, + "loss": 1.7541, + "step": 2465 + }, + { + "epoch": 0.28326919763367986, + "grad_norm": 0.3810610771179199, + "learning_rate": 0.0001, + "loss": 1.6542, + "step": 2466 + }, + { + "epoch": 0.283384067543507, + "grad_norm": 0.3662252724170685, + "learning_rate": 0.0001, + "loss": 1.5483, + "step": 2467 + }, + { + "epoch": 0.2834989374533341, + "grad_norm": 0.36314326524734497, + "learning_rate": 0.0001, + "loss": 1.5905, + "step": 2468 + }, + { + "epoch": 0.2836138073631612, + "grad_norm": 0.3729235529899597, + "learning_rate": 0.0001, + "loss": 1.6139, + "step": 2469 + }, + { + "epoch": 0.28372867727298834, + "grad_norm": 0.40618225932121277, + "learning_rate": 0.0001, + "loss": 1.7267, + "step": 2470 + }, + { + "epoch": 0.28384354718281546, + "grad_norm": 0.3927193284034729, + "learning_rate": 0.0001, + "loss": 1.8012, + "step": 2471 + }, + { + "epoch": 0.2839584170926426, + "grad_norm": 0.3536290228366852, + "learning_rate": 0.0001, + "loss": 1.7102, + "step": 2472 + }, + { + "epoch": 0.2840732870024697, + "grad_norm": 0.36351481080055237, + "learning_rate": 0.0001, + "loss": 1.5794, + "step": 2473 + }, + { + "epoch": 0.2841881569122968, + "grad_norm": 0.3685545027256012, + "learning_rate": 0.0001, + "loss": 1.619, + "step": 2474 + }, + { + "epoch": 0.28430302682212394, + "grad_norm": 0.37130284309387207, + "learning_rate": 0.0001, + "loss": 1.7307, + "step": 2475 + }, + { + "epoch": 0.28441789673195106, + "grad_norm": 0.3823324143886566, + "learning_rate": 0.0001, + "loss": 1.7253, + "step": 2476 + }, + { + "epoch": 0.2845327666417782, + "grad_norm": 0.3819986879825592, + "learning_rate": 0.0001, + "loss": 1.5769, + "step": 2477 + }, + { + "epoch": 0.2846476365516053, + "grad_norm": 0.4364182651042938, + "learning_rate": 0.0001, + "loss": 1.7868, + "step": 2478 + }, + { + "epoch": 0.2847625064614324, + "grad_norm": 0.3844752907752991, + "learning_rate": 0.0001, + "loss": 1.6089, + "step": 2479 + }, + { + "epoch": 0.28487737637125954, + "grad_norm": 0.38411590456962585, + "learning_rate": 0.0001, + "loss": 1.5398, + "step": 2480 + }, + { + "epoch": 0.28499224628108666, + "grad_norm": 0.40151742100715637, + "learning_rate": 0.0001, + "loss": 1.8084, + "step": 2481 + }, + { + "epoch": 0.2851071161909138, + "grad_norm": 0.41466403007507324, + "learning_rate": 0.0001, + "loss": 1.6723, + "step": 2482 + }, + { + "epoch": 0.2852219861007409, + "grad_norm": 0.37958747148513794, + "learning_rate": 0.0001, + "loss": 1.516, + "step": 2483 + }, + { + "epoch": 0.285336856010568, + "grad_norm": 0.3950232267379761, + "learning_rate": 0.0001, + "loss": 1.955, + "step": 2484 + }, + { + "epoch": 0.28545172592039514, + "grad_norm": 0.3635809123516083, + "learning_rate": 0.0001, + "loss": 1.7445, + "step": 2485 + }, + { + "epoch": 0.28556659583022226, + "grad_norm": 0.38769015669822693, + "learning_rate": 0.0001, + "loss": 1.6845, + "step": 2486 + }, + { + "epoch": 0.2856814657400494, + "grad_norm": 0.37288132309913635, + "learning_rate": 0.0001, + "loss": 1.5452, + "step": 2487 + }, + { + "epoch": 0.2857963356498765, + "grad_norm": 0.4224679172039032, + "learning_rate": 0.0001, + "loss": 1.8745, + "step": 2488 + }, + { + "epoch": 0.2859112055597036, + "grad_norm": 0.4068784713745117, + "learning_rate": 0.0001, + "loss": 1.9496, + "step": 2489 + }, + { + "epoch": 0.28602607546953074, + "grad_norm": 0.39766570925712585, + "learning_rate": 0.0001, + "loss": 1.7433, + "step": 2490 + }, + { + "epoch": 0.28614094537935786, + "grad_norm": 0.4218822717666626, + "learning_rate": 0.0001, + "loss": 1.7239, + "step": 2491 + }, + { + "epoch": 0.286255815289185, + "grad_norm": 0.3917092978954315, + "learning_rate": 0.0001, + "loss": 1.5885, + "step": 2492 + }, + { + "epoch": 0.2863706851990121, + "grad_norm": 0.44831544160842896, + "learning_rate": 0.0001, + "loss": 1.7862, + "step": 2493 + }, + { + "epoch": 0.2864855551088392, + "grad_norm": 0.39573273062705994, + "learning_rate": 0.0001, + "loss": 1.8192, + "step": 2494 + }, + { + "epoch": 0.28660042501866634, + "grad_norm": 0.37129390239715576, + "learning_rate": 0.0001, + "loss": 1.5017, + "step": 2495 + }, + { + "epoch": 0.28671529492849346, + "grad_norm": 0.3685044050216675, + "learning_rate": 0.0001, + "loss": 1.7772, + "step": 2496 + }, + { + "epoch": 0.2868301648383206, + "grad_norm": 0.41524022817611694, + "learning_rate": 0.0001, + "loss": 1.825, + "step": 2497 + }, + { + "epoch": 0.2869450347481477, + "grad_norm": 0.39948517084121704, + "learning_rate": 0.0001, + "loss": 1.6756, + "step": 2498 + }, + { + "epoch": 0.2870599046579748, + "grad_norm": 0.3788050413131714, + "learning_rate": 0.0001, + "loss": 1.8223, + "step": 2499 + }, + { + "epoch": 0.28717477456780194, + "grad_norm": 0.3545449674129486, + "learning_rate": 0.0001, + "loss": 1.4563, + "step": 2500 + }, + { + "epoch": 0.28728964447762906, + "grad_norm": 0.37108996510505676, + "learning_rate": 0.0001, + "loss": 1.685, + "step": 2501 + }, + { + "epoch": 0.28740451438745623, + "grad_norm": 0.37973251938819885, + "learning_rate": 0.0001, + "loss": 1.9102, + "step": 2502 + }, + { + "epoch": 0.28751938429728335, + "grad_norm": 0.3596639931201935, + "learning_rate": 0.0001, + "loss": 1.6966, + "step": 2503 + }, + { + "epoch": 0.2876342542071105, + "grad_norm": 0.39931946992874146, + "learning_rate": 0.0001, + "loss": 1.664, + "step": 2504 + }, + { + "epoch": 0.2877491241169376, + "grad_norm": 0.3801995515823364, + "learning_rate": 0.0001, + "loss": 1.6762, + "step": 2505 + }, + { + "epoch": 0.2878639940267647, + "grad_norm": 0.3975699245929718, + "learning_rate": 0.0001, + "loss": 1.8263, + "step": 2506 + }, + { + "epoch": 0.28797886393659183, + "grad_norm": 0.41587021946907043, + "learning_rate": 0.0001, + "loss": 1.8763, + "step": 2507 + }, + { + "epoch": 0.28809373384641895, + "grad_norm": 0.3884534239768982, + "learning_rate": 0.0001, + "loss": 1.8323, + "step": 2508 + }, + { + "epoch": 0.2882086037562461, + "grad_norm": 0.376265287399292, + "learning_rate": 0.0001, + "loss": 1.735, + "step": 2509 + }, + { + "epoch": 0.2883234736660732, + "grad_norm": 0.349988728761673, + "learning_rate": 0.0001, + "loss": 1.5764, + "step": 2510 + }, + { + "epoch": 0.2884383435759003, + "grad_norm": 0.3495781123638153, + "learning_rate": 0.0001, + "loss": 1.6788, + "step": 2511 + }, + { + "epoch": 0.28855321348572743, + "grad_norm": 0.40810123085975647, + "learning_rate": 0.0001, + "loss": 1.7881, + "step": 2512 + }, + { + "epoch": 0.28866808339555455, + "grad_norm": 0.37921836972236633, + "learning_rate": 0.0001, + "loss": 1.6171, + "step": 2513 + }, + { + "epoch": 0.2887829533053817, + "grad_norm": 0.3909505009651184, + "learning_rate": 0.0001, + "loss": 1.7327, + "step": 2514 + }, + { + "epoch": 0.2888978232152088, + "grad_norm": 0.3837600648403168, + "learning_rate": 0.0001, + "loss": 1.5121, + "step": 2515 + }, + { + "epoch": 0.2890126931250359, + "grad_norm": 0.4128781855106354, + "learning_rate": 0.0001, + "loss": 1.8047, + "step": 2516 + }, + { + "epoch": 0.28912756303486303, + "grad_norm": 0.3738429546356201, + "learning_rate": 0.0001, + "loss": 1.6059, + "step": 2517 + }, + { + "epoch": 0.28924243294469015, + "grad_norm": 0.40288591384887695, + "learning_rate": 0.0001, + "loss": 1.8381, + "step": 2518 + }, + { + "epoch": 0.2893573028545173, + "grad_norm": 0.37133219838142395, + "learning_rate": 0.0001, + "loss": 1.6335, + "step": 2519 + }, + { + "epoch": 0.2894721727643444, + "grad_norm": 0.3895220160484314, + "learning_rate": 0.0001, + "loss": 1.6405, + "step": 2520 + }, + { + "epoch": 0.2895870426741715, + "grad_norm": 0.42748066782951355, + "learning_rate": 0.0001, + "loss": 1.7745, + "step": 2521 + }, + { + "epoch": 0.28970191258399863, + "grad_norm": 0.3681427836418152, + "learning_rate": 0.0001, + "loss": 1.5979, + "step": 2522 + }, + { + "epoch": 0.28981678249382575, + "grad_norm": 0.3755578100681305, + "learning_rate": 0.0001, + "loss": 1.7104, + "step": 2523 + }, + { + "epoch": 0.2899316524036529, + "grad_norm": 0.3791626989841461, + "learning_rate": 0.0001, + "loss": 1.6502, + "step": 2524 + }, + { + "epoch": 0.29004652231348, + "grad_norm": 0.4189550578594208, + "learning_rate": 0.0001, + "loss": 1.9463, + "step": 2525 + }, + { + "epoch": 0.2901613922233071, + "grad_norm": 0.4706687033176422, + "learning_rate": 0.0001, + "loss": 1.8105, + "step": 2526 + }, + { + "epoch": 0.29027626213313423, + "grad_norm": 0.38283270597457886, + "learning_rate": 0.0001, + "loss": 1.7055, + "step": 2527 + }, + { + "epoch": 0.29039113204296135, + "grad_norm": 0.38680872321128845, + "learning_rate": 0.0001, + "loss": 1.7457, + "step": 2528 + }, + { + "epoch": 0.2905060019527885, + "grad_norm": 0.34589383006095886, + "learning_rate": 0.0001, + "loss": 1.5529, + "step": 2529 + }, + { + "epoch": 0.2906208718626156, + "grad_norm": 0.33116042613983154, + "learning_rate": 0.0001, + "loss": 1.3595, + "step": 2530 + }, + { + "epoch": 0.2907357417724427, + "grad_norm": 0.37206852436065674, + "learning_rate": 0.0001, + "loss": 1.8634, + "step": 2531 + }, + { + "epoch": 0.29085061168226983, + "grad_norm": 0.39483073353767395, + "learning_rate": 0.0001, + "loss": 1.7221, + "step": 2532 + }, + { + "epoch": 0.29096548159209695, + "grad_norm": 0.3861776888370514, + "learning_rate": 0.0001, + "loss": 1.7241, + "step": 2533 + }, + { + "epoch": 0.2910803515019241, + "grad_norm": 0.3621794283390045, + "learning_rate": 0.0001, + "loss": 1.6201, + "step": 2534 + }, + { + "epoch": 0.2911952214117512, + "grad_norm": 0.40973809361457825, + "learning_rate": 0.0001, + "loss": 1.7886, + "step": 2535 + }, + { + "epoch": 0.2913100913215783, + "grad_norm": 0.36391711235046387, + "learning_rate": 0.0001, + "loss": 1.6253, + "step": 2536 + }, + { + "epoch": 0.29142496123140543, + "grad_norm": 0.38137203454971313, + "learning_rate": 0.0001, + "loss": 1.4499, + "step": 2537 + }, + { + "epoch": 0.29153983114123255, + "grad_norm": 0.3694712817668915, + "learning_rate": 0.0001, + "loss": 1.5863, + "step": 2538 + }, + { + "epoch": 0.2916547010510597, + "grad_norm": 0.38381004333496094, + "learning_rate": 0.0001, + "loss": 1.8341, + "step": 2539 + }, + { + "epoch": 0.2917695709608868, + "grad_norm": 0.39511749148368835, + "learning_rate": 0.0001, + "loss": 1.8543, + "step": 2540 + }, + { + "epoch": 0.2918844408707139, + "grad_norm": 0.41761019825935364, + "learning_rate": 0.0001, + "loss": 1.6229, + "step": 2541 + }, + { + "epoch": 0.29199931078054103, + "grad_norm": 0.4057486355304718, + "learning_rate": 0.0001, + "loss": 1.7036, + "step": 2542 + }, + { + "epoch": 0.29211418069036815, + "grad_norm": 0.35340383648872375, + "learning_rate": 0.0001, + "loss": 1.4367, + "step": 2543 + }, + { + "epoch": 0.2922290506001953, + "grad_norm": 0.3808727264404297, + "learning_rate": 0.0001, + "loss": 1.644, + "step": 2544 + }, + { + "epoch": 0.2923439205100224, + "grad_norm": 0.38112786412239075, + "learning_rate": 0.0001, + "loss": 1.6942, + "step": 2545 + }, + { + "epoch": 0.2924587904198495, + "grad_norm": 0.41311514377593994, + "learning_rate": 0.0001, + "loss": 1.8936, + "step": 2546 + }, + { + "epoch": 0.29257366032967663, + "grad_norm": 0.37865912914276123, + "learning_rate": 0.0001, + "loss": 1.7468, + "step": 2547 + }, + { + "epoch": 0.29268853023950375, + "grad_norm": 0.379802942276001, + "learning_rate": 0.0001, + "loss": 1.6966, + "step": 2548 + }, + { + "epoch": 0.2928034001493309, + "grad_norm": 0.4265817105770111, + "learning_rate": 0.0001, + "loss": 1.6835, + "step": 2549 + }, + { + "epoch": 0.292918270059158, + "grad_norm": 0.38082998991012573, + "learning_rate": 0.0001, + "loss": 1.6804, + "step": 2550 + }, + { + "epoch": 0.2930331399689851, + "grad_norm": 0.38684189319610596, + "learning_rate": 0.0001, + "loss": 1.9011, + "step": 2551 + }, + { + "epoch": 0.29314800987881223, + "grad_norm": 0.38656875491142273, + "learning_rate": 0.0001, + "loss": 1.7264, + "step": 2552 + }, + { + "epoch": 0.29326287978863935, + "grad_norm": 0.365608274936676, + "learning_rate": 0.0001, + "loss": 1.4035, + "step": 2553 + }, + { + "epoch": 0.2933777496984665, + "grad_norm": 0.3454169034957886, + "learning_rate": 0.0001, + "loss": 1.5401, + "step": 2554 + }, + { + "epoch": 0.2934926196082936, + "grad_norm": 0.3723081052303314, + "learning_rate": 0.0001, + "loss": 1.5431, + "step": 2555 + }, + { + "epoch": 0.2936074895181207, + "grad_norm": 0.35232463479042053, + "learning_rate": 0.0001, + "loss": 1.5584, + "step": 2556 + }, + { + "epoch": 0.29372235942794783, + "grad_norm": 0.36890724301338196, + "learning_rate": 0.0001, + "loss": 1.6345, + "step": 2557 + }, + { + "epoch": 0.29383722933777495, + "grad_norm": 0.39997896552085876, + "learning_rate": 0.0001, + "loss": 1.9178, + "step": 2558 + }, + { + "epoch": 0.29395209924760207, + "grad_norm": 0.3915958106517792, + "learning_rate": 0.0001, + "loss": 1.6637, + "step": 2559 + }, + { + "epoch": 0.2940669691574292, + "grad_norm": 0.3597055971622467, + "learning_rate": 0.0001, + "loss": 1.5527, + "step": 2560 + }, + { + "epoch": 0.2941818390672563, + "grad_norm": 0.38754889369010925, + "learning_rate": 0.0001, + "loss": 1.68, + "step": 2561 + }, + { + "epoch": 0.29429670897708343, + "grad_norm": 0.37158629298210144, + "learning_rate": 0.0001, + "loss": 1.6678, + "step": 2562 + }, + { + "epoch": 0.29441157888691055, + "grad_norm": 0.393255352973938, + "learning_rate": 0.0001, + "loss": 1.5541, + "step": 2563 + }, + { + "epoch": 0.29452644879673767, + "grad_norm": 0.4069103002548218, + "learning_rate": 0.0001, + "loss": 1.6978, + "step": 2564 + }, + { + "epoch": 0.2946413187065648, + "grad_norm": 0.3824077844619751, + "learning_rate": 0.0001, + "loss": 1.628, + "step": 2565 + }, + { + "epoch": 0.2947561886163919, + "grad_norm": 0.3937399387359619, + "learning_rate": 0.0001, + "loss": 1.7788, + "step": 2566 + }, + { + "epoch": 0.29487105852621903, + "grad_norm": 0.4155014753341675, + "learning_rate": 0.0001, + "loss": 1.8913, + "step": 2567 + }, + { + "epoch": 0.29498592843604615, + "grad_norm": 0.3631512224674225, + "learning_rate": 0.0001, + "loss": 1.6648, + "step": 2568 + }, + { + "epoch": 0.29510079834587327, + "grad_norm": 0.37146568298339844, + "learning_rate": 0.0001, + "loss": 1.5236, + "step": 2569 + }, + { + "epoch": 0.2952156682557004, + "grad_norm": 0.3793732523918152, + "learning_rate": 0.0001, + "loss": 1.77, + "step": 2570 + }, + { + "epoch": 0.29533053816552757, + "grad_norm": 0.39151185750961304, + "learning_rate": 0.0001, + "loss": 1.7726, + "step": 2571 + }, + { + "epoch": 0.2954454080753547, + "grad_norm": 0.38116058707237244, + "learning_rate": 0.0001, + "loss": 1.8428, + "step": 2572 + }, + { + "epoch": 0.2955602779851818, + "grad_norm": 0.3952963650226593, + "learning_rate": 0.0001, + "loss": 1.6293, + "step": 2573 + }, + { + "epoch": 0.2956751478950089, + "grad_norm": 0.3993338942527771, + "learning_rate": 0.0001, + "loss": 1.9019, + "step": 2574 + }, + { + "epoch": 0.29579001780483605, + "grad_norm": 0.37218764424324036, + "learning_rate": 0.0001, + "loss": 1.564, + "step": 2575 + }, + { + "epoch": 0.29590488771466317, + "grad_norm": 0.3602159023284912, + "learning_rate": 0.0001, + "loss": 1.5937, + "step": 2576 + }, + { + "epoch": 0.2960197576244903, + "grad_norm": 0.4018074572086334, + "learning_rate": 0.0001, + "loss": 1.7806, + "step": 2577 + }, + { + "epoch": 0.2961346275343174, + "grad_norm": 0.38347718119621277, + "learning_rate": 0.0001, + "loss": 1.6941, + "step": 2578 + }, + { + "epoch": 0.2962494974441445, + "grad_norm": 0.3954737186431885, + "learning_rate": 0.0001, + "loss": 1.8642, + "step": 2579 + }, + { + "epoch": 0.29636436735397165, + "grad_norm": 0.4196760952472687, + "learning_rate": 0.0001, + "loss": 1.6487, + "step": 2580 + }, + { + "epoch": 0.29647923726379877, + "grad_norm": 0.3532737195491791, + "learning_rate": 0.0001, + "loss": 1.6036, + "step": 2581 + }, + { + "epoch": 0.2965941071736259, + "grad_norm": 0.36641091108322144, + "learning_rate": 0.0001, + "loss": 1.4252, + "step": 2582 + }, + { + "epoch": 0.296708977083453, + "grad_norm": 0.37007638812065125, + "learning_rate": 0.0001, + "loss": 1.6963, + "step": 2583 + }, + { + "epoch": 0.2968238469932801, + "grad_norm": 0.3760312795639038, + "learning_rate": 0.0001, + "loss": 1.613, + "step": 2584 + }, + { + "epoch": 0.29693871690310725, + "grad_norm": 0.392160564661026, + "learning_rate": 0.0001, + "loss": 1.759, + "step": 2585 + }, + { + "epoch": 0.29705358681293437, + "grad_norm": 0.3935618996620178, + "learning_rate": 0.0001, + "loss": 1.787, + "step": 2586 + }, + { + "epoch": 0.2971684567227615, + "grad_norm": 0.3662956655025482, + "learning_rate": 0.0001, + "loss": 1.6899, + "step": 2587 + }, + { + "epoch": 0.2972833266325886, + "grad_norm": 0.4194296598434448, + "learning_rate": 0.0001, + "loss": 1.7168, + "step": 2588 + }, + { + "epoch": 0.2973981965424157, + "grad_norm": 0.37482190132141113, + "learning_rate": 0.0001, + "loss": 1.8223, + "step": 2589 + }, + { + "epoch": 0.29751306645224285, + "grad_norm": 0.38104763627052307, + "learning_rate": 0.0001, + "loss": 1.7773, + "step": 2590 + }, + { + "epoch": 0.29762793636206997, + "grad_norm": 0.37571361660957336, + "learning_rate": 0.0001, + "loss": 1.6534, + "step": 2591 + }, + { + "epoch": 0.2977428062718971, + "grad_norm": 0.4095185697078705, + "learning_rate": 0.0001, + "loss": 1.5866, + "step": 2592 + }, + { + "epoch": 0.2978576761817242, + "grad_norm": 0.4440751075744629, + "learning_rate": 0.0001, + "loss": 1.646, + "step": 2593 + }, + { + "epoch": 0.2979725460915513, + "grad_norm": 0.3924051821231842, + "learning_rate": 0.0001, + "loss": 1.6179, + "step": 2594 + }, + { + "epoch": 0.29808741600137845, + "grad_norm": 0.3902948796749115, + "learning_rate": 0.0001, + "loss": 1.7332, + "step": 2595 + }, + { + "epoch": 0.29820228591120557, + "grad_norm": 0.3790493607521057, + "learning_rate": 0.0001, + "loss": 1.7428, + "step": 2596 + }, + { + "epoch": 0.2983171558210327, + "grad_norm": 0.38171616196632385, + "learning_rate": 0.0001, + "loss": 1.6307, + "step": 2597 + }, + { + "epoch": 0.2984320257308598, + "grad_norm": 0.40960413217544556, + "learning_rate": 0.0001, + "loss": 1.9166, + "step": 2598 + }, + { + "epoch": 0.2985468956406869, + "grad_norm": 0.3882502019405365, + "learning_rate": 0.0001, + "loss": 1.7118, + "step": 2599 + }, + { + "epoch": 0.29866176555051405, + "grad_norm": 0.3626171350479126, + "learning_rate": 0.0001, + "loss": 1.6102, + "step": 2600 + }, + { + "epoch": 0.29877663546034117, + "grad_norm": 0.37243038415908813, + "learning_rate": 0.0001, + "loss": 1.6959, + "step": 2601 + }, + { + "epoch": 0.2988915053701683, + "grad_norm": 0.40221109986305237, + "learning_rate": 0.0001, + "loss": 1.6998, + "step": 2602 + }, + { + "epoch": 0.2990063752799954, + "grad_norm": 0.3771398663520813, + "learning_rate": 0.0001, + "loss": 1.6593, + "step": 2603 + }, + { + "epoch": 0.2991212451898225, + "grad_norm": 0.3790495991706848, + "learning_rate": 0.0001, + "loss": 1.6981, + "step": 2604 + }, + { + "epoch": 0.29923611509964965, + "grad_norm": 0.3962880074977875, + "learning_rate": 0.0001, + "loss": 1.7409, + "step": 2605 + }, + { + "epoch": 0.29935098500947677, + "grad_norm": 0.3856731057167053, + "learning_rate": 0.0001, + "loss": 1.8138, + "step": 2606 + }, + { + "epoch": 0.2994658549193039, + "grad_norm": 0.37686559557914734, + "learning_rate": 0.0001, + "loss": 1.7306, + "step": 2607 + }, + { + "epoch": 0.299580724829131, + "grad_norm": 0.4329466223716736, + "learning_rate": 0.0001, + "loss": 2.0436, + "step": 2608 + }, + { + "epoch": 0.2996955947389581, + "grad_norm": 0.3704317808151245, + "learning_rate": 0.0001, + "loss": 1.6667, + "step": 2609 + }, + { + "epoch": 0.29981046464878525, + "grad_norm": 0.42832422256469727, + "learning_rate": 0.0001, + "loss": 1.7882, + "step": 2610 + }, + { + "epoch": 0.29992533455861237, + "grad_norm": 0.416471928358078, + "learning_rate": 0.0001, + "loss": 1.8709, + "step": 2611 + }, + { + "epoch": 0.3000402044684395, + "grad_norm": 0.35787421464920044, + "learning_rate": 0.0001, + "loss": 1.6281, + "step": 2612 + }, + { + "epoch": 0.3001550743782666, + "grad_norm": 0.37828290462493896, + "learning_rate": 0.0001, + "loss": 1.5254, + "step": 2613 + }, + { + "epoch": 0.3002699442880937, + "grad_norm": 0.3505316972732544, + "learning_rate": 0.0001, + "loss": 1.6496, + "step": 2614 + }, + { + "epoch": 0.30038481419792085, + "grad_norm": 0.3572443127632141, + "learning_rate": 0.0001, + "loss": 1.6333, + "step": 2615 + }, + { + "epoch": 0.30049968410774797, + "grad_norm": 0.3744872808456421, + "learning_rate": 0.0001, + "loss": 1.7276, + "step": 2616 + }, + { + "epoch": 0.3006145540175751, + "grad_norm": 0.38314029574394226, + "learning_rate": 0.0001, + "loss": 1.6795, + "step": 2617 + }, + { + "epoch": 0.3007294239274022, + "grad_norm": 0.38417288661003113, + "learning_rate": 0.0001, + "loss": 1.7798, + "step": 2618 + }, + { + "epoch": 0.3008442938372293, + "grad_norm": 0.39451834559440613, + "learning_rate": 0.0001, + "loss": 1.7956, + "step": 2619 + }, + { + "epoch": 0.30095916374705645, + "grad_norm": 0.3972351849079132, + "learning_rate": 0.0001, + "loss": 1.654, + "step": 2620 + }, + { + "epoch": 0.30107403365688357, + "grad_norm": 0.4091535806655884, + "learning_rate": 0.0001, + "loss": 1.9206, + "step": 2621 + }, + { + "epoch": 0.3011889035667107, + "grad_norm": 0.3716078996658325, + "learning_rate": 0.0001, + "loss": 1.432, + "step": 2622 + }, + { + "epoch": 0.3013037734765378, + "grad_norm": 0.34581923484802246, + "learning_rate": 0.0001, + "loss": 1.3475, + "step": 2623 + }, + { + "epoch": 0.3014186433863649, + "grad_norm": 0.3731369376182556, + "learning_rate": 0.0001, + "loss": 1.613, + "step": 2624 + }, + { + "epoch": 0.30153351329619205, + "grad_norm": 0.36073705554008484, + "learning_rate": 0.0001, + "loss": 1.6255, + "step": 2625 + }, + { + "epoch": 0.30164838320601917, + "grad_norm": 0.3785097002983093, + "learning_rate": 0.0001, + "loss": 1.7506, + "step": 2626 + }, + { + "epoch": 0.3017632531158463, + "grad_norm": 0.346171498298645, + "learning_rate": 0.0001, + "loss": 1.4672, + "step": 2627 + }, + { + "epoch": 0.3018781230256734, + "grad_norm": 0.353345662355423, + "learning_rate": 0.0001, + "loss": 1.652, + "step": 2628 + }, + { + "epoch": 0.3019929929355005, + "grad_norm": 0.3921557366847992, + "learning_rate": 0.0001, + "loss": 1.8869, + "step": 2629 + }, + { + "epoch": 0.30210786284532765, + "grad_norm": 0.377298504114151, + "learning_rate": 0.0001, + "loss": 1.8358, + "step": 2630 + }, + { + "epoch": 0.30222273275515477, + "grad_norm": 0.3824778199195862, + "learning_rate": 0.0001, + "loss": 1.705, + "step": 2631 + }, + { + "epoch": 0.3023376026649819, + "grad_norm": 0.371186763048172, + "learning_rate": 0.0001, + "loss": 1.5894, + "step": 2632 + }, + { + "epoch": 0.302452472574809, + "grad_norm": 0.3520771563053131, + "learning_rate": 0.0001, + "loss": 1.4233, + "step": 2633 + }, + { + "epoch": 0.3025673424846361, + "grad_norm": 0.3969862461090088, + "learning_rate": 0.0001, + "loss": 1.7946, + "step": 2634 + }, + { + "epoch": 0.30268221239446325, + "grad_norm": 0.39209333062171936, + "learning_rate": 0.0001, + "loss": 1.5675, + "step": 2635 + }, + { + "epoch": 0.30279708230429037, + "grad_norm": 0.4001356065273285, + "learning_rate": 0.0001, + "loss": 1.5882, + "step": 2636 + }, + { + "epoch": 0.3029119522141175, + "grad_norm": 0.3858399987220764, + "learning_rate": 0.0001, + "loss": 1.7268, + "step": 2637 + }, + { + "epoch": 0.3030268221239446, + "grad_norm": 0.3735487163066864, + "learning_rate": 0.0001, + "loss": 1.5653, + "step": 2638 + }, + { + "epoch": 0.3031416920337717, + "grad_norm": 0.39118990302085876, + "learning_rate": 0.0001, + "loss": 1.6705, + "step": 2639 + }, + { + "epoch": 0.3032565619435989, + "grad_norm": 0.42595022916793823, + "learning_rate": 0.0001, + "loss": 1.7263, + "step": 2640 + }, + { + "epoch": 0.303371431853426, + "grad_norm": 0.41437122225761414, + "learning_rate": 0.0001, + "loss": 1.8724, + "step": 2641 + }, + { + "epoch": 0.30348630176325314, + "grad_norm": 0.3900952637195587, + "learning_rate": 0.0001, + "loss": 1.673, + "step": 2642 + }, + { + "epoch": 0.30360117167308026, + "grad_norm": 0.38442274928092957, + "learning_rate": 0.0001, + "loss": 1.6173, + "step": 2643 + }, + { + "epoch": 0.3037160415829074, + "grad_norm": 0.39064112305641174, + "learning_rate": 0.0001, + "loss": 1.7353, + "step": 2644 + }, + { + "epoch": 0.3038309114927345, + "grad_norm": 0.3769366443157196, + "learning_rate": 0.0001, + "loss": 1.5141, + "step": 2645 + }, + { + "epoch": 0.3039457814025616, + "grad_norm": 0.39268919825553894, + "learning_rate": 0.0001, + "loss": 1.6863, + "step": 2646 + }, + { + "epoch": 0.30406065131238874, + "grad_norm": 0.3635852038860321, + "learning_rate": 0.0001, + "loss": 1.6138, + "step": 2647 + }, + { + "epoch": 0.30417552122221586, + "grad_norm": 0.36260101199150085, + "learning_rate": 0.0001, + "loss": 1.5824, + "step": 2648 + }, + { + "epoch": 0.304290391132043, + "grad_norm": 0.37813135981559753, + "learning_rate": 0.0001, + "loss": 1.7067, + "step": 2649 + }, + { + "epoch": 0.3044052610418701, + "grad_norm": 0.39040499925613403, + "learning_rate": 0.0001, + "loss": 1.6515, + "step": 2650 + }, + { + "epoch": 0.3045201309516972, + "grad_norm": 0.36264294385910034, + "learning_rate": 0.0001, + "loss": 1.589, + "step": 2651 + }, + { + "epoch": 0.30463500086152434, + "grad_norm": 0.3571374714374542, + "learning_rate": 0.0001, + "loss": 1.5062, + "step": 2652 + }, + { + "epoch": 0.30474987077135146, + "grad_norm": 0.3698153793811798, + "learning_rate": 0.0001, + "loss": 1.5825, + "step": 2653 + }, + { + "epoch": 0.3048647406811786, + "grad_norm": 0.37908482551574707, + "learning_rate": 0.0001, + "loss": 1.7519, + "step": 2654 + }, + { + "epoch": 0.3049796105910057, + "grad_norm": 0.4145006835460663, + "learning_rate": 0.0001, + "loss": 2.0428, + "step": 2655 + }, + { + "epoch": 0.3050944805008328, + "grad_norm": 0.37752121686935425, + "learning_rate": 0.0001, + "loss": 1.7289, + "step": 2656 + }, + { + "epoch": 0.30520935041065994, + "grad_norm": 0.41512149572372437, + "learning_rate": 0.0001, + "loss": 2.0011, + "step": 2657 + }, + { + "epoch": 0.30532422032048706, + "grad_norm": 0.3637721836566925, + "learning_rate": 0.0001, + "loss": 1.6781, + "step": 2658 + }, + { + "epoch": 0.3054390902303142, + "grad_norm": 0.3608452081680298, + "learning_rate": 0.0001, + "loss": 1.5406, + "step": 2659 + }, + { + "epoch": 0.3055539601401413, + "grad_norm": 0.4033581018447876, + "learning_rate": 0.0001, + "loss": 1.8266, + "step": 2660 + }, + { + "epoch": 0.3056688300499684, + "grad_norm": 0.371520459651947, + "learning_rate": 0.0001, + "loss": 1.6083, + "step": 2661 + }, + { + "epoch": 0.30578369995979554, + "grad_norm": 0.36730000376701355, + "learning_rate": 0.0001, + "loss": 1.5817, + "step": 2662 + }, + { + "epoch": 0.30589856986962266, + "grad_norm": 0.376396119594574, + "learning_rate": 0.0001, + "loss": 1.7396, + "step": 2663 + }, + { + "epoch": 0.3060134397794498, + "grad_norm": 0.35428953170776367, + "learning_rate": 0.0001, + "loss": 1.589, + "step": 2664 + }, + { + "epoch": 0.3061283096892769, + "grad_norm": 0.4117322266101837, + "learning_rate": 0.0001, + "loss": 1.7213, + "step": 2665 + }, + { + "epoch": 0.306243179599104, + "grad_norm": 0.39247551560401917, + "learning_rate": 0.0001, + "loss": 1.6146, + "step": 2666 + }, + { + "epoch": 0.30635804950893114, + "grad_norm": 0.39449337124824524, + "learning_rate": 0.0001, + "loss": 1.8517, + "step": 2667 + }, + { + "epoch": 0.30647291941875826, + "grad_norm": 0.3695959150791168, + "learning_rate": 0.0001, + "loss": 1.5881, + "step": 2668 + }, + { + "epoch": 0.3065877893285854, + "grad_norm": 0.3867664635181427, + "learning_rate": 0.0001, + "loss": 1.7339, + "step": 2669 + }, + { + "epoch": 0.3067026592384125, + "grad_norm": 0.4185912311077118, + "learning_rate": 0.0001, + "loss": 1.9492, + "step": 2670 + }, + { + "epoch": 0.3068175291482396, + "grad_norm": 0.365018755197525, + "learning_rate": 0.0001, + "loss": 1.6135, + "step": 2671 + }, + { + "epoch": 0.30693239905806674, + "grad_norm": 0.4161297380924225, + "learning_rate": 0.0001, + "loss": 1.7588, + "step": 2672 + }, + { + "epoch": 0.30704726896789386, + "grad_norm": 0.4104420840740204, + "learning_rate": 0.0001, + "loss": 1.6287, + "step": 2673 + }, + { + "epoch": 0.307162138877721, + "grad_norm": 0.393228143453598, + "learning_rate": 0.0001, + "loss": 1.6481, + "step": 2674 + }, + { + "epoch": 0.3072770087875481, + "grad_norm": 0.37595561146736145, + "learning_rate": 0.0001, + "loss": 1.7663, + "step": 2675 + }, + { + "epoch": 0.3073918786973752, + "grad_norm": 0.3587210178375244, + "learning_rate": 0.0001, + "loss": 1.505, + "step": 2676 + }, + { + "epoch": 0.30750674860720234, + "grad_norm": 0.37775328755378723, + "learning_rate": 0.0001, + "loss": 1.6793, + "step": 2677 + }, + { + "epoch": 0.30762161851702946, + "grad_norm": 0.3870543837547302, + "learning_rate": 0.0001, + "loss": 1.73, + "step": 2678 + }, + { + "epoch": 0.3077364884268566, + "grad_norm": 0.3819582760334015, + "learning_rate": 0.0001, + "loss": 1.6968, + "step": 2679 + }, + { + "epoch": 0.3078513583366837, + "grad_norm": 0.3661853075027466, + "learning_rate": 0.0001, + "loss": 1.6436, + "step": 2680 + }, + { + "epoch": 0.3079662282465108, + "grad_norm": 0.356086790561676, + "learning_rate": 0.0001, + "loss": 1.7138, + "step": 2681 + }, + { + "epoch": 0.30808109815633794, + "grad_norm": 0.34564852714538574, + "learning_rate": 0.0001, + "loss": 1.4758, + "step": 2682 + }, + { + "epoch": 0.30819596806616506, + "grad_norm": 0.37625306844711304, + "learning_rate": 0.0001, + "loss": 1.8159, + "step": 2683 + }, + { + "epoch": 0.3083108379759922, + "grad_norm": 0.4367530345916748, + "learning_rate": 0.0001, + "loss": 1.9309, + "step": 2684 + }, + { + "epoch": 0.3084257078858193, + "grad_norm": 0.41043978929519653, + "learning_rate": 0.0001, + "loss": 1.9249, + "step": 2685 + }, + { + "epoch": 0.3085405777956464, + "grad_norm": 0.36539000272750854, + "learning_rate": 0.0001, + "loss": 1.689, + "step": 2686 + }, + { + "epoch": 0.30865544770547354, + "grad_norm": 0.3621140718460083, + "learning_rate": 0.0001, + "loss": 1.6027, + "step": 2687 + }, + { + "epoch": 0.30877031761530066, + "grad_norm": 0.34792810678482056, + "learning_rate": 0.0001, + "loss": 1.5316, + "step": 2688 + }, + { + "epoch": 0.3088851875251278, + "grad_norm": 0.37253010272979736, + "learning_rate": 0.0001, + "loss": 1.6275, + "step": 2689 + }, + { + "epoch": 0.3090000574349549, + "grad_norm": 0.3895919919013977, + "learning_rate": 0.0001, + "loss": 1.6633, + "step": 2690 + }, + { + "epoch": 0.309114927344782, + "grad_norm": 0.3458951711654663, + "learning_rate": 0.0001, + "loss": 1.5419, + "step": 2691 + }, + { + "epoch": 0.30922979725460914, + "grad_norm": 0.34888923168182373, + "learning_rate": 0.0001, + "loss": 1.6153, + "step": 2692 + }, + { + "epoch": 0.30934466716443626, + "grad_norm": 0.3860279321670532, + "learning_rate": 0.0001, + "loss": 1.6428, + "step": 2693 + }, + { + "epoch": 0.3094595370742634, + "grad_norm": 0.3899478018283844, + "learning_rate": 0.0001, + "loss": 1.8091, + "step": 2694 + }, + { + "epoch": 0.3095744069840905, + "grad_norm": 0.3502478003501892, + "learning_rate": 0.0001, + "loss": 1.6229, + "step": 2695 + }, + { + "epoch": 0.3096892768939176, + "grad_norm": 0.3814723789691925, + "learning_rate": 0.0001, + "loss": 1.6441, + "step": 2696 + }, + { + "epoch": 0.30980414680374474, + "grad_norm": 0.4145774245262146, + "learning_rate": 0.0001, + "loss": 1.6666, + "step": 2697 + }, + { + "epoch": 0.30991901671357186, + "grad_norm": 0.3777678906917572, + "learning_rate": 0.0001, + "loss": 1.7871, + "step": 2698 + }, + { + "epoch": 0.310033886623399, + "grad_norm": 0.39043325185775757, + "learning_rate": 0.0001, + "loss": 1.8022, + "step": 2699 + }, + { + "epoch": 0.3101487565332261, + "grad_norm": 0.39468830823898315, + "learning_rate": 0.0001, + "loss": 1.6472, + "step": 2700 + }, + { + "epoch": 0.3102636264430532, + "grad_norm": 0.41475868225097656, + "learning_rate": 0.0001, + "loss": 1.8264, + "step": 2701 + }, + { + "epoch": 0.31037849635288034, + "grad_norm": 0.387824147939682, + "learning_rate": 0.0001, + "loss": 1.7159, + "step": 2702 + }, + { + "epoch": 0.31049336626270746, + "grad_norm": 0.38441115617752075, + "learning_rate": 0.0001, + "loss": 1.6584, + "step": 2703 + }, + { + "epoch": 0.3106082361725346, + "grad_norm": 0.374197781085968, + "learning_rate": 0.0001, + "loss": 1.6529, + "step": 2704 + }, + { + "epoch": 0.3107231060823617, + "grad_norm": 0.3562909960746765, + "learning_rate": 0.0001, + "loss": 1.4979, + "step": 2705 + }, + { + "epoch": 0.3108379759921888, + "grad_norm": 0.38204044103622437, + "learning_rate": 0.0001, + "loss": 1.8704, + "step": 2706 + }, + { + "epoch": 0.31095284590201594, + "grad_norm": 0.3814204931259155, + "learning_rate": 0.0001, + "loss": 1.5465, + "step": 2707 + }, + { + "epoch": 0.3110677158118431, + "grad_norm": 0.4282824993133545, + "learning_rate": 0.0001, + "loss": 1.6085, + "step": 2708 + }, + { + "epoch": 0.31118258572167024, + "grad_norm": 0.3564637005329132, + "learning_rate": 0.0001, + "loss": 1.6366, + "step": 2709 + }, + { + "epoch": 0.31129745563149736, + "grad_norm": 0.4151432514190674, + "learning_rate": 0.0001, + "loss": 1.9175, + "step": 2710 + }, + { + "epoch": 0.3114123255413245, + "grad_norm": 0.37644943594932556, + "learning_rate": 0.0001, + "loss": 1.5751, + "step": 2711 + }, + { + "epoch": 0.3115271954511516, + "grad_norm": 0.39474377036094666, + "learning_rate": 0.0001, + "loss": 1.6295, + "step": 2712 + }, + { + "epoch": 0.3116420653609787, + "grad_norm": 0.3828750550746918, + "learning_rate": 0.0001, + "loss": 1.6704, + "step": 2713 + }, + { + "epoch": 0.31175693527080584, + "grad_norm": 0.38936948776245117, + "learning_rate": 0.0001, + "loss": 1.6456, + "step": 2714 + }, + { + "epoch": 0.31187180518063295, + "grad_norm": 0.40668943524360657, + "learning_rate": 0.0001, + "loss": 1.708, + "step": 2715 + }, + { + "epoch": 0.3119866750904601, + "grad_norm": 0.36367443203926086, + "learning_rate": 0.0001, + "loss": 1.6902, + "step": 2716 + }, + { + "epoch": 0.3121015450002872, + "grad_norm": 0.4468287229537964, + "learning_rate": 0.0001, + "loss": 1.5658, + "step": 2717 + }, + { + "epoch": 0.3122164149101143, + "grad_norm": 0.3429298400878906, + "learning_rate": 0.0001, + "loss": 1.4857, + "step": 2718 + }, + { + "epoch": 0.31233128481994143, + "grad_norm": 0.4072478711605072, + "learning_rate": 0.0001, + "loss": 1.7749, + "step": 2719 + }, + { + "epoch": 0.31244615472976855, + "grad_norm": 0.37284785509109497, + "learning_rate": 0.0001, + "loss": 1.6334, + "step": 2720 + }, + { + "epoch": 0.3125610246395957, + "grad_norm": 0.41400986909866333, + "learning_rate": 0.0001, + "loss": 1.936, + "step": 2721 + }, + { + "epoch": 0.3126758945494228, + "grad_norm": 0.3585307002067566, + "learning_rate": 0.0001, + "loss": 1.3443, + "step": 2722 + }, + { + "epoch": 0.3127907644592499, + "grad_norm": 0.41940388083457947, + "learning_rate": 0.0001, + "loss": 1.8787, + "step": 2723 + }, + { + "epoch": 0.31290563436907703, + "grad_norm": 0.3974437713623047, + "learning_rate": 0.0001, + "loss": 1.6164, + "step": 2724 + }, + { + "epoch": 0.31302050427890415, + "grad_norm": 0.39561134576797485, + "learning_rate": 0.0001, + "loss": 1.7365, + "step": 2725 + }, + { + "epoch": 0.3131353741887313, + "grad_norm": 0.36751341819763184, + "learning_rate": 0.0001, + "loss": 1.6052, + "step": 2726 + }, + { + "epoch": 0.3132502440985584, + "grad_norm": 0.41230806708335876, + "learning_rate": 0.0001, + "loss": 1.5398, + "step": 2727 + }, + { + "epoch": 0.3133651140083855, + "grad_norm": 0.3846902847290039, + "learning_rate": 0.0001, + "loss": 1.6922, + "step": 2728 + }, + { + "epoch": 0.31347998391821263, + "grad_norm": 0.40803879499435425, + "learning_rate": 0.0001, + "loss": 1.7361, + "step": 2729 + }, + { + "epoch": 0.31359485382803975, + "grad_norm": 0.35604923963546753, + "learning_rate": 0.0001, + "loss": 1.5149, + "step": 2730 + }, + { + "epoch": 0.3137097237378669, + "grad_norm": 0.38761159777641296, + "learning_rate": 0.0001, + "loss": 1.7401, + "step": 2731 + }, + { + "epoch": 0.313824593647694, + "grad_norm": 0.42147189378738403, + "learning_rate": 0.0001, + "loss": 1.7959, + "step": 2732 + }, + { + "epoch": 0.3139394635575211, + "grad_norm": 0.3850533366203308, + "learning_rate": 0.0001, + "loss": 1.8302, + "step": 2733 + }, + { + "epoch": 0.31405433346734823, + "grad_norm": 0.3670084476470947, + "learning_rate": 0.0001, + "loss": 1.546, + "step": 2734 + }, + { + "epoch": 0.31416920337717535, + "grad_norm": 0.3647415041923523, + "learning_rate": 0.0001, + "loss": 1.5944, + "step": 2735 + }, + { + "epoch": 0.3142840732870025, + "grad_norm": 0.38492029905319214, + "learning_rate": 0.0001, + "loss": 1.5857, + "step": 2736 + }, + { + "epoch": 0.3143989431968296, + "grad_norm": 0.4054207503795624, + "learning_rate": 0.0001, + "loss": 1.8419, + "step": 2737 + }, + { + "epoch": 0.3145138131066567, + "grad_norm": 0.36467689275741577, + "learning_rate": 0.0001, + "loss": 1.5701, + "step": 2738 + }, + { + "epoch": 0.31462868301648383, + "grad_norm": 0.3815039098262787, + "learning_rate": 0.0001, + "loss": 1.6893, + "step": 2739 + }, + { + "epoch": 0.31474355292631095, + "grad_norm": 0.3768649697303772, + "learning_rate": 0.0001, + "loss": 1.5021, + "step": 2740 + }, + { + "epoch": 0.3148584228361381, + "grad_norm": 0.36210617423057556, + "learning_rate": 0.0001, + "loss": 1.7335, + "step": 2741 + }, + { + "epoch": 0.3149732927459652, + "grad_norm": 0.41380202770233154, + "learning_rate": 0.0001, + "loss": 1.799, + "step": 2742 + }, + { + "epoch": 0.3150881626557923, + "grad_norm": 0.38201257586479187, + "learning_rate": 0.0001, + "loss": 1.669, + "step": 2743 + }, + { + "epoch": 0.31520303256561943, + "grad_norm": 0.383025586605072, + "learning_rate": 0.0001, + "loss": 1.5881, + "step": 2744 + }, + { + "epoch": 0.31531790247544655, + "grad_norm": 0.3883838653564453, + "learning_rate": 0.0001, + "loss": 1.6941, + "step": 2745 + }, + { + "epoch": 0.3154327723852737, + "grad_norm": 0.35281357169151306, + "learning_rate": 0.0001, + "loss": 1.4192, + "step": 2746 + }, + { + "epoch": 0.3155476422951008, + "grad_norm": 0.40777984261512756, + "learning_rate": 0.0001, + "loss": 1.6675, + "step": 2747 + }, + { + "epoch": 0.3156625122049279, + "grad_norm": 0.36390420794487, + "learning_rate": 0.0001, + "loss": 1.5117, + "step": 2748 + }, + { + "epoch": 0.31577738211475503, + "grad_norm": 0.38784828782081604, + "learning_rate": 0.0001, + "loss": 1.7435, + "step": 2749 + }, + { + "epoch": 0.31589225202458215, + "grad_norm": 0.4247525632381439, + "learning_rate": 0.0001, + "loss": 1.7565, + "step": 2750 + }, + { + "epoch": 0.3160071219344093, + "grad_norm": 0.38927143812179565, + "learning_rate": 0.0001, + "loss": 1.6956, + "step": 2751 + }, + { + "epoch": 0.3161219918442364, + "grad_norm": 0.34700581431388855, + "learning_rate": 0.0001, + "loss": 1.6371, + "step": 2752 + }, + { + "epoch": 0.3162368617540635, + "grad_norm": 0.3716479241847992, + "learning_rate": 0.0001, + "loss": 1.5576, + "step": 2753 + }, + { + "epoch": 0.31635173166389063, + "grad_norm": 0.4048490822315216, + "learning_rate": 0.0001, + "loss": 1.7723, + "step": 2754 + }, + { + "epoch": 0.31646660157371775, + "grad_norm": 0.36082956194877625, + "learning_rate": 0.0001, + "loss": 1.6133, + "step": 2755 + }, + { + "epoch": 0.3165814714835449, + "grad_norm": 0.40957361459732056, + "learning_rate": 0.0001, + "loss": 1.845, + "step": 2756 + }, + { + "epoch": 0.316696341393372, + "grad_norm": 0.3455057442188263, + "learning_rate": 0.0001, + "loss": 1.5038, + "step": 2757 + }, + { + "epoch": 0.3168112113031991, + "grad_norm": 0.3513345718383789, + "learning_rate": 0.0001, + "loss": 1.4897, + "step": 2758 + }, + { + "epoch": 0.31692608121302623, + "grad_norm": 0.3828970491886139, + "learning_rate": 0.0001, + "loss": 1.6297, + "step": 2759 + }, + { + "epoch": 0.31704095112285335, + "grad_norm": 0.370225191116333, + "learning_rate": 0.0001, + "loss": 1.6051, + "step": 2760 + }, + { + "epoch": 0.3171558210326805, + "grad_norm": 0.4304163157939911, + "learning_rate": 0.0001, + "loss": 1.9163, + "step": 2761 + }, + { + "epoch": 0.3172706909425076, + "grad_norm": 0.3837917745113373, + "learning_rate": 0.0001, + "loss": 1.5688, + "step": 2762 + }, + { + "epoch": 0.3173855608523347, + "grad_norm": 0.3771938383579254, + "learning_rate": 0.0001, + "loss": 1.7106, + "step": 2763 + }, + { + "epoch": 0.31750043076216183, + "grad_norm": 0.3861342668533325, + "learning_rate": 0.0001, + "loss": 1.6719, + "step": 2764 + }, + { + "epoch": 0.31761530067198895, + "grad_norm": 0.36387091875076294, + "learning_rate": 0.0001, + "loss": 1.6573, + "step": 2765 + }, + { + "epoch": 0.3177301705818161, + "grad_norm": 0.3894106447696686, + "learning_rate": 0.0001, + "loss": 1.4844, + "step": 2766 + }, + { + "epoch": 0.3178450404916432, + "grad_norm": 0.37301284074783325, + "learning_rate": 0.0001, + "loss": 1.6716, + "step": 2767 + }, + { + "epoch": 0.3179599104014703, + "grad_norm": 0.38504043221473694, + "learning_rate": 0.0001, + "loss": 1.6545, + "step": 2768 + }, + { + "epoch": 0.31807478031129743, + "grad_norm": 0.37516283988952637, + "learning_rate": 0.0001, + "loss": 1.6915, + "step": 2769 + }, + { + "epoch": 0.31818965022112455, + "grad_norm": 0.3629774749279022, + "learning_rate": 0.0001, + "loss": 1.5294, + "step": 2770 + }, + { + "epoch": 0.3183045201309517, + "grad_norm": 0.3853417634963989, + "learning_rate": 0.0001, + "loss": 1.7261, + "step": 2771 + }, + { + "epoch": 0.3184193900407788, + "grad_norm": 0.36345425248146057, + "learning_rate": 0.0001, + "loss": 1.4298, + "step": 2772 + }, + { + "epoch": 0.3185342599506059, + "grad_norm": 0.3774106204509735, + "learning_rate": 0.0001, + "loss": 1.6075, + "step": 2773 + }, + { + "epoch": 0.31864912986043303, + "grad_norm": 0.40592819452285767, + "learning_rate": 0.0001, + "loss": 1.8993, + "step": 2774 + }, + { + "epoch": 0.31876399977026015, + "grad_norm": 0.38778918981552124, + "learning_rate": 0.0001, + "loss": 1.5641, + "step": 2775 + }, + { + "epoch": 0.3188788696800873, + "grad_norm": 0.39623749256134033, + "learning_rate": 0.0001, + "loss": 1.7249, + "step": 2776 + }, + { + "epoch": 0.31899373958991445, + "grad_norm": 0.3900299072265625, + "learning_rate": 0.0001, + "loss": 1.7291, + "step": 2777 + }, + { + "epoch": 0.31910860949974157, + "grad_norm": 0.3717004358768463, + "learning_rate": 0.0001, + "loss": 1.5562, + "step": 2778 + }, + { + "epoch": 0.3192234794095687, + "grad_norm": 0.38834843039512634, + "learning_rate": 0.0001, + "loss": 1.7519, + "step": 2779 + }, + { + "epoch": 0.3193383493193958, + "grad_norm": 0.3893420994281769, + "learning_rate": 0.0001, + "loss": 1.7821, + "step": 2780 + }, + { + "epoch": 0.31945321922922293, + "grad_norm": 0.4312572479248047, + "learning_rate": 0.0001, + "loss": 1.6432, + "step": 2781 + }, + { + "epoch": 0.31956808913905005, + "grad_norm": 0.3759611248970032, + "learning_rate": 0.0001, + "loss": 1.5577, + "step": 2782 + }, + { + "epoch": 0.31968295904887717, + "grad_norm": 0.37230929732322693, + "learning_rate": 0.0001, + "loss": 1.6343, + "step": 2783 + }, + { + "epoch": 0.3197978289587043, + "grad_norm": 0.3799343407154083, + "learning_rate": 0.0001, + "loss": 1.5809, + "step": 2784 + }, + { + "epoch": 0.3199126988685314, + "grad_norm": 0.38527607917785645, + "learning_rate": 0.0001, + "loss": 1.7334, + "step": 2785 + }, + { + "epoch": 0.32002756877835853, + "grad_norm": 0.3624141812324524, + "learning_rate": 0.0001, + "loss": 1.644, + "step": 2786 + }, + { + "epoch": 0.32014243868818565, + "grad_norm": 0.36637428402900696, + "learning_rate": 0.0001, + "loss": 1.5811, + "step": 2787 + }, + { + "epoch": 0.32025730859801277, + "grad_norm": 0.4166329503059387, + "learning_rate": 0.0001, + "loss": 1.8372, + "step": 2788 + }, + { + "epoch": 0.3203721785078399, + "grad_norm": 0.38629505038261414, + "learning_rate": 0.0001, + "loss": 1.726, + "step": 2789 + }, + { + "epoch": 0.320487048417667, + "grad_norm": 0.3948490023612976, + "learning_rate": 0.0001, + "loss": 1.7453, + "step": 2790 + }, + { + "epoch": 0.32060191832749413, + "grad_norm": 0.3907056450843811, + "learning_rate": 0.0001, + "loss": 1.6473, + "step": 2791 + }, + { + "epoch": 0.32071678823732125, + "grad_norm": 0.36693593859672546, + "learning_rate": 0.0001, + "loss": 1.5415, + "step": 2792 + }, + { + "epoch": 0.32083165814714837, + "grad_norm": 0.36896297335624695, + "learning_rate": 0.0001, + "loss": 1.4488, + "step": 2793 + }, + { + "epoch": 0.3209465280569755, + "grad_norm": 0.38584834337234497, + "learning_rate": 0.0001, + "loss": 1.8108, + "step": 2794 + }, + { + "epoch": 0.3210613979668026, + "grad_norm": 0.3919477164745331, + "learning_rate": 0.0001, + "loss": 1.4851, + "step": 2795 + }, + { + "epoch": 0.32117626787662973, + "grad_norm": 0.41220781207084656, + "learning_rate": 0.0001, + "loss": 1.6354, + "step": 2796 + }, + { + "epoch": 0.32129113778645685, + "grad_norm": 0.3902750015258789, + "learning_rate": 0.0001, + "loss": 1.6299, + "step": 2797 + }, + { + "epoch": 0.32140600769628397, + "grad_norm": 0.3956315219402313, + "learning_rate": 0.0001, + "loss": 1.7504, + "step": 2798 + }, + { + "epoch": 0.3215208776061111, + "grad_norm": 0.35562974214553833, + "learning_rate": 0.0001, + "loss": 1.6145, + "step": 2799 + }, + { + "epoch": 0.3216357475159382, + "grad_norm": 0.39182206988334656, + "learning_rate": 0.0001, + "loss": 1.5574, + "step": 2800 + }, + { + "epoch": 0.32175061742576533, + "grad_norm": 0.41521987318992615, + "learning_rate": 0.0001, + "loss": 1.7274, + "step": 2801 + }, + { + "epoch": 0.32186548733559245, + "grad_norm": 0.38131776452064514, + "learning_rate": 0.0001, + "loss": 1.5591, + "step": 2802 + }, + { + "epoch": 0.32198035724541957, + "grad_norm": 0.4228755533695221, + "learning_rate": 0.0001, + "loss": 1.8683, + "step": 2803 + }, + { + "epoch": 0.3220952271552467, + "grad_norm": 0.3989662230014801, + "learning_rate": 0.0001, + "loss": 1.8067, + "step": 2804 + }, + { + "epoch": 0.3222100970650738, + "grad_norm": 0.3961024284362793, + "learning_rate": 0.0001, + "loss": 1.8281, + "step": 2805 + }, + { + "epoch": 0.32232496697490093, + "grad_norm": 0.4035508930683136, + "learning_rate": 0.0001, + "loss": 1.7393, + "step": 2806 + }, + { + "epoch": 0.32243983688472805, + "grad_norm": 0.38078513741493225, + "learning_rate": 0.0001, + "loss": 1.7108, + "step": 2807 + }, + { + "epoch": 0.32255470679455517, + "grad_norm": 0.367631196975708, + "learning_rate": 0.0001, + "loss": 1.7122, + "step": 2808 + }, + { + "epoch": 0.3226695767043823, + "grad_norm": 0.3607901632785797, + "learning_rate": 0.0001, + "loss": 1.5505, + "step": 2809 + }, + { + "epoch": 0.3227844466142094, + "grad_norm": 0.3930343687534332, + "learning_rate": 0.0001, + "loss": 1.4403, + "step": 2810 + }, + { + "epoch": 0.32289931652403653, + "grad_norm": 0.37640708684921265, + "learning_rate": 0.0001, + "loss": 1.798, + "step": 2811 + }, + { + "epoch": 0.32301418643386365, + "grad_norm": 0.36390334367752075, + "learning_rate": 0.0001, + "loss": 1.5824, + "step": 2812 + }, + { + "epoch": 0.32312905634369077, + "grad_norm": 0.3854324519634247, + "learning_rate": 0.0001, + "loss": 1.6426, + "step": 2813 + }, + { + "epoch": 0.3232439262535179, + "grad_norm": 0.37264391779899597, + "learning_rate": 0.0001, + "loss": 1.7246, + "step": 2814 + }, + { + "epoch": 0.323358796163345, + "grad_norm": 0.40931811928749084, + "learning_rate": 0.0001, + "loss": 1.6462, + "step": 2815 + }, + { + "epoch": 0.32347366607317213, + "grad_norm": 0.36656174063682556, + "learning_rate": 0.0001, + "loss": 1.618, + "step": 2816 + }, + { + "epoch": 0.32358853598299925, + "grad_norm": 0.3858596682548523, + "learning_rate": 0.0001, + "loss": 1.6036, + "step": 2817 + }, + { + "epoch": 0.32370340589282637, + "grad_norm": 0.3784213066101074, + "learning_rate": 0.0001, + "loss": 1.8733, + "step": 2818 + }, + { + "epoch": 0.3238182758026535, + "grad_norm": 0.3836335241794586, + "learning_rate": 0.0001, + "loss": 1.6904, + "step": 2819 + }, + { + "epoch": 0.3239331457124806, + "grad_norm": 0.3633041977882385, + "learning_rate": 0.0001, + "loss": 1.7444, + "step": 2820 + }, + { + "epoch": 0.32404801562230773, + "grad_norm": 0.3903411626815796, + "learning_rate": 0.0001, + "loss": 1.7132, + "step": 2821 + }, + { + "epoch": 0.32416288553213485, + "grad_norm": 0.4233011305332184, + "learning_rate": 0.0001, + "loss": 1.6829, + "step": 2822 + }, + { + "epoch": 0.32427775544196197, + "grad_norm": 0.4304129481315613, + "learning_rate": 0.0001, + "loss": 1.859, + "step": 2823 + }, + { + "epoch": 0.3243926253517891, + "grad_norm": 0.37114959955215454, + "learning_rate": 0.0001, + "loss": 1.5548, + "step": 2824 + }, + { + "epoch": 0.3245074952616162, + "grad_norm": 0.3852083683013916, + "learning_rate": 0.0001, + "loss": 1.6829, + "step": 2825 + }, + { + "epoch": 0.32462236517144333, + "grad_norm": 0.3652872145175934, + "learning_rate": 0.0001, + "loss": 1.5912, + "step": 2826 + }, + { + "epoch": 0.32473723508127045, + "grad_norm": 0.3811475932598114, + "learning_rate": 0.0001, + "loss": 1.6123, + "step": 2827 + }, + { + "epoch": 0.32485210499109757, + "grad_norm": 0.3886179029941559, + "learning_rate": 0.0001, + "loss": 1.7562, + "step": 2828 + }, + { + "epoch": 0.3249669749009247, + "grad_norm": 0.39535319805145264, + "learning_rate": 0.0001, + "loss": 1.5934, + "step": 2829 + }, + { + "epoch": 0.3250818448107518, + "grad_norm": 0.40873974561691284, + "learning_rate": 0.0001, + "loss": 1.8607, + "step": 2830 + }, + { + "epoch": 0.3251967147205789, + "grad_norm": 0.38622164726257324, + "learning_rate": 0.0001, + "loss": 1.6962, + "step": 2831 + }, + { + "epoch": 0.32531158463040605, + "grad_norm": 0.42052188515663147, + "learning_rate": 0.0001, + "loss": 1.7905, + "step": 2832 + }, + { + "epoch": 0.32542645454023317, + "grad_norm": 0.3537195026874542, + "learning_rate": 0.0001, + "loss": 1.5616, + "step": 2833 + }, + { + "epoch": 0.3255413244500603, + "grad_norm": 0.3691607415676117, + "learning_rate": 0.0001, + "loss": 1.5968, + "step": 2834 + }, + { + "epoch": 0.3256561943598874, + "grad_norm": 0.40789857506752014, + "learning_rate": 0.0001, + "loss": 1.9685, + "step": 2835 + }, + { + "epoch": 0.3257710642697145, + "grad_norm": 0.3981241285800934, + "learning_rate": 0.0001, + "loss": 1.493, + "step": 2836 + }, + { + "epoch": 0.32588593417954165, + "grad_norm": 0.36344999074935913, + "learning_rate": 0.0001, + "loss": 1.6628, + "step": 2837 + }, + { + "epoch": 0.32600080408936877, + "grad_norm": 0.3493889272212982, + "learning_rate": 0.0001, + "loss": 1.4989, + "step": 2838 + }, + { + "epoch": 0.3261156739991959, + "grad_norm": 0.39113399386405945, + "learning_rate": 0.0001, + "loss": 1.7783, + "step": 2839 + }, + { + "epoch": 0.326230543909023, + "grad_norm": 0.3922522962093353, + "learning_rate": 0.0001, + "loss": 1.8986, + "step": 2840 + }, + { + "epoch": 0.3263454138188501, + "grad_norm": 0.35591554641723633, + "learning_rate": 0.0001, + "loss": 1.6045, + "step": 2841 + }, + { + "epoch": 0.32646028372867725, + "grad_norm": 0.38669490814208984, + "learning_rate": 0.0001, + "loss": 1.8257, + "step": 2842 + }, + { + "epoch": 0.32657515363850437, + "grad_norm": 0.37732958793640137, + "learning_rate": 0.0001, + "loss": 1.7277, + "step": 2843 + }, + { + "epoch": 0.3266900235483315, + "grad_norm": 0.3958292603492737, + "learning_rate": 0.0001, + "loss": 1.8254, + "step": 2844 + }, + { + "epoch": 0.32680489345815866, + "grad_norm": 0.368966668844223, + "learning_rate": 0.0001, + "loss": 1.728, + "step": 2845 + }, + { + "epoch": 0.3269197633679858, + "grad_norm": 0.3690721094608307, + "learning_rate": 0.0001, + "loss": 1.6272, + "step": 2846 + }, + { + "epoch": 0.3270346332778129, + "grad_norm": 0.37405163049697876, + "learning_rate": 0.0001, + "loss": 1.6248, + "step": 2847 + }, + { + "epoch": 0.32714950318764, + "grad_norm": 0.43797191977500916, + "learning_rate": 0.0001, + "loss": 1.9247, + "step": 2848 + }, + { + "epoch": 0.32726437309746714, + "grad_norm": 0.38513773679733276, + "learning_rate": 0.0001, + "loss": 1.7059, + "step": 2849 + }, + { + "epoch": 0.32737924300729426, + "grad_norm": 0.36473625898361206, + "learning_rate": 0.0001, + "loss": 1.5298, + "step": 2850 + }, + { + "epoch": 0.3274941129171214, + "grad_norm": 0.38011401891708374, + "learning_rate": 0.0001, + "loss": 1.6667, + "step": 2851 + }, + { + "epoch": 0.3276089828269485, + "grad_norm": 0.3875674605369568, + "learning_rate": 0.0001, + "loss": 1.8539, + "step": 2852 + }, + { + "epoch": 0.3277238527367756, + "grad_norm": 0.4060609042644501, + "learning_rate": 0.0001, + "loss": 1.6604, + "step": 2853 + }, + { + "epoch": 0.32783872264660274, + "grad_norm": 0.38478556275367737, + "learning_rate": 0.0001, + "loss": 1.8449, + "step": 2854 + }, + { + "epoch": 0.32795359255642986, + "grad_norm": 0.414813756942749, + "learning_rate": 0.0001, + "loss": 1.7037, + "step": 2855 + }, + { + "epoch": 0.328068462466257, + "grad_norm": 0.38957443833351135, + "learning_rate": 0.0001, + "loss": 1.8987, + "step": 2856 + }, + { + "epoch": 0.3281833323760841, + "grad_norm": 0.3751903772354126, + "learning_rate": 0.0001, + "loss": 1.7977, + "step": 2857 + }, + { + "epoch": 0.3282982022859112, + "grad_norm": 0.3669251799583435, + "learning_rate": 0.0001, + "loss": 1.6621, + "step": 2858 + }, + { + "epoch": 0.32841307219573834, + "grad_norm": 0.36348956823349, + "learning_rate": 0.0001, + "loss": 1.7146, + "step": 2859 + }, + { + "epoch": 0.32852794210556546, + "grad_norm": 0.35992637276649475, + "learning_rate": 0.0001, + "loss": 1.4548, + "step": 2860 + }, + { + "epoch": 0.3286428120153926, + "grad_norm": 0.3643839955329895, + "learning_rate": 0.0001, + "loss": 1.7187, + "step": 2861 + }, + { + "epoch": 0.3287576819252197, + "grad_norm": 0.40732714533805847, + "learning_rate": 0.0001, + "loss": 1.8751, + "step": 2862 + }, + { + "epoch": 0.3288725518350468, + "grad_norm": 0.38161808252334595, + "learning_rate": 0.0001, + "loss": 1.7558, + "step": 2863 + }, + { + "epoch": 0.32898742174487394, + "grad_norm": 0.40559515357017517, + "learning_rate": 0.0001, + "loss": 1.6177, + "step": 2864 + }, + { + "epoch": 0.32910229165470106, + "grad_norm": 0.3841257393360138, + "learning_rate": 0.0001, + "loss": 1.5836, + "step": 2865 + }, + { + "epoch": 0.3292171615645282, + "grad_norm": 0.36284148693084717, + "learning_rate": 0.0001, + "loss": 1.4342, + "step": 2866 + }, + { + "epoch": 0.3293320314743553, + "grad_norm": 0.3603561818599701, + "learning_rate": 0.0001, + "loss": 1.6746, + "step": 2867 + }, + { + "epoch": 0.3294469013841824, + "grad_norm": 0.39351412653923035, + "learning_rate": 0.0001, + "loss": 1.7173, + "step": 2868 + }, + { + "epoch": 0.32956177129400954, + "grad_norm": 0.408401757478714, + "learning_rate": 0.0001, + "loss": 1.7423, + "step": 2869 + }, + { + "epoch": 0.32967664120383666, + "grad_norm": 0.37314414978027344, + "learning_rate": 0.0001, + "loss": 1.5158, + "step": 2870 + }, + { + "epoch": 0.3297915111136638, + "grad_norm": 0.369933545589447, + "learning_rate": 0.0001, + "loss": 1.5922, + "step": 2871 + }, + { + "epoch": 0.3299063810234909, + "grad_norm": 0.4098500907421112, + "learning_rate": 0.0001, + "loss": 1.8109, + "step": 2872 + }, + { + "epoch": 0.330021250933318, + "grad_norm": 0.40561625361442566, + "learning_rate": 0.0001, + "loss": 1.6972, + "step": 2873 + }, + { + "epoch": 0.33013612084314514, + "grad_norm": 0.3818763792514801, + "learning_rate": 0.0001, + "loss": 1.6462, + "step": 2874 + }, + { + "epoch": 0.33025099075297226, + "grad_norm": 0.4518624544143677, + "learning_rate": 0.0001, + "loss": 1.6921, + "step": 2875 + }, + { + "epoch": 0.3303658606627994, + "grad_norm": 0.3883214294910431, + "learning_rate": 0.0001, + "loss": 1.7215, + "step": 2876 + }, + { + "epoch": 0.3304807305726265, + "grad_norm": 0.4087526500225067, + "learning_rate": 0.0001, + "loss": 1.919, + "step": 2877 + }, + { + "epoch": 0.3305956004824536, + "grad_norm": 0.37639445066452026, + "learning_rate": 0.0001, + "loss": 1.7867, + "step": 2878 + }, + { + "epoch": 0.33071047039228074, + "grad_norm": 0.42621272802352905, + "learning_rate": 0.0001, + "loss": 1.5689, + "step": 2879 + }, + { + "epoch": 0.33082534030210786, + "grad_norm": 0.36311256885528564, + "learning_rate": 0.0001, + "loss": 1.7286, + "step": 2880 + }, + { + "epoch": 0.330940210211935, + "grad_norm": 0.35729482769966125, + "learning_rate": 0.0001, + "loss": 1.6164, + "step": 2881 + }, + { + "epoch": 0.3310550801217621, + "grad_norm": 0.3519277274608612, + "learning_rate": 0.0001, + "loss": 1.4828, + "step": 2882 + }, + { + "epoch": 0.3311699500315892, + "grad_norm": 0.40030479431152344, + "learning_rate": 0.0001, + "loss": 1.8654, + "step": 2883 + }, + { + "epoch": 0.33128481994141634, + "grad_norm": 0.3615962862968445, + "learning_rate": 0.0001, + "loss": 1.5445, + "step": 2884 + }, + { + "epoch": 0.33139968985124346, + "grad_norm": 0.38331496715545654, + "learning_rate": 0.0001, + "loss": 1.7308, + "step": 2885 + }, + { + "epoch": 0.3315145597610706, + "grad_norm": 0.3735135793685913, + "learning_rate": 0.0001, + "loss": 1.4669, + "step": 2886 + }, + { + "epoch": 0.3316294296708977, + "grad_norm": 0.39380258321762085, + "learning_rate": 0.0001, + "loss": 1.5611, + "step": 2887 + }, + { + "epoch": 0.3317442995807248, + "grad_norm": 0.4285888373851776, + "learning_rate": 0.0001, + "loss": 1.5816, + "step": 2888 + }, + { + "epoch": 0.33185916949055194, + "grad_norm": 0.36404624581336975, + "learning_rate": 0.0001, + "loss": 1.5868, + "step": 2889 + }, + { + "epoch": 0.33197403940037906, + "grad_norm": 0.42275553941726685, + "learning_rate": 0.0001, + "loss": 1.6401, + "step": 2890 + }, + { + "epoch": 0.3320889093102062, + "grad_norm": 0.37147074937820435, + "learning_rate": 0.0001, + "loss": 1.5029, + "step": 2891 + }, + { + "epoch": 0.3322037792200333, + "grad_norm": 0.3962612450122833, + "learning_rate": 0.0001, + "loss": 1.8306, + "step": 2892 + }, + { + "epoch": 0.3323186491298604, + "grad_norm": 0.36973974108695984, + "learning_rate": 0.0001, + "loss": 1.6596, + "step": 2893 + }, + { + "epoch": 0.33243351903968754, + "grad_norm": 0.3744032680988312, + "learning_rate": 0.0001, + "loss": 1.6306, + "step": 2894 + }, + { + "epoch": 0.33254838894951466, + "grad_norm": 0.3863425552845001, + "learning_rate": 0.0001, + "loss": 1.5361, + "step": 2895 + }, + { + "epoch": 0.3326632588593418, + "grad_norm": 0.3814421594142914, + "learning_rate": 0.0001, + "loss": 1.6219, + "step": 2896 + }, + { + "epoch": 0.3327781287691689, + "grad_norm": 0.3841194808483124, + "learning_rate": 0.0001, + "loss": 1.7178, + "step": 2897 + }, + { + "epoch": 0.332892998678996, + "grad_norm": 0.3847556412220001, + "learning_rate": 0.0001, + "loss": 1.6418, + "step": 2898 + }, + { + "epoch": 0.33300786858882314, + "grad_norm": 0.38841572403907776, + "learning_rate": 0.0001, + "loss": 1.8005, + "step": 2899 + }, + { + "epoch": 0.33312273849865026, + "grad_norm": 0.41473743319511414, + "learning_rate": 0.0001, + "loss": 1.6702, + "step": 2900 + }, + { + "epoch": 0.3332376084084774, + "grad_norm": 0.37773972749710083, + "learning_rate": 0.0001, + "loss": 1.5637, + "step": 2901 + }, + { + "epoch": 0.3333524783183045, + "grad_norm": 0.38093826174736023, + "learning_rate": 0.0001, + "loss": 1.7681, + "step": 2902 + }, + { + "epoch": 0.3334673482281316, + "grad_norm": 0.39234668016433716, + "learning_rate": 0.0001, + "loss": 1.7385, + "step": 2903 + }, + { + "epoch": 0.33358221813795874, + "grad_norm": 0.4004804790019989, + "learning_rate": 0.0001, + "loss": 1.7519, + "step": 2904 + }, + { + "epoch": 0.33369708804778586, + "grad_norm": 0.35611221194267273, + "learning_rate": 0.0001, + "loss": 1.5161, + "step": 2905 + }, + { + "epoch": 0.333811957957613, + "grad_norm": 0.383696585893631, + "learning_rate": 0.0001, + "loss": 1.7001, + "step": 2906 + }, + { + "epoch": 0.3339268278674401, + "grad_norm": 0.3913770020008087, + "learning_rate": 0.0001, + "loss": 1.7143, + "step": 2907 + }, + { + "epoch": 0.3340416977772672, + "grad_norm": 0.38301557302474976, + "learning_rate": 0.0001, + "loss": 1.649, + "step": 2908 + }, + { + "epoch": 0.33415656768709434, + "grad_norm": 0.3856869041919708, + "learning_rate": 0.0001, + "loss": 1.7601, + "step": 2909 + }, + { + "epoch": 0.33427143759692146, + "grad_norm": 0.3839951753616333, + "learning_rate": 0.0001, + "loss": 1.8057, + "step": 2910 + }, + { + "epoch": 0.3343863075067486, + "grad_norm": 0.38269492983818054, + "learning_rate": 0.0001, + "loss": 1.5389, + "step": 2911 + }, + { + "epoch": 0.3345011774165757, + "grad_norm": 0.3791959583759308, + "learning_rate": 0.0001, + "loss": 1.7275, + "step": 2912 + }, + { + "epoch": 0.3346160473264028, + "grad_norm": 0.36112257838249207, + "learning_rate": 0.0001, + "loss": 1.6216, + "step": 2913 + }, + { + "epoch": 0.33473091723623, + "grad_norm": 0.3655812740325928, + "learning_rate": 0.0001, + "loss": 1.4784, + "step": 2914 + }, + { + "epoch": 0.3348457871460571, + "grad_norm": 0.3751130998134613, + "learning_rate": 0.0001, + "loss": 1.7072, + "step": 2915 + }, + { + "epoch": 0.33496065705588424, + "grad_norm": 0.3733077049255371, + "learning_rate": 0.0001, + "loss": 1.8166, + "step": 2916 + }, + { + "epoch": 0.33507552696571136, + "grad_norm": 0.3870159983634949, + "learning_rate": 0.0001, + "loss": 1.6662, + "step": 2917 + }, + { + "epoch": 0.3351903968755385, + "grad_norm": 0.3635254204273224, + "learning_rate": 0.0001, + "loss": 1.4773, + "step": 2918 + }, + { + "epoch": 0.3353052667853656, + "grad_norm": 0.3966655135154724, + "learning_rate": 0.0001, + "loss": 1.4219, + "step": 2919 + }, + { + "epoch": 0.3354201366951927, + "grad_norm": 0.3748622536659241, + "learning_rate": 0.0001, + "loss": 1.5614, + "step": 2920 + }, + { + "epoch": 0.33553500660501984, + "grad_norm": 0.38931792974472046, + "learning_rate": 0.0001, + "loss": 1.6788, + "step": 2921 + }, + { + "epoch": 0.33564987651484696, + "grad_norm": 0.4402804970741272, + "learning_rate": 0.0001, + "loss": 1.5002, + "step": 2922 + }, + { + "epoch": 0.3357647464246741, + "grad_norm": 0.3441646099090576, + "learning_rate": 0.0001, + "loss": 1.5717, + "step": 2923 + }, + { + "epoch": 0.3358796163345012, + "grad_norm": 0.39570891857147217, + "learning_rate": 0.0001, + "loss": 1.714, + "step": 2924 + }, + { + "epoch": 0.3359944862443283, + "grad_norm": 0.39201679825782776, + "learning_rate": 0.0001, + "loss": 1.7896, + "step": 2925 + }, + { + "epoch": 0.33610935615415544, + "grad_norm": 0.3926868140697479, + "learning_rate": 0.0001, + "loss": 1.6262, + "step": 2926 + }, + { + "epoch": 0.33622422606398256, + "grad_norm": 0.3830588757991791, + "learning_rate": 0.0001, + "loss": 1.6365, + "step": 2927 + }, + { + "epoch": 0.3363390959738097, + "grad_norm": 0.3714669346809387, + "learning_rate": 0.0001, + "loss": 1.6564, + "step": 2928 + }, + { + "epoch": 0.3364539658836368, + "grad_norm": 0.4032626748085022, + "learning_rate": 0.0001, + "loss": 1.7987, + "step": 2929 + }, + { + "epoch": 0.3365688357934639, + "grad_norm": 0.40316057205200195, + "learning_rate": 0.0001, + "loss": 1.6721, + "step": 2930 + }, + { + "epoch": 0.33668370570329104, + "grad_norm": 0.4364853799343109, + "learning_rate": 0.0001, + "loss": 1.5832, + "step": 2931 + }, + { + "epoch": 0.33679857561311816, + "grad_norm": 0.3844012916088104, + "learning_rate": 0.0001, + "loss": 1.687, + "step": 2932 + }, + { + "epoch": 0.3369134455229453, + "grad_norm": 0.3774738311767578, + "learning_rate": 0.0001, + "loss": 1.7877, + "step": 2933 + }, + { + "epoch": 0.3370283154327724, + "grad_norm": 0.4184546172618866, + "learning_rate": 0.0001, + "loss": 1.8513, + "step": 2934 + }, + { + "epoch": 0.3371431853425995, + "grad_norm": 0.3983631432056427, + "learning_rate": 0.0001, + "loss": 1.7087, + "step": 2935 + }, + { + "epoch": 0.33725805525242664, + "grad_norm": 0.356240451335907, + "learning_rate": 0.0001, + "loss": 1.5994, + "step": 2936 + }, + { + "epoch": 0.33737292516225376, + "grad_norm": 0.3877936601638794, + "learning_rate": 0.0001, + "loss": 1.6252, + "step": 2937 + }, + { + "epoch": 0.3374877950720809, + "grad_norm": 0.3945756256580353, + "learning_rate": 0.0001, + "loss": 1.7137, + "step": 2938 + }, + { + "epoch": 0.337602664981908, + "grad_norm": 0.3544231355190277, + "learning_rate": 0.0001, + "loss": 1.3307, + "step": 2939 + }, + { + "epoch": 0.3377175348917351, + "grad_norm": 0.3833335041999817, + "learning_rate": 0.0001, + "loss": 1.7065, + "step": 2940 + }, + { + "epoch": 0.33783240480156224, + "grad_norm": 0.3731600046157837, + "learning_rate": 0.0001, + "loss": 1.7736, + "step": 2941 + }, + { + "epoch": 0.33794727471138936, + "grad_norm": 0.4063700735569, + "learning_rate": 0.0001, + "loss": 1.6827, + "step": 2942 + }, + { + "epoch": 0.3380621446212165, + "grad_norm": 0.4960021674633026, + "learning_rate": 0.0001, + "loss": 1.9989, + "step": 2943 + }, + { + "epoch": 0.3381770145310436, + "grad_norm": 0.4238811433315277, + "learning_rate": 0.0001, + "loss": 1.7855, + "step": 2944 + }, + { + "epoch": 0.3382918844408707, + "grad_norm": 0.4114185571670532, + "learning_rate": 0.0001, + "loss": 1.8296, + "step": 2945 + }, + { + "epoch": 0.33840675435069784, + "grad_norm": 0.40994930267333984, + "learning_rate": 0.0001, + "loss": 1.8185, + "step": 2946 + }, + { + "epoch": 0.33852162426052496, + "grad_norm": 0.39755189418792725, + "learning_rate": 0.0001, + "loss": 1.7911, + "step": 2947 + }, + { + "epoch": 0.3386364941703521, + "grad_norm": 0.39836958050727844, + "learning_rate": 0.0001, + "loss": 1.6468, + "step": 2948 + }, + { + "epoch": 0.3387513640801792, + "grad_norm": 0.3699915409088135, + "learning_rate": 0.0001, + "loss": 1.603, + "step": 2949 + }, + { + "epoch": 0.3388662339900063, + "grad_norm": 0.42995521426200867, + "learning_rate": 0.0001, + "loss": 1.852, + "step": 2950 + }, + { + "epoch": 0.33898110389983344, + "grad_norm": 0.398151695728302, + "learning_rate": 0.0001, + "loss": 1.8414, + "step": 2951 + }, + { + "epoch": 0.33909597380966056, + "grad_norm": 0.36557191610336304, + "learning_rate": 0.0001, + "loss": 1.5987, + "step": 2952 + }, + { + "epoch": 0.3392108437194877, + "grad_norm": 0.3784855008125305, + "learning_rate": 0.0001, + "loss": 1.6334, + "step": 2953 + }, + { + "epoch": 0.3393257136293148, + "grad_norm": 0.40273427963256836, + "learning_rate": 0.0001, + "loss": 1.7353, + "step": 2954 + }, + { + "epoch": 0.3394405835391419, + "grad_norm": 0.37005481123924255, + "learning_rate": 0.0001, + "loss": 1.6173, + "step": 2955 + }, + { + "epoch": 0.33955545344896904, + "grad_norm": 0.3850763142108917, + "learning_rate": 0.0001, + "loss": 1.7957, + "step": 2956 + }, + { + "epoch": 0.33967032335879616, + "grad_norm": 0.3557315170764923, + "learning_rate": 0.0001, + "loss": 1.6479, + "step": 2957 + }, + { + "epoch": 0.3397851932686233, + "grad_norm": 0.44396260380744934, + "learning_rate": 0.0001, + "loss": 1.6434, + "step": 2958 + }, + { + "epoch": 0.3399000631784504, + "grad_norm": 0.3347325325012207, + "learning_rate": 0.0001, + "loss": 1.3493, + "step": 2959 + }, + { + "epoch": 0.3400149330882775, + "grad_norm": 0.3799315094947815, + "learning_rate": 0.0001, + "loss": 1.698, + "step": 2960 + }, + { + "epoch": 0.34012980299810464, + "grad_norm": 0.3979965150356293, + "learning_rate": 0.0001, + "loss": 1.6106, + "step": 2961 + }, + { + "epoch": 0.34024467290793176, + "grad_norm": 0.3687105178833008, + "learning_rate": 0.0001, + "loss": 1.6298, + "step": 2962 + }, + { + "epoch": 0.3403595428177589, + "grad_norm": 0.3896116018295288, + "learning_rate": 0.0001, + "loss": 1.6547, + "step": 2963 + }, + { + "epoch": 0.340474412727586, + "grad_norm": 0.38803455233573914, + "learning_rate": 0.0001, + "loss": 1.7099, + "step": 2964 + }, + { + "epoch": 0.3405892826374131, + "grad_norm": 0.37791207432746887, + "learning_rate": 0.0001, + "loss": 1.4235, + "step": 2965 + }, + { + "epoch": 0.34070415254724024, + "grad_norm": 0.3701097071170807, + "learning_rate": 0.0001, + "loss": 1.7074, + "step": 2966 + }, + { + "epoch": 0.34081902245706736, + "grad_norm": 0.3952276408672333, + "learning_rate": 0.0001, + "loss": 1.7784, + "step": 2967 + }, + { + "epoch": 0.3409338923668945, + "grad_norm": 0.6198942065238953, + "learning_rate": 0.0001, + "loss": 1.4752, + "step": 2968 + }, + { + "epoch": 0.3410487622767216, + "grad_norm": 0.39042800664901733, + "learning_rate": 0.0001, + "loss": 1.715, + "step": 2969 + }, + { + "epoch": 0.3411636321865487, + "grad_norm": 0.39784181118011475, + "learning_rate": 0.0001, + "loss": 1.7825, + "step": 2970 + }, + { + "epoch": 0.34127850209637584, + "grad_norm": 0.42527255415916443, + "learning_rate": 0.0001, + "loss": 1.6846, + "step": 2971 + }, + { + "epoch": 0.34139337200620296, + "grad_norm": 0.35908040404319763, + "learning_rate": 0.0001, + "loss": 1.5091, + "step": 2972 + }, + { + "epoch": 0.3415082419160301, + "grad_norm": 0.3641200661659241, + "learning_rate": 0.0001, + "loss": 1.6271, + "step": 2973 + }, + { + "epoch": 0.3416231118258572, + "grad_norm": 0.38720008730888367, + "learning_rate": 0.0001, + "loss": 1.6965, + "step": 2974 + }, + { + "epoch": 0.3417379817356843, + "grad_norm": 0.40444648265838623, + "learning_rate": 0.0001, + "loss": 1.9605, + "step": 2975 + }, + { + "epoch": 0.34185285164551144, + "grad_norm": 0.3607354462146759, + "learning_rate": 0.0001, + "loss": 1.537, + "step": 2976 + }, + { + "epoch": 0.34196772155533856, + "grad_norm": 0.3962652087211609, + "learning_rate": 0.0001, + "loss": 1.85, + "step": 2977 + }, + { + "epoch": 0.3420825914651657, + "grad_norm": 0.36956319212913513, + "learning_rate": 0.0001, + "loss": 1.6285, + "step": 2978 + }, + { + "epoch": 0.3421974613749928, + "grad_norm": 0.39066120982170105, + "learning_rate": 0.0001, + "loss": 1.7077, + "step": 2979 + }, + { + "epoch": 0.3423123312848199, + "grad_norm": 0.3526730537414551, + "learning_rate": 0.0001, + "loss": 1.5309, + "step": 2980 + }, + { + "epoch": 0.34242720119464704, + "grad_norm": 0.39420273900032043, + "learning_rate": 0.0001, + "loss": 1.7418, + "step": 2981 + }, + { + "epoch": 0.3425420711044742, + "grad_norm": 0.3418049216270447, + "learning_rate": 0.0001, + "loss": 1.3557, + "step": 2982 + }, + { + "epoch": 0.34265694101430133, + "grad_norm": 0.4102267920970917, + "learning_rate": 0.0001, + "loss": 1.8232, + "step": 2983 + }, + { + "epoch": 0.34277181092412845, + "grad_norm": 0.3934805691242218, + "learning_rate": 0.0001, + "loss": 1.7044, + "step": 2984 + }, + { + "epoch": 0.34288668083395557, + "grad_norm": 0.3770129978656769, + "learning_rate": 0.0001, + "loss": 1.4916, + "step": 2985 + }, + { + "epoch": 0.3430015507437827, + "grad_norm": 0.39099806547164917, + "learning_rate": 0.0001, + "loss": 1.6679, + "step": 2986 + }, + { + "epoch": 0.3431164206536098, + "grad_norm": 0.36751532554626465, + "learning_rate": 0.0001, + "loss": 1.7394, + "step": 2987 + }, + { + "epoch": 0.34323129056343693, + "grad_norm": 0.3834240734577179, + "learning_rate": 0.0001, + "loss": 1.5544, + "step": 2988 + }, + { + "epoch": 0.34334616047326405, + "grad_norm": 0.3922926187515259, + "learning_rate": 0.0001, + "loss": 1.6312, + "step": 2989 + }, + { + "epoch": 0.34346103038309117, + "grad_norm": 0.38694506883621216, + "learning_rate": 0.0001, + "loss": 1.81, + "step": 2990 + }, + { + "epoch": 0.3435759002929183, + "grad_norm": 0.3988105356693268, + "learning_rate": 0.0001, + "loss": 1.6714, + "step": 2991 + }, + { + "epoch": 0.3436907702027454, + "grad_norm": 0.42653176188468933, + "learning_rate": 0.0001, + "loss": 1.8591, + "step": 2992 + }, + { + "epoch": 0.34380564011257253, + "grad_norm": 0.4091017246246338, + "learning_rate": 0.0001, + "loss": 1.6663, + "step": 2993 + }, + { + "epoch": 0.34392051002239965, + "grad_norm": 0.4520750641822815, + "learning_rate": 0.0001, + "loss": 1.8694, + "step": 2994 + }, + { + "epoch": 0.34403537993222677, + "grad_norm": 0.38830214738845825, + "learning_rate": 0.0001, + "loss": 1.7041, + "step": 2995 + }, + { + "epoch": 0.3441502498420539, + "grad_norm": 0.40523776412010193, + "learning_rate": 0.0001, + "loss": 1.7478, + "step": 2996 + }, + { + "epoch": 0.344265119751881, + "grad_norm": 0.3667933940887451, + "learning_rate": 0.0001, + "loss": 1.6113, + "step": 2997 + }, + { + "epoch": 0.34437998966170813, + "grad_norm": 0.43613407015800476, + "learning_rate": 0.0001, + "loss": 1.7168, + "step": 2998 + }, + { + "epoch": 0.34449485957153525, + "grad_norm": 0.38262253999710083, + "learning_rate": 0.0001, + "loss": 1.5915, + "step": 2999 + }, + { + "epoch": 0.34460972948136237, + "grad_norm": 0.39579108357429504, + "learning_rate": 0.0001, + "loss": 1.658, + "step": 3000 + }, + { + "epoch": 0.3447245993911895, + "grad_norm": 0.38654452562332153, + "learning_rate": 0.0001, + "loss": 1.3892, + "step": 3001 + }, + { + "epoch": 0.3448394693010166, + "grad_norm": 0.35805824398994446, + "learning_rate": 0.0001, + "loss": 1.4611, + "step": 3002 + }, + { + "epoch": 0.34495433921084373, + "grad_norm": 0.3665701448917389, + "learning_rate": 0.0001, + "loss": 1.5227, + "step": 3003 + }, + { + "epoch": 0.34506920912067085, + "grad_norm": 0.40592774748802185, + "learning_rate": 0.0001, + "loss": 1.5783, + "step": 3004 + }, + { + "epoch": 0.34518407903049797, + "grad_norm": 0.3932124376296997, + "learning_rate": 0.0001, + "loss": 1.6414, + "step": 3005 + }, + { + "epoch": 0.3452989489403251, + "grad_norm": 0.42479029297828674, + "learning_rate": 0.0001, + "loss": 1.6137, + "step": 3006 + }, + { + "epoch": 0.3454138188501522, + "grad_norm": 0.43539106845855713, + "learning_rate": 0.0001, + "loss": 1.7239, + "step": 3007 + }, + { + "epoch": 0.34552868875997933, + "grad_norm": 0.40625861287117004, + "learning_rate": 0.0001, + "loss": 1.9526, + "step": 3008 + }, + { + "epoch": 0.34564355866980645, + "grad_norm": 0.3962743282318115, + "learning_rate": 0.0001, + "loss": 1.6972, + "step": 3009 + }, + { + "epoch": 0.34575842857963357, + "grad_norm": 0.37623900175094604, + "learning_rate": 0.0001, + "loss": 1.5136, + "step": 3010 + }, + { + "epoch": 0.3458732984894607, + "grad_norm": 0.3827407658100128, + "learning_rate": 0.0001, + "loss": 1.6511, + "step": 3011 + }, + { + "epoch": 0.3459881683992878, + "grad_norm": 0.3819064795970917, + "learning_rate": 0.0001, + "loss": 1.6127, + "step": 3012 + }, + { + "epoch": 0.34610303830911493, + "grad_norm": 0.38156652450561523, + "learning_rate": 0.0001, + "loss": 1.5852, + "step": 3013 + }, + { + "epoch": 0.34621790821894205, + "grad_norm": 0.42006298899650574, + "learning_rate": 0.0001, + "loss": 1.8345, + "step": 3014 + }, + { + "epoch": 0.34633277812876917, + "grad_norm": 0.3623389005661011, + "learning_rate": 0.0001, + "loss": 1.6146, + "step": 3015 + }, + { + "epoch": 0.3464476480385963, + "grad_norm": 0.39460188150405884, + "learning_rate": 0.0001, + "loss": 1.8264, + "step": 3016 + }, + { + "epoch": 0.3465625179484234, + "grad_norm": 0.40314847230911255, + "learning_rate": 0.0001, + "loss": 1.646, + "step": 3017 + }, + { + "epoch": 0.34667738785825053, + "grad_norm": 0.37284019589424133, + "learning_rate": 0.0001, + "loss": 1.6055, + "step": 3018 + }, + { + "epoch": 0.34679225776807765, + "grad_norm": 0.4431118071079254, + "learning_rate": 0.0001, + "loss": 1.9377, + "step": 3019 + }, + { + "epoch": 0.34690712767790477, + "grad_norm": 0.3849484622478485, + "learning_rate": 0.0001, + "loss": 1.6559, + "step": 3020 + }, + { + "epoch": 0.3470219975877319, + "grad_norm": 0.3909108340740204, + "learning_rate": 0.0001, + "loss": 1.8541, + "step": 3021 + }, + { + "epoch": 0.347136867497559, + "grad_norm": 0.38009119033813477, + "learning_rate": 0.0001, + "loss": 1.7868, + "step": 3022 + }, + { + "epoch": 0.34725173740738613, + "grad_norm": 0.3676866292953491, + "learning_rate": 0.0001, + "loss": 1.5971, + "step": 3023 + }, + { + "epoch": 0.34736660731721325, + "grad_norm": 0.3783824145793915, + "learning_rate": 0.0001, + "loss": 1.6714, + "step": 3024 + }, + { + "epoch": 0.34748147722704037, + "grad_norm": 0.3574279546737671, + "learning_rate": 0.0001, + "loss": 1.4742, + "step": 3025 + }, + { + "epoch": 0.3475963471368675, + "grad_norm": 0.3630661070346832, + "learning_rate": 0.0001, + "loss": 1.3814, + "step": 3026 + }, + { + "epoch": 0.3477112170466946, + "grad_norm": 0.37314149737358093, + "learning_rate": 0.0001, + "loss": 1.6602, + "step": 3027 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 0.38733503222465515, + "learning_rate": 0.0001, + "loss": 1.5787, + "step": 3028 + }, + { + "epoch": 0.34794095686634885, + "grad_norm": 0.42135870456695557, + "learning_rate": 0.0001, + "loss": 1.9024, + "step": 3029 + }, + { + "epoch": 0.34805582677617597, + "grad_norm": 0.40700820088386536, + "learning_rate": 0.0001, + "loss": 1.5771, + "step": 3030 + }, + { + "epoch": 0.3481706966860031, + "grad_norm": 0.389607310295105, + "learning_rate": 0.0001, + "loss": 1.751, + "step": 3031 + }, + { + "epoch": 0.3482855665958302, + "grad_norm": 0.39443865418434143, + "learning_rate": 0.0001, + "loss": 1.4554, + "step": 3032 + }, + { + "epoch": 0.34840043650565733, + "grad_norm": 0.4073096513748169, + "learning_rate": 0.0001, + "loss": 1.7826, + "step": 3033 + }, + { + "epoch": 0.34851530641548445, + "grad_norm": 0.38107818365097046, + "learning_rate": 0.0001, + "loss": 1.7504, + "step": 3034 + }, + { + "epoch": 0.34863017632531157, + "grad_norm": 0.39980536699295044, + "learning_rate": 0.0001, + "loss": 1.7422, + "step": 3035 + }, + { + "epoch": 0.3487450462351387, + "grad_norm": 0.3910979628562927, + "learning_rate": 0.0001, + "loss": 1.7604, + "step": 3036 + }, + { + "epoch": 0.3488599161449658, + "grad_norm": 0.3955102562904358, + "learning_rate": 0.0001, + "loss": 1.5553, + "step": 3037 + }, + { + "epoch": 0.34897478605479293, + "grad_norm": 0.3812708258628845, + "learning_rate": 0.0001, + "loss": 1.6518, + "step": 3038 + }, + { + "epoch": 0.34908965596462005, + "grad_norm": 0.402920126914978, + "learning_rate": 0.0001, + "loss": 1.6048, + "step": 3039 + }, + { + "epoch": 0.34920452587444717, + "grad_norm": 0.3769501745700836, + "learning_rate": 0.0001, + "loss": 1.6784, + "step": 3040 + }, + { + "epoch": 0.3493193957842743, + "grad_norm": 0.3954136371612549, + "learning_rate": 0.0001, + "loss": 1.7905, + "step": 3041 + }, + { + "epoch": 0.3494342656941014, + "grad_norm": 0.36639055609703064, + "learning_rate": 0.0001, + "loss": 1.625, + "step": 3042 + }, + { + "epoch": 0.34954913560392853, + "grad_norm": 0.42278578877449036, + "learning_rate": 0.0001, + "loss": 1.9002, + "step": 3043 + }, + { + "epoch": 0.34966400551375565, + "grad_norm": 0.37817153334617615, + "learning_rate": 0.0001, + "loss": 1.4365, + "step": 3044 + }, + { + "epoch": 0.34977887542358277, + "grad_norm": 0.3958953320980072, + "learning_rate": 0.0001, + "loss": 1.8305, + "step": 3045 + }, + { + "epoch": 0.3498937453334099, + "grad_norm": 0.3798516094684601, + "learning_rate": 0.0001, + "loss": 1.6299, + "step": 3046 + }, + { + "epoch": 0.350008615243237, + "grad_norm": 0.37097060680389404, + "learning_rate": 0.0001, + "loss": 1.4808, + "step": 3047 + }, + { + "epoch": 0.35012348515306413, + "grad_norm": 0.3970964848995209, + "learning_rate": 0.0001, + "loss": 1.5711, + "step": 3048 + }, + { + "epoch": 0.35023835506289125, + "grad_norm": 0.3985350728034973, + "learning_rate": 0.0001, + "loss": 1.9372, + "step": 3049 + }, + { + "epoch": 0.35035322497271837, + "grad_norm": 0.3665081262588501, + "learning_rate": 0.0001, + "loss": 1.4447, + "step": 3050 + }, + { + "epoch": 0.35046809488254554, + "grad_norm": 0.3866146504878998, + "learning_rate": 0.0001, + "loss": 1.6509, + "step": 3051 + }, + { + "epoch": 0.35058296479237266, + "grad_norm": 0.37819328904151917, + "learning_rate": 0.0001, + "loss": 1.6386, + "step": 3052 + }, + { + "epoch": 0.3506978347021998, + "grad_norm": 0.3822932243347168, + "learning_rate": 0.0001, + "loss": 1.6757, + "step": 3053 + }, + { + "epoch": 0.3508127046120269, + "grad_norm": 0.3891104757785797, + "learning_rate": 0.0001, + "loss": 1.6485, + "step": 3054 + }, + { + "epoch": 0.350927574521854, + "grad_norm": 0.40191301703453064, + "learning_rate": 0.0001, + "loss": 1.8016, + "step": 3055 + }, + { + "epoch": 0.35104244443168114, + "grad_norm": 0.40188851952552795, + "learning_rate": 0.0001, + "loss": 1.7198, + "step": 3056 + }, + { + "epoch": 0.35115731434150826, + "grad_norm": 0.3913547694683075, + "learning_rate": 0.0001, + "loss": 1.5879, + "step": 3057 + }, + { + "epoch": 0.3512721842513354, + "grad_norm": 0.410559743642807, + "learning_rate": 0.0001, + "loss": 1.6572, + "step": 3058 + }, + { + "epoch": 0.3513870541611625, + "grad_norm": 0.3729799687862396, + "learning_rate": 0.0001, + "loss": 1.6277, + "step": 3059 + }, + { + "epoch": 0.3515019240709896, + "grad_norm": 0.39681947231292725, + "learning_rate": 0.0001, + "loss": 1.7177, + "step": 3060 + }, + { + "epoch": 0.35161679398081674, + "grad_norm": 0.38647258281707764, + "learning_rate": 0.0001, + "loss": 1.8458, + "step": 3061 + }, + { + "epoch": 0.35173166389064386, + "grad_norm": 0.3754447102546692, + "learning_rate": 0.0001, + "loss": 1.78, + "step": 3062 + }, + { + "epoch": 0.351846533800471, + "grad_norm": 0.35854676365852356, + "learning_rate": 0.0001, + "loss": 1.4786, + "step": 3063 + }, + { + "epoch": 0.3519614037102981, + "grad_norm": 0.3889663815498352, + "learning_rate": 0.0001, + "loss": 1.8046, + "step": 3064 + }, + { + "epoch": 0.3520762736201252, + "grad_norm": 0.36157429218292236, + "learning_rate": 0.0001, + "loss": 1.3026, + "step": 3065 + }, + { + "epoch": 0.35219114352995234, + "grad_norm": 0.44542935490608215, + "learning_rate": 0.0001, + "loss": 1.6889, + "step": 3066 + }, + { + "epoch": 0.35230601343977946, + "grad_norm": 0.3814290463924408, + "learning_rate": 0.0001, + "loss": 1.5027, + "step": 3067 + }, + { + "epoch": 0.3524208833496066, + "grad_norm": 0.36910781264305115, + "learning_rate": 0.0001, + "loss": 1.5745, + "step": 3068 + }, + { + "epoch": 0.3525357532594337, + "grad_norm": 0.4063052535057068, + "learning_rate": 0.0001, + "loss": 1.6937, + "step": 3069 + }, + { + "epoch": 0.3526506231692608, + "grad_norm": 0.3729820251464844, + "learning_rate": 0.0001, + "loss": 1.5685, + "step": 3070 + }, + { + "epoch": 0.35276549307908794, + "grad_norm": 0.3980967700481415, + "learning_rate": 0.0001, + "loss": 1.6192, + "step": 3071 + }, + { + "epoch": 0.35288036298891506, + "grad_norm": 0.3677471876144409, + "learning_rate": 0.0001, + "loss": 1.555, + "step": 3072 + }, + { + "epoch": 0.3529952328987422, + "grad_norm": 0.38200077414512634, + "learning_rate": 0.0001, + "loss": 1.6235, + "step": 3073 + }, + { + "epoch": 0.3531101028085693, + "grad_norm": 0.37423157691955566, + "learning_rate": 0.0001, + "loss": 1.5642, + "step": 3074 + }, + { + "epoch": 0.3532249727183964, + "grad_norm": 0.37253043055534363, + "learning_rate": 0.0001, + "loss": 1.6977, + "step": 3075 + }, + { + "epoch": 0.35333984262822354, + "grad_norm": 0.3927091360092163, + "learning_rate": 0.0001, + "loss": 1.7231, + "step": 3076 + }, + { + "epoch": 0.35345471253805066, + "grad_norm": 0.36988648772239685, + "learning_rate": 0.0001, + "loss": 1.506, + "step": 3077 + }, + { + "epoch": 0.3535695824478778, + "grad_norm": 0.38558465242385864, + "learning_rate": 0.0001, + "loss": 1.7394, + "step": 3078 + }, + { + "epoch": 0.3536844523577049, + "grad_norm": 0.44665199518203735, + "learning_rate": 0.0001, + "loss": 1.6499, + "step": 3079 + }, + { + "epoch": 0.353799322267532, + "grad_norm": 0.3623279929161072, + "learning_rate": 0.0001, + "loss": 1.4641, + "step": 3080 + }, + { + "epoch": 0.35391419217735914, + "grad_norm": 0.3726842701435089, + "learning_rate": 0.0001, + "loss": 1.5952, + "step": 3081 + }, + { + "epoch": 0.35402906208718626, + "grad_norm": 0.3852822184562683, + "learning_rate": 0.0001, + "loss": 1.7239, + "step": 3082 + }, + { + "epoch": 0.3541439319970134, + "grad_norm": 0.3527339994907379, + "learning_rate": 0.0001, + "loss": 1.5354, + "step": 3083 + }, + { + "epoch": 0.3542588019068405, + "grad_norm": 0.3964180052280426, + "learning_rate": 0.0001, + "loss": 1.6671, + "step": 3084 + }, + { + "epoch": 0.3543736718166676, + "grad_norm": 0.37189149856567383, + "learning_rate": 0.0001, + "loss": 1.6399, + "step": 3085 + }, + { + "epoch": 0.35448854172649474, + "grad_norm": 0.3664330840110779, + "learning_rate": 0.0001, + "loss": 1.4749, + "step": 3086 + }, + { + "epoch": 0.35460341163632186, + "grad_norm": 0.43963682651519775, + "learning_rate": 0.0001, + "loss": 1.8784, + "step": 3087 + }, + { + "epoch": 0.354718281546149, + "grad_norm": 0.41229376196861267, + "learning_rate": 0.0001, + "loss": 2.0028, + "step": 3088 + }, + { + "epoch": 0.3548331514559761, + "grad_norm": 0.37985897064208984, + "learning_rate": 0.0001, + "loss": 1.522, + "step": 3089 + }, + { + "epoch": 0.3549480213658032, + "grad_norm": 0.4061013162136078, + "learning_rate": 0.0001, + "loss": 1.767, + "step": 3090 + }, + { + "epoch": 0.35506289127563034, + "grad_norm": 0.4098246395587921, + "learning_rate": 0.0001, + "loss": 1.6884, + "step": 3091 + }, + { + "epoch": 0.35517776118545746, + "grad_norm": 0.4013693630695343, + "learning_rate": 0.0001, + "loss": 1.6242, + "step": 3092 + }, + { + "epoch": 0.3552926310952846, + "grad_norm": 0.3519565463066101, + "learning_rate": 0.0001, + "loss": 1.6036, + "step": 3093 + }, + { + "epoch": 0.3554075010051117, + "grad_norm": 0.3677949011325836, + "learning_rate": 0.0001, + "loss": 1.4798, + "step": 3094 + }, + { + "epoch": 0.3555223709149388, + "grad_norm": 0.38344481587409973, + "learning_rate": 0.0001, + "loss": 1.7043, + "step": 3095 + }, + { + "epoch": 0.35563724082476594, + "grad_norm": 0.3987760543823242, + "learning_rate": 0.0001, + "loss": 1.5873, + "step": 3096 + }, + { + "epoch": 0.35575211073459306, + "grad_norm": 0.37182289361953735, + "learning_rate": 0.0001, + "loss": 1.5454, + "step": 3097 + }, + { + "epoch": 0.3558669806444202, + "grad_norm": 0.36989521980285645, + "learning_rate": 0.0001, + "loss": 1.6257, + "step": 3098 + }, + { + "epoch": 0.3559818505542473, + "grad_norm": 0.38265496492385864, + "learning_rate": 0.0001, + "loss": 1.6665, + "step": 3099 + }, + { + "epoch": 0.3560967204640744, + "grad_norm": 0.4006288945674896, + "learning_rate": 0.0001, + "loss": 1.787, + "step": 3100 + }, + { + "epoch": 0.35621159037390154, + "grad_norm": 0.38467901945114136, + "learning_rate": 0.0001, + "loss": 1.639, + "step": 3101 + }, + { + "epoch": 0.35632646028372866, + "grad_norm": 0.3911599814891815, + "learning_rate": 0.0001, + "loss": 1.6307, + "step": 3102 + }, + { + "epoch": 0.3564413301935558, + "grad_norm": 0.36751917004585266, + "learning_rate": 0.0001, + "loss": 1.5144, + "step": 3103 + }, + { + "epoch": 0.3565562001033829, + "grad_norm": 0.3881228566169739, + "learning_rate": 0.0001, + "loss": 1.7045, + "step": 3104 + }, + { + "epoch": 0.35667107001321, + "grad_norm": 0.36318087577819824, + "learning_rate": 0.0001, + "loss": 1.4927, + "step": 3105 + }, + { + "epoch": 0.35678593992303714, + "grad_norm": 0.3672310709953308, + "learning_rate": 0.0001, + "loss": 1.6125, + "step": 3106 + }, + { + "epoch": 0.35690080983286426, + "grad_norm": 0.35635697841644287, + "learning_rate": 0.0001, + "loss": 1.5576, + "step": 3107 + }, + { + "epoch": 0.3570156797426914, + "grad_norm": 0.38991066813468933, + "learning_rate": 0.0001, + "loss": 1.6838, + "step": 3108 + }, + { + "epoch": 0.3571305496525185, + "grad_norm": 0.3975917100906372, + "learning_rate": 0.0001, + "loss": 1.6502, + "step": 3109 + }, + { + "epoch": 0.3572454195623456, + "grad_norm": 0.39171603322029114, + "learning_rate": 0.0001, + "loss": 1.7254, + "step": 3110 + }, + { + "epoch": 0.35736028947217274, + "grad_norm": 0.39555254578590393, + "learning_rate": 0.0001, + "loss": 1.5358, + "step": 3111 + }, + { + "epoch": 0.35747515938199986, + "grad_norm": 0.36104774475097656, + "learning_rate": 0.0001, + "loss": 1.4724, + "step": 3112 + }, + { + "epoch": 0.357590029291827, + "grad_norm": 0.38759103417396545, + "learning_rate": 0.0001, + "loss": 1.6787, + "step": 3113 + }, + { + "epoch": 0.3577048992016541, + "grad_norm": 0.4347061216831207, + "learning_rate": 0.0001, + "loss": 1.8388, + "step": 3114 + }, + { + "epoch": 0.3578197691114812, + "grad_norm": 0.36423662304878235, + "learning_rate": 0.0001, + "loss": 1.7236, + "step": 3115 + }, + { + "epoch": 0.35793463902130834, + "grad_norm": 0.4022355377674103, + "learning_rate": 0.0001, + "loss": 1.5869, + "step": 3116 + }, + { + "epoch": 0.35804950893113546, + "grad_norm": 0.3823007047176361, + "learning_rate": 0.0001, + "loss": 1.5984, + "step": 3117 + }, + { + "epoch": 0.3581643788409626, + "grad_norm": 0.3590918481349945, + "learning_rate": 0.0001, + "loss": 1.6226, + "step": 3118 + }, + { + "epoch": 0.35827924875078976, + "grad_norm": 0.3881797790527344, + "learning_rate": 0.0001, + "loss": 1.7983, + "step": 3119 + }, + { + "epoch": 0.3583941186606169, + "grad_norm": 0.41121959686279297, + "learning_rate": 0.0001, + "loss": 1.6413, + "step": 3120 + }, + { + "epoch": 0.358508988570444, + "grad_norm": 0.37650343775749207, + "learning_rate": 0.0001, + "loss": 1.553, + "step": 3121 + }, + { + "epoch": 0.3586238584802711, + "grad_norm": 0.4044298827648163, + "learning_rate": 0.0001, + "loss": 1.715, + "step": 3122 + }, + { + "epoch": 0.35873872839009824, + "grad_norm": 0.3791009485721588, + "learning_rate": 0.0001, + "loss": 1.6326, + "step": 3123 + }, + { + "epoch": 0.35885359829992536, + "grad_norm": 0.425060898065567, + "learning_rate": 0.0001, + "loss": 1.8154, + "step": 3124 + }, + { + "epoch": 0.3589684682097525, + "grad_norm": 0.3991539776325226, + "learning_rate": 0.0001, + "loss": 1.7491, + "step": 3125 + }, + { + "epoch": 0.3590833381195796, + "grad_norm": 0.40489476919174194, + "learning_rate": 0.0001, + "loss": 1.8376, + "step": 3126 + }, + { + "epoch": 0.3591982080294067, + "grad_norm": 0.40158843994140625, + "learning_rate": 0.0001, + "loss": 1.818, + "step": 3127 + }, + { + "epoch": 0.35931307793923384, + "grad_norm": 0.4292317032814026, + "learning_rate": 0.0001, + "loss": 1.8346, + "step": 3128 + }, + { + "epoch": 0.35942794784906096, + "grad_norm": 0.4303164780139923, + "learning_rate": 0.0001, + "loss": 1.9145, + "step": 3129 + }, + { + "epoch": 0.3595428177588881, + "grad_norm": 0.406921923160553, + "learning_rate": 0.0001, + "loss": 1.7952, + "step": 3130 + }, + { + "epoch": 0.3596576876687152, + "grad_norm": 0.39321935176849365, + "learning_rate": 0.0001, + "loss": 1.8008, + "step": 3131 + }, + { + "epoch": 0.3597725575785423, + "grad_norm": 0.36769285798072815, + "learning_rate": 0.0001, + "loss": 1.7165, + "step": 3132 + }, + { + "epoch": 0.35988742748836944, + "grad_norm": 0.37966418266296387, + "learning_rate": 0.0001, + "loss": 1.652, + "step": 3133 + }, + { + "epoch": 0.36000229739819656, + "grad_norm": 0.4449335038661957, + "learning_rate": 0.0001, + "loss": 1.8674, + "step": 3134 + }, + { + "epoch": 0.3601171673080237, + "grad_norm": 0.37845826148986816, + "learning_rate": 0.0001, + "loss": 1.6453, + "step": 3135 + }, + { + "epoch": 0.3602320372178508, + "grad_norm": 0.43711939454078674, + "learning_rate": 0.0001, + "loss": 1.6399, + "step": 3136 + }, + { + "epoch": 0.3603469071276779, + "grad_norm": 0.3823258876800537, + "learning_rate": 0.0001, + "loss": 1.6534, + "step": 3137 + }, + { + "epoch": 0.36046177703750504, + "grad_norm": 0.38525834679603577, + "learning_rate": 0.0001, + "loss": 1.611, + "step": 3138 + }, + { + "epoch": 0.36057664694733216, + "grad_norm": 0.38904234766960144, + "learning_rate": 0.0001, + "loss": 1.5454, + "step": 3139 + }, + { + "epoch": 0.3606915168571593, + "grad_norm": 0.39216116070747375, + "learning_rate": 0.0001, + "loss": 1.7976, + "step": 3140 + }, + { + "epoch": 0.3608063867669864, + "grad_norm": 0.3699231743812561, + "learning_rate": 0.0001, + "loss": 1.5785, + "step": 3141 + }, + { + "epoch": 0.3609212566768135, + "grad_norm": 0.3960736393928528, + "learning_rate": 0.0001, + "loss": 1.6743, + "step": 3142 + }, + { + "epoch": 0.36103612658664064, + "grad_norm": 0.37303784489631653, + "learning_rate": 0.0001, + "loss": 1.415, + "step": 3143 + }, + { + "epoch": 0.36115099649646776, + "grad_norm": 0.4160480797290802, + "learning_rate": 0.0001, + "loss": 1.7466, + "step": 3144 + }, + { + "epoch": 0.3612658664062949, + "grad_norm": 0.4043230712413788, + "learning_rate": 0.0001, + "loss": 1.7383, + "step": 3145 + }, + { + "epoch": 0.361380736316122, + "grad_norm": 0.3847825825214386, + "learning_rate": 0.0001, + "loss": 1.6484, + "step": 3146 + }, + { + "epoch": 0.3614956062259491, + "grad_norm": 0.40463775396347046, + "learning_rate": 0.0001, + "loss": 1.6482, + "step": 3147 + }, + { + "epoch": 0.36161047613577624, + "grad_norm": 0.3750319480895996, + "learning_rate": 0.0001, + "loss": 1.6921, + "step": 3148 + }, + { + "epoch": 0.36172534604560336, + "grad_norm": 0.37009164690971375, + "learning_rate": 0.0001, + "loss": 1.6415, + "step": 3149 + }, + { + "epoch": 0.3618402159554305, + "grad_norm": 0.34993571043014526, + "learning_rate": 0.0001, + "loss": 1.5076, + "step": 3150 + }, + { + "epoch": 0.3619550858652576, + "grad_norm": 0.3800502419471741, + "learning_rate": 0.0001, + "loss": 1.6974, + "step": 3151 + }, + { + "epoch": 0.3620699557750847, + "grad_norm": 0.3825697600841522, + "learning_rate": 0.0001, + "loss": 1.4917, + "step": 3152 + }, + { + "epoch": 0.36218482568491184, + "grad_norm": 0.384375661611557, + "learning_rate": 0.0001, + "loss": 1.5734, + "step": 3153 + }, + { + "epoch": 0.36229969559473896, + "grad_norm": 0.4005104899406433, + "learning_rate": 0.0001, + "loss": 1.7287, + "step": 3154 + }, + { + "epoch": 0.3624145655045661, + "grad_norm": 0.3781236708164215, + "learning_rate": 0.0001, + "loss": 1.4272, + "step": 3155 + }, + { + "epoch": 0.3625294354143932, + "grad_norm": 0.37591099739074707, + "learning_rate": 0.0001, + "loss": 1.5723, + "step": 3156 + }, + { + "epoch": 0.3626443053242203, + "grad_norm": 0.3829016089439392, + "learning_rate": 0.0001, + "loss": 1.4127, + "step": 3157 + }, + { + "epoch": 0.36275917523404744, + "grad_norm": 0.4181736707687378, + "learning_rate": 0.0001, + "loss": 1.8398, + "step": 3158 + }, + { + "epoch": 0.36287404514387456, + "grad_norm": 0.3961413502693176, + "learning_rate": 0.0001, + "loss": 1.651, + "step": 3159 + }, + { + "epoch": 0.3629889150537017, + "grad_norm": 0.3490610420703888, + "learning_rate": 0.0001, + "loss": 1.6495, + "step": 3160 + }, + { + "epoch": 0.3631037849635288, + "grad_norm": 0.45399048924446106, + "learning_rate": 0.0001, + "loss": 1.7701, + "step": 3161 + }, + { + "epoch": 0.3632186548733559, + "grad_norm": 0.3794902563095093, + "learning_rate": 0.0001, + "loss": 1.6005, + "step": 3162 + }, + { + "epoch": 0.36333352478318304, + "grad_norm": 0.41195976734161377, + "learning_rate": 0.0001, + "loss": 1.7688, + "step": 3163 + }, + { + "epoch": 0.36344839469301016, + "grad_norm": 0.4826469123363495, + "learning_rate": 0.0001, + "loss": 1.5523, + "step": 3164 + }, + { + "epoch": 0.3635632646028373, + "grad_norm": 0.39594000577926636, + "learning_rate": 0.0001, + "loss": 1.6323, + "step": 3165 + }, + { + "epoch": 0.3636781345126644, + "grad_norm": 0.41155776381492615, + "learning_rate": 0.0001, + "loss": 1.7261, + "step": 3166 + }, + { + "epoch": 0.3637930044224915, + "grad_norm": 0.413484126329422, + "learning_rate": 0.0001, + "loss": 1.7269, + "step": 3167 + }, + { + "epoch": 0.36390787433231864, + "grad_norm": 0.41094526648521423, + "learning_rate": 0.0001, + "loss": 1.5909, + "step": 3168 + }, + { + "epoch": 0.36402274424214576, + "grad_norm": 0.4104525148868561, + "learning_rate": 0.0001, + "loss": 1.7229, + "step": 3169 + }, + { + "epoch": 0.3641376141519729, + "grad_norm": 0.40125396847724915, + "learning_rate": 0.0001, + "loss": 1.7518, + "step": 3170 + }, + { + "epoch": 0.3642524840618, + "grad_norm": 0.37167468667030334, + "learning_rate": 0.0001, + "loss": 1.6433, + "step": 3171 + }, + { + "epoch": 0.3643673539716271, + "grad_norm": 0.4034722149372101, + "learning_rate": 0.0001, + "loss": 1.6484, + "step": 3172 + }, + { + "epoch": 0.36448222388145424, + "grad_norm": 0.3839128613471985, + "learning_rate": 0.0001, + "loss": 1.6553, + "step": 3173 + }, + { + "epoch": 0.36459709379128136, + "grad_norm": 0.36040592193603516, + "learning_rate": 0.0001, + "loss": 1.5386, + "step": 3174 + }, + { + "epoch": 0.3647119637011085, + "grad_norm": 0.3928232192993164, + "learning_rate": 0.0001, + "loss": 1.6535, + "step": 3175 + }, + { + "epoch": 0.3648268336109356, + "grad_norm": 0.3935447931289673, + "learning_rate": 0.0001, + "loss": 1.6686, + "step": 3176 + }, + { + "epoch": 0.3649417035207627, + "grad_norm": 0.37371543049812317, + "learning_rate": 0.0001, + "loss": 1.5849, + "step": 3177 + }, + { + "epoch": 0.36505657343058984, + "grad_norm": 0.40319526195526123, + "learning_rate": 0.0001, + "loss": 1.5654, + "step": 3178 + }, + { + "epoch": 0.36517144334041696, + "grad_norm": 0.42261865735054016, + "learning_rate": 0.0001, + "loss": 1.5549, + "step": 3179 + }, + { + "epoch": 0.3652863132502441, + "grad_norm": 0.4181409180164337, + "learning_rate": 0.0001, + "loss": 1.6968, + "step": 3180 + }, + { + "epoch": 0.3654011831600712, + "grad_norm": 0.420463889837265, + "learning_rate": 0.0001, + "loss": 1.8145, + "step": 3181 + }, + { + "epoch": 0.3655160530698983, + "grad_norm": 0.3919375538825989, + "learning_rate": 0.0001, + "loss": 1.5424, + "step": 3182 + }, + { + "epoch": 0.36563092297972544, + "grad_norm": 0.6190922260284424, + "learning_rate": 0.0001, + "loss": 1.6232, + "step": 3183 + }, + { + "epoch": 0.36574579288955256, + "grad_norm": 0.3965461254119873, + "learning_rate": 0.0001, + "loss": 1.6968, + "step": 3184 + }, + { + "epoch": 0.3658606627993797, + "grad_norm": 0.3818904757499695, + "learning_rate": 0.0001, + "loss": 1.7041, + "step": 3185 + }, + { + "epoch": 0.3659755327092068, + "grad_norm": 0.38142475485801697, + "learning_rate": 0.0001, + "loss": 1.6398, + "step": 3186 + }, + { + "epoch": 0.3660904026190339, + "grad_norm": 0.4176258444786072, + "learning_rate": 0.0001, + "loss": 1.678, + "step": 3187 + }, + { + "epoch": 0.3662052725288611, + "grad_norm": 0.3882141709327698, + "learning_rate": 0.0001, + "loss": 1.601, + "step": 3188 + }, + { + "epoch": 0.3663201424386882, + "grad_norm": 0.3725062608718872, + "learning_rate": 0.0001, + "loss": 1.5094, + "step": 3189 + }, + { + "epoch": 0.36643501234851533, + "grad_norm": 0.4642198979854584, + "learning_rate": 0.0001, + "loss": 2.0719, + "step": 3190 + }, + { + "epoch": 0.36654988225834245, + "grad_norm": 0.4120834171772003, + "learning_rate": 0.0001, + "loss": 1.475, + "step": 3191 + }, + { + "epoch": 0.36666475216816957, + "grad_norm": 0.44245001673698425, + "learning_rate": 0.0001, + "loss": 1.818, + "step": 3192 + }, + { + "epoch": 0.3667796220779967, + "grad_norm": 0.3757948875427246, + "learning_rate": 0.0001, + "loss": 1.6047, + "step": 3193 + }, + { + "epoch": 0.3668944919878238, + "grad_norm": 0.386857271194458, + "learning_rate": 0.0001, + "loss": 1.7447, + "step": 3194 + }, + { + "epoch": 0.36700936189765093, + "grad_norm": 0.37118127942085266, + "learning_rate": 0.0001, + "loss": 1.7593, + "step": 3195 + }, + { + "epoch": 0.36712423180747805, + "grad_norm": 0.39029768109321594, + "learning_rate": 0.0001, + "loss": 1.6951, + "step": 3196 + }, + { + "epoch": 0.36723910171730517, + "grad_norm": 0.4252242147922516, + "learning_rate": 0.0001, + "loss": 1.817, + "step": 3197 + }, + { + "epoch": 0.3673539716271323, + "grad_norm": 0.39361050724983215, + "learning_rate": 0.0001, + "loss": 1.663, + "step": 3198 + }, + { + "epoch": 0.3674688415369594, + "grad_norm": 0.4176224172115326, + "learning_rate": 0.0001, + "loss": 1.8173, + "step": 3199 + }, + { + "epoch": 0.36758371144678653, + "grad_norm": 0.42337673902511597, + "learning_rate": 0.0001, + "loss": 1.6874, + "step": 3200 + }, + { + "epoch": 0.36769858135661365, + "grad_norm": 0.3670978546142578, + "learning_rate": 0.0001, + "loss": 1.6705, + "step": 3201 + }, + { + "epoch": 0.36781345126644077, + "grad_norm": 0.3871794044971466, + "learning_rate": 0.0001, + "loss": 1.7955, + "step": 3202 + }, + { + "epoch": 0.3679283211762679, + "grad_norm": 0.39479079842567444, + "learning_rate": 0.0001, + "loss": 1.8078, + "step": 3203 + }, + { + "epoch": 0.368043191086095, + "grad_norm": 0.405738890171051, + "learning_rate": 0.0001, + "loss": 1.8309, + "step": 3204 + }, + { + "epoch": 0.36815806099592213, + "grad_norm": 0.37141597270965576, + "learning_rate": 0.0001, + "loss": 1.5382, + "step": 3205 + }, + { + "epoch": 0.36827293090574925, + "grad_norm": 0.4640069007873535, + "learning_rate": 0.0001, + "loss": 1.5492, + "step": 3206 + }, + { + "epoch": 0.36838780081557637, + "grad_norm": 0.39117664098739624, + "learning_rate": 0.0001, + "loss": 1.8055, + "step": 3207 + }, + { + "epoch": 0.3685026707254035, + "grad_norm": 0.3931315839290619, + "learning_rate": 0.0001, + "loss": 1.6636, + "step": 3208 + }, + { + "epoch": 0.3686175406352306, + "grad_norm": 0.406044065952301, + "learning_rate": 0.0001, + "loss": 1.7652, + "step": 3209 + }, + { + "epoch": 0.36873241054505773, + "grad_norm": 0.37992948293685913, + "learning_rate": 0.0001, + "loss": 1.5028, + "step": 3210 + }, + { + "epoch": 0.36884728045488485, + "grad_norm": 0.4231666326522827, + "learning_rate": 0.0001, + "loss": 1.7238, + "step": 3211 + }, + { + "epoch": 0.36896215036471197, + "grad_norm": 0.4150424599647522, + "learning_rate": 0.0001, + "loss": 1.8108, + "step": 3212 + }, + { + "epoch": 0.3690770202745391, + "grad_norm": 0.3594525158405304, + "learning_rate": 0.0001, + "loss": 1.4806, + "step": 3213 + }, + { + "epoch": 0.3691918901843662, + "grad_norm": 0.381798654794693, + "learning_rate": 0.0001, + "loss": 1.6687, + "step": 3214 + }, + { + "epoch": 0.36930676009419333, + "grad_norm": 0.38499388098716736, + "learning_rate": 0.0001, + "loss": 1.7469, + "step": 3215 + }, + { + "epoch": 0.36942163000402045, + "grad_norm": 0.4006686210632324, + "learning_rate": 0.0001, + "loss": 1.7326, + "step": 3216 + }, + { + "epoch": 0.36953649991384757, + "grad_norm": 0.396440327167511, + "learning_rate": 0.0001, + "loss": 1.6461, + "step": 3217 + }, + { + "epoch": 0.3696513698236747, + "grad_norm": 0.3961047828197479, + "learning_rate": 0.0001, + "loss": 1.762, + "step": 3218 + }, + { + "epoch": 0.3697662397335018, + "grad_norm": 0.4057855010032654, + "learning_rate": 0.0001, + "loss": 1.5334, + "step": 3219 + }, + { + "epoch": 0.36988110964332893, + "grad_norm": 0.4407478868961334, + "learning_rate": 0.0001, + "loss": 1.6928, + "step": 3220 + }, + { + "epoch": 0.36999597955315605, + "grad_norm": 0.37952542304992676, + "learning_rate": 0.0001, + "loss": 1.7687, + "step": 3221 + }, + { + "epoch": 0.37011084946298317, + "grad_norm": 0.3981403708457947, + "learning_rate": 0.0001, + "loss": 1.6591, + "step": 3222 + }, + { + "epoch": 0.3702257193728103, + "grad_norm": 0.3874553143978119, + "learning_rate": 0.0001, + "loss": 1.6144, + "step": 3223 + }, + { + "epoch": 0.3703405892826374, + "grad_norm": 0.40902000665664673, + "learning_rate": 0.0001, + "loss": 1.6587, + "step": 3224 + }, + { + "epoch": 0.37045545919246453, + "grad_norm": 0.42551088333129883, + "learning_rate": 0.0001, + "loss": 1.7355, + "step": 3225 + }, + { + "epoch": 0.37057032910229165, + "grad_norm": 0.3739902079105377, + "learning_rate": 0.0001, + "loss": 1.4641, + "step": 3226 + }, + { + "epoch": 0.37068519901211877, + "grad_norm": 0.41387036442756653, + "learning_rate": 0.0001, + "loss": 1.7716, + "step": 3227 + }, + { + "epoch": 0.3708000689219459, + "grad_norm": 0.393655002117157, + "learning_rate": 0.0001, + "loss": 1.5306, + "step": 3228 + }, + { + "epoch": 0.370914938831773, + "grad_norm": 0.39244723320007324, + "learning_rate": 0.0001, + "loss": 1.7408, + "step": 3229 + }, + { + "epoch": 0.37102980874160013, + "grad_norm": 0.3959055542945862, + "learning_rate": 0.0001, + "loss": 1.8051, + "step": 3230 + }, + { + "epoch": 0.37114467865142725, + "grad_norm": 0.39641687273979187, + "learning_rate": 0.0001, + "loss": 1.5969, + "step": 3231 + }, + { + "epoch": 0.37125954856125437, + "grad_norm": 0.37749791145324707, + "learning_rate": 0.0001, + "loss": 1.685, + "step": 3232 + }, + { + "epoch": 0.3713744184710815, + "grad_norm": 0.40520602464675903, + "learning_rate": 0.0001, + "loss": 1.6052, + "step": 3233 + }, + { + "epoch": 0.3714892883809086, + "grad_norm": 0.37299278378486633, + "learning_rate": 0.0001, + "loss": 1.6383, + "step": 3234 + }, + { + "epoch": 0.37160415829073573, + "grad_norm": 0.41653332114219666, + "learning_rate": 0.0001, + "loss": 1.5829, + "step": 3235 + }, + { + "epoch": 0.37171902820056285, + "grad_norm": 0.4918358623981476, + "learning_rate": 0.0001, + "loss": 1.894, + "step": 3236 + }, + { + "epoch": 0.37183389811038997, + "grad_norm": 0.40961939096450806, + "learning_rate": 0.0001, + "loss": 1.6471, + "step": 3237 + }, + { + "epoch": 0.3719487680202171, + "grad_norm": 0.39822736382484436, + "learning_rate": 0.0001, + "loss": 1.6876, + "step": 3238 + }, + { + "epoch": 0.3720636379300442, + "grad_norm": 0.42709147930145264, + "learning_rate": 0.0001, + "loss": 1.6064, + "step": 3239 + }, + { + "epoch": 0.37217850783987133, + "grad_norm": 0.3828563690185547, + "learning_rate": 0.0001, + "loss": 1.5153, + "step": 3240 + }, + { + "epoch": 0.37229337774969845, + "grad_norm": 0.3979793190956116, + "learning_rate": 0.0001, + "loss": 1.7632, + "step": 3241 + }, + { + "epoch": 0.37240824765952557, + "grad_norm": 0.4186950623989105, + "learning_rate": 0.0001, + "loss": 1.6498, + "step": 3242 + }, + { + "epoch": 0.3725231175693527, + "grad_norm": 0.3847867548465729, + "learning_rate": 0.0001, + "loss": 1.6185, + "step": 3243 + }, + { + "epoch": 0.3726379874791798, + "grad_norm": 0.4237745404243469, + "learning_rate": 0.0001, + "loss": 1.6776, + "step": 3244 + }, + { + "epoch": 0.37275285738900693, + "grad_norm": 0.41460272669792175, + "learning_rate": 0.0001, + "loss": 1.8457, + "step": 3245 + }, + { + "epoch": 0.37286772729883405, + "grad_norm": 0.37694281339645386, + "learning_rate": 0.0001, + "loss": 1.5774, + "step": 3246 + }, + { + "epoch": 0.37298259720866117, + "grad_norm": 0.3860875368118286, + "learning_rate": 0.0001, + "loss": 1.7283, + "step": 3247 + }, + { + "epoch": 0.3730974671184883, + "grad_norm": 0.4228290319442749, + "learning_rate": 0.0001, + "loss": 1.9065, + "step": 3248 + }, + { + "epoch": 0.3732123370283154, + "grad_norm": 0.4396858811378479, + "learning_rate": 0.0001, + "loss": 1.8495, + "step": 3249 + }, + { + "epoch": 0.37332720693814253, + "grad_norm": 0.44928887486457825, + "learning_rate": 0.0001, + "loss": 1.9191, + "step": 3250 + }, + { + "epoch": 0.37344207684796965, + "grad_norm": 0.4096646010875702, + "learning_rate": 0.0001, + "loss": 1.7892, + "step": 3251 + }, + { + "epoch": 0.37355694675779677, + "grad_norm": 0.36828523874282837, + "learning_rate": 0.0001, + "loss": 1.396, + "step": 3252 + }, + { + "epoch": 0.3736718166676239, + "grad_norm": 0.3781738877296448, + "learning_rate": 0.0001, + "loss": 1.6157, + "step": 3253 + }, + { + "epoch": 0.373786686577451, + "grad_norm": 0.3941371738910675, + "learning_rate": 0.0001, + "loss": 1.7005, + "step": 3254 + }, + { + "epoch": 0.37390155648727813, + "grad_norm": 0.3932330310344696, + "learning_rate": 0.0001, + "loss": 1.605, + "step": 3255 + }, + { + "epoch": 0.3740164263971053, + "grad_norm": 0.40510842204093933, + "learning_rate": 0.0001, + "loss": 1.7448, + "step": 3256 + }, + { + "epoch": 0.3741312963069324, + "grad_norm": 0.40375015139579773, + "learning_rate": 0.0001, + "loss": 1.7276, + "step": 3257 + }, + { + "epoch": 0.37424616621675955, + "grad_norm": 0.4124782085418701, + "learning_rate": 0.0001, + "loss": 1.7071, + "step": 3258 + }, + { + "epoch": 0.37436103612658667, + "grad_norm": 0.3871046006679535, + "learning_rate": 0.0001, + "loss": 1.7161, + "step": 3259 + }, + { + "epoch": 0.3744759060364138, + "grad_norm": 0.40944960713386536, + "learning_rate": 0.0001, + "loss": 1.5999, + "step": 3260 + }, + { + "epoch": 0.3745907759462409, + "grad_norm": 0.39792561531066895, + "learning_rate": 0.0001, + "loss": 1.6789, + "step": 3261 + }, + { + "epoch": 0.374705645856068, + "grad_norm": 0.38910964131355286, + "learning_rate": 0.0001, + "loss": 1.6719, + "step": 3262 + }, + { + "epoch": 0.37482051576589515, + "grad_norm": 0.37065139412879944, + "learning_rate": 0.0001, + "loss": 1.5161, + "step": 3263 + }, + { + "epoch": 0.37493538567572227, + "grad_norm": 0.3843998908996582, + "learning_rate": 0.0001, + "loss": 1.7721, + "step": 3264 + }, + { + "epoch": 0.3750502555855494, + "grad_norm": 0.4312693178653717, + "learning_rate": 0.0001, + "loss": 1.9565, + "step": 3265 + }, + { + "epoch": 0.3751651254953765, + "grad_norm": 0.3939458727836609, + "learning_rate": 0.0001, + "loss": 1.7894, + "step": 3266 + }, + { + "epoch": 0.3752799954052036, + "grad_norm": 0.3727447986602783, + "learning_rate": 0.0001, + "loss": 1.5771, + "step": 3267 + }, + { + "epoch": 0.37539486531503075, + "grad_norm": 0.3730446994304657, + "learning_rate": 0.0001, + "loss": 1.5062, + "step": 3268 + }, + { + "epoch": 0.37550973522485787, + "grad_norm": 0.4053596258163452, + "learning_rate": 0.0001, + "loss": 1.7005, + "step": 3269 + }, + { + "epoch": 0.375624605134685, + "grad_norm": 0.4294143617153168, + "learning_rate": 0.0001, + "loss": 1.5863, + "step": 3270 + }, + { + "epoch": 0.3757394750445121, + "grad_norm": 0.3998027443885803, + "learning_rate": 0.0001, + "loss": 1.6598, + "step": 3271 + }, + { + "epoch": 0.3758543449543392, + "grad_norm": 0.39624300599098206, + "learning_rate": 0.0001, + "loss": 1.6187, + "step": 3272 + }, + { + "epoch": 0.37596921486416635, + "grad_norm": 0.38765525817871094, + "learning_rate": 0.0001, + "loss": 1.7249, + "step": 3273 + }, + { + "epoch": 0.37608408477399347, + "grad_norm": 0.38132765889167786, + "learning_rate": 0.0001, + "loss": 1.6718, + "step": 3274 + }, + { + "epoch": 0.3761989546838206, + "grad_norm": 0.386642187833786, + "learning_rate": 0.0001, + "loss": 1.6102, + "step": 3275 + }, + { + "epoch": 0.3763138245936477, + "grad_norm": 0.3864053189754486, + "learning_rate": 0.0001, + "loss": 1.5038, + "step": 3276 + }, + { + "epoch": 0.3764286945034748, + "grad_norm": 0.40680447220802307, + "learning_rate": 0.0001, + "loss": 1.6453, + "step": 3277 + }, + { + "epoch": 0.37654356441330195, + "grad_norm": 0.38861724734306335, + "learning_rate": 0.0001, + "loss": 1.6394, + "step": 3278 + }, + { + "epoch": 0.37665843432312907, + "grad_norm": 0.3894346058368683, + "learning_rate": 0.0001, + "loss": 1.7633, + "step": 3279 + }, + { + "epoch": 0.3767733042329562, + "grad_norm": 0.4006745517253876, + "learning_rate": 0.0001, + "loss": 1.6574, + "step": 3280 + }, + { + "epoch": 0.3768881741427833, + "grad_norm": 0.4053511917591095, + "learning_rate": 0.0001, + "loss": 1.5958, + "step": 3281 + }, + { + "epoch": 0.3770030440526104, + "grad_norm": 0.44916823506355286, + "learning_rate": 0.0001, + "loss": 1.5274, + "step": 3282 + }, + { + "epoch": 0.37711791396243755, + "grad_norm": 0.4060499668121338, + "learning_rate": 0.0001, + "loss": 1.6009, + "step": 3283 + }, + { + "epoch": 0.37723278387226467, + "grad_norm": 0.43672630190849304, + "learning_rate": 0.0001, + "loss": 1.7468, + "step": 3284 + }, + { + "epoch": 0.3773476537820918, + "grad_norm": 0.4144275486469269, + "learning_rate": 0.0001, + "loss": 1.7347, + "step": 3285 + }, + { + "epoch": 0.3774625236919189, + "grad_norm": 0.3954853415489197, + "learning_rate": 0.0001, + "loss": 1.7747, + "step": 3286 + }, + { + "epoch": 0.377577393601746, + "grad_norm": 0.4159530699253082, + "learning_rate": 0.0001, + "loss": 1.868, + "step": 3287 + }, + { + "epoch": 0.37769226351157315, + "grad_norm": 0.4597531855106354, + "learning_rate": 0.0001, + "loss": 1.6728, + "step": 3288 + }, + { + "epoch": 0.37780713342140027, + "grad_norm": 0.3839264214038849, + "learning_rate": 0.0001, + "loss": 1.4713, + "step": 3289 + }, + { + "epoch": 0.3779220033312274, + "grad_norm": 0.40157079696655273, + "learning_rate": 0.0001, + "loss": 1.7359, + "step": 3290 + }, + { + "epoch": 0.3780368732410545, + "grad_norm": 0.38555535674095154, + "learning_rate": 0.0001, + "loss": 1.5765, + "step": 3291 + }, + { + "epoch": 0.3781517431508816, + "grad_norm": 0.41350314021110535, + "learning_rate": 0.0001, + "loss": 1.6308, + "step": 3292 + }, + { + "epoch": 0.37826661306070875, + "grad_norm": 0.4091131389141083, + "learning_rate": 0.0001, + "loss": 1.7817, + "step": 3293 + }, + { + "epoch": 0.37838148297053587, + "grad_norm": 0.39197850227355957, + "learning_rate": 0.0001, + "loss": 1.7373, + "step": 3294 + }, + { + "epoch": 0.378496352880363, + "grad_norm": 0.3582981824874878, + "learning_rate": 0.0001, + "loss": 1.5578, + "step": 3295 + }, + { + "epoch": 0.3786112227901901, + "grad_norm": 0.4097733497619629, + "learning_rate": 0.0001, + "loss": 1.815, + "step": 3296 + }, + { + "epoch": 0.3787260927000172, + "grad_norm": 0.405089408159256, + "learning_rate": 0.0001, + "loss": 1.7199, + "step": 3297 + }, + { + "epoch": 0.37884096260984435, + "grad_norm": 0.3839695155620575, + "learning_rate": 0.0001, + "loss": 1.7156, + "step": 3298 + }, + { + "epoch": 0.37895583251967146, + "grad_norm": 0.4171733260154724, + "learning_rate": 0.0001, + "loss": 1.4943, + "step": 3299 + }, + { + "epoch": 0.3790707024294986, + "grad_norm": 0.39771202206611633, + "learning_rate": 0.0001, + "loss": 1.4806, + "step": 3300 + }, + { + "epoch": 0.3791855723393257, + "grad_norm": 0.4176054894924164, + "learning_rate": 0.0001, + "loss": 1.6608, + "step": 3301 + }, + { + "epoch": 0.3793004422491528, + "grad_norm": 0.3709241449832916, + "learning_rate": 0.0001, + "loss": 1.4327, + "step": 3302 + }, + { + "epoch": 0.37941531215897994, + "grad_norm": 0.4202117919921875, + "learning_rate": 0.0001, + "loss": 1.7664, + "step": 3303 + }, + { + "epoch": 0.37953018206880706, + "grad_norm": 0.4221265912055969, + "learning_rate": 0.0001, + "loss": 1.8107, + "step": 3304 + }, + { + "epoch": 0.3796450519786342, + "grad_norm": 0.3954116404056549, + "learning_rate": 0.0001, + "loss": 1.5563, + "step": 3305 + }, + { + "epoch": 0.3797599218884613, + "grad_norm": 0.3891046643257141, + "learning_rate": 0.0001, + "loss": 1.4281, + "step": 3306 + }, + { + "epoch": 0.3798747917982884, + "grad_norm": 0.4024218022823334, + "learning_rate": 0.0001, + "loss": 1.7119, + "step": 3307 + }, + { + "epoch": 0.37998966170811554, + "grad_norm": 0.45850181579589844, + "learning_rate": 0.0001, + "loss": 1.7876, + "step": 3308 + }, + { + "epoch": 0.38010453161794266, + "grad_norm": 0.4152509868144989, + "learning_rate": 0.0001, + "loss": 1.847, + "step": 3309 + }, + { + "epoch": 0.3802194015277698, + "grad_norm": 0.4059930443763733, + "learning_rate": 0.0001, + "loss": 1.6807, + "step": 3310 + }, + { + "epoch": 0.3803342714375969, + "grad_norm": 0.4039838910102844, + "learning_rate": 0.0001, + "loss": 1.7222, + "step": 3311 + }, + { + "epoch": 0.380449141347424, + "grad_norm": 0.40002793073654175, + "learning_rate": 0.0001, + "loss": 1.7285, + "step": 3312 + }, + { + "epoch": 0.38056401125725114, + "grad_norm": 0.3877595067024231, + "learning_rate": 0.0001, + "loss": 1.6028, + "step": 3313 + }, + { + "epoch": 0.38067888116707826, + "grad_norm": 0.41167452931404114, + "learning_rate": 0.0001, + "loss": 1.7559, + "step": 3314 + }, + { + "epoch": 0.3807937510769054, + "grad_norm": 0.3951496481895447, + "learning_rate": 0.0001, + "loss": 1.6941, + "step": 3315 + }, + { + "epoch": 0.3809086209867325, + "grad_norm": 0.4057624936103821, + "learning_rate": 0.0001, + "loss": 1.7384, + "step": 3316 + }, + { + "epoch": 0.3810234908965596, + "grad_norm": 0.41710686683654785, + "learning_rate": 0.0001, + "loss": 1.6361, + "step": 3317 + }, + { + "epoch": 0.38113836080638674, + "grad_norm": 0.38492968678474426, + "learning_rate": 0.0001, + "loss": 1.5527, + "step": 3318 + }, + { + "epoch": 0.38125323071621386, + "grad_norm": 0.3713145852088928, + "learning_rate": 0.0001, + "loss": 1.4979, + "step": 3319 + }, + { + "epoch": 0.381368100626041, + "grad_norm": 0.3968343734741211, + "learning_rate": 0.0001, + "loss": 1.4566, + "step": 3320 + }, + { + "epoch": 0.3814829705358681, + "grad_norm": 0.3992241621017456, + "learning_rate": 0.0001, + "loss": 1.6667, + "step": 3321 + }, + { + "epoch": 0.3815978404456952, + "grad_norm": 0.3984096050262451, + "learning_rate": 0.0001, + "loss": 1.6411, + "step": 3322 + }, + { + "epoch": 0.38171271035552234, + "grad_norm": 0.4242939352989197, + "learning_rate": 0.0001, + "loss": 1.8618, + "step": 3323 + }, + { + "epoch": 0.38182758026534946, + "grad_norm": 0.4698657691478729, + "learning_rate": 0.0001, + "loss": 1.7804, + "step": 3324 + }, + { + "epoch": 0.38194245017517664, + "grad_norm": 0.3952822685241699, + "learning_rate": 0.0001, + "loss": 1.762, + "step": 3325 + }, + { + "epoch": 0.38205732008500376, + "grad_norm": 0.3968496322631836, + "learning_rate": 0.0001, + "loss": 1.7494, + "step": 3326 + }, + { + "epoch": 0.3821721899948309, + "grad_norm": 0.35102754831314087, + "learning_rate": 0.0001, + "loss": 1.5076, + "step": 3327 + }, + { + "epoch": 0.382287059904658, + "grad_norm": 0.3999139070510864, + "learning_rate": 0.0001, + "loss": 1.422, + "step": 3328 + }, + { + "epoch": 0.3824019298144851, + "grad_norm": 0.3803260922431946, + "learning_rate": 0.0001, + "loss": 1.5255, + "step": 3329 + }, + { + "epoch": 0.38251679972431224, + "grad_norm": 0.3881869614124298, + "learning_rate": 0.0001, + "loss": 1.6423, + "step": 3330 + }, + { + "epoch": 0.38263166963413936, + "grad_norm": 0.3832140266895294, + "learning_rate": 0.0001, + "loss": 1.5412, + "step": 3331 + }, + { + "epoch": 0.3827465395439665, + "grad_norm": 0.4469684064388275, + "learning_rate": 0.0001, + "loss": 1.7267, + "step": 3332 + }, + { + "epoch": 0.3828614094537936, + "grad_norm": 0.3962031602859497, + "learning_rate": 0.0001, + "loss": 1.6285, + "step": 3333 + }, + { + "epoch": 0.3829762793636207, + "grad_norm": 0.408916175365448, + "learning_rate": 0.0001, + "loss": 1.6637, + "step": 3334 + }, + { + "epoch": 0.38309114927344784, + "grad_norm": 0.4158702790737152, + "learning_rate": 0.0001, + "loss": 1.6769, + "step": 3335 + }, + { + "epoch": 0.38320601918327496, + "grad_norm": 0.3798253536224365, + "learning_rate": 0.0001, + "loss": 1.5571, + "step": 3336 + }, + { + "epoch": 0.3833208890931021, + "grad_norm": 0.39568817615509033, + "learning_rate": 0.0001, + "loss": 1.6156, + "step": 3337 + }, + { + "epoch": 0.3834357590029292, + "grad_norm": 0.38767796754837036, + "learning_rate": 0.0001, + "loss": 1.6802, + "step": 3338 + }, + { + "epoch": 0.3835506289127563, + "grad_norm": 0.3866712749004364, + "learning_rate": 0.0001, + "loss": 1.7018, + "step": 3339 + }, + { + "epoch": 0.38366549882258344, + "grad_norm": 0.41397809982299805, + "learning_rate": 0.0001, + "loss": 1.8039, + "step": 3340 + }, + { + "epoch": 0.38378036873241056, + "grad_norm": 0.39398789405822754, + "learning_rate": 0.0001, + "loss": 1.6419, + "step": 3341 + }, + { + "epoch": 0.3838952386422377, + "grad_norm": 0.4054300785064697, + "learning_rate": 0.0001, + "loss": 1.6549, + "step": 3342 + }, + { + "epoch": 0.3840101085520648, + "grad_norm": 0.4255220890045166, + "learning_rate": 0.0001, + "loss": 1.9429, + "step": 3343 + }, + { + "epoch": 0.3841249784618919, + "grad_norm": 0.4655556082725525, + "learning_rate": 0.0001, + "loss": 1.7391, + "step": 3344 + }, + { + "epoch": 0.38423984837171904, + "grad_norm": 0.43720874190330505, + "learning_rate": 0.0001, + "loss": 1.7068, + "step": 3345 + }, + { + "epoch": 0.38435471828154616, + "grad_norm": 0.443059504032135, + "learning_rate": 0.0001, + "loss": 1.6133, + "step": 3346 + }, + { + "epoch": 0.3844695881913733, + "grad_norm": 0.4180503189563751, + "learning_rate": 0.0001, + "loss": 1.7699, + "step": 3347 + }, + { + "epoch": 0.3845844581012004, + "grad_norm": 0.3981085419654846, + "learning_rate": 0.0001, + "loss": 1.6587, + "step": 3348 + }, + { + "epoch": 0.3846993280110275, + "grad_norm": 0.38549384474754333, + "learning_rate": 0.0001, + "loss": 1.5092, + "step": 3349 + }, + { + "epoch": 0.38481419792085464, + "grad_norm": 0.4354805052280426, + "learning_rate": 0.0001, + "loss": 1.7426, + "step": 3350 + }, + { + "epoch": 0.38492906783068176, + "grad_norm": 0.39129069447517395, + "learning_rate": 0.0001, + "loss": 1.6652, + "step": 3351 + }, + { + "epoch": 0.3850439377405089, + "grad_norm": 0.40617823600769043, + "learning_rate": 0.0001, + "loss": 1.7721, + "step": 3352 + }, + { + "epoch": 0.385158807650336, + "grad_norm": 0.3931029438972473, + "learning_rate": 0.0001, + "loss": 1.4904, + "step": 3353 + }, + { + "epoch": 0.3852736775601631, + "grad_norm": 0.37746840715408325, + "learning_rate": 0.0001, + "loss": 1.6286, + "step": 3354 + }, + { + "epoch": 0.38538854746999024, + "grad_norm": 0.4153432250022888, + "learning_rate": 0.0001, + "loss": 1.8866, + "step": 3355 + }, + { + "epoch": 0.38550341737981736, + "grad_norm": 0.3631424605846405, + "learning_rate": 0.0001, + "loss": 1.6112, + "step": 3356 + }, + { + "epoch": 0.3856182872896445, + "grad_norm": 0.42953309416770935, + "learning_rate": 0.0001, + "loss": 1.9191, + "step": 3357 + }, + { + "epoch": 0.3857331571994716, + "grad_norm": 0.4635213613510132, + "learning_rate": 0.0001, + "loss": 1.9103, + "step": 3358 + }, + { + "epoch": 0.3858480271092987, + "grad_norm": 0.40578657388687134, + "learning_rate": 0.0001, + "loss": 1.7202, + "step": 3359 + }, + { + "epoch": 0.38596289701912584, + "grad_norm": 0.3969693183898926, + "learning_rate": 0.0001, + "loss": 1.7364, + "step": 3360 + }, + { + "epoch": 0.38607776692895296, + "grad_norm": 0.39727887511253357, + "learning_rate": 0.0001, + "loss": 1.6442, + "step": 3361 + }, + { + "epoch": 0.3861926368387801, + "grad_norm": 0.3983653485774994, + "learning_rate": 0.0001, + "loss": 1.5409, + "step": 3362 + }, + { + "epoch": 0.3863075067486072, + "grad_norm": 0.3816327452659607, + "learning_rate": 0.0001, + "loss": 1.7286, + "step": 3363 + }, + { + "epoch": 0.3864223766584343, + "grad_norm": 0.38533225655555725, + "learning_rate": 0.0001, + "loss": 1.6898, + "step": 3364 + }, + { + "epoch": 0.38653724656826144, + "grad_norm": 0.40804487466812134, + "learning_rate": 0.0001, + "loss": 1.7621, + "step": 3365 + }, + { + "epoch": 0.38665211647808856, + "grad_norm": 0.3823128044605255, + "learning_rate": 0.0001, + "loss": 1.6575, + "step": 3366 + }, + { + "epoch": 0.3867669863879157, + "grad_norm": 0.39397284388542175, + "learning_rate": 0.0001, + "loss": 1.4919, + "step": 3367 + }, + { + "epoch": 0.3868818562977428, + "grad_norm": 0.38799893856048584, + "learning_rate": 0.0001, + "loss": 1.5236, + "step": 3368 + }, + { + "epoch": 0.3869967262075699, + "grad_norm": 0.3756542205810547, + "learning_rate": 0.0001, + "loss": 1.5963, + "step": 3369 + }, + { + "epoch": 0.38711159611739704, + "grad_norm": 0.40256842970848083, + "learning_rate": 0.0001, + "loss": 1.606, + "step": 3370 + }, + { + "epoch": 0.38722646602722416, + "grad_norm": 0.43613606691360474, + "learning_rate": 0.0001, + "loss": 1.8034, + "step": 3371 + }, + { + "epoch": 0.3873413359370513, + "grad_norm": 0.38475316762924194, + "learning_rate": 0.0001, + "loss": 1.6643, + "step": 3372 + }, + { + "epoch": 0.3874562058468784, + "grad_norm": 0.40157008171081543, + "learning_rate": 0.0001, + "loss": 1.6925, + "step": 3373 + }, + { + "epoch": 0.3875710757567055, + "grad_norm": 0.39951568841934204, + "learning_rate": 0.0001, + "loss": 1.5993, + "step": 3374 + }, + { + "epoch": 0.38768594566653264, + "grad_norm": 0.42573562264442444, + "learning_rate": 0.0001, + "loss": 1.7148, + "step": 3375 + }, + { + "epoch": 0.38780081557635976, + "grad_norm": 0.3692642152309418, + "learning_rate": 0.0001, + "loss": 1.6399, + "step": 3376 + }, + { + "epoch": 0.3879156854861869, + "grad_norm": 0.3984464108943939, + "learning_rate": 0.0001, + "loss": 1.8419, + "step": 3377 + }, + { + "epoch": 0.388030555396014, + "grad_norm": 0.40837594866752625, + "learning_rate": 0.0001, + "loss": 1.8327, + "step": 3378 + }, + { + "epoch": 0.3881454253058411, + "grad_norm": 0.4153634011745453, + "learning_rate": 0.0001, + "loss": 1.6514, + "step": 3379 + }, + { + "epoch": 0.38826029521566824, + "grad_norm": 0.40997132658958435, + "learning_rate": 0.0001, + "loss": 1.7067, + "step": 3380 + }, + { + "epoch": 0.38837516512549536, + "grad_norm": 0.43204736709594727, + "learning_rate": 0.0001, + "loss": 1.7734, + "step": 3381 + }, + { + "epoch": 0.3884900350353225, + "grad_norm": 0.4231989085674286, + "learning_rate": 0.0001, + "loss": 1.7414, + "step": 3382 + }, + { + "epoch": 0.3886049049451496, + "grad_norm": 0.3816749155521393, + "learning_rate": 0.0001, + "loss": 1.6473, + "step": 3383 + }, + { + "epoch": 0.3887197748549767, + "grad_norm": 0.4512353539466858, + "learning_rate": 0.0001, + "loss": 1.9125, + "step": 3384 + }, + { + "epoch": 0.38883464476480384, + "grad_norm": 0.3650393486022949, + "learning_rate": 0.0001, + "loss": 1.6222, + "step": 3385 + }, + { + "epoch": 0.38894951467463096, + "grad_norm": 0.3751547038555145, + "learning_rate": 0.0001, + "loss": 1.7122, + "step": 3386 + }, + { + "epoch": 0.3890643845844581, + "grad_norm": 0.36982572078704834, + "learning_rate": 0.0001, + "loss": 1.6857, + "step": 3387 + }, + { + "epoch": 0.3891792544942852, + "grad_norm": 0.43625834584236145, + "learning_rate": 0.0001, + "loss": 1.591, + "step": 3388 + }, + { + "epoch": 0.3892941244041123, + "grad_norm": 0.40629592537879944, + "learning_rate": 0.0001, + "loss": 1.6299, + "step": 3389 + }, + { + "epoch": 0.38940899431393944, + "grad_norm": 0.41246914863586426, + "learning_rate": 0.0001, + "loss": 1.8241, + "step": 3390 + }, + { + "epoch": 0.38952386422376656, + "grad_norm": 0.40135088562965393, + "learning_rate": 0.0001, + "loss": 1.5663, + "step": 3391 + }, + { + "epoch": 0.3896387341335937, + "grad_norm": 0.4104592502117157, + "learning_rate": 0.0001, + "loss": 1.8265, + "step": 3392 + }, + { + "epoch": 0.3897536040434208, + "grad_norm": 0.35968706011772156, + "learning_rate": 0.0001, + "loss": 1.5164, + "step": 3393 + }, + { + "epoch": 0.389868473953248, + "grad_norm": 0.4108087420463562, + "learning_rate": 0.0001, + "loss": 1.6904, + "step": 3394 + }, + { + "epoch": 0.3899833438630751, + "grad_norm": 0.39649486541748047, + "learning_rate": 0.0001, + "loss": 1.6845, + "step": 3395 + }, + { + "epoch": 0.3900982137729022, + "grad_norm": 0.448559045791626, + "learning_rate": 0.0001, + "loss": 1.8202, + "step": 3396 + }, + { + "epoch": 0.39021308368272933, + "grad_norm": 0.40180444717407227, + "learning_rate": 0.0001, + "loss": 1.7422, + "step": 3397 + }, + { + "epoch": 0.39032795359255645, + "grad_norm": 0.4011480510234833, + "learning_rate": 0.0001, + "loss": 1.5017, + "step": 3398 + }, + { + "epoch": 0.3904428235023836, + "grad_norm": 0.41441354155540466, + "learning_rate": 0.0001, + "loss": 1.5995, + "step": 3399 + }, + { + "epoch": 0.3905576934122107, + "grad_norm": 0.41887158155441284, + "learning_rate": 0.0001, + "loss": 1.7279, + "step": 3400 + }, + { + "epoch": 0.3906725633220378, + "grad_norm": 0.4123765826225281, + "learning_rate": 0.0001, + "loss": 1.7621, + "step": 3401 + }, + { + "epoch": 0.39078743323186493, + "grad_norm": 0.3798231780529022, + "learning_rate": 0.0001, + "loss": 1.4537, + "step": 3402 + }, + { + "epoch": 0.39090230314169205, + "grad_norm": 0.40583983063697815, + "learning_rate": 0.0001, + "loss": 1.609, + "step": 3403 + }, + { + "epoch": 0.3910171730515192, + "grad_norm": 0.41971534490585327, + "learning_rate": 0.0001, + "loss": 1.6041, + "step": 3404 + }, + { + "epoch": 0.3911320429613463, + "grad_norm": 0.4053299129009247, + "learning_rate": 0.0001, + "loss": 1.7071, + "step": 3405 + }, + { + "epoch": 0.3912469128711734, + "grad_norm": 0.4222469925880432, + "learning_rate": 0.0001, + "loss": 1.9102, + "step": 3406 + }, + { + "epoch": 0.39136178278100053, + "grad_norm": 0.3722541630268097, + "learning_rate": 0.0001, + "loss": 1.6368, + "step": 3407 + }, + { + "epoch": 0.39147665269082765, + "grad_norm": 0.3922461271286011, + "learning_rate": 0.0001, + "loss": 1.5646, + "step": 3408 + }, + { + "epoch": 0.3915915226006548, + "grad_norm": 0.43620502948760986, + "learning_rate": 0.0001, + "loss": 1.9101, + "step": 3409 + }, + { + "epoch": 0.3917063925104819, + "grad_norm": 0.38486000895500183, + "learning_rate": 0.0001, + "loss": 1.625, + "step": 3410 + }, + { + "epoch": 0.391821262420309, + "grad_norm": 0.417163223028183, + "learning_rate": 0.0001, + "loss": 1.4222, + "step": 3411 + }, + { + "epoch": 0.39193613233013613, + "grad_norm": 0.37510520219802856, + "learning_rate": 0.0001, + "loss": 1.2839, + "step": 3412 + }, + { + "epoch": 0.39205100223996325, + "grad_norm": 0.4312410056591034, + "learning_rate": 0.0001, + "loss": 1.7017, + "step": 3413 + }, + { + "epoch": 0.3921658721497904, + "grad_norm": 0.4008169174194336, + "learning_rate": 0.0001, + "loss": 1.6157, + "step": 3414 + }, + { + "epoch": 0.3922807420596175, + "grad_norm": 0.3868180811405182, + "learning_rate": 0.0001, + "loss": 1.6447, + "step": 3415 + }, + { + "epoch": 0.3923956119694446, + "grad_norm": 0.38877058029174805, + "learning_rate": 0.0001, + "loss": 1.8274, + "step": 3416 + }, + { + "epoch": 0.39251048187927173, + "grad_norm": 0.39784759283065796, + "learning_rate": 0.0001, + "loss": 1.7979, + "step": 3417 + }, + { + "epoch": 0.39262535178909885, + "grad_norm": 0.3924922049045563, + "learning_rate": 0.0001, + "loss": 1.713, + "step": 3418 + }, + { + "epoch": 0.392740221698926, + "grad_norm": 0.40661346912384033, + "learning_rate": 0.0001, + "loss": 1.723, + "step": 3419 + }, + { + "epoch": 0.3928550916087531, + "grad_norm": 0.4197205901145935, + "learning_rate": 0.0001, + "loss": 1.7735, + "step": 3420 + }, + { + "epoch": 0.3929699615185802, + "grad_norm": 0.3987165689468384, + "learning_rate": 0.0001, + "loss": 1.8069, + "step": 3421 + }, + { + "epoch": 0.39308483142840733, + "grad_norm": 0.38276979327201843, + "learning_rate": 0.0001, + "loss": 1.6656, + "step": 3422 + }, + { + "epoch": 0.39319970133823445, + "grad_norm": 0.3716558814048767, + "learning_rate": 0.0001, + "loss": 1.5835, + "step": 3423 + }, + { + "epoch": 0.3933145712480616, + "grad_norm": 0.3954230844974518, + "learning_rate": 0.0001, + "loss": 1.7698, + "step": 3424 + }, + { + "epoch": 0.3934294411578887, + "grad_norm": 0.39034202694892883, + "learning_rate": 0.0001, + "loss": 1.783, + "step": 3425 + }, + { + "epoch": 0.3935443110677158, + "grad_norm": 0.36752232909202576, + "learning_rate": 0.0001, + "loss": 1.6384, + "step": 3426 + }, + { + "epoch": 0.39365918097754293, + "grad_norm": 0.3931824564933777, + "learning_rate": 0.0001, + "loss": 1.7198, + "step": 3427 + }, + { + "epoch": 0.39377405088737005, + "grad_norm": 0.3895040452480316, + "learning_rate": 0.0001, + "loss": 1.717, + "step": 3428 + }, + { + "epoch": 0.3938889207971972, + "grad_norm": 0.4126931428909302, + "learning_rate": 0.0001, + "loss": 1.5665, + "step": 3429 + }, + { + "epoch": 0.3940037907070243, + "grad_norm": 0.39863380789756775, + "learning_rate": 0.0001, + "loss": 1.8072, + "step": 3430 + }, + { + "epoch": 0.3941186606168514, + "grad_norm": 0.409013032913208, + "learning_rate": 0.0001, + "loss": 1.5039, + "step": 3431 + }, + { + "epoch": 0.39423353052667853, + "grad_norm": 0.39997661113739014, + "learning_rate": 0.0001, + "loss": 1.4622, + "step": 3432 + }, + { + "epoch": 0.39434840043650565, + "grad_norm": 0.38587263226509094, + "learning_rate": 0.0001, + "loss": 1.7621, + "step": 3433 + }, + { + "epoch": 0.3944632703463328, + "grad_norm": 0.41442370414733887, + "learning_rate": 0.0001, + "loss": 1.5584, + "step": 3434 + }, + { + "epoch": 0.3945781402561599, + "grad_norm": 0.387526273727417, + "learning_rate": 0.0001, + "loss": 1.6732, + "step": 3435 + }, + { + "epoch": 0.394693010165987, + "grad_norm": 0.4018903970718384, + "learning_rate": 0.0001, + "loss": 1.7994, + "step": 3436 + }, + { + "epoch": 0.39480788007581413, + "grad_norm": 0.38275036215782166, + "learning_rate": 0.0001, + "loss": 1.6699, + "step": 3437 + }, + { + "epoch": 0.39492274998564125, + "grad_norm": 0.392865926027298, + "learning_rate": 0.0001, + "loss": 1.699, + "step": 3438 + }, + { + "epoch": 0.3950376198954684, + "grad_norm": 0.3883517384529114, + "learning_rate": 0.0001, + "loss": 1.7361, + "step": 3439 + }, + { + "epoch": 0.3951524898052955, + "grad_norm": 0.43552684783935547, + "learning_rate": 0.0001, + "loss": 1.835, + "step": 3440 + }, + { + "epoch": 0.3952673597151226, + "grad_norm": 0.38498955965042114, + "learning_rate": 0.0001, + "loss": 1.6328, + "step": 3441 + }, + { + "epoch": 0.39538222962494973, + "grad_norm": 0.4317042827606201, + "learning_rate": 0.0001, + "loss": 1.9773, + "step": 3442 + }, + { + "epoch": 0.39549709953477685, + "grad_norm": 0.4095926880836487, + "learning_rate": 0.0001, + "loss": 1.6328, + "step": 3443 + }, + { + "epoch": 0.395611969444604, + "grad_norm": 0.4393211007118225, + "learning_rate": 0.0001, + "loss": 1.8934, + "step": 3444 + }, + { + "epoch": 0.3957268393544311, + "grad_norm": 0.3967788517475128, + "learning_rate": 0.0001, + "loss": 1.6129, + "step": 3445 + }, + { + "epoch": 0.3958417092642582, + "grad_norm": 0.3683384656906128, + "learning_rate": 0.0001, + "loss": 1.6212, + "step": 3446 + }, + { + "epoch": 0.39595657917408533, + "grad_norm": 0.37010353803634644, + "learning_rate": 0.0001, + "loss": 1.6381, + "step": 3447 + }, + { + "epoch": 0.39607144908391245, + "grad_norm": 0.4637109637260437, + "learning_rate": 0.0001, + "loss": 1.7736, + "step": 3448 + }, + { + "epoch": 0.3961863189937396, + "grad_norm": 0.3716694116592407, + "learning_rate": 0.0001, + "loss": 1.5737, + "step": 3449 + }, + { + "epoch": 0.3963011889035667, + "grad_norm": 0.3940329849720001, + "learning_rate": 0.0001, + "loss": 1.8883, + "step": 3450 + }, + { + "epoch": 0.3964160588133938, + "grad_norm": 0.3837946653366089, + "learning_rate": 0.0001, + "loss": 1.5734, + "step": 3451 + }, + { + "epoch": 0.39653092872322093, + "grad_norm": 0.4419655203819275, + "learning_rate": 0.0001, + "loss": 1.7845, + "step": 3452 + }, + { + "epoch": 0.39664579863304805, + "grad_norm": 0.39042928814888, + "learning_rate": 0.0001, + "loss": 1.6394, + "step": 3453 + }, + { + "epoch": 0.3967606685428752, + "grad_norm": 0.43041858077049255, + "learning_rate": 0.0001, + "loss": 1.4786, + "step": 3454 + }, + { + "epoch": 0.3968755384527023, + "grad_norm": 0.36053743958473206, + "learning_rate": 0.0001, + "loss": 1.4182, + "step": 3455 + }, + { + "epoch": 0.3969904083625294, + "grad_norm": 0.3963378965854645, + "learning_rate": 0.0001, + "loss": 1.8062, + "step": 3456 + }, + { + "epoch": 0.39710527827235653, + "grad_norm": 0.391775906085968, + "learning_rate": 0.0001, + "loss": 1.6042, + "step": 3457 + }, + { + "epoch": 0.39722014818218365, + "grad_norm": 0.39680591225624084, + "learning_rate": 0.0001, + "loss": 1.7523, + "step": 3458 + }, + { + "epoch": 0.39733501809201077, + "grad_norm": 0.4133915901184082, + "learning_rate": 0.0001, + "loss": 1.6781, + "step": 3459 + }, + { + "epoch": 0.3974498880018379, + "grad_norm": 0.41504111886024475, + "learning_rate": 0.0001, + "loss": 1.5819, + "step": 3460 + }, + { + "epoch": 0.397564757911665, + "grad_norm": 0.39782634377479553, + "learning_rate": 0.0001, + "loss": 1.5943, + "step": 3461 + }, + { + "epoch": 0.3976796278214922, + "grad_norm": 0.4147406220436096, + "learning_rate": 0.0001, + "loss": 1.6262, + "step": 3462 + }, + { + "epoch": 0.3977944977313193, + "grad_norm": 0.3900047838687897, + "learning_rate": 0.0001, + "loss": 1.5578, + "step": 3463 + }, + { + "epoch": 0.3979093676411464, + "grad_norm": 0.3884030282497406, + "learning_rate": 0.0001, + "loss": 1.695, + "step": 3464 + }, + { + "epoch": 0.39802423755097355, + "grad_norm": 0.422282874584198, + "learning_rate": 0.0001, + "loss": 1.6552, + "step": 3465 + }, + { + "epoch": 0.39813910746080067, + "grad_norm": 0.39325493574142456, + "learning_rate": 0.0001, + "loss": 1.6323, + "step": 3466 + }, + { + "epoch": 0.3982539773706278, + "grad_norm": 0.3940114974975586, + "learning_rate": 0.0001, + "loss": 1.6235, + "step": 3467 + }, + { + "epoch": 0.3983688472804549, + "grad_norm": 0.4041246175765991, + "learning_rate": 0.0001, + "loss": 1.6107, + "step": 3468 + }, + { + "epoch": 0.398483717190282, + "grad_norm": 0.37534579634666443, + "learning_rate": 0.0001, + "loss": 1.737, + "step": 3469 + }, + { + "epoch": 0.39859858710010915, + "grad_norm": 0.40846285223960876, + "learning_rate": 0.0001, + "loss": 1.7085, + "step": 3470 + }, + { + "epoch": 0.39871345700993627, + "grad_norm": 0.43047085404396057, + "learning_rate": 0.0001, + "loss": 1.7536, + "step": 3471 + }, + { + "epoch": 0.3988283269197634, + "grad_norm": 0.3942980170249939, + "learning_rate": 0.0001, + "loss": 1.7882, + "step": 3472 + }, + { + "epoch": 0.3989431968295905, + "grad_norm": 0.408866822719574, + "learning_rate": 0.0001, + "loss": 1.6989, + "step": 3473 + }, + { + "epoch": 0.3990580667394176, + "grad_norm": 0.3862482011318207, + "learning_rate": 0.0001, + "loss": 1.6176, + "step": 3474 + }, + { + "epoch": 0.39917293664924475, + "grad_norm": 0.3940208852291107, + "learning_rate": 0.0001, + "loss": 1.6806, + "step": 3475 + }, + { + "epoch": 0.39928780655907187, + "grad_norm": 0.42412546277046204, + "learning_rate": 0.0001, + "loss": 1.6059, + "step": 3476 + }, + { + "epoch": 0.399402676468899, + "grad_norm": 0.4334600567817688, + "learning_rate": 0.0001, + "loss": 1.5058, + "step": 3477 + }, + { + "epoch": 0.3995175463787261, + "grad_norm": 0.3741202652454376, + "learning_rate": 0.0001, + "loss": 1.5907, + "step": 3478 + }, + { + "epoch": 0.3996324162885532, + "grad_norm": 0.37549999356269836, + "learning_rate": 0.0001, + "loss": 1.4891, + "step": 3479 + }, + { + "epoch": 0.39974728619838035, + "grad_norm": 0.41318750381469727, + "learning_rate": 0.0001, + "loss": 1.7634, + "step": 3480 + }, + { + "epoch": 0.39986215610820747, + "grad_norm": 0.39875760674476624, + "learning_rate": 0.0001, + "loss": 1.74, + "step": 3481 + }, + { + "epoch": 0.3999770260180346, + "grad_norm": 0.4000686705112457, + "learning_rate": 0.0001, + "loss": 1.5764, + "step": 3482 + }, + { + "epoch": 0.4000918959278617, + "grad_norm": 0.4229472577571869, + "learning_rate": 0.0001, + "loss": 1.5092, + "step": 3483 + }, + { + "epoch": 0.4002067658376888, + "grad_norm": 0.4480687975883484, + "learning_rate": 0.0001, + "loss": 1.8381, + "step": 3484 + }, + { + "epoch": 0.40032163574751595, + "grad_norm": 0.3803386688232422, + "learning_rate": 0.0001, + "loss": 1.6061, + "step": 3485 + }, + { + "epoch": 0.40043650565734307, + "grad_norm": 0.40477797389030457, + "learning_rate": 0.0001, + "loss": 1.7031, + "step": 3486 + }, + { + "epoch": 0.4005513755671702, + "grad_norm": 0.4002171456813812, + "learning_rate": 0.0001, + "loss": 1.7292, + "step": 3487 + }, + { + "epoch": 0.4006662454769973, + "grad_norm": 0.4127141833305359, + "learning_rate": 0.0001, + "loss": 1.8455, + "step": 3488 + }, + { + "epoch": 0.4007811153868244, + "grad_norm": 0.4116250276565552, + "learning_rate": 0.0001, + "loss": 1.5637, + "step": 3489 + }, + { + "epoch": 0.40089598529665155, + "grad_norm": 0.3903186619281769, + "learning_rate": 0.0001, + "loss": 1.567, + "step": 3490 + }, + { + "epoch": 0.40101085520647867, + "grad_norm": 0.3886655867099762, + "learning_rate": 0.0001, + "loss": 1.512, + "step": 3491 + }, + { + "epoch": 0.4011257251163058, + "grad_norm": 0.40146976709365845, + "learning_rate": 0.0001, + "loss": 1.6052, + "step": 3492 + }, + { + "epoch": 0.4012405950261329, + "grad_norm": 0.4304850697517395, + "learning_rate": 0.0001, + "loss": 1.6255, + "step": 3493 + }, + { + "epoch": 0.40135546493596, + "grad_norm": 0.41273269057273865, + "learning_rate": 0.0001, + "loss": 1.566, + "step": 3494 + }, + { + "epoch": 0.40147033484578715, + "grad_norm": 0.42954984307289124, + "learning_rate": 0.0001, + "loss": 1.7387, + "step": 3495 + }, + { + "epoch": 0.40158520475561427, + "grad_norm": 0.3892943859100342, + "learning_rate": 0.0001, + "loss": 1.6935, + "step": 3496 + }, + { + "epoch": 0.4017000746654414, + "grad_norm": 0.43084821105003357, + "learning_rate": 0.0001, + "loss": 1.8832, + "step": 3497 + }, + { + "epoch": 0.4018149445752685, + "grad_norm": 0.3853071331977844, + "learning_rate": 0.0001, + "loss": 1.4343, + "step": 3498 + }, + { + "epoch": 0.4019298144850956, + "grad_norm": 0.3769509196281433, + "learning_rate": 0.0001, + "loss": 1.5531, + "step": 3499 + }, + { + "epoch": 0.40204468439492275, + "grad_norm": 0.41047728061676025, + "learning_rate": 0.0001, + "loss": 1.6463, + "step": 3500 + }, + { + "epoch": 0.40215955430474987, + "grad_norm": 0.3837498128414154, + "learning_rate": 0.0001, + "loss": 1.6573, + "step": 3501 + }, + { + "epoch": 0.402274424214577, + "grad_norm": 0.4407394826412201, + "learning_rate": 0.0001, + "loss": 1.6908, + "step": 3502 + }, + { + "epoch": 0.4023892941244041, + "grad_norm": 0.3970791697502136, + "learning_rate": 0.0001, + "loss": 1.6581, + "step": 3503 + }, + { + "epoch": 0.4025041640342312, + "grad_norm": 0.402908593416214, + "learning_rate": 0.0001, + "loss": 1.7421, + "step": 3504 + }, + { + "epoch": 0.40261903394405835, + "grad_norm": 0.3898850977420807, + "learning_rate": 0.0001, + "loss": 1.5148, + "step": 3505 + }, + { + "epoch": 0.40273390385388547, + "grad_norm": 0.4609782099723816, + "learning_rate": 0.0001, + "loss": 1.9721, + "step": 3506 + }, + { + "epoch": 0.4028487737637126, + "grad_norm": 0.4256317615509033, + "learning_rate": 0.0001, + "loss": 1.8592, + "step": 3507 + }, + { + "epoch": 0.4029636436735397, + "grad_norm": 0.38240641355514526, + "learning_rate": 0.0001, + "loss": 1.5619, + "step": 3508 + }, + { + "epoch": 0.4030785135833668, + "grad_norm": 0.40074291825294495, + "learning_rate": 0.0001, + "loss": 1.6603, + "step": 3509 + }, + { + "epoch": 0.40319338349319395, + "grad_norm": 0.38915419578552246, + "learning_rate": 0.0001, + "loss": 1.4588, + "step": 3510 + }, + { + "epoch": 0.40330825340302107, + "grad_norm": 0.39490392804145813, + "learning_rate": 0.0001, + "loss": 1.7036, + "step": 3511 + }, + { + "epoch": 0.4034231233128482, + "grad_norm": 0.37011227011680603, + "learning_rate": 0.0001, + "loss": 1.6374, + "step": 3512 + }, + { + "epoch": 0.4035379932226753, + "grad_norm": 0.41320136189460754, + "learning_rate": 0.0001, + "loss": 1.6905, + "step": 3513 + }, + { + "epoch": 0.4036528631325024, + "grad_norm": 0.43924593925476074, + "learning_rate": 0.0001, + "loss": 1.7871, + "step": 3514 + }, + { + "epoch": 0.40376773304232955, + "grad_norm": 0.3919188380241394, + "learning_rate": 0.0001, + "loss": 1.5806, + "step": 3515 + }, + { + "epoch": 0.40388260295215667, + "grad_norm": 0.40619519352912903, + "learning_rate": 0.0001, + "loss": 1.5834, + "step": 3516 + }, + { + "epoch": 0.4039974728619838, + "grad_norm": 0.4137546420097351, + "learning_rate": 0.0001, + "loss": 1.7776, + "step": 3517 + }, + { + "epoch": 0.4041123427718109, + "grad_norm": 0.4008006453514099, + "learning_rate": 0.0001, + "loss": 1.5994, + "step": 3518 + }, + { + "epoch": 0.404227212681638, + "grad_norm": 0.43822675943374634, + "learning_rate": 0.0001, + "loss": 1.8259, + "step": 3519 + }, + { + "epoch": 0.40434208259146515, + "grad_norm": 0.41821324825286865, + "learning_rate": 0.0001, + "loss": 1.7496, + "step": 3520 + }, + { + "epoch": 0.40445695250129227, + "grad_norm": 0.45367196202278137, + "learning_rate": 0.0001, + "loss": 1.9165, + "step": 3521 + }, + { + "epoch": 0.4045718224111194, + "grad_norm": 0.38345637917518616, + "learning_rate": 0.0001, + "loss": 1.7299, + "step": 3522 + }, + { + "epoch": 0.4046866923209465, + "grad_norm": 0.3878767490386963, + "learning_rate": 0.0001, + "loss": 1.683, + "step": 3523 + }, + { + "epoch": 0.4048015622307736, + "grad_norm": 0.43922069668769836, + "learning_rate": 0.0001, + "loss": 1.8087, + "step": 3524 + }, + { + "epoch": 0.40491643214060075, + "grad_norm": 0.38794466853141785, + "learning_rate": 0.0001, + "loss": 1.6888, + "step": 3525 + }, + { + "epoch": 0.40503130205042787, + "grad_norm": 0.39647185802459717, + "learning_rate": 0.0001, + "loss": 1.7157, + "step": 3526 + }, + { + "epoch": 0.405146171960255, + "grad_norm": 0.38950473070144653, + "learning_rate": 0.0001, + "loss": 1.7733, + "step": 3527 + }, + { + "epoch": 0.4052610418700821, + "grad_norm": 0.43412071466445923, + "learning_rate": 0.0001, + "loss": 1.8643, + "step": 3528 + }, + { + "epoch": 0.4053759117799092, + "grad_norm": 0.4561465382575989, + "learning_rate": 0.0001, + "loss": 1.9004, + "step": 3529 + }, + { + "epoch": 0.40549078168973635, + "grad_norm": 0.3911570906639099, + "learning_rate": 0.0001, + "loss": 1.7185, + "step": 3530 + }, + { + "epoch": 0.4056056515995635, + "grad_norm": 0.3808756172657013, + "learning_rate": 0.0001, + "loss": 1.6488, + "step": 3531 + }, + { + "epoch": 0.40572052150939064, + "grad_norm": 0.41304758191108704, + "learning_rate": 0.0001, + "loss": 1.704, + "step": 3532 + }, + { + "epoch": 0.40583539141921776, + "grad_norm": 0.3868066072463989, + "learning_rate": 0.0001, + "loss": 1.6564, + "step": 3533 + }, + { + "epoch": 0.4059502613290449, + "grad_norm": 0.47195518016815186, + "learning_rate": 0.0001, + "loss": 1.85, + "step": 3534 + }, + { + "epoch": 0.406065131238872, + "grad_norm": 0.40682026743888855, + "learning_rate": 0.0001, + "loss": 1.7334, + "step": 3535 + }, + { + "epoch": 0.4061800011486991, + "grad_norm": 0.4269334375858307, + "learning_rate": 0.0001, + "loss": 1.743, + "step": 3536 + }, + { + "epoch": 0.40629487105852624, + "grad_norm": 0.40638071298599243, + "learning_rate": 0.0001, + "loss": 1.734, + "step": 3537 + }, + { + "epoch": 0.40640974096835336, + "grad_norm": 0.3894198536872864, + "learning_rate": 0.0001, + "loss": 1.6416, + "step": 3538 + }, + { + "epoch": 0.4065246108781805, + "grad_norm": 0.4187344014644623, + "learning_rate": 0.0001, + "loss": 1.647, + "step": 3539 + }, + { + "epoch": 0.4066394807880076, + "grad_norm": 0.3857315182685852, + "learning_rate": 0.0001, + "loss": 1.5714, + "step": 3540 + }, + { + "epoch": 0.4067543506978347, + "grad_norm": 0.3764125108718872, + "learning_rate": 0.0001, + "loss": 1.6134, + "step": 3541 + }, + { + "epoch": 0.40686922060766184, + "grad_norm": 0.38731610774993896, + "learning_rate": 0.0001, + "loss": 1.551, + "step": 3542 + }, + { + "epoch": 0.40698409051748896, + "grad_norm": 0.38628220558166504, + "learning_rate": 0.0001, + "loss": 1.599, + "step": 3543 + }, + { + "epoch": 0.4070989604273161, + "grad_norm": 0.39660200476646423, + "learning_rate": 0.0001, + "loss": 1.6402, + "step": 3544 + }, + { + "epoch": 0.4072138303371432, + "grad_norm": 0.4349413812160492, + "learning_rate": 0.0001, + "loss": 1.9247, + "step": 3545 + }, + { + "epoch": 0.4073287002469703, + "grad_norm": 0.41799604892730713, + "learning_rate": 0.0001, + "loss": 1.7725, + "step": 3546 + }, + { + "epoch": 0.40744357015679744, + "grad_norm": 0.4117201566696167, + "learning_rate": 0.0001, + "loss": 1.6207, + "step": 3547 + }, + { + "epoch": 0.40755844006662456, + "grad_norm": 0.3982154130935669, + "learning_rate": 0.0001, + "loss": 1.652, + "step": 3548 + }, + { + "epoch": 0.4076733099764517, + "grad_norm": 0.386783242225647, + "learning_rate": 0.0001, + "loss": 1.4822, + "step": 3549 + }, + { + "epoch": 0.4077881798862788, + "grad_norm": 0.3949221670627594, + "learning_rate": 0.0001, + "loss": 1.5991, + "step": 3550 + }, + { + "epoch": 0.4079030497961059, + "grad_norm": 0.38319775462150574, + "learning_rate": 0.0001, + "loss": 1.6634, + "step": 3551 + }, + { + "epoch": 0.40801791970593304, + "grad_norm": 0.4165116250514984, + "learning_rate": 0.0001, + "loss": 1.7474, + "step": 3552 + }, + { + "epoch": 0.40813278961576016, + "grad_norm": 0.4256540536880493, + "learning_rate": 0.0001, + "loss": 1.8328, + "step": 3553 + }, + { + "epoch": 0.4082476595255873, + "grad_norm": 0.4176916182041168, + "learning_rate": 0.0001, + "loss": 1.7, + "step": 3554 + }, + { + "epoch": 0.4083625294354144, + "grad_norm": 0.40990862250328064, + "learning_rate": 0.0001, + "loss": 1.3584, + "step": 3555 + }, + { + "epoch": 0.4084773993452415, + "grad_norm": 0.39748623967170715, + "learning_rate": 0.0001, + "loss": 1.5139, + "step": 3556 + }, + { + "epoch": 0.40859226925506864, + "grad_norm": 0.3950015604496002, + "learning_rate": 0.0001, + "loss": 1.5983, + "step": 3557 + }, + { + "epoch": 0.40870713916489576, + "grad_norm": 0.3774421215057373, + "learning_rate": 0.0001, + "loss": 1.4646, + "step": 3558 + }, + { + "epoch": 0.4088220090747229, + "grad_norm": 0.3629385530948639, + "learning_rate": 0.0001, + "loss": 1.5852, + "step": 3559 + }, + { + "epoch": 0.40893687898455, + "grad_norm": 0.3893386125564575, + "learning_rate": 0.0001, + "loss": 1.6006, + "step": 3560 + }, + { + "epoch": 0.4090517488943771, + "grad_norm": 0.3888409435749054, + "learning_rate": 0.0001, + "loss": 1.65, + "step": 3561 + }, + { + "epoch": 0.40916661880420424, + "grad_norm": 0.415077805519104, + "learning_rate": 0.0001, + "loss": 1.6517, + "step": 3562 + }, + { + "epoch": 0.40928148871403136, + "grad_norm": 0.420685350894928, + "learning_rate": 0.0001, + "loss": 1.7946, + "step": 3563 + }, + { + "epoch": 0.4093963586238585, + "grad_norm": 0.4362063407897949, + "learning_rate": 0.0001, + "loss": 1.7735, + "step": 3564 + }, + { + "epoch": 0.4095112285336856, + "grad_norm": 0.42826390266418457, + "learning_rate": 0.0001, + "loss": 1.8189, + "step": 3565 + }, + { + "epoch": 0.4096260984435127, + "grad_norm": 0.3822219669818878, + "learning_rate": 0.0001, + "loss": 1.6092, + "step": 3566 + }, + { + "epoch": 0.40974096835333984, + "grad_norm": 0.4273683428764343, + "learning_rate": 0.0001, + "loss": 1.8192, + "step": 3567 + }, + { + "epoch": 0.40985583826316696, + "grad_norm": 0.401048481464386, + "learning_rate": 0.0001, + "loss": 1.5636, + "step": 3568 + }, + { + "epoch": 0.4099707081729941, + "grad_norm": 0.4532688558101654, + "learning_rate": 0.0001, + "loss": 1.6273, + "step": 3569 + }, + { + "epoch": 0.4100855780828212, + "grad_norm": 0.40526339411735535, + "learning_rate": 0.0001, + "loss": 1.8208, + "step": 3570 + }, + { + "epoch": 0.4102004479926483, + "grad_norm": 0.3863803446292877, + "learning_rate": 0.0001, + "loss": 1.6683, + "step": 3571 + }, + { + "epoch": 0.41031531790247544, + "grad_norm": 0.38212668895721436, + "learning_rate": 0.0001, + "loss": 1.6424, + "step": 3572 + }, + { + "epoch": 0.41043018781230256, + "grad_norm": 0.38503432273864746, + "learning_rate": 0.0001, + "loss": 1.5401, + "step": 3573 + }, + { + "epoch": 0.4105450577221297, + "grad_norm": 0.4347050189971924, + "learning_rate": 0.0001, + "loss": 1.7337, + "step": 3574 + }, + { + "epoch": 0.4106599276319568, + "grad_norm": 0.4009620249271393, + "learning_rate": 0.0001, + "loss": 1.4972, + "step": 3575 + }, + { + "epoch": 0.4107747975417839, + "grad_norm": 0.40209561586380005, + "learning_rate": 0.0001, + "loss": 1.716, + "step": 3576 + }, + { + "epoch": 0.41088966745161104, + "grad_norm": 0.42424091696739197, + "learning_rate": 0.0001, + "loss": 1.6551, + "step": 3577 + }, + { + "epoch": 0.41100453736143816, + "grad_norm": 0.405749648809433, + "learning_rate": 0.0001, + "loss": 1.3783, + "step": 3578 + }, + { + "epoch": 0.4111194072712653, + "grad_norm": 0.4051080346107483, + "learning_rate": 0.0001, + "loss": 1.7914, + "step": 3579 + }, + { + "epoch": 0.4112342771810924, + "grad_norm": 0.3940957188606262, + "learning_rate": 0.0001, + "loss": 1.4819, + "step": 3580 + }, + { + "epoch": 0.4113491470909195, + "grad_norm": 0.4148561358451843, + "learning_rate": 0.0001, + "loss": 1.815, + "step": 3581 + }, + { + "epoch": 0.41146401700074664, + "grad_norm": 0.37967970967292786, + "learning_rate": 0.0001, + "loss": 1.5627, + "step": 3582 + }, + { + "epoch": 0.41157888691057376, + "grad_norm": 0.4085034728050232, + "learning_rate": 0.0001, + "loss": 1.7586, + "step": 3583 + }, + { + "epoch": 0.4116937568204009, + "grad_norm": 0.43780767917633057, + "learning_rate": 0.0001, + "loss": 1.6554, + "step": 3584 + }, + { + "epoch": 0.411808626730228, + "grad_norm": 0.4053542912006378, + "learning_rate": 0.0001, + "loss": 1.627, + "step": 3585 + }, + { + "epoch": 0.4119234966400551, + "grad_norm": 0.42248353362083435, + "learning_rate": 0.0001, + "loss": 1.7921, + "step": 3586 + }, + { + "epoch": 0.41203836654988224, + "grad_norm": 0.38272780179977417, + "learning_rate": 0.0001, + "loss": 1.6164, + "step": 3587 + }, + { + "epoch": 0.41215323645970936, + "grad_norm": 0.3969421088695526, + "learning_rate": 0.0001, + "loss": 1.7657, + "step": 3588 + }, + { + "epoch": 0.4122681063695365, + "grad_norm": 0.3919724225997925, + "learning_rate": 0.0001, + "loss": 1.5506, + "step": 3589 + }, + { + "epoch": 0.4123829762793636, + "grad_norm": 0.37912601232528687, + "learning_rate": 0.0001, + "loss": 1.6657, + "step": 3590 + }, + { + "epoch": 0.4124978461891907, + "grad_norm": 0.4223651885986328, + "learning_rate": 0.0001, + "loss": 1.7099, + "step": 3591 + }, + { + "epoch": 0.41261271609901784, + "grad_norm": 0.39578935503959656, + "learning_rate": 0.0001, + "loss": 1.5249, + "step": 3592 + }, + { + "epoch": 0.41272758600884496, + "grad_norm": 0.40267515182495117, + "learning_rate": 0.0001, + "loss": 1.5518, + "step": 3593 + }, + { + "epoch": 0.4128424559186721, + "grad_norm": 0.4238963723182678, + "learning_rate": 0.0001, + "loss": 1.9176, + "step": 3594 + }, + { + "epoch": 0.4129573258284992, + "grad_norm": 0.43114468455314636, + "learning_rate": 0.0001, + "loss": 1.7819, + "step": 3595 + }, + { + "epoch": 0.4130721957383263, + "grad_norm": 0.36643150448799133, + "learning_rate": 0.0001, + "loss": 1.5617, + "step": 3596 + }, + { + "epoch": 0.41318706564815344, + "grad_norm": 0.37914469838142395, + "learning_rate": 0.0001, + "loss": 1.6791, + "step": 3597 + }, + { + "epoch": 0.41330193555798056, + "grad_norm": 0.4015408158302307, + "learning_rate": 0.0001, + "loss": 1.6765, + "step": 3598 + }, + { + "epoch": 0.41341680546780774, + "grad_norm": 0.40977799892425537, + "learning_rate": 0.0001, + "loss": 1.445, + "step": 3599 + }, + { + "epoch": 0.41353167537763486, + "grad_norm": 0.4055366814136505, + "learning_rate": 0.0001, + "loss": 1.5245, + "step": 3600 + }, + { + "epoch": 0.413646545287462, + "grad_norm": 0.4310015141963959, + "learning_rate": 0.0001, + "loss": 1.6015, + "step": 3601 + }, + { + "epoch": 0.4137614151972891, + "grad_norm": 0.4342171847820282, + "learning_rate": 0.0001, + "loss": 1.7804, + "step": 3602 + }, + { + "epoch": 0.4138762851071162, + "grad_norm": 0.3718089759349823, + "learning_rate": 0.0001, + "loss": 1.5651, + "step": 3603 + }, + { + "epoch": 0.41399115501694334, + "grad_norm": 0.38151004910469055, + "learning_rate": 0.0001, + "loss": 1.6581, + "step": 3604 + }, + { + "epoch": 0.41410602492677046, + "grad_norm": 0.4037642776966095, + "learning_rate": 0.0001, + "loss": 1.5375, + "step": 3605 + }, + { + "epoch": 0.4142208948365976, + "grad_norm": 0.40361806750297546, + "learning_rate": 0.0001, + "loss": 1.5097, + "step": 3606 + }, + { + "epoch": 0.4143357647464247, + "grad_norm": 0.4126417934894562, + "learning_rate": 0.0001, + "loss": 1.703, + "step": 3607 + }, + { + "epoch": 0.4144506346562518, + "grad_norm": 0.409773051738739, + "learning_rate": 0.0001, + "loss": 1.6464, + "step": 3608 + }, + { + "epoch": 0.41456550456607894, + "grad_norm": 0.44128933548927307, + "learning_rate": 0.0001, + "loss": 1.7266, + "step": 3609 + }, + { + "epoch": 0.41468037447590606, + "grad_norm": 0.3855801820755005, + "learning_rate": 0.0001, + "loss": 1.5346, + "step": 3610 + }, + { + "epoch": 0.4147952443857332, + "grad_norm": 0.3980732262134552, + "learning_rate": 0.0001, + "loss": 1.7958, + "step": 3611 + }, + { + "epoch": 0.4149101142955603, + "grad_norm": 0.4035704433917999, + "learning_rate": 0.0001, + "loss": 1.5173, + "step": 3612 + }, + { + "epoch": 0.4150249842053874, + "grad_norm": 0.3865931034088135, + "learning_rate": 0.0001, + "loss": 1.778, + "step": 3613 + }, + { + "epoch": 0.41513985411521453, + "grad_norm": 0.4053201377391815, + "learning_rate": 0.0001, + "loss": 1.7013, + "step": 3614 + }, + { + "epoch": 0.41525472402504165, + "grad_norm": 0.3935595452785492, + "learning_rate": 0.0001, + "loss": 1.6111, + "step": 3615 + }, + { + "epoch": 0.4153695939348688, + "grad_norm": 0.3803655803203583, + "learning_rate": 0.0001, + "loss": 1.5889, + "step": 3616 + }, + { + "epoch": 0.4154844638446959, + "grad_norm": 0.4078107476234436, + "learning_rate": 0.0001, + "loss": 1.6897, + "step": 3617 + }, + { + "epoch": 0.415599333754523, + "grad_norm": 0.40983328223228455, + "learning_rate": 0.0001, + "loss": 1.8803, + "step": 3618 + }, + { + "epoch": 0.41571420366435013, + "grad_norm": 0.3894909918308258, + "learning_rate": 0.0001, + "loss": 1.6771, + "step": 3619 + }, + { + "epoch": 0.41582907357417725, + "grad_norm": 0.41159242391586304, + "learning_rate": 0.0001, + "loss": 1.7309, + "step": 3620 + }, + { + "epoch": 0.4159439434840044, + "grad_norm": 0.4065288305282593, + "learning_rate": 0.0001, + "loss": 1.7908, + "step": 3621 + }, + { + "epoch": 0.4160588133938315, + "grad_norm": 0.4101758599281311, + "learning_rate": 0.0001, + "loss": 1.7292, + "step": 3622 + }, + { + "epoch": 0.4161736833036586, + "grad_norm": 0.41453102231025696, + "learning_rate": 0.0001, + "loss": 1.8015, + "step": 3623 + }, + { + "epoch": 0.41628855321348573, + "grad_norm": 0.39582541584968567, + "learning_rate": 0.0001, + "loss": 1.7353, + "step": 3624 + }, + { + "epoch": 0.41640342312331285, + "grad_norm": 0.4312124252319336, + "learning_rate": 0.0001, + "loss": 1.7743, + "step": 3625 + }, + { + "epoch": 0.41651829303314, + "grad_norm": 0.40501224994659424, + "learning_rate": 0.0001, + "loss": 1.8335, + "step": 3626 + }, + { + "epoch": 0.4166331629429671, + "grad_norm": 0.39315342903137207, + "learning_rate": 0.0001, + "loss": 1.5686, + "step": 3627 + }, + { + "epoch": 0.4167480328527942, + "grad_norm": 0.4335770606994629, + "learning_rate": 0.0001, + "loss": 1.6204, + "step": 3628 + }, + { + "epoch": 0.41686290276262133, + "grad_norm": 0.3941115736961365, + "learning_rate": 0.0001, + "loss": 1.6904, + "step": 3629 + }, + { + "epoch": 0.41697777267244845, + "grad_norm": 0.40810754895210266, + "learning_rate": 0.0001, + "loss": 1.6955, + "step": 3630 + }, + { + "epoch": 0.4170926425822756, + "grad_norm": 0.3764353394508362, + "learning_rate": 0.0001, + "loss": 1.496, + "step": 3631 + }, + { + "epoch": 0.4172075124921027, + "grad_norm": 0.41539716720581055, + "learning_rate": 0.0001, + "loss": 1.638, + "step": 3632 + }, + { + "epoch": 0.4173223824019298, + "grad_norm": 0.3807416260242462, + "learning_rate": 0.0001, + "loss": 1.6627, + "step": 3633 + }, + { + "epoch": 0.41743725231175693, + "grad_norm": 0.38666248321533203, + "learning_rate": 0.0001, + "loss": 1.5018, + "step": 3634 + }, + { + "epoch": 0.41755212222158405, + "grad_norm": 0.40492692589759827, + "learning_rate": 0.0001, + "loss": 1.5823, + "step": 3635 + }, + { + "epoch": 0.4176669921314112, + "grad_norm": 0.38428235054016113, + "learning_rate": 0.0001, + "loss": 1.5198, + "step": 3636 + }, + { + "epoch": 0.4177818620412383, + "grad_norm": 0.3912535607814789, + "learning_rate": 0.0001, + "loss": 1.6182, + "step": 3637 + }, + { + "epoch": 0.4178967319510654, + "grad_norm": 0.3759515881538391, + "learning_rate": 0.0001, + "loss": 1.5587, + "step": 3638 + }, + { + "epoch": 0.41801160186089253, + "grad_norm": 0.38995474576950073, + "learning_rate": 0.0001, + "loss": 1.6871, + "step": 3639 + }, + { + "epoch": 0.41812647177071965, + "grad_norm": 0.38808655738830566, + "learning_rate": 0.0001, + "loss": 1.3153, + "step": 3640 + }, + { + "epoch": 0.4182413416805468, + "grad_norm": 0.3956000804901123, + "learning_rate": 0.0001, + "loss": 1.575, + "step": 3641 + }, + { + "epoch": 0.4183562115903739, + "grad_norm": 0.43580135703086853, + "learning_rate": 0.0001, + "loss": 1.6672, + "step": 3642 + }, + { + "epoch": 0.418471081500201, + "grad_norm": 0.4119028151035309, + "learning_rate": 0.0001, + "loss": 1.7918, + "step": 3643 + }, + { + "epoch": 0.41858595141002813, + "grad_norm": 0.44993263483047485, + "learning_rate": 0.0001, + "loss": 1.88, + "step": 3644 + }, + { + "epoch": 0.41870082131985525, + "grad_norm": 0.4024428129196167, + "learning_rate": 0.0001, + "loss": 1.5243, + "step": 3645 + }, + { + "epoch": 0.4188156912296824, + "grad_norm": 0.4061081111431122, + "learning_rate": 0.0001, + "loss": 1.7381, + "step": 3646 + }, + { + "epoch": 0.4189305611395095, + "grad_norm": 0.3835539221763611, + "learning_rate": 0.0001, + "loss": 1.4161, + "step": 3647 + }, + { + "epoch": 0.4190454310493366, + "grad_norm": 0.41590166091918945, + "learning_rate": 0.0001, + "loss": 1.7132, + "step": 3648 + }, + { + "epoch": 0.41916030095916373, + "grad_norm": 0.40571367740631104, + "learning_rate": 0.0001, + "loss": 1.5744, + "step": 3649 + }, + { + "epoch": 0.41927517086899085, + "grad_norm": 0.43440738320350647, + "learning_rate": 0.0001, + "loss": 1.7795, + "step": 3650 + }, + { + "epoch": 0.419390040778818, + "grad_norm": 0.40848132967948914, + "learning_rate": 0.0001, + "loss": 1.5865, + "step": 3651 + }, + { + "epoch": 0.4195049106886451, + "grad_norm": 0.3971754014492035, + "learning_rate": 0.0001, + "loss": 1.6035, + "step": 3652 + }, + { + "epoch": 0.4196197805984722, + "grad_norm": 0.3942772448062897, + "learning_rate": 0.0001, + "loss": 1.59, + "step": 3653 + }, + { + "epoch": 0.41973465050829933, + "grad_norm": 0.4283560812473297, + "learning_rate": 0.0001, + "loss": 1.7446, + "step": 3654 + }, + { + "epoch": 0.41984952041812645, + "grad_norm": 0.39290323853492737, + "learning_rate": 0.0001, + "loss": 1.6255, + "step": 3655 + }, + { + "epoch": 0.4199643903279536, + "grad_norm": 0.3974771797657013, + "learning_rate": 0.0001, + "loss": 1.5625, + "step": 3656 + }, + { + "epoch": 0.4200792602377807, + "grad_norm": 0.40750113129615784, + "learning_rate": 0.0001, + "loss": 1.8154, + "step": 3657 + }, + { + "epoch": 0.4201941301476078, + "grad_norm": 0.4657525420188904, + "learning_rate": 0.0001, + "loss": 1.8862, + "step": 3658 + }, + { + "epoch": 0.42030900005743493, + "grad_norm": 0.40977463126182556, + "learning_rate": 0.0001, + "loss": 1.7574, + "step": 3659 + }, + { + "epoch": 0.42042386996726205, + "grad_norm": 0.4081574082374573, + "learning_rate": 0.0001, + "loss": 1.7681, + "step": 3660 + }, + { + "epoch": 0.4205387398770892, + "grad_norm": 0.46353742480278015, + "learning_rate": 0.0001, + "loss": 1.9854, + "step": 3661 + }, + { + "epoch": 0.4206536097869163, + "grad_norm": 0.3851860463619232, + "learning_rate": 0.0001, + "loss": 1.5924, + "step": 3662 + }, + { + "epoch": 0.4207684796967434, + "grad_norm": 0.38905036449432373, + "learning_rate": 0.0001, + "loss": 1.5134, + "step": 3663 + }, + { + "epoch": 0.42088334960657053, + "grad_norm": 0.3887808620929718, + "learning_rate": 0.0001, + "loss": 1.4947, + "step": 3664 + }, + { + "epoch": 0.42099821951639765, + "grad_norm": 0.39309823513031006, + "learning_rate": 0.0001, + "loss": 1.7099, + "step": 3665 + }, + { + "epoch": 0.4211130894262248, + "grad_norm": 0.389574259519577, + "learning_rate": 0.0001, + "loss": 1.7133, + "step": 3666 + }, + { + "epoch": 0.4212279593360519, + "grad_norm": 0.3817690312862396, + "learning_rate": 0.0001, + "loss": 1.4487, + "step": 3667 + }, + { + "epoch": 0.42134282924587907, + "grad_norm": 0.41336965560913086, + "learning_rate": 0.0001, + "loss": 1.7784, + "step": 3668 + }, + { + "epoch": 0.4214576991557062, + "grad_norm": 0.42329856753349304, + "learning_rate": 0.0001, + "loss": 1.7214, + "step": 3669 + }, + { + "epoch": 0.4215725690655333, + "grad_norm": 0.3827388882637024, + "learning_rate": 0.0001, + "loss": 1.5998, + "step": 3670 + }, + { + "epoch": 0.42168743897536043, + "grad_norm": 0.37781426310539246, + "learning_rate": 0.0001, + "loss": 1.5439, + "step": 3671 + }, + { + "epoch": 0.42180230888518755, + "grad_norm": 0.40145260095596313, + "learning_rate": 0.0001, + "loss": 1.6156, + "step": 3672 + }, + { + "epoch": 0.42191717879501467, + "grad_norm": 0.37764087319374084, + "learning_rate": 0.0001, + "loss": 1.5918, + "step": 3673 + }, + { + "epoch": 0.4220320487048418, + "grad_norm": 0.40427038073539734, + "learning_rate": 0.0001, + "loss": 1.8588, + "step": 3674 + }, + { + "epoch": 0.4221469186146689, + "grad_norm": 0.3810051679611206, + "learning_rate": 0.0001, + "loss": 1.6272, + "step": 3675 + }, + { + "epoch": 0.42226178852449603, + "grad_norm": 0.4347747266292572, + "learning_rate": 0.0001, + "loss": 1.81, + "step": 3676 + }, + { + "epoch": 0.42237665843432315, + "grad_norm": 0.4019508361816406, + "learning_rate": 0.0001, + "loss": 1.6382, + "step": 3677 + }, + { + "epoch": 0.42249152834415027, + "grad_norm": 0.40587884187698364, + "learning_rate": 0.0001, + "loss": 1.665, + "step": 3678 + }, + { + "epoch": 0.4226063982539774, + "grad_norm": 0.3974643051624298, + "learning_rate": 0.0001, + "loss": 1.646, + "step": 3679 + }, + { + "epoch": 0.4227212681638045, + "grad_norm": 0.443687379360199, + "learning_rate": 0.0001, + "loss": 1.9096, + "step": 3680 + }, + { + "epoch": 0.42283613807363163, + "grad_norm": 0.4022139608860016, + "learning_rate": 0.0001, + "loss": 1.8164, + "step": 3681 + }, + { + "epoch": 0.42295100798345875, + "grad_norm": 0.46298283338546753, + "learning_rate": 0.0001, + "loss": 1.9288, + "step": 3682 + }, + { + "epoch": 0.42306587789328587, + "grad_norm": 0.3819207549095154, + "learning_rate": 0.0001, + "loss": 1.5375, + "step": 3683 + }, + { + "epoch": 0.423180747803113, + "grad_norm": 0.39071542024612427, + "learning_rate": 0.0001, + "loss": 1.4206, + "step": 3684 + }, + { + "epoch": 0.4232956177129401, + "grad_norm": 0.39119791984558105, + "learning_rate": 0.0001, + "loss": 1.5128, + "step": 3685 + }, + { + "epoch": 0.42341048762276723, + "grad_norm": 0.41179823875427246, + "learning_rate": 0.0001, + "loss": 1.6371, + "step": 3686 + }, + { + "epoch": 0.42352535753259435, + "grad_norm": 0.40467724204063416, + "learning_rate": 0.0001, + "loss": 1.6625, + "step": 3687 + }, + { + "epoch": 0.42364022744242147, + "grad_norm": 0.4044126570224762, + "learning_rate": 0.0001, + "loss": 1.7322, + "step": 3688 + }, + { + "epoch": 0.4237550973522486, + "grad_norm": 0.39904049038887024, + "learning_rate": 0.0001, + "loss": 1.4788, + "step": 3689 + }, + { + "epoch": 0.4238699672620757, + "grad_norm": 0.4178762137889862, + "learning_rate": 0.0001, + "loss": 1.6207, + "step": 3690 + }, + { + "epoch": 0.42398483717190283, + "grad_norm": 0.4076288640499115, + "learning_rate": 0.0001, + "loss": 1.6828, + "step": 3691 + }, + { + "epoch": 0.42409970708172995, + "grad_norm": 0.4018821716308594, + "learning_rate": 0.0001, + "loss": 1.5477, + "step": 3692 + }, + { + "epoch": 0.42421457699155707, + "grad_norm": 0.40769195556640625, + "learning_rate": 0.0001, + "loss": 1.6397, + "step": 3693 + }, + { + "epoch": 0.4243294469013842, + "grad_norm": 0.38684284687042236, + "learning_rate": 0.0001, + "loss": 1.6296, + "step": 3694 + }, + { + "epoch": 0.4244443168112113, + "grad_norm": 0.41944825649261475, + "learning_rate": 0.0001, + "loss": 1.6284, + "step": 3695 + }, + { + "epoch": 0.42455918672103843, + "grad_norm": 0.38351449370384216, + "learning_rate": 0.0001, + "loss": 1.6596, + "step": 3696 + }, + { + "epoch": 0.42467405663086555, + "grad_norm": 0.429304838180542, + "learning_rate": 0.0001, + "loss": 1.6653, + "step": 3697 + }, + { + "epoch": 0.42478892654069267, + "grad_norm": 0.42793336510658264, + "learning_rate": 0.0001, + "loss": 1.6201, + "step": 3698 + }, + { + "epoch": 0.4249037964505198, + "grad_norm": 0.41888338327407837, + "learning_rate": 0.0001, + "loss": 1.7443, + "step": 3699 + }, + { + "epoch": 0.4250186663603469, + "grad_norm": 0.3938450217247009, + "learning_rate": 0.0001, + "loss": 1.4583, + "step": 3700 + }, + { + "epoch": 0.42513353627017403, + "grad_norm": 0.3846457004547119, + "learning_rate": 0.0001, + "loss": 1.4163, + "step": 3701 + }, + { + "epoch": 0.42524840618000115, + "grad_norm": 0.37341800332069397, + "learning_rate": 0.0001, + "loss": 1.5151, + "step": 3702 + }, + { + "epoch": 0.42536327608982827, + "grad_norm": 0.39832285046577454, + "learning_rate": 0.0001, + "loss": 1.5883, + "step": 3703 + }, + { + "epoch": 0.4254781459996554, + "grad_norm": 0.374226838350296, + "learning_rate": 0.0001, + "loss": 1.6674, + "step": 3704 + }, + { + "epoch": 0.4255930159094825, + "grad_norm": 0.4002193212509155, + "learning_rate": 0.0001, + "loss": 1.5852, + "step": 3705 + }, + { + "epoch": 0.42570788581930963, + "grad_norm": 0.3768205940723419, + "learning_rate": 0.0001, + "loss": 1.5192, + "step": 3706 + }, + { + "epoch": 0.42582275572913675, + "grad_norm": 0.3951648771762848, + "learning_rate": 0.0001, + "loss": 1.6479, + "step": 3707 + }, + { + "epoch": 0.42593762563896387, + "grad_norm": 0.4215008616447449, + "learning_rate": 0.0001, + "loss": 1.8617, + "step": 3708 + }, + { + "epoch": 0.426052495548791, + "grad_norm": 0.4458306133747101, + "learning_rate": 0.0001, + "loss": 1.7462, + "step": 3709 + }, + { + "epoch": 0.4261673654586181, + "grad_norm": 0.40523743629455566, + "learning_rate": 0.0001, + "loss": 1.7303, + "step": 3710 + }, + { + "epoch": 0.42628223536844523, + "grad_norm": 0.412463515996933, + "learning_rate": 0.0001, + "loss": 1.6925, + "step": 3711 + }, + { + "epoch": 0.42639710527827235, + "grad_norm": 0.4172394871711731, + "learning_rate": 0.0001, + "loss": 1.7788, + "step": 3712 + }, + { + "epoch": 0.42651197518809947, + "grad_norm": 0.38772034645080566, + "learning_rate": 0.0001, + "loss": 1.6891, + "step": 3713 + }, + { + "epoch": 0.4266268450979266, + "grad_norm": 0.43306443095207214, + "learning_rate": 0.0001, + "loss": 1.7567, + "step": 3714 + }, + { + "epoch": 0.4267417150077537, + "grad_norm": 0.4059670865535736, + "learning_rate": 0.0001, + "loss": 1.8066, + "step": 3715 + }, + { + "epoch": 0.42685658491758083, + "grad_norm": 0.392241507768631, + "learning_rate": 0.0001, + "loss": 1.6895, + "step": 3716 + }, + { + "epoch": 0.42697145482740795, + "grad_norm": 0.42141708731651306, + "learning_rate": 0.0001, + "loss": 1.7378, + "step": 3717 + }, + { + "epoch": 0.42708632473723507, + "grad_norm": 0.42226916551589966, + "learning_rate": 0.0001, + "loss": 1.7086, + "step": 3718 + }, + { + "epoch": 0.4272011946470622, + "grad_norm": 0.4137056767940521, + "learning_rate": 0.0001, + "loss": 1.5845, + "step": 3719 + }, + { + "epoch": 0.4273160645568893, + "grad_norm": 0.4272863566875458, + "learning_rate": 0.0001, + "loss": 1.6079, + "step": 3720 + }, + { + "epoch": 0.42743093446671643, + "grad_norm": 0.4106510579586029, + "learning_rate": 0.0001, + "loss": 1.5838, + "step": 3721 + }, + { + "epoch": 0.42754580437654355, + "grad_norm": 0.4005192518234253, + "learning_rate": 0.0001, + "loss": 1.6362, + "step": 3722 + }, + { + "epoch": 0.42766067428637067, + "grad_norm": 0.4278923273086548, + "learning_rate": 0.0001, + "loss": 1.7448, + "step": 3723 + }, + { + "epoch": 0.4277755441961978, + "grad_norm": 0.4044369161128998, + "learning_rate": 0.0001, + "loss": 1.6479, + "step": 3724 + }, + { + "epoch": 0.4278904141060249, + "grad_norm": 0.40069693326950073, + "learning_rate": 0.0001, + "loss": 1.7594, + "step": 3725 + }, + { + "epoch": 0.42800528401585203, + "grad_norm": 0.4300965964794159, + "learning_rate": 0.0001, + "loss": 1.5522, + "step": 3726 + }, + { + "epoch": 0.42812015392567915, + "grad_norm": 0.39623257517814636, + "learning_rate": 0.0001, + "loss": 1.4162, + "step": 3727 + }, + { + "epoch": 0.42823502383550627, + "grad_norm": 0.39578771591186523, + "learning_rate": 0.0001, + "loss": 1.5182, + "step": 3728 + }, + { + "epoch": 0.4283498937453334, + "grad_norm": 0.4229432940483093, + "learning_rate": 0.0001, + "loss": 1.7019, + "step": 3729 + }, + { + "epoch": 0.4284647636551605, + "grad_norm": 0.40856388211250305, + "learning_rate": 0.0001, + "loss": 1.8467, + "step": 3730 + }, + { + "epoch": 0.4285796335649876, + "grad_norm": 0.4008654057979584, + "learning_rate": 0.0001, + "loss": 1.7282, + "step": 3731 + }, + { + "epoch": 0.42869450347481475, + "grad_norm": 0.4332294464111328, + "learning_rate": 0.0001, + "loss": 1.8846, + "step": 3732 + }, + { + "epoch": 0.42880937338464187, + "grad_norm": 0.39115771651268005, + "learning_rate": 0.0001, + "loss": 1.5872, + "step": 3733 + }, + { + "epoch": 0.428924243294469, + "grad_norm": 0.42474624514579773, + "learning_rate": 0.0001, + "loss": 1.7635, + "step": 3734 + }, + { + "epoch": 0.4290391132042961, + "grad_norm": 0.3834139406681061, + "learning_rate": 0.0001, + "loss": 1.6279, + "step": 3735 + }, + { + "epoch": 0.4291539831141233, + "grad_norm": 0.42314034700393677, + "learning_rate": 0.0001, + "loss": 1.7384, + "step": 3736 + }, + { + "epoch": 0.4292688530239504, + "grad_norm": 0.41689667105674744, + "learning_rate": 0.0001, + "loss": 1.5733, + "step": 3737 + }, + { + "epoch": 0.4293837229337775, + "grad_norm": 0.38845548033714294, + "learning_rate": 0.0001, + "loss": 1.6234, + "step": 3738 + }, + { + "epoch": 0.42949859284360464, + "grad_norm": 0.4228481352329254, + "learning_rate": 0.0001, + "loss": 1.7028, + "step": 3739 + }, + { + "epoch": 0.42961346275343176, + "grad_norm": 0.38769879937171936, + "learning_rate": 0.0001, + "loss": 1.6032, + "step": 3740 + }, + { + "epoch": 0.4297283326632589, + "grad_norm": 0.43911242485046387, + "learning_rate": 0.0001, + "loss": 1.7073, + "step": 3741 + }, + { + "epoch": 0.429843202573086, + "grad_norm": 0.40347546339035034, + "learning_rate": 0.0001, + "loss": 1.5931, + "step": 3742 + }, + { + "epoch": 0.4299580724829131, + "grad_norm": 0.43745940923690796, + "learning_rate": 0.0001, + "loss": 1.6856, + "step": 3743 + }, + { + "epoch": 0.43007294239274024, + "grad_norm": 0.41324344277381897, + "learning_rate": 0.0001, + "loss": 1.6837, + "step": 3744 + }, + { + "epoch": 0.43018781230256736, + "grad_norm": 0.43059617280960083, + "learning_rate": 0.0001, + "loss": 1.7581, + "step": 3745 + }, + { + "epoch": 0.4303026822123945, + "grad_norm": 0.40253952145576477, + "learning_rate": 0.0001, + "loss": 1.5059, + "step": 3746 + }, + { + "epoch": 0.4304175521222216, + "grad_norm": 0.3712945580482483, + "learning_rate": 0.0001, + "loss": 1.4402, + "step": 3747 + }, + { + "epoch": 0.4305324220320487, + "grad_norm": 0.41729727387428284, + "learning_rate": 0.0001, + "loss": 1.6481, + "step": 3748 + }, + { + "epoch": 0.43064729194187584, + "grad_norm": 0.40478816628456116, + "learning_rate": 0.0001, + "loss": 1.7401, + "step": 3749 + }, + { + "epoch": 0.43076216185170296, + "grad_norm": 0.37169450521469116, + "learning_rate": 0.0001, + "loss": 1.45, + "step": 3750 + }, + { + "epoch": 0.4308770317615301, + "grad_norm": 0.4352746903896332, + "learning_rate": 0.0001, + "loss": 1.711, + "step": 3751 + }, + { + "epoch": 0.4309919016713572, + "grad_norm": 0.41806161403656006, + "learning_rate": 0.0001, + "loss": 1.7793, + "step": 3752 + }, + { + "epoch": 0.4311067715811843, + "grad_norm": 0.41790369153022766, + "learning_rate": 0.0001, + "loss": 1.5924, + "step": 3753 + }, + { + "epoch": 0.43122164149101144, + "grad_norm": 0.40323588252067566, + "learning_rate": 0.0001, + "loss": 1.6645, + "step": 3754 + }, + { + "epoch": 0.43133651140083856, + "grad_norm": 0.4263021945953369, + "learning_rate": 0.0001, + "loss": 1.795, + "step": 3755 + }, + { + "epoch": 0.4314513813106657, + "grad_norm": 0.4722081124782562, + "learning_rate": 0.0001, + "loss": 1.8166, + "step": 3756 + }, + { + "epoch": 0.4315662512204928, + "grad_norm": 0.4170565605163574, + "learning_rate": 0.0001, + "loss": 1.7488, + "step": 3757 + }, + { + "epoch": 0.4316811211303199, + "grad_norm": 0.3915204703807831, + "learning_rate": 0.0001, + "loss": 1.673, + "step": 3758 + }, + { + "epoch": 0.43179599104014704, + "grad_norm": 0.38846898078918457, + "learning_rate": 0.0001, + "loss": 1.6482, + "step": 3759 + }, + { + "epoch": 0.43191086094997416, + "grad_norm": 0.3822706639766693, + "learning_rate": 0.0001, + "loss": 1.7103, + "step": 3760 + }, + { + "epoch": 0.4320257308598013, + "grad_norm": 0.3950004279613495, + "learning_rate": 0.0001, + "loss": 1.5763, + "step": 3761 + }, + { + "epoch": 0.4321406007696284, + "grad_norm": 0.3973482549190521, + "learning_rate": 0.0001, + "loss": 1.561, + "step": 3762 + }, + { + "epoch": 0.4322554706794555, + "grad_norm": 0.40038934350013733, + "learning_rate": 0.0001, + "loss": 1.6894, + "step": 3763 + }, + { + "epoch": 0.43237034058928264, + "grad_norm": 0.3878767490386963, + "learning_rate": 0.0001, + "loss": 1.5425, + "step": 3764 + }, + { + "epoch": 0.43248521049910976, + "grad_norm": 0.40222638845443726, + "learning_rate": 0.0001, + "loss": 1.7212, + "step": 3765 + }, + { + "epoch": 0.4326000804089369, + "grad_norm": 0.38121557235717773, + "learning_rate": 0.0001, + "loss": 1.5263, + "step": 3766 + }, + { + "epoch": 0.432714950318764, + "grad_norm": 0.4091407060623169, + "learning_rate": 0.0001, + "loss": 1.8233, + "step": 3767 + }, + { + "epoch": 0.4328298202285911, + "grad_norm": 0.39854979515075684, + "learning_rate": 0.0001, + "loss": 1.6323, + "step": 3768 + }, + { + "epoch": 0.43294469013841824, + "grad_norm": 0.41306614875793457, + "learning_rate": 0.0001, + "loss": 1.8204, + "step": 3769 + }, + { + "epoch": 0.43305956004824536, + "grad_norm": 0.4026402235031128, + "learning_rate": 0.0001, + "loss": 1.6817, + "step": 3770 + }, + { + "epoch": 0.4331744299580725, + "grad_norm": 0.3648960292339325, + "learning_rate": 0.0001, + "loss": 1.3872, + "step": 3771 + }, + { + "epoch": 0.4332892998678996, + "grad_norm": 0.3978753983974457, + "learning_rate": 0.0001, + "loss": 1.6007, + "step": 3772 + }, + { + "epoch": 0.4334041697777267, + "grad_norm": 0.3977111876010895, + "learning_rate": 0.0001, + "loss": 1.7074, + "step": 3773 + }, + { + "epoch": 0.43351903968755384, + "grad_norm": 0.4535907506942749, + "learning_rate": 0.0001, + "loss": 1.8803, + "step": 3774 + }, + { + "epoch": 0.43363390959738096, + "grad_norm": 0.4159374535083771, + "learning_rate": 0.0001, + "loss": 1.6929, + "step": 3775 + }, + { + "epoch": 0.4337487795072081, + "grad_norm": 0.40071192383766174, + "learning_rate": 0.0001, + "loss": 1.6455, + "step": 3776 + }, + { + "epoch": 0.4338636494170352, + "grad_norm": 0.40635019540786743, + "learning_rate": 0.0001, + "loss": 1.6612, + "step": 3777 + }, + { + "epoch": 0.4339785193268623, + "grad_norm": 0.4317454993724823, + "learning_rate": 0.0001, + "loss": 1.7709, + "step": 3778 + }, + { + "epoch": 0.43409338923668944, + "grad_norm": 0.3959444761276245, + "learning_rate": 0.0001, + "loss": 1.5872, + "step": 3779 + }, + { + "epoch": 0.43420825914651656, + "grad_norm": 0.3976995050907135, + "learning_rate": 0.0001, + "loss": 1.671, + "step": 3780 + }, + { + "epoch": 0.4343231290563437, + "grad_norm": 0.4593839943408966, + "learning_rate": 0.0001, + "loss": 2.0386, + "step": 3781 + }, + { + "epoch": 0.4344379989661708, + "grad_norm": 0.4245525598526001, + "learning_rate": 0.0001, + "loss": 1.7047, + "step": 3782 + }, + { + "epoch": 0.4345528688759979, + "grad_norm": 0.3964807689189911, + "learning_rate": 0.0001, + "loss": 1.5174, + "step": 3783 + }, + { + "epoch": 0.43466773878582504, + "grad_norm": 0.3919978439807892, + "learning_rate": 0.0001, + "loss": 1.598, + "step": 3784 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 0.41812998056411743, + "learning_rate": 0.0001, + "loss": 1.7921, + "step": 3785 + }, + { + "epoch": 0.4348974786054793, + "grad_norm": 0.3976697325706482, + "learning_rate": 0.0001, + "loss": 1.636, + "step": 3786 + }, + { + "epoch": 0.4350123485153064, + "grad_norm": 0.39994823932647705, + "learning_rate": 0.0001, + "loss": 1.7524, + "step": 3787 + }, + { + "epoch": 0.4351272184251335, + "grad_norm": 0.41444066166877747, + "learning_rate": 0.0001, + "loss": 1.591, + "step": 3788 + }, + { + "epoch": 0.43524208833496064, + "grad_norm": 0.41670259833335876, + "learning_rate": 0.0001, + "loss": 1.6125, + "step": 3789 + }, + { + "epoch": 0.43535695824478776, + "grad_norm": 0.4110081195831299, + "learning_rate": 0.0001, + "loss": 1.4517, + "step": 3790 + }, + { + "epoch": 0.4354718281546149, + "grad_norm": 0.41741257905960083, + "learning_rate": 0.0001, + "loss": 1.7861, + "step": 3791 + }, + { + "epoch": 0.435586698064442, + "grad_norm": 0.40086445212364197, + "learning_rate": 0.0001, + "loss": 1.6162, + "step": 3792 + }, + { + "epoch": 0.4357015679742691, + "grad_norm": 0.38740214705467224, + "learning_rate": 0.0001, + "loss": 1.5713, + "step": 3793 + }, + { + "epoch": 0.43581643788409624, + "grad_norm": 0.43951183557510376, + "learning_rate": 0.0001, + "loss": 1.795, + "step": 3794 + }, + { + "epoch": 0.43593130779392336, + "grad_norm": 0.4288491904735565, + "learning_rate": 0.0001, + "loss": 1.7329, + "step": 3795 + }, + { + "epoch": 0.4360461777037505, + "grad_norm": 0.4051986038684845, + "learning_rate": 0.0001, + "loss": 1.6826, + "step": 3796 + }, + { + "epoch": 0.4361610476135776, + "grad_norm": 0.36968207359313965, + "learning_rate": 0.0001, + "loss": 1.4493, + "step": 3797 + }, + { + "epoch": 0.4362759175234047, + "grad_norm": 0.39007097482681274, + "learning_rate": 0.0001, + "loss": 1.4471, + "step": 3798 + }, + { + "epoch": 0.43639078743323184, + "grad_norm": 0.40956395864486694, + "learning_rate": 0.0001, + "loss": 1.5975, + "step": 3799 + }, + { + "epoch": 0.43650565734305896, + "grad_norm": 0.4181691110134125, + "learning_rate": 0.0001, + "loss": 1.7006, + "step": 3800 + }, + { + "epoch": 0.4366205272528861, + "grad_norm": 0.4084911644458771, + "learning_rate": 0.0001, + "loss": 1.7662, + "step": 3801 + }, + { + "epoch": 0.4367353971627132, + "grad_norm": 0.4404086470603943, + "learning_rate": 0.0001, + "loss": 1.7702, + "step": 3802 + }, + { + "epoch": 0.4368502670725403, + "grad_norm": 0.43644893169403076, + "learning_rate": 0.0001, + "loss": 1.5842, + "step": 3803 + }, + { + "epoch": 0.43696513698236744, + "grad_norm": 0.42488721013069153, + "learning_rate": 0.0001, + "loss": 1.795, + "step": 3804 + }, + { + "epoch": 0.4370800068921946, + "grad_norm": 0.4115133583545685, + "learning_rate": 0.0001, + "loss": 1.7059, + "step": 3805 + }, + { + "epoch": 0.43719487680202174, + "grad_norm": 0.4241204857826233, + "learning_rate": 0.0001, + "loss": 1.7201, + "step": 3806 + }, + { + "epoch": 0.43730974671184886, + "grad_norm": 0.3907410204410553, + "learning_rate": 0.0001, + "loss": 1.6551, + "step": 3807 + }, + { + "epoch": 0.437424616621676, + "grad_norm": 0.42004403471946716, + "learning_rate": 0.0001, + "loss": 1.7367, + "step": 3808 + }, + { + "epoch": 0.4375394865315031, + "grad_norm": 0.39130643010139465, + "learning_rate": 0.0001, + "loss": 1.5863, + "step": 3809 + }, + { + "epoch": 0.4376543564413302, + "grad_norm": 0.420646071434021, + "learning_rate": 0.0001, + "loss": 1.6687, + "step": 3810 + }, + { + "epoch": 0.43776922635115734, + "grad_norm": 0.43482112884521484, + "learning_rate": 0.0001, + "loss": 1.7814, + "step": 3811 + }, + { + "epoch": 0.43788409626098446, + "grad_norm": 0.4364151656627655, + "learning_rate": 0.0001, + "loss": 1.7378, + "step": 3812 + }, + { + "epoch": 0.4379989661708116, + "grad_norm": 0.44241318106651306, + "learning_rate": 0.0001, + "loss": 1.8158, + "step": 3813 + }, + { + "epoch": 0.4381138360806387, + "grad_norm": 0.3973522186279297, + "learning_rate": 0.0001, + "loss": 1.6807, + "step": 3814 + }, + { + "epoch": 0.4382287059904658, + "grad_norm": 0.42336922883987427, + "learning_rate": 0.0001, + "loss": 1.8086, + "step": 3815 + }, + { + "epoch": 0.43834357590029294, + "grad_norm": 0.40706026554107666, + "learning_rate": 0.0001, + "loss": 1.7805, + "step": 3816 + }, + { + "epoch": 0.43845844581012006, + "grad_norm": 0.4009002149105072, + "learning_rate": 0.0001, + "loss": 1.7439, + "step": 3817 + }, + { + "epoch": 0.4385733157199472, + "grad_norm": 0.38294774293899536, + "learning_rate": 0.0001, + "loss": 1.6179, + "step": 3818 + }, + { + "epoch": 0.4386881856297743, + "grad_norm": 0.40694519877433777, + "learning_rate": 0.0001, + "loss": 1.5189, + "step": 3819 + }, + { + "epoch": 0.4388030555396014, + "grad_norm": 0.3984982371330261, + "learning_rate": 0.0001, + "loss": 1.6288, + "step": 3820 + }, + { + "epoch": 0.43891792544942854, + "grad_norm": 0.44056373834609985, + "learning_rate": 0.0001, + "loss": 1.7716, + "step": 3821 + }, + { + "epoch": 0.43903279535925566, + "grad_norm": 0.41726037859916687, + "learning_rate": 0.0001, + "loss": 1.8127, + "step": 3822 + }, + { + "epoch": 0.4391476652690828, + "grad_norm": 0.4185958802700043, + "learning_rate": 0.0001, + "loss": 1.6932, + "step": 3823 + }, + { + "epoch": 0.4392625351789099, + "grad_norm": 0.3969833254814148, + "learning_rate": 0.0001, + "loss": 1.661, + "step": 3824 + }, + { + "epoch": 0.439377405088737, + "grad_norm": 0.42625898122787476, + "learning_rate": 0.0001, + "loss": 1.7313, + "step": 3825 + }, + { + "epoch": 0.43949227499856414, + "grad_norm": 0.3792615532875061, + "learning_rate": 0.0001, + "loss": 1.5973, + "step": 3826 + }, + { + "epoch": 0.43960714490839126, + "grad_norm": 0.3736385107040405, + "learning_rate": 0.0001, + "loss": 1.3924, + "step": 3827 + }, + { + "epoch": 0.4397220148182184, + "grad_norm": 0.4105335772037506, + "learning_rate": 0.0001, + "loss": 1.6946, + "step": 3828 + }, + { + "epoch": 0.4398368847280455, + "grad_norm": 0.4096393585205078, + "learning_rate": 0.0001, + "loss": 1.722, + "step": 3829 + }, + { + "epoch": 0.4399517546378726, + "grad_norm": 0.3897015154361725, + "learning_rate": 0.0001, + "loss": 1.597, + "step": 3830 + }, + { + "epoch": 0.44006662454769974, + "grad_norm": 0.4688592851161957, + "learning_rate": 0.0001, + "loss": 1.7616, + "step": 3831 + }, + { + "epoch": 0.44018149445752686, + "grad_norm": 0.4732673168182373, + "learning_rate": 0.0001, + "loss": 1.5295, + "step": 3832 + }, + { + "epoch": 0.440296364367354, + "grad_norm": 0.40647318959236145, + "learning_rate": 0.0001, + "loss": 1.745, + "step": 3833 + }, + { + "epoch": 0.4404112342771811, + "grad_norm": 0.38484612107276917, + "learning_rate": 0.0001, + "loss": 1.4694, + "step": 3834 + }, + { + "epoch": 0.4405261041870082, + "grad_norm": 0.39425164461135864, + "learning_rate": 0.0001, + "loss": 1.6431, + "step": 3835 + }, + { + "epoch": 0.44064097409683534, + "grad_norm": 0.38018524646759033, + "learning_rate": 0.0001, + "loss": 1.5564, + "step": 3836 + }, + { + "epoch": 0.44075584400666246, + "grad_norm": 0.40597987174987793, + "learning_rate": 0.0001, + "loss": 1.6501, + "step": 3837 + }, + { + "epoch": 0.4408707139164896, + "grad_norm": 0.4145064949989319, + "learning_rate": 0.0001, + "loss": 1.699, + "step": 3838 + }, + { + "epoch": 0.4409855838263167, + "grad_norm": 0.41554126143455505, + "learning_rate": 0.0001, + "loss": 1.7004, + "step": 3839 + }, + { + "epoch": 0.4411004537361438, + "grad_norm": 0.3642127811908722, + "learning_rate": 0.0001, + "loss": 1.5098, + "step": 3840 + }, + { + "epoch": 0.44121532364597094, + "grad_norm": 0.40661707520484924, + "learning_rate": 0.0001, + "loss": 1.7808, + "step": 3841 + }, + { + "epoch": 0.44133019355579806, + "grad_norm": 0.42845696210861206, + "learning_rate": 0.0001, + "loss": 1.6614, + "step": 3842 + }, + { + "epoch": 0.4414450634656252, + "grad_norm": 0.43189069628715515, + "learning_rate": 0.0001, + "loss": 1.7699, + "step": 3843 + }, + { + "epoch": 0.4415599333754523, + "grad_norm": 0.42555564641952515, + "learning_rate": 0.0001, + "loss": 1.7836, + "step": 3844 + }, + { + "epoch": 0.4416748032852794, + "grad_norm": 0.4227694571018219, + "learning_rate": 0.0001, + "loss": 1.7315, + "step": 3845 + }, + { + "epoch": 0.44178967319510654, + "grad_norm": 0.4147578477859497, + "learning_rate": 0.0001, + "loss": 1.7124, + "step": 3846 + }, + { + "epoch": 0.44190454310493366, + "grad_norm": 0.39049941301345825, + "learning_rate": 0.0001, + "loss": 1.5687, + "step": 3847 + }, + { + "epoch": 0.4420194130147608, + "grad_norm": 0.4264180362224579, + "learning_rate": 0.0001, + "loss": 1.7393, + "step": 3848 + }, + { + "epoch": 0.4421342829245879, + "grad_norm": 0.416720449924469, + "learning_rate": 0.0001, + "loss": 1.6101, + "step": 3849 + }, + { + "epoch": 0.442249152834415, + "grad_norm": 0.4004881680011749, + "learning_rate": 0.0001, + "loss": 1.6228, + "step": 3850 + }, + { + "epoch": 0.44236402274424214, + "grad_norm": 0.3840608298778534, + "learning_rate": 0.0001, + "loss": 1.4561, + "step": 3851 + }, + { + "epoch": 0.44247889265406926, + "grad_norm": 0.398914098739624, + "learning_rate": 0.0001, + "loss": 1.6199, + "step": 3852 + }, + { + "epoch": 0.4425937625638964, + "grad_norm": 0.4018940329551697, + "learning_rate": 0.0001, + "loss": 1.6785, + "step": 3853 + }, + { + "epoch": 0.4427086324737235, + "grad_norm": 0.4263051450252533, + "learning_rate": 0.0001, + "loss": 1.6775, + "step": 3854 + }, + { + "epoch": 0.4428235023835506, + "grad_norm": 0.4021928608417511, + "learning_rate": 0.0001, + "loss": 1.6207, + "step": 3855 + }, + { + "epoch": 0.44293837229337774, + "grad_norm": 0.3968657851219177, + "learning_rate": 0.0001, + "loss": 1.4729, + "step": 3856 + }, + { + "epoch": 0.44305324220320486, + "grad_norm": 0.3775864541530609, + "learning_rate": 0.0001, + "loss": 1.6014, + "step": 3857 + }, + { + "epoch": 0.443168112113032, + "grad_norm": 0.3850625157356262, + "learning_rate": 0.0001, + "loss": 1.6046, + "step": 3858 + }, + { + "epoch": 0.4432829820228591, + "grad_norm": 0.4146713614463806, + "learning_rate": 0.0001, + "loss": 1.6827, + "step": 3859 + }, + { + "epoch": 0.4433978519326862, + "grad_norm": 0.3781043291091919, + "learning_rate": 0.0001, + "loss": 1.4812, + "step": 3860 + }, + { + "epoch": 0.44351272184251334, + "grad_norm": 0.38465172052383423, + "learning_rate": 0.0001, + "loss": 1.6866, + "step": 3861 + }, + { + "epoch": 0.44362759175234046, + "grad_norm": 0.41216936707496643, + "learning_rate": 0.0001, + "loss": 1.7803, + "step": 3862 + }, + { + "epoch": 0.4437424616621676, + "grad_norm": 0.4183763563632965, + "learning_rate": 0.0001, + "loss": 1.6628, + "step": 3863 + }, + { + "epoch": 0.4438573315719947, + "grad_norm": 0.39858147501945496, + "learning_rate": 0.0001, + "loss": 1.641, + "step": 3864 + }, + { + "epoch": 0.4439722014818218, + "grad_norm": 0.4076845347881317, + "learning_rate": 0.0001, + "loss": 1.7546, + "step": 3865 + }, + { + "epoch": 0.44408707139164894, + "grad_norm": 0.4265919625759125, + "learning_rate": 0.0001, + "loss": 1.6738, + "step": 3866 + }, + { + "epoch": 0.44420194130147606, + "grad_norm": 0.4252798855304718, + "learning_rate": 0.0001, + "loss": 1.698, + "step": 3867 + }, + { + "epoch": 0.4443168112113032, + "grad_norm": 0.3964008688926697, + "learning_rate": 0.0001, + "loss": 1.5616, + "step": 3868 + }, + { + "epoch": 0.4444316811211303, + "grad_norm": 0.3971204459667206, + "learning_rate": 0.0001, + "loss": 1.5393, + "step": 3869 + }, + { + "epoch": 0.4445465510309574, + "grad_norm": 0.39955177903175354, + "learning_rate": 0.0001, + "loss": 1.7192, + "step": 3870 + }, + { + "epoch": 0.44466142094078454, + "grad_norm": 0.40053296089172363, + "learning_rate": 0.0001, + "loss": 1.5665, + "step": 3871 + }, + { + "epoch": 0.44477629085061166, + "grad_norm": 0.3930230438709259, + "learning_rate": 0.0001, + "loss": 1.6522, + "step": 3872 + }, + { + "epoch": 0.44489116076043883, + "grad_norm": 0.41546955704689026, + "learning_rate": 0.0001, + "loss": 1.7292, + "step": 3873 + }, + { + "epoch": 0.44500603067026595, + "grad_norm": 0.40702396631240845, + "learning_rate": 0.0001, + "loss": 1.6555, + "step": 3874 + }, + { + "epoch": 0.44512090058009307, + "grad_norm": 0.451388955116272, + "learning_rate": 0.0001, + "loss": 1.8062, + "step": 3875 + }, + { + "epoch": 0.4452357704899202, + "grad_norm": 0.40768948197364807, + "learning_rate": 0.0001, + "loss": 1.6942, + "step": 3876 + }, + { + "epoch": 0.4453506403997473, + "grad_norm": 0.4409973919391632, + "learning_rate": 0.0001, + "loss": 1.7809, + "step": 3877 + }, + { + "epoch": 0.44546551030957443, + "grad_norm": 0.4067426323890686, + "learning_rate": 0.0001, + "loss": 1.6392, + "step": 3878 + }, + { + "epoch": 0.44558038021940155, + "grad_norm": 0.37034404277801514, + "learning_rate": 0.0001, + "loss": 1.6065, + "step": 3879 + }, + { + "epoch": 0.44569525012922867, + "grad_norm": 0.41960740089416504, + "learning_rate": 0.0001, + "loss": 1.9168, + "step": 3880 + }, + { + "epoch": 0.4458101200390558, + "grad_norm": 0.40706804394721985, + "learning_rate": 0.0001, + "loss": 1.5405, + "step": 3881 + }, + { + "epoch": 0.4459249899488829, + "grad_norm": 0.38883423805236816, + "learning_rate": 0.0001, + "loss": 1.5699, + "step": 3882 + }, + { + "epoch": 0.44603985985871003, + "grad_norm": 0.3777417540550232, + "learning_rate": 0.0001, + "loss": 1.4643, + "step": 3883 + }, + { + "epoch": 0.44615472976853715, + "grad_norm": 0.3927326798439026, + "learning_rate": 0.0001, + "loss": 1.6023, + "step": 3884 + }, + { + "epoch": 0.44626959967836427, + "grad_norm": 0.39336463809013367, + "learning_rate": 0.0001, + "loss": 1.6599, + "step": 3885 + }, + { + "epoch": 0.4463844695881914, + "grad_norm": 0.38999754190444946, + "learning_rate": 0.0001, + "loss": 1.6558, + "step": 3886 + }, + { + "epoch": 0.4464993394980185, + "grad_norm": 0.37875038385391235, + "learning_rate": 0.0001, + "loss": 1.5655, + "step": 3887 + }, + { + "epoch": 0.44661420940784563, + "grad_norm": 0.4335520267486572, + "learning_rate": 0.0001, + "loss": 1.9779, + "step": 3888 + }, + { + "epoch": 0.44672907931767275, + "grad_norm": 0.461556613445282, + "learning_rate": 0.0001, + "loss": 1.9997, + "step": 3889 + }, + { + "epoch": 0.44684394922749987, + "grad_norm": 0.4445992410182953, + "learning_rate": 0.0001, + "loss": 2.0335, + "step": 3890 + }, + { + "epoch": 0.446958819137327, + "grad_norm": 0.39221543073654175, + "learning_rate": 0.0001, + "loss": 1.5501, + "step": 3891 + }, + { + "epoch": 0.4470736890471541, + "grad_norm": 0.3761284351348877, + "learning_rate": 0.0001, + "loss": 1.6046, + "step": 3892 + }, + { + "epoch": 0.44718855895698123, + "grad_norm": 0.473550409078598, + "learning_rate": 0.0001, + "loss": 1.9831, + "step": 3893 + }, + { + "epoch": 0.44730342886680835, + "grad_norm": 0.39917153120040894, + "learning_rate": 0.0001, + "loss": 1.6847, + "step": 3894 + }, + { + "epoch": 0.44741829877663547, + "grad_norm": 0.3896121680736542, + "learning_rate": 0.0001, + "loss": 1.5956, + "step": 3895 + }, + { + "epoch": 0.4475331686864626, + "grad_norm": 0.39092063903808594, + "learning_rate": 0.0001, + "loss": 1.5873, + "step": 3896 + }, + { + "epoch": 0.4476480385962897, + "grad_norm": 0.427628755569458, + "learning_rate": 0.0001, + "loss": 1.7593, + "step": 3897 + }, + { + "epoch": 0.44776290850611683, + "grad_norm": 0.4042544364929199, + "learning_rate": 0.0001, + "loss": 1.7025, + "step": 3898 + }, + { + "epoch": 0.44787777841594395, + "grad_norm": 0.3957765996456146, + "learning_rate": 0.0001, + "loss": 1.6169, + "step": 3899 + }, + { + "epoch": 0.44799264832577107, + "grad_norm": 0.41617143154144287, + "learning_rate": 0.0001, + "loss": 1.6083, + "step": 3900 + }, + { + "epoch": 0.4481075182355982, + "grad_norm": 0.4180884063243866, + "learning_rate": 0.0001, + "loss": 1.4391, + "step": 3901 + }, + { + "epoch": 0.4482223881454253, + "grad_norm": 0.38561975955963135, + "learning_rate": 0.0001, + "loss": 1.4084, + "step": 3902 + }, + { + "epoch": 0.44833725805525243, + "grad_norm": 0.4009731709957123, + "learning_rate": 0.0001, + "loss": 1.5731, + "step": 3903 + }, + { + "epoch": 0.44845212796507955, + "grad_norm": 0.4165489077568054, + "learning_rate": 0.0001, + "loss": 1.6112, + "step": 3904 + }, + { + "epoch": 0.44856699787490667, + "grad_norm": 0.41844233870506287, + "learning_rate": 0.0001, + "loss": 1.7241, + "step": 3905 + }, + { + "epoch": 0.4486818677847338, + "grad_norm": 0.4142603576183319, + "learning_rate": 0.0001, + "loss": 1.5215, + "step": 3906 + }, + { + "epoch": 0.4487967376945609, + "grad_norm": 0.41738462448120117, + "learning_rate": 0.0001, + "loss": 1.7553, + "step": 3907 + }, + { + "epoch": 0.44891160760438803, + "grad_norm": 0.40905338525772095, + "learning_rate": 0.0001, + "loss": 1.5517, + "step": 3908 + }, + { + "epoch": 0.44902647751421515, + "grad_norm": 0.4092770516872406, + "learning_rate": 0.0001, + "loss": 1.7373, + "step": 3909 + }, + { + "epoch": 0.44914134742404227, + "grad_norm": 0.3912902772426605, + "learning_rate": 0.0001, + "loss": 1.6418, + "step": 3910 + }, + { + "epoch": 0.4492562173338694, + "grad_norm": 0.42974889278411865, + "learning_rate": 0.0001, + "loss": 1.5082, + "step": 3911 + }, + { + "epoch": 0.4493710872436965, + "grad_norm": 0.4151856601238251, + "learning_rate": 0.0001, + "loss": 1.511, + "step": 3912 + }, + { + "epoch": 0.44948595715352363, + "grad_norm": 0.4048108458518982, + "learning_rate": 0.0001, + "loss": 1.6821, + "step": 3913 + }, + { + "epoch": 0.44960082706335075, + "grad_norm": 0.3995741307735443, + "learning_rate": 0.0001, + "loss": 1.5816, + "step": 3914 + }, + { + "epoch": 0.44971569697317787, + "grad_norm": 0.41176357865333557, + "learning_rate": 0.0001, + "loss": 1.65, + "step": 3915 + }, + { + "epoch": 0.449830566883005, + "grad_norm": 0.4261864721775055, + "learning_rate": 0.0001, + "loss": 1.719, + "step": 3916 + }, + { + "epoch": 0.4499454367928321, + "grad_norm": 0.41965851187705994, + "learning_rate": 0.0001, + "loss": 1.8569, + "step": 3917 + }, + { + "epoch": 0.45006030670265923, + "grad_norm": 0.4024384617805481, + "learning_rate": 0.0001, + "loss": 1.682, + "step": 3918 + }, + { + "epoch": 0.45017517661248635, + "grad_norm": 0.42263585329055786, + "learning_rate": 0.0001, + "loss": 1.7975, + "step": 3919 + }, + { + "epoch": 0.45029004652231347, + "grad_norm": 0.37755391001701355, + "learning_rate": 0.0001, + "loss": 1.6863, + "step": 3920 + }, + { + "epoch": 0.4504049164321406, + "grad_norm": 0.4409322440624237, + "learning_rate": 0.0001, + "loss": 1.9805, + "step": 3921 + }, + { + "epoch": 0.4505197863419677, + "grad_norm": 0.4285867512226105, + "learning_rate": 0.0001, + "loss": 1.7172, + "step": 3922 + }, + { + "epoch": 0.45063465625179483, + "grad_norm": 0.4555914103984833, + "learning_rate": 0.0001, + "loss": 1.7151, + "step": 3923 + }, + { + "epoch": 0.45074952616162195, + "grad_norm": 0.399330198764801, + "learning_rate": 0.0001, + "loss": 1.5435, + "step": 3924 + }, + { + "epoch": 0.45086439607144907, + "grad_norm": 0.4097477197647095, + "learning_rate": 0.0001, + "loss": 1.7046, + "step": 3925 + }, + { + "epoch": 0.4509792659812762, + "grad_norm": 0.4111149311065674, + "learning_rate": 0.0001, + "loss": 1.6762, + "step": 3926 + }, + { + "epoch": 0.4510941358911033, + "grad_norm": 0.4010746479034424, + "learning_rate": 0.0001, + "loss": 1.2525, + "step": 3927 + }, + { + "epoch": 0.45120900580093043, + "grad_norm": 0.39972639083862305, + "learning_rate": 0.0001, + "loss": 1.6492, + "step": 3928 + }, + { + "epoch": 0.45132387571075755, + "grad_norm": 0.4085247218608856, + "learning_rate": 0.0001, + "loss": 1.7221, + "step": 3929 + }, + { + "epoch": 0.45143874562058467, + "grad_norm": 0.3858582377433777, + "learning_rate": 0.0001, + "loss": 1.4941, + "step": 3930 + }, + { + "epoch": 0.4515536155304118, + "grad_norm": 0.4039546251296997, + "learning_rate": 0.0001, + "loss": 1.4808, + "step": 3931 + }, + { + "epoch": 0.4516684854402389, + "grad_norm": 0.42480626702308655, + "learning_rate": 0.0001, + "loss": 1.6769, + "step": 3932 + }, + { + "epoch": 0.45178335535006603, + "grad_norm": 0.4410436451435089, + "learning_rate": 0.0001, + "loss": 1.7151, + "step": 3933 + }, + { + "epoch": 0.45189822525989315, + "grad_norm": 0.3930107057094574, + "learning_rate": 0.0001, + "loss": 1.5862, + "step": 3934 + }, + { + "epoch": 0.45201309516972027, + "grad_norm": 0.3937776982784271, + "learning_rate": 0.0001, + "loss": 1.6546, + "step": 3935 + }, + { + "epoch": 0.4521279650795474, + "grad_norm": 0.41633737087249756, + "learning_rate": 0.0001, + "loss": 1.9046, + "step": 3936 + }, + { + "epoch": 0.4522428349893745, + "grad_norm": 0.41131532192230225, + "learning_rate": 0.0001, + "loss": 1.7626, + "step": 3937 + }, + { + "epoch": 0.45235770489920163, + "grad_norm": 0.4197579026222229, + "learning_rate": 0.0001, + "loss": 1.686, + "step": 3938 + }, + { + "epoch": 0.45247257480902875, + "grad_norm": 0.4465826749801636, + "learning_rate": 0.0001, + "loss": 1.6723, + "step": 3939 + }, + { + "epoch": 0.45258744471885587, + "grad_norm": 0.3683403730392456, + "learning_rate": 0.0001, + "loss": 1.6319, + "step": 3940 + }, + { + "epoch": 0.452702314628683, + "grad_norm": 0.4257335066795349, + "learning_rate": 0.0001, + "loss": 1.6718, + "step": 3941 + }, + { + "epoch": 0.45281718453851016, + "grad_norm": 0.42209717631340027, + "learning_rate": 0.0001, + "loss": 1.5877, + "step": 3942 + }, + { + "epoch": 0.4529320544483373, + "grad_norm": 0.3728243112564087, + "learning_rate": 0.0001, + "loss": 1.3088, + "step": 3943 + }, + { + "epoch": 0.4530469243581644, + "grad_norm": 0.44413092732429504, + "learning_rate": 0.0001, + "loss": 1.7914, + "step": 3944 + }, + { + "epoch": 0.4531617942679915, + "grad_norm": 0.40472888946533203, + "learning_rate": 0.0001, + "loss": 1.4247, + "step": 3945 + }, + { + "epoch": 0.45327666417781864, + "grad_norm": 0.44987305998802185, + "learning_rate": 0.0001, + "loss": 1.8835, + "step": 3946 + }, + { + "epoch": 0.45339153408764576, + "grad_norm": 0.4118070602416992, + "learning_rate": 0.0001, + "loss": 1.5984, + "step": 3947 + }, + { + "epoch": 0.4535064039974729, + "grad_norm": 0.3929295241832733, + "learning_rate": 0.0001, + "loss": 1.6814, + "step": 3948 + }, + { + "epoch": 0.4536212739073, + "grad_norm": 0.43117576837539673, + "learning_rate": 0.0001, + "loss": 1.6748, + "step": 3949 + }, + { + "epoch": 0.4537361438171271, + "grad_norm": 0.434177041053772, + "learning_rate": 0.0001, + "loss": 1.6521, + "step": 3950 + }, + { + "epoch": 0.45385101372695424, + "grad_norm": 0.4338892698287964, + "learning_rate": 0.0001, + "loss": 1.6405, + "step": 3951 + }, + { + "epoch": 0.45396588363678136, + "grad_norm": 0.4370008409023285, + "learning_rate": 0.0001, + "loss": 1.6605, + "step": 3952 + }, + { + "epoch": 0.4540807535466085, + "grad_norm": 0.42377498745918274, + "learning_rate": 0.0001, + "loss": 1.6392, + "step": 3953 + }, + { + "epoch": 0.4541956234564356, + "grad_norm": 0.3817870318889618, + "learning_rate": 0.0001, + "loss": 1.5003, + "step": 3954 + }, + { + "epoch": 0.4543104933662627, + "grad_norm": 0.37805989384651184, + "learning_rate": 0.0001, + "loss": 1.6596, + "step": 3955 + }, + { + "epoch": 0.45442536327608984, + "grad_norm": 0.4458475708961487, + "learning_rate": 0.0001, + "loss": 1.8117, + "step": 3956 + }, + { + "epoch": 0.45454023318591696, + "grad_norm": 0.44833672046661377, + "learning_rate": 0.0001, + "loss": 1.6747, + "step": 3957 + }, + { + "epoch": 0.4546551030957441, + "grad_norm": 0.40606266260147095, + "learning_rate": 0.0001, + "loss": 1.7301, + "step": 3958 + }, + { + "epoch": 0.4547699730055712, + "grad_norm": 0.4011266529560089, + "learning_rate": 0.0001, + "loss": 1.5636, + "step": 3959 + }, + { + "epoch": 0.4548848429153983, + "grad_norm": 0.4372859299182892, + "learning_rate": 0.0001, + "loss": 1.7811, + "step": 3960 + }, + { + "epoch": 0.45499971282522544, + "grad_norm": 0.41698816418647766, + "learning_rate": 0.0001, + "loss": 1.6018, + "step": 3961 + }, + { + "epoch": 0.45511458273505256, + "grad_norm": 0.41979551315307617, + "learning_rate": 0.0001, + "loss": 1.8385, + "step": 3962 + }, + { + "epoch": 0.4552294526448797, + "grad_norm": 0.4077071249485016, + "learning_rate": 0.0001, + "loss": 1.6945, + "step": 3963 + }, + { + "epoch": 0.4553443225547068, + "grad_norm": 0.3627765476703644, + "learning_rate": 0.0001, + "loss": 1.4897, + "step": 3964 + }, + { + "epoch": 0.4554591924645339, + "grad_norm": 0.3952348828315735, + "learning_rate": 0.0001, + "loss": 1.6327, + "step": 3965 + }, + { + "epoch": 0.45557406237436104, + "grad_norm": 0.44601067900657654, + "learning_rate": 0.0001, + "loss": 1.8478, + "step": 3966 + }, + { + "epoch": 0.45568893228418816, + "grad_norm": 0.3901880085468292, + "learning_rate": 0.0001, + "loss": 1.5955, + "step": 3967 + }, + { + "epoch": 0.4558038021940153, + "grad_norm": 0.4016532599925995, + "learning_rate": 0.0001, + "loss": 1.4559, + "step": 3968 + }, + { + "epoch": 0.4559186721038424, + "grad_norm": 0.4067118763923645, + "learning_rate": 0.0001, + "loss": 1.4878, + "step": 3969 + }, + { + "epoch": 0.4560335420136695, + "grad_norm": 0.49588122963905334, + "learning_rate": 0.0001, + "loss": 1.8831, + "step": 3970 + }, + { + "epoch": 0.45614841192349664, + "grad_norm": 0.4378882348537445, + "learning_rate": 0.0001, + "loss": 1.7759, + "step": 3971 + }, + { + "epoch": 0.45626328183332376, + "grad_norm": 0.3988548517227173, + "learning_rate": 0.0001, + "loss": 1.5539, + "step": 3972 + }, + { + "epoch": 0.4563781517431509, + "grad_norm": 0.4102991819381714, + "learning_rate": 0.0001, + "loss": 1.6752, + "step": 3973 + }, + { + "epoch": 0.456493021652978, + "grad_norm": 0.41767358779907227, + "learning_rate": 0.0001, + "loss": 1.5417, + "step": 3974 + }, + { + "epoch": 0.4566078915628051, + "grad_norm": 0.40786105394363403, + "learning_rate": 0.0001, + "loss": 1.6357, + "step": 3975 + }, + { + "epoch": 0.45672276147263224, + "grad_norm": 0.4227571487426758, + "learning_rate": 0.0001, + "loss": 1.6638, + "step": 3976 + }, + { + "epoch": 0.45683763138245936, + "grad_norm": 0.4906644821166992, + "learning_rate": 0.0001, + "loss": 1.8612, + "step": 3977 + }, + { + "epoch": 0.4569525012922865, + "grad_norm": 0.4171657860279083, + "learning_rate": 0.0001, + "loss": 1.5353, + "step": 3978 + }, + { + "epoch": 0.4570673712021136, + "grad_norm": 0.5092725157737732, + "learning_rate": 0.0001, + "loss": 1.9038, + "step": 3979 + }, + { + "epoch": 0.4571822411119407, + "grad_norm": 0.3923867642879486, + "learning_rate": 0.0001, + "loss": 1.5821, + "step": 3980 + }, + { + "epoch": 0.45729711102176784, + "grad_norm": 0.41276079416275024, + "learning_rate": 0.0001, + "loss": 1.7424, + "step": 3981 + }, + { + "epoch": 0.45741198093159496, + "grad_norm": 0.41370689868927, + "learning_rate": 0.0001, + "loss": 1.6337, + "step": 3982 + }, + { + "epoch": 0.4575268508414221, + "grad_norm": 0.40904295444488525, + "learning_rate": 0.0001, + "loss": 1.5968, + "step": 3983 + }, + { + "epoch": 0.4576417207512492, + "grad_norm": 0.4055366516113281, + "learning_rate": 0.0001, + "loss": 1.6431, + "step": 3984 + }, + { + "epoch": 0.4577565906610763, + "grad_norm": 0.40704345703125, + "learning_rate": 0.0001, + "loss": 1.6477, + "step": 3985 + }, + { + "epoch": 0.45787146057090344, + "grad_norm": 0.395580530166626, + "learning_rate": 0.0001, + "loss": 1.5129, + "step": 3986 + }, + { + "epoch": 0.45798633048073056, + "grad_norm": 0.5622934103012085, + "learning_rate": 0.0001, + "loss": 1.6131, + "step": 3987 + }, + { + "epoch": 0.4581012003905577, + "grad_norm": 0.4147200584411621, + "learning_rate": 0.0001, + "loss": 1.5065, + "step": 3988 + }, + { + "epoch": 0.4582160703003848, + "grad_norm": 0.4014433026313782, + "learning_rate": 0.0001, + "loss": 1.7181, + "step": 3989 + }, + { + "epoch": 0.4583309402102119, + "grad_norm": 0.438319593667984, + "learning_rate": 0.0001, + "loss": 1.5005, + "step": 3990 + }, + { + "epoch": 0.45844581012003904, + "grad_norm": 0.4093727767467499, + "learning_rate": 0.0001, + "loss": 1.5912, + "step": 3991 + }, + { + "epoch": 0.45856068002986616, + "grad_norm": 0.40968188643455505, + "learning_rate": 0.0001, + "loss": 1.6271, + "step": 3992 + }, + { + "epoch": 0.4586755499396933, + "grad_norm": 0.43956807255744934, + "learning_rate": 0.0001, + "loss": 1.6531, + "step": 3993 + }, + { + "epoch": 0.4587904198495204, + "grad_norm": 0.40947967767715454, + "learning_rate": 0.0001, + "loss": 1.5883, + "step": 3994 + }, + { + "epoch": 0.4589052897593475, + "grad_norm": 0.40524375438690186, + "learning_rate": 0.0001, + "loss": 1.6648, + "step": 3995 + }, + { + "epoch": 0.45902015966917464, + "grad_norm": 0.4079066812992096, + "learning_rate": 0.0001, + "loss": 1.7688, + "step": 3996 + }, + { + "epoch": 0.45913502957900176, + "grad_norm": 0.40221965312957764, + "learning_rate": 0.0001, + "loss": 1.8436, + "step": 3997 + }, + { + "epoch": 0.4592498994888289, + "grad_norm": 0.4226916432380676, + "learning_rate": 0.0001, + "loss": 1.8167, + "step": 3998 + }, + { + "epoch": 0.459364769398656, + "grad_norm": 0.3903824985027313, + "learning_rate": 0.0001, + "loss": 1.7388, + "step": 3999 + }, + { + "epoch": 0.4594796393084831, + "grad_norm": 0.4148130416870117, + "learning_rate": 0.0001, + "loss": 1.8559, + "step": 4000 + }, + { + "epoch": 0.45959450921831024, + "grad_norm": 0.39708560705184937, + "learning_rate": 0.0001, + "loss": 1.6761, + "step": 4001 + }, + { + "epoch": 0.45970937912813736, + "grad_norm": 0.4335632920265198, + "learning_rate": 0.0001, + "loss": 1.7744, + "step": 4002 + }, + { + "epoch": 0.4598242490379645, + "grad_norm": 0.4105009436607361, + "learning_rate": 0.0001, + "loss": 1.7787, + "step": 4003 + }, + { + "epoch": 0.4599391189477916, + "grad_norm": 0.4119328558444977, + "learning_rate": 0.0001, + "loss": 1.569, + "step": 4004 + }, + { + "epoch": 0.4600539888576187, + "grad_norm": 0.4249887466430664, + "learning_rate": 0.0001, + "loss": 1.6975, + "step": 4005 + }, + { + "epoch": 0.46016885876744584, + "grad_norm": 0.4140334129333496, + "learning_rate": 0.0001, + "loss": 1.6846, + "step": 4006 + }, + { + "epoch": 0.46028372867727296, + "grad_norm": 0.41865649819374084, + "learning_rate": 0.0001, + "loss": 1.6627, + "step": 4007 + }, + { + "epoch": 0.4603985985871001, + "grad_norm": 0.3861866295337677, + "learning_rate": 0.0001, + "loss": 1.3934, + "step": 4008 + }, + { + "epoch": 0.4605134684969272, + "grad_norm": 0.3830776512622833, + "learning_rate": 0.0001, + "loss": 1.6086, + "step": 4009 + }, + { + "epoch": 0.4606283384067543, + "grad_norm": 0.4259714186191559, + "learning_rate": 0.0001, + "loss": 1.7963, + "step": 4010 + }, + { + "epoch": 0.4607432083165815, + "grad_norm": 0.3694056570529938, + "learning_rate": 0.0001, + "loss": 1.5173, + "step": 4011 + }, + { + "epoch": 0.4608580782264086, + "grad_norm": 0.41248619556427, + "learning_rate": 0.0001, + "loss": 1.7967, + "step": 4012 + }, + { + "epoch": 0.46097294813623574, + "grad_norm": 0.40641582012176514, + "learning_rate": 0.0001, + "loss": 1.7708, + "step": 4013 + }, + { + "epoch": 0.46108781804606286, + "grad_norm": 0.43671518564224243, + "learning_rate": 0.0001, + "loss": 1.6405, + "step": 4014 + }, + { + "epoch": 0.46120268795589, + "grad_norm": 0.4056967496871948, + "learning_rate": 0.0001, + "loss": 1.7083, + "step": 4015 + }, + { + "epoch": 0.4613175578657171, + "grad_norm": 0.4197401702404022, + "learning_rate": 0.0001, + "loss": 1.763, + "step": 4016 + }, + { + "epoch": 0.4614324277755442, + "grad_norm": 0.38381877541542053, + "learning_rate": 0.0001, + "loss": 1.6494, + "step": 4017 + }, + { + "epoch": 0.46154729768537134, + "grad_norm": 0.39994266629219055, + "learning_rate": 0.0001, + "loss": 1.6528, + "step": 4018 + }, + { + "epoch": 0.46166216759519846, + "grad_norm": 0.40306952595710754, + "learning_rate": 0.0001, + "loss": 1.7447, + "step": 4019 + }, + { + "epoch": 0.4617770375050256, + "grad_norm": 0.4068452715873718, + "learning_rate": 0.0001, + "loss": 1.6314, + "step": 4020 + }, + { + "epoch": 0.4618919074148527, + "grad_norm": 0.3850820064544678, + "learning_rate": 0.0001, + "loss": 1.6017, + "step": 4021 + }, + { + "epoch": 0.4620067773246798, + "grad_norm": 0.420269250869751, + "learning_rate": 0.0001, + "loss": 1.814, + "step": 4022 + }, + { + "epoch": 0.46212164723450694, + "grad_norm": 0.404812216758728, + "learning_rate": 0.0001, + "loss": 1.6553, + "step": 4023 + }, + { + "epoch": 0.46223651714433406, + "grad_norm": 0.40154874324798584, + "learning_rate": 0.0001, + "loss": 1.7349, + "step": 4024 + }, + { + "epoch": 0.4623513870541612, + "grad_norm": 0.40221109986305237, + "learning_rate": 0.0001, + "loss": 1.6723, + "step": 4025 + }, + { + "epoch": 0.4624662569639883, + "grad_norm": 0.4642849564552307, + "learning_rate": 0.0001, + "loss": 1.6938, + "step": 4026 + }, + { + "epoch": 0.4625811268738154, + "grad_norm": 0.4198199212551117, + "learning_rate": 0.0001, + "loss": 1.6603, + "step": 4027 + }, + { + "epoch": 0.46269599678364254, + "grad_norm": 0.40827077627182007, + "learning_rate": 0.0001, + "loss": 1.7611, + "step": 4028 + }, + { + "epoch": 0.46281086669346966, + "grad_norm": 0.41351550817489624, + "learning_rate": 0.0001, + "loss": 1.6947, + "step": 4029 + }, + { + "epoch": 0.4629257366032968, + "grad_norm": 0.41555505990982056, + "learning_rate": 0.0001, + "loss": 1.7832, + "step": 4030 + }, + { + "epoch": 0.4630406065131239, + "grad_norm": 0.4276737570762634, + "learning_rate": 0.0001, + "loss": 1.7332, + "step": 4031 + }, + { + "epoch": 0.463155476422951, + "grad_norm": 0.39254918694496155, + "learning_rate": 0.0001, + "loss": 1.7283, + "step": 4032 + }, + { + "epoch": 0.46327034633277814, + "grad_norm": 0.40760576725006104, + "learning_rate": 0.0001, + "loss": 1.6464, + "step": 4033 + }, + { + "epoch": 0.46338521624260526, + "grad_norm": 0.3900972902774811, + "learning_rate": 0.0001, + "loss": 1.6067, + "step": 4034 + }, + { + "epoch": 0.4635000861524324, + "grad_norm": 0.42098426818847656, + "learning_rate": 0.0001, + "loss": 1.6296, + "step": 4035 + }, + { + "epoch": 0.4636149560622595, + "grad_norm": 0.41556769609451294, + "learning_rate": 0.0001, + "loss": 1.545, + "step": 4036 + }, + { + "epoch": 0.4637298259720866, + "grad_norm": 0.4194001853466034, + "learning_rate": 0.0001, + "loss": 1.6771, + "step": 4037 + }, + { + "epoch": 0.46384469588191374, + "grad_norm": 0.4148116111755371, + "learning_rate": 0.0001, + "loss": 1.7292, + "step": 4038 + }, + { + "epoch": 0.46395956579174086, + "grad_norm": 0.39787450432777405, + "learning_rate": 0.0001, + "loss": 1.4533, + "step": 4039 + }, + { + "epoch": 0.464074435701568, + "grad_norm": 0.4328993856906891, + "learning_rate": 0.0001, + "loss": 1.74, + "step": 4040 + }, + { + "epoch": 0.4641893056113951, + "grad_norm": 0.45193320512771606, + "learning_rate": 0.0001, + "loss": 1.9133, + "step": 4041 + }, + { + "epoch": 0.4643041755212222, + "grad_norm": 0.4238536059856415, + "learning_rate": 0.0001, + "loss": 1.7697, + "step": 4042 + }, + { + "epoch": 0.46441904543104934, + "grad_norm": 0.4080374836921692, + "learning_rate": 0.0001, + "loss": 1.5563, + "step": 4043 + }, + { + "epoch": 0.46453391534087646, + "grad_norm": 0.40985575318336487, + "learning_rate": 0.0001, + "loss": 1.742, + "step": 4044 + }, + { + "epoch": 0.4646487852507036, + "grad_norm": 0.40973904728889465, + "learning_rate": 0.0001, + "loss": 1.6101, + "step": 4045 + }, + { + "epoch": 0.4647636551605307, + "grad_norm": 0.4287373423576355, + "learning_rate": 0.0001, + "loss": 1.6476, + "step": 4046 + }, + { + "epoch": 0.4648785250703578, + "grad_norm": 0.41029754281044006, + "learning_rate": 0.0001, + "loss": 1.7405, + "step": 4047 + }, + { + "epoch": 0.46499339498018494, + "grad_norm": 0.4067857563495636, + "learning_rate": 0.0001, + "loss": 1.5641, + "step": 4048 + }, + { + "epoch": 0.46510826489001206, + "grad_norm": 0.42624571919441223, + "learning_rate": 0.0001, + "loss": 1.7049, + "step": 4049 + }, + { + "epoch": 0.4652231347998392, + "grad_norm": 0.3797012269496918, + "learning_rate": 0.0001, + "loss": 1.3009, + "step": 4050 + }, + { + "epoch": 0.4653380047096663, + "grad_norm": 0.37980806827545166, + "learning_rate": 0.0001, + "loss": 1.7286, + "step": 4051 + }, + { + "epoch": 0.4654528746194934, + "grad_norm": 0.42795923352241516, + "learning_rate": 0.0001, + "loss": 1.5539, + "step": 4052 + }, + { + "epoch": 0.46556774452932054, + "grad_norm": 0.4233475625514984, + "learning_rate": 0.0001, + "loss": 1.6725, + "step": 4053 + }, + { + "epoch": 0.46568261443914766, + "grad_norm": 0.3736623525619507, + "learning_rate": 0.0001, + "loss": 1.5268, + "step": 4054 + }, + { + "epoch": 0.4657974843489748, + "grad_norm": 0.43289387226104736, + "learning_rate": 0.0001, + "loss": 1.7833, + "step": 4055 + }, + { + "epoch": 0.4659123542588019, + "grad_norm": 0.39889997243881226, + "learning_rate": 0.0001, + "loss": 1.6901, + "step": 4056 + }, + { + "epoch": 0.466027224168629, + "grad_norm": 0.40544313192367554, + "learning_rate": 0.0001, + "loss": 1.7284, + "step": 4057 + }, + { + "epoch": 0.46614209407845614, + "grad_norm": 0.3909522294998169, + "learning_rate": 0.0001, + "loss": 1.5248, + "step": 4058 + }, + { + "epoch": 0.46625696398828326, + "grad_norm": 0.4280645549297333, + "learning_rate": 0.0001, + "loss": 1.6554, + "step": 4059 + }, + { + "epoch": 0.4663718338981104, + "grad_norm": 0.40522295236587524, + "learning_rate": 0.0001, + "loss": 1.5629, + "step": 4060 + }, + { + "epoch": 0.4664867038079375, + "grad_norm": 0.43785059452056885, + "learning_rate": 0.0001, + "loss": 1.6645, + "step": 4061 + }, + { + "epoch": 0.4666015737177646, + "grad_norm": 0.39573609828948975, + "learning_rate": 0.0001, + "loss": 1.6039, + "step": 4062 + }, + { + "epoch": 0.46671644362759174, + "grad_norm": 0.39389288425445557, + "learning_rate": 0.0001, + "loss": 1.5866, + "step": 4063 + }, + { + "epoch": 0.46683131353741886, + "grad_norm": 0.43171268701553345, + "learning_rate": 0.0001, + "loss": 1.8756, + "step": 4064 + }, + { + "epoch": 0.466946183447246, + "grad_norm": 0.4238601624965668, + "learning_rate": 0.0001, + "loss": 1.9024, + "step": 4065 + }, + { + "epoch": 0.4670610533570731, + "grad_norm": 0.4299545884132385, + "learning_rate": 0.0001, + "loss": 1.7863, + "step": 4066 + }, + { + "epoch": 0.4671759232669002, + "grad_norm": 0.38363876938819885, + "learning_rate": 0.0001, + "loss": 1.5237, + "step": 4067 + }, + { + "epoch": 0.46729079317672734, + "grad_norm": 0.39695143699645996, + "learning_rate": 0.0001, + "loss": 1.6357, + "step": 4068 + }, + { + "epoch": 0.46740566308655446, + "grad_norm": 0.4232119023799896, + "learning_rate": 0.0001, + "loss": 1.7294, + "step": 4069 + }, + { + "epoch": 0.4675205329963816, + "grad_norm": 0.39182764291763306, + "learning_rate": 0.0001, + "loss": 1.4701, + "step": 4070 + }, + { + "epoch": 0.4676354029062087, + "grad_norm": 0.38294535875320435, + "learning_rate": 0.0001, + "loss": 1.5047, + "step": 4071 + }, + { + "epoch": 0.4677502728160358, + "grad_norm": 0.39663416147232056, + "learning_rate": 0.0001, + "loss": 1.4622, + "step": 4072 + }, + { + "epoch": 0.46786514272586294, + "grad_norm": 0.4411957263946533, + "learning_rate": 0.0001, + "loss": 1.7604, + "step": 4073 + }, + { + "epoch": 0.46798001263569006, + "grad_norm": 0.43797844648361206, + "learning_rate": 0.0001, + "loss": 1.871, + "step": 4074 + }, + { + "epoch": 0.4680948825455172, + "grad_norm": 0.4069494605064392, + "learning_rate": 0.0001, + "loss": 1.6468, + "step": 4075 + }, + { + "epoch": 0.4682097524553443, + "grad_norm": 0.4068356454372406, + "learning_rate": 0.0001, + "loss": 1.6041, + "step": 4076 + }, + { + "epoch": 0.4683246223651714, + "grad_norm": 0.4451930820941925, + "learning_rate": 0.0001, + "loss": 1.6094, + "step": 4077 + }, + { + "epoch": 0.46843949227499854, + "grad_norm": 0.4141843616962433, + "learning_rate": 0.0001, + "loss": 1.6697, + "step": 4078 + }, + { + "epoch": 0.4685543621848257, + "grad_norm": 0.37937334179878235, + "learning_rate": 0.0001, + "loss": 1.5631, + "step": 4079 + }, + { + "epoch": 0.46866923209465283, + "grad_norm": 0.3883213996887207, + "learning_rate": 0.0001, + "loss": 1.7144, + "step": 4080 + }, + { + "epoch": 0.46878410200447995, + "grad_norm": 0.4085143506526947, + "learning_rate": 0.0001, + "loss": 1.6192, + "step": 4081 + }, + { + "epoch": 0.46889897191430707, + "grad_norm": 0.4008790850639343, + "learning_rate": 0.0001, + "loss": 1.5507, + "step": 4082 + }, + { + "epoch": 0.4690138418241342, + "grad_norm": 0.40579459071159363, + "learning_rate": 0.0001, + "loss": 1.6322, + "step": 4083 + }, + { + "epoch": 0.4691287117339613, + "grad_norm": 0.41397562623023987, + "learning_rate": 0.0001, + "loss": 1.7478, + "step": 4084 + }, + { + "epoch": 0.46924358164378843, + "grad_norm": 0.40979722142219543, + "learning_rate": 0.0001, + "loss": 1.5539, + "step": 4085 + }, + { + "epoch": 0.46935845155361555, + "grad_norm": 0.3959348201751709, + "learning_rate": 0.0001, + "loss": 1.5345, + "step": 4086 + }, + { + "epoch": 0.46947332146344267, + "grad_norm": 0.4095418453216553, + "learning_rate": 0.0001, + "loss": 1.6808, + "step": 4087 + }, + { + "epoch": 0.4695881913732698, + "grad_norm": 0.4053293466567993, + "learning_rate": 0.0001, + "loss": 1.6511, + "step": 4088 + }, + { + "epoch": 0.4697030612830969, + "grad_norm": 0.41647395491600037, + "learning_rate": 0.0001, + "loss": 1.7705, + "step": 4089 + }, + { + "epoch": 0.46981793119292403, + "grad_norm": 0.420941025018692, + "learning_rate": 0.0001, + "loss": 1.5527, + "step": 4090 + }, + { + "epoch": 0.46993280110275115, + "grad_norm": 0.4197264015674591, + "learning_rate": 0.0001, + "loss": 1.7921, + "step": 4091 + }, + { + "epoch": 0.47004767101257827, + "grad_norm": 0.40832772850990295, + "learning_rate": 0.0001, + "loss": 1.6727, + "step": 4092 + }, + { + "epoch": 0.4701625409224054, + "grad_norm": 0.4231039881706238, + "learning_rate": 0.0001, + "loss": 1.5909, + "step": 4093 + }, + { + "epoch": 0.4702774108322325, + "grad_norm": 0.4249723255634308, + "learning_rate": 0.0001, + "loss": 1.6056, + "step": 4094 + }, + { + "epoch": 0.47039228074205963, + "grad_norm": 0.41998720169067383, + "learning_rate": 0.0001, + "loss": 1.7629, + "step": 4095 + }, + { + "epoch": 0.47050715065188675, + "grad_norm": 0.38457614183425903, + "learning_rate": 0.0001, + "loss": 1.6671, + "step": 4096 + }, + { + "epoch": 0.47062202056171387, + "grad_norm": 0.3916681408882141, + "learning_rate": 0.0001, + "loss": 1.6675, + "step": 4097 + }, + { + "epoch": 0.470736890471541, + "grad_norm": 0.40213701128959656, + "learning_rate": 0.0001, + "loss": 1.6876, + "step": 4098 + }, + { + "epoch": 0.4708517603813681, + "grad_norm": 0.3882690668106079, + "learning_rate": 0.0001, + "loss": 1.6579, + "step": 4099 + }, + { + "epoch": 0.47096663029119523, + "grad_norm": 0.4170128107070923, + "learning_rate": 0.0001, + "loss": 1.5347, + "step": 4100 + }, + { + "epoch": 0.47108150020102235, + "grad_norm": 0.3979085087776184, + "learning_rate": 0.0001, + "loss": 1.6854, + "step": 4101 + }, + { + "epoch": 0.47119637011084947, + "grad_norm": 0.41884052753448486, + "learning_rate": 0.0001, + "loss": 1.7235, + "step": 4102 + }, + { + "epoch": 0.4713112400206766, + "grad_norm": 0.4105212688446045, + "learning_rate": 0.0001, + "loss": 1.6214, + "step": 4103 + }, + { + "epoch": 0.4714261099305037, + "grad_norm": 0.4247276186943054, + "learning_rate": 0.0001, + "loss": 1.7434, + "step": 4104 + }, + { + "epoch": 0.47154097984033083, + "grad_norm": 0.4294239580631256, + "learning_rate": 0.0001, + "loss": 1.7376, + "step": 4105 + }, + { + "epoch": 0.47165584975015795, + "grad_norm": 0.4285709857940674, + "learning_rate": 0.0001, + "loss": 1.7798, + "step": 4106 + }, + { + "epoch": 0.47177071965998507, + "grad_norm": 0.44262823462486267, + "learning_rate": 0.0001, + "loss": 1.8391, + "step": 4107 + }, + { + "epoch": 0.4718855895698122, + "grad_norm": 0.4055534601211548, + "learning_rate": 0.0001, + "loss": 1.6681, + "step": 4108 + }, + { + "epoch": 0.4720004594796393, + "grad_norm": 0.39059194922447205, + "learning_rate": 0.0001, + "loss": 1.7303, + "step": 4109 + }, + { + "epoch": 0.47211532938946643, + "grad_norm": 0.3832624554634094, + "learning_rate": 0.0001, + "loss": 1.6018, + "step": 4110 + }, + { + "epoch": 0.47223019929929355, + "grad_norm": 0.42001864314079285, + "learning_rate": 0.0001, + "loss": 1.65, + "step": 4111 + }, + { + "epoch": 0.47234506920912067, + "grad_norm": 0.4122453033924103, + "learning_rate": 0.0001, + "loss": 1.6837, + "step": 4112 + }, + { + "epoch": 0.4724599391189478, + "grad_norm": 0.38872405886650085, + "learning_rate": 0.0001, + "loss": 1.5834, + "step": 4113 + }, + { + "epoch": 0.4725748090287749, + "grad_norm": 0.3932107388973236, + "learning_rate": 0.0001, + "loss": 1.5957, + "step": 4114 + }, + { + "epoch": 0.47268967893860203, + "grad_norm": 0.3980741798877716, + "learning_rate": 0.0001, + "loss": 1.6973, + "step": 4115 + }, + { + "epoch": 0.47280454884842915, + "grad_norm": 0.4428998827934265, + "learning_rate": 0.0001, + "loss": 1.6528, + "step": 4116 + }, + { + "epoch": 0.47291941875825627, + "grad_norm": 0.37861019372940063, + "learning_rate": 0.0001, + "loss": 1.6581, + "step": 4117 + }, + { + "epoch": 0.4730342886680834, + "grad_norm": 0.43132567405700684, + "learning_rate": 0.0001, + "loss": 1.8072, + "step": 4118 + }, + { + "epoch": 0.4731491585779105, + "grad_norm": 0.4315222501754761, + "learning_rate": 0.0001, + "loss": 1.7475, + "step": 4119 + }, + { + "epoch": 0.47326402848773763, + "grad_norm": 0.3742518424987793, + "learning_rate": 0.0001, + "loss": 1.4972, + "step": 4120 + }, + { + "epoch": 0.47337889839756475, + "grad_norm": 0.39365971088409424, + "learning_rate": 0.0001, + "loss": 1.6029, + "step": 4121 + }, + { + "epoch": 0.47349376830739187, + "grad_norm": 0.43931326270103455, + "learning_rate": 0.0001, + "loss": 1.7912, + "step": 4122 + }, + { + "epoch": 0.473608638217219, + "grad_norm": 0.4146181046962738, + "learning_rate": 0.0001, + "loss": 1.7775, + "step": 4123 + }, + { + "epoch": 0.4737235081270461, + "grad_norm": 0.4833734929561615, + "learning_rate": 0.0001, + "loss": 1.8526, + "step": 4124 + }, + { + "epoch": 0.47383837803687323, + "grad_norm": 0.38752907514572144, + "learning_rate": 0.0001, + "loss": 1.6212, + "step": 4125 + }, + { + "epoch": 0.47395324794670035, + "grad_norm": 0.38719457387924194, + "learning_rate": 0.0001, + "loss": 1.3979, + "step": 4126 + }, + { + "epoch": 0.47406811785652747, + "grad_norm": 0.4148174226284027, + "learning_rate": 0.0001, + "loss": 1.7373, + "step": 4127 + }, + { + "epoch": 0.4741829877663546, + "grad_norm": 0.41567835211753845, + "learning_rate": 0.0001, + "loss": 1.5284, + "step": 4128 + }, + { + "epoch": 0.4742978576761817, + "grad_norm": 0.45437824726104736, + "learning_rate": 0.0001, + "loss": 1.7829, + "step": 4129 + }, + { + "epoch": 0.47441272758600883, + "grad_norm": 0.41790398955345154, + "learning_rate": 0.0001, + "loss": 1.6595, + "step": 4130 + }, + { + "epoch": 0.47452759749583595, + "grad_norm": 0.40303486585617065, + "learning_rate": 0.0001, + "loss": 1.6587, + "step": 4131 + }, + { + "epoch": 0.47464246740566307, + "grad_norm": 0.391946405172348, + "learning_rate": 0.0001, + "loss": 1.6744, + "step": 4132 + }, + { + "epoch": 0.4747573373154902, + "grad_norm": 0.3910079896450043, + "learning_rate": 0.0001, + "loss": 1.6995, + "step": 4133 + }, + { + "epoch": 0.4748722072253173, + "grad_norm": 0.3853004276752472, + "learning_rate": 0.0001, + "loss": 1.6034, + "step": 4134 + }, + { + "epoch": 0.47498707713514443, + "grad_norm": 0.41160422563552856, + "learning_rate": 0.0001, + "loss": 1.7027, + "step": 4135 + }, + { + "epoch": 0.47510194704497155, + "grad_norm": 0.40281742811203003, + "learning_rate": 0.0001, + "loss": 1.7533, + "step": 4136 + }, + { + "epoch": 0.47521681695479867, + "grad_norm": 0.39282864332199097, + "learning_rate": 0.0001, + "loss": 1.5728, + "step": 4137 + }, + { + "epoch": 0.4753316868646258, + "grad_norm": 0.3944832384586334, + "learning_rate": 0.0001, + "loss": 1.5693, + "step": 4138 + }, + { + "epoch": 0.4754465567744529, + "grad_norm": 0.4138934910297394, + "learning_rate": 0.0001, + "loss": 1.6778, + "step": 4139 + }, + { + "epoch": 0.47556142668428003, + "grad_norm": 0.40532898902893066, + "learning_rate": 0.0001, + "loss": 1.4911, + "step": 4140 + }, + { + "epoch": 0.47567629659410715, + "grad_norm": 0.3844515383243561, + "learning_rate": 0.0001, + "loss": 1.551, + "step": 4141 + }, + { + "epoch": 0.47579116650393427, + "grad_norm": 0.3857368230819702, + "learning_rate": 0.0001, + "loss": 1.5968, + "step": 4142 + }, + { + "epoch": 0.4759060364137614, + "grad_norm": 0.42560672760009766, + "learning_rate": 0.0001, + "loss": 1.8693, + "step": 4143 + }, + { + "epoch": 0.4760209063235885, + "grad_norm": 0.43299341201782227, + "learning_rate": 0.0001, + "loss": 1.6393, + "step": 4144 + }, + { + "epoch": 0.47613577623341563, + "grad_norm": 0.431902676820755, + "learning_rate": 0.0001, + "loss": 1.8618, + "step": 4145 + }, + { + "epoch": 0.47625064614324275, + "grad_norm": 0.43173423409461975, + "learning_rate": 0.0001, + "loss": 1.5596, + "step": 4146 + }, + { + "epoch": 0.47636551605306987, + "grad_norm": 0.4200470447540283, + "learning_rate": 0.0001, + "loss": 1.7881, + "step": 4147 + }, + { + "epoch": 0.47648038596289705, + "grad_norm": 0.3650665581226349, + "learning_rate": 0.0001, + "loss": 1.2449, + "step": 4148 + }, + { + "epoch": 0.47659525587272417, + "grad_norm": 0.40055355429649353, + "learning_rate": 0.0001, + "loss": 1.4084, + "step": 4149 + }, + { + "epoch": 0.4767101257825513, + "grad_norm": 0.41404348611831665, + "learning_rate": 0.0001, + "loss": 1.667, + "step": 4150 + }, + { + "epoch": 0.4768249956923784, + "grad_norm": 0.42411214113235474, + "learning_rate": 0.0001, + "loss": 1.4006, + "step": 4151 + }, + { + "epoch": 0.4769398656022055, + "grad_norm": 0.4193376302719116, + "learning_rate": 0.0001, + "loss": 1.7433, + "step": 4152 + }, + { + "epoch": 0.47705473551203265, + "grad_norm": 0.41245442628860474, + "learning_rate": 0.0001, + "loss": 1.5723, + "step": 4153 + }, + { + "epoch": 0.47716960542185977, + "grad_norm": 0.4130132496356964, + "learning_rate": 0.0001, + "loss": 1.6799, + "step": 4154 + }, + { + "epoch": 0.4772844753316869, + "grad_norm": 0.3704391419887543, + "learning_rate": 0.0001, + "loss": 1.4916, + "step": 4155 + }, + { + "epoch": 0.477399345241514, + "grad_norm": 0.4236399531364441, + "learning_rate": 0.0001, + "loss": 1.7303, + "step": 4156 + }, + { + "epoch": 0.4775142151513411, + "grad_norm": 0.4399377703666687, + "learning_rate": 0.0001, + "loss": 1.8676, + "step": 4157 + }, + { + "epoch": 0.47762908506116825, + "grad_norm": 0.4146808683872223, + "learning_rate": 0.0001, + "loss": 1.4918, + "step": 4158 + }, + { + "epoch": 0.47774395497099537, + "grad_norm": 0.40414994955062866, + "learning_rate": 0.0001, + "loss": 1.6722, + "step": 4159 + }, + { + "epoch": 0.4778588248808225, + "grad_norm": 0.44139429926872253, + "learning_rate": 0.0001, + "loss": 1.5388, + "step": 4160 + }, + { + "epoch": 0.4779736947906496, + "grad_norm": 0.35786864161491394, + "learning_rate": 0.0001, + "loss": 1.3425, + "step": 4161 + }, + { + "epoch": 0.4780885647004767, + "grad_norm": 0.4114287793636322, + "learning_rate": 0.0001, + "loss": 1.6002, + "step": 4162 + }, + { + "epoch": 0.47820343461030385, + "grad_norm": 0.436565101146698, + "learning_rate": 0.0001, + "loss": 1.6089, + "step": 4163 + }, + { + "epoch": 0.47831830452013097, + "grad_norm": 0.42503979802131653, + "learning_rate": 0.0001, + "loss": 1.6941, + "step": 4164 + }, + { + "epoch": 0.4784331744299581, + "grad_norm": 0.4688960313796997, + "learning_rate": 0.0001, + "loss": 1.6121, + "step": 4165 + }, + { + "epoch": 0.4785480443397852, + "grad_norm": 0.42915526032447815, + "learning_rate": 0.0001, + "loss": 1.7311, + "step": 4166 + }, + { + "epoch": 0.4786629142496123, + "grad_norm": 0.42920050024986267, + "learning_rate": 0.0001, + "loss": 1.8337, + "step": 4167 + }, + { + "epoch": 0.47877778415943945, + "grad_norm": 0.4068220257759094, + "learning_rate": 0.0001, + "loss": 1.2956, + "step": 4168 + }, + { + "epoch": 0.47889265406926657, + "grad_norm": 0.38509076833724976, + "learning_rate": 0.0001, + "loss": 1.5839, + "step": 4169 + }, + { + "epoch": 0.4790075239790937, + "grad_norm": 0.43463391065597534, + "learning_rate": 0.0001, + "loss": 1.8903, + "step": 4170 + }, + { + "epoch": 0.4791223938889208, + "grad_norm": 0.4463506042957306, + "learning_rate": 0.0001, + "loss": 1.8181, + "step": 4171 + }, + { + "epoch": 0.4792372637987479, + "grad_norm": 0.42022788524627686, + "learning_rate": 0.0001, + "loss": 1.8081, + "step": 4172 + }, + { + "epoch": 0.47935213370857505, + "grad_norm": 0.4187469780445099, + "learning_rate": 0.0001, + "loss": 1.6547, + "step": 4173 + }, + { + "epoch": 0.47946700361840217, + "grad_norm": 0.3888328969478607, + "learning_rate": 0.0001, + "loss": 1.4741, + "step": 4174 + }, + { + "epoch": 0.4795818735282293, + "grad_norm": 0.4130741059780121, + "learning_rate": 0.0001, + "loss": 1.6267, + "step": 4175 + }, + { + "epoch": 0.4796967434380564, + "grad_norm": 0.3948152959346771, + "learning_rate": 0.0001, + "loss": 1.6343, + "step": 4176 + }, + { + "epoch": 0.4798116133478835, + "grad_norm": 0.3814520537853241, + "learning_rate": 0.0001, + "loss": 1.5364, + "step": 4177 + }, + { + "epoch": 0.47992648325771065, + "grad_norm": 0.4245845079421997, + "learning_rate": 0.0001, + "loss": 1.6257, + "step": 4178 + }, + { + "epoch": 0.48004135316753777, + "grad_norm": 0.39747804403305054, + "learning_rate": 0.0001, + "loss": 1.4679, + "step": 4179 + }, + { + "epoch": 0.4801562230773649, + "grad_norm": 0.41911858320236206, + "learning_rate": 0.0001, + "loss": 1.6383, + "step": 4180 + }, + { + "epoch": 0.480271092987192, + "grad_norm": 0.4305039048194885, + "learning_rate": 0.0001, + "loss": 1.7368, + "step": 4181 + }, + { + "epoch": 0.4803859628970191, + "grad_norm": 0.3938789665699005, + "learning_rate": 0.0001, + "loss": 1.4452, + "step": 4182 + }, + { + "epoch": 0.48050083280684625, + "grad_norm": 0.45596420764923096, + "learning_rate": 0.0001, + "loss": 1.697, + "step": 4183 + }, + { + "epoch": 0.48061570271667337, + "grad_norm": 0.3951970040798187, + "learning_rate": 0.0001, + "loss": 1.602, + "step": 4184 + }, + { + "epoch": 0.4807305726265005, + "grad_norm": 0.40332433581352234, + "learning_rate": 0.0001, + "loss": 1.7118, + "step": 4185 + }, + { + "epoch": 0.4808454425363276, + "grad_norm": 0.4350488781929016, + "learning_rate": 0.0001, + "loss": 1.7018, + "step": 4186 + }, + { + "epoch": 0.4809603124461547, + "grad_norm": 0.4046489894390106, + "learning_rate": 0.0001, + "loss": 1.2882, + "step": 4187 + }, + { + "epoch": 0.48107518235598185, + "grad_norm": 0.4263879358768463, + "learning_rate": 0.0001, + "loss": 1.6298, + "step": 4188 + }, + { + "epoch": 0.48119005226580897, + "grad_norm": 0.42358294129371643, + "learning_rate": 0.0001, + "loss": 1.791, + "step": 4189 + }, + { + "epoch": 0.4813049221756361, + "grad_norm": 0.3843347430229187, + "learning_rate": 0.0001, + "loss": 1.6233, + "step": 4190 + }, + { + "epoch": 0.4814197920854632, + "grad_norm": 0.43530306220054626, + "learning_rate": 0.0001, + "loss": 1.7777, + "step": 4191 + }, + { + "epoch": 0.4815346619952903, + "grad_norm": 0.4170604646205902, + "learning_rate": 0.0001, + "loss": 1.6756, + "step": 4192 + }, + { + "epoch": 0.48164953190511745, + "grad_norm": 0.39548924565315247, + "learning_rate": 0.0001, + "loss": 1.5848, + "step": 4193 + }, + { + "epoch": 0.48176440181494457, + "grad_norm": 0.38482916355133057, + "learning_rate": 0.0001, + "loss": 1.6586, + "step": 4194 + }, + { + "epoch": 0.4818792717247717, + "grad_norm": 0.38123294711112976, + "learning_rate": 0.0001, + "loss": 1.4144, + "step": 4195 + }, + { + "epoch": 0.4819941416345988, + "grad_norm": 0.4207984209060669, + "learning_rate": 0.0001, + "loss": 1.6567, + "step": 4196 + }, + { + "epoch": 0.4821090115444259, + "grad_norm": 0.40305450558662415, + "learning_rate": 0.0001, + "loss": 1.6715, + "step": 4197 + }, + { + "epoch": 0.48222388145425304, + "grad_norm": 0.3921899199485779, + "learning_rate": 0.0001, + "loss": 1.5884, + "step": 4198 + }, + { + "epoch": 0.48233875136408016, + "grad_norm": 0.3937564790248871, + "learning_rate": 0.0001, + "loss": 1.4346, + "step": 4199 + }, + { + "epoch": 0.4824536212739073, + "grad_norm": 0.44775843620300293, + "learning_rate": 0.0001, + "loss": 1.6271, + "step": 4200 + }, + { + "epoch": 0.4825684911837344, + "grad_norm": 0.424165278673172, + "learning_rate": 0.0001, + "loss": 1.632, + "step": 4201 + }, + { + "epoch": 0.4826833610935615, + "grad_norm": 0.4116969406604767, + "learning_rate": 0.0001, + "loss": 1.7062, + "step": 4202 + }, + { + "epoch": 0.48279823100338864, + "grad_norm": 0.4231579303741455, + "learning_rate": 0.0001, + "loss": 1.8064, + "step": 4203 + }, + { + "epoch": 0.48291310091321576, + "grad_norm": 0.4327984154224396, + "learning_rate": 0.0001, + "loss": 1.7143, + "step": 4204 + }, + { + "epoch": 0.4830279708230429, + "grad_norm": 0.45221564173698425, + "learning_rate": 0.0001, + "loss": 1.7857, + "step": 4205 + }, + { + "epoch": 0.48314284073287, + "grad_norm": 0.4953210651874542, + "learning_rate": 0.0001, + "loss": 1.861, + "step": 4206 + }, + { + "epoch": 0.4832577106426971, + "grad_norm": 0.38952094316482544, + "learning_rate": 0.0001, + "loss": 1.4247, + "step": 4207 + }, + { + "epoch": 0.48337258055252424, + "grad_norm": 0.4227844774723053, + "learning_rate": 0.0001, + "loss": 1.6854, + "step": 4208 + }, + { + "epoch": 0.48348745046235136, + "grad_norm": 0.4414394199848175, + "learning_rate": 0.0001, + "loss": 1.9377, + "step": 4209 + }, + { + "epoch": 0.4836023203721785, + "grad_norm": 0.4291560649871826, + "learning_rate": 0.0001, + "loss": 1.4077, + "step": 4210 + }, + { + "epoch": 0.4837171902820056, + "grad_norm": 0.40819114446640015, + "learning_rate": 0.0001, + "loss": 1.8186, + "step": 4211 + }, + { + "epoch": 0.4838320601918327, + "grad_norm": 0.40871623158454895, + "learning_rate": 0.0001, + "loss": 1.545, + "step": 4212 + }, + { + "epoch": 0.48394693010165984, + "grad_norm": 0.41948193311691284, + "learning_rate": 0.0001, + "loss": 1.7082, + "step": 4213 + }, + { + "epoch": 0.48406180001148696, + "grad_norm": 0.40412595868110657, + "learning_rate": 0.0001, + "loss": 1.6625, + "step": 4214 + }, + { + "epoch": 0.4841766699213141, + "grad_norm": 0.4160853624343872, + "learning_rate": 0.0001, + "loss": 1.5709, + "step": 4215 + }, + { + "epoch": 0.48429153983114126, + "grad_norm": 0.3837149143218994, + "learning_rate": 0.0001, + "loss": 1.4121, + "step": 4216 + }, + { + "epoch": 0.4844064097409684, + "grad_norm": 0.443243145942688, + "learning_rate": 0.0001, + "loss": 1.8064, + "step": 4217 + }, + { + "epoch": 0.4845212796507955, + "grad_norm": 0.38426709175109863, + "learning_rate": 0.0001, + "loss": 1.6433, + "step": 4218 + }, + { + "epoch": 0.4846361495606226, + "grad_norm": 0.40559226274490356, + "learning_rate": 0.0001, + "loss": 1.6694, + "step": 4219 + }, + { + "epoch": 0.48475101947044974, + "grad_norm": 0.4205404222011566, + "learning_rate": 0.0001, + "loss": 1.8064, + "step": 4220 + }, + { + "epoch": 0.48486588938027686, + "grad_norm": 0.4019322693347931, + "learning_rate": 0.0001, + "loss": 1.6447, + "step": 4221 + }, + { + "epoch": 0.484980759290104, + "grad_norm": 0.43610960245132446, + "learning_rate": 0.0001, + "loss": 1.6533, + "step": 4222 + }, + { + "epoch": 0.4850956291999311, + "grad_norm": 0.45350828766822815, + "learning_rate": 0.0001, + "loss": 1.6381, + "step": 4223 + }, + { + "epoch": 0.4852104991097582, + "grad_norm": 0.420280396938324, + "learning_rate": 0.0001, + "loss": 1.8019, + "step": 4224 + }, + { + "epoch": 0.48532536901958534, + "grad_norm": 0.4381343424320221, + "learning_rate": 0.0001, + "loss": 1.5139, + "step": 4225 + }, + { + "epoch": 0.48544023892941246, + "grad_norm": 0.38392412662506104, + "learning_rate": 0.0001, + "loss": 1.4845, + "step": 4226 + }, + { + "epoch": 0.4855551088392396, + "grad_norm": 0.4646781086921692, + "learning_rate": 0.0001, + "loss": 1.8663, + "step": 4227 + }, + { + "epoch": 0.4856699787490667, + "grad_norm": 0.39009374380111694, + "learning_rate": 0.0001, + "loss": 1.5679, + "step": 4228 + }, + { + "epoch": 0.4857848486588938, + "grad_norm": 0.44190630316734314, + "learning_rate": 0.0001, + "loss": 1.891, + "step": 4229 + }, + { + "epoch": 0.48589971856872094, + "grad_norm": 0.38428959250450134, + "learning_rate": 0.0001, + "loss": 1.5294, + "step": 4230 + }, + { + "epoch": 0.48601458847854806, + "grad_norm": 0.3903064429759979, + "learning_rate": 0.0001, + "loss": 1.4688, + "step": 4231 + }, + { + "epoch": 0.4861294583883752, + "grad_norm": 0.39627987146377563, + "learning_rate": 0.0001, + "loss": 1.5868, + "step": 4232 + }, + { + "epoch": 0.4862443282982023, + "grad_norm": 0.3723553419113159, + "learning_rate": 0.0001, + "loss": 1.4925, + "step": 4233 + }, + { + "epoch": 0.4863591982080294, + "grad_norm": 0.4561592936515808, + "learning_rate": 0.0001, + "loss": 1.8449, + "step": 4234 + }, + { + "epoch": 0.48647406811785654, + "grad_norm": 0.42943283915519714, + "learning_rate": 0.0001, + "loss": 1.6589, + "step": 4235 + }, + { + "epoch": 0.48658893802768366, + "grad_norm": 0.39778202772140503, + "learning_rate": 0.0001, + "loss": 1.4218, + "step": 4236 + }, + { + "epoch": 0.4867038079375108, + "grad_norm": 0.4467572569847107, + "learning_rate": 0.0001, + "loss": 1.7572, + "step": 4237 + }, + { + "epoch": 0.4868186778473379, + "grad_norm": 0.393715500831604, + "learning_rate": 0.0001, + "loss": 1.56, + "step": 4238 + }, + { + "epoch": 0.486933547757165, + "grad_norm": 0.43201369047164917, + "learning_rate": 0.0001, + "loss": 1.8393, + "step": 4239 + }, + { + "epoch": 0.48704841766699214, + "grad_norm": 0.3823792040348053, + "learning_rate": 0.0001, + "loss": 1.3971, + "step": 4240 + }, + { + "epoch": 0.48716328757681926, + "grad_norm": 0.38682422041893005, + "learning_rate": 0.0001, + "loss": 1.3362, + "step": 4241 + }, + { + "epoch": 0.4872781574866464, + "grad_norm": 0.405185341835022, + "learning_rate": 0.0001, + "loss": 1.5936, + "step": 4242 + }, + { + "epoch": 0.4873930273964735, + "grad_norm": 0.4169776141643524, + "learning_rate": 0.0001, + "loss": 1.6479, + "step": 4243 + }, + { + "epoch": 0.4875078973063006, + "grad_norm": 0.399635910987854, + "learning_rate": 0.0001, + "loss": 1.4352, + "step": 4244 + }, + { + "epoch": 0.48762276721612774, + "grad_norm": 0.39507734775543213, + "learning_rate": 0.0001, + "loss": 1.5375, + "step": 4245 + }, + { + "epoch": 0.48773763712595486, + "grad_norm": 0.42992520332336426, + "learning_rate": 0.0001, + "loss": 1.5337, + "step": 4246 + }, + { + "epoch": 0.487852507035782, + "grad_norm": 0.4018322825431824, + "learning_rate": 0.0001, + "loss": 1.8032, + "step": 4247 + }, + { + "epoch": 0.4879673769456091, + "grad_norm": 0.40367868542671204, + "learning_rate": 0.0001, + "loss": 1.6862, + "step": 4248 + }, + { + "epoch": 0.4880822468554362, + "grad_norm": 0.4355872571468353, + "learning_rate": 0.0001, + "loss": 1.5951, + "step": 4249 + }, + { + "epoch": 0.48819711676526334, + "grad_norm": 0.4191339313983917, + "learning_rate": 0.0001, + "loss": 1.8388, + "step": 4250 + }, + { + "epoch": 0.48831198667509046, + "grad_norm": 0.39485347270965576, + "learning_rate": 0.0001, + "loss": 1.5524, + "step": 4251 + }, + { + "epoch": 0.4884268565849176, + "grad_norm": 0.40303096175193787, + "learning_rate": 0.0001, + "loss": 1.718, + "step": 4252 + }, + { + "epoch": 0.4885417264947447, + "grad_norm": 0.4248945116996765, + "learning_rate": 0.0001, + "loss": 1.6762, + "step": 4253 + }, + { + "epoch": 0.4886565964045718, + "grad_norm": 0.45680445432662964, + "learning_rate": 0.0001, + "loss": 1.8161, + "step": 4254 + }, + { + "epoch": 0.48877146631439894, + "grad_norm": 0.40725842118263245, + "learning_rate": 0.0001, + "loss": 1.6237, + "step": 4255 + }, + { + "epoch": 0.48888633622422606, + "grad_norm": 0.4762290418148041, + "learning_rate": 0.0001, + "loss": 1.6353, + "step": 4256 + }, + { + "epoch": 0.4890012061340532, + "grad_norm": 0.42993873357772827, + "learning_rate": 0.0001, + "loss": 1.6581, + "step": 4257 + }, + { + "epoch": 0.4891160760438803, + "grad_norm": 0.38512858748435974, + "learning_rate": 0.0001, + "loss": 1.6684, + "step": 4258 + }, + { + "epoch": 0.4892309459537074, + "grad_norm": 0.4009222984313965, + "learning_rate": 0.0001, + "loss": 1.662, + "step": 4259 + }, + { + "epoch": 0.48934581586353454, + "grad_norm": 0.3873811662197113, + "learning_rate": 0.0001, + "loss": 1.6713, + "step": 4260 + }, + { + "epoch": 0.48946068577336166, + "grad_norm": 0.39984700083732605, + "learning_rate": 0.0001, + "loss": 1.577, + "step": 4261 + }, + { + "epoch": 0.4895755556831888, + "grad_norm": 0.4596845209598541, + "learning_rate": 0.0001, + "loss": 1.7131, + "step": 4262 + }, + { + "epoch": 0.4896904255930159, + "grad_norm": 0.4038037657737732, + "learning_rate": 0.0001, + "loss": 1.7656, + "step": 4263 + }, + { + "epoch": 0.489805295502843, + "grad_norm": 0.39577624201774597, + "learning_rate": 0.0001, + "loss": 1.652, + "step": 4264 + }, + { + "epoch": 0.48992016541267014, + "grad_norm": 0.38728025555610657, + "learning_rate": 0.0001, + "loss": 1.5229, + "step": 4265 + }, + { + "epoch": 0.49003503532249726, + "grad_norm": 0.4493556618690491, + "learning_rate": 0.0001, + "loss": 1.6367, + "step": 4266 + }, + { + "epoch": 0.4901499052323244, + "grad_norm": 0.4298556447029114, + "learning_rate": 0.0001, + "loss": 1.8001, + "step": 4267 + }, + { + "epoch": 0.4902647751421515, + "grad_norm": 0.4090805947780609, + "learning_rate": 0.0001, + "loss": 1.6775, + "step": 4268 + }, + { + "epoch": 0.4903796450519786, + "grad_norm": 0.4103395640850067, + "learning_rate": 0.0001, + "loss": 1.6592, + "step": 4269 + }, + { + "epoch": 0.49049451496180574, + "grad_norm": 0.41509976983070374, + "learning_rate": 0.0001, + "loss": 1.6015, + "step": 4270 + }, + { + "epoch": 0.49060938487163286, + "grad_norm": 0.4312727153301239, + "learning_rate": 0.0001, + "loss": 1.7256, + "step": 4271 + }, + { + "epoch": 0.49072425478146, + "grad_norm": 0.42118600010871887, + "learning_rate": 0.0001, + "loss": 1.7583, + "step": 4272 + }, + { + "epoch": 0.4908391246912871, + "grad_norm": 0.4323917329311371, + "learning_rate": 0.0001, + "loss": 1.6377, + "step": 4273 + }, + { + "epoch": 0.4909539946011142, + "grad_norm": 0.4115605354309082, + "learning_rate": 0.0001, + "loss": 1.7165, + "step": 4274 + }, + { + "epoch": 0.49106886451094134, + "grad_norm": 0.41110193729400635, + "learning_rate": 0.0001, + "loss": 1.6258, + "step": 4275 + }, + { + "epoch": 0.49118373442076846, + "grad_norm": 0.43548524379730225, + "learning_rate": 0.0001, + "loss": 1.7275, + "step": 4276 + }, + { + "epoch": 0.4912986043305956, + "grad_norm": 0.4107424020767212, + "learning_rate": 0.0001, + "loss": 1.514, + "step": 4277 + }, + { + "epoch": 0.4914134742404227, + "grad_norm": 0.4132782220840454, + "learning_rate": 0.0001, + "loss": 1.7099, + "step": 4278 + }, + { + "epoch": 0.4915283441502498, + "grad_norm": 0.4486086070537567, + "learning_rate": 0.0001, + "loss": 1.9551, + "step": 4279 + }, + { + "epoch": 0.49164321406007694, + "grad_norm": 0.41517210006713867, + "learning_rate": 0.0001, + "loss": 1.7067, + "step": 4280 + }, + { + "epoch": 0.49175808396990406, + "grad_norm": 0.37078627943992615, + "learning_rate": 0.0001, + "loss": 1.4954, + "step": 4281 + }, + { + "epoch": 0.4918729538797312, + "grad_norm": 0.41808491945266724, + "learning_rate": 0.0001, + "loss": 1.5452, + "step": 4282 + }, + { + "epoch": 0.4919878237895583, + "grad_norm": 0.3952369689941406, + "learning_rate": 0.0001, + "loss": 1.7146, + "step": 4283 + }, + { + "epoch": 0.4921026936993854, + "grad_norm": 0.43479466438293457, + "learning_rate": 0.0001, + "loss": 1.7984, + "step": 4284 + }, + { + "epoch": 0.4922175636092126, + "grad_norm": 0.44229763746261597, + "learning_rate": 0.0001, + "loss": 1.877, + "step": 4285 + }, + { + "epoch": 0.4923324335190397, + "grad_norm": 0.4501185715198517, + "learning_rate": 0.0001, + "loss": 1.5643, + "step": 4286 + }, + { + "epoch": 0.49244730342886683, + "grad_norm": 0.43694865703582764, + "learning_rate": 0.0001, + "loss": 1.512, + "step": 4287 + }, + { + "epoch": 0.49256217333869395, + "grad_norm": 0.4092878997325897, + "learning_rate": 0.0001, + "loss": 1.7253, + "step": 4288 + }, + { + "epoch": 0.4926770432485211, + "grad_norm": 0.4009423851966858, + "learning_rate": 0.0001, + "loss": 1.6641, + "step": 4289 + }, + { + "epoch": 0.4927919131583482, + "grad_norm": 0.4281792640686035, + "learning_rate": 0.0001, + "loss": 1.5892, + "step": 4290 + }, + { + "epoch": 0.4929067830681753, + "grad_norm": 0.49694761633872986, + "learning_rate": 0.0001, + "loss": 1.7392, + "step": 4291 + }, + { + "epoch": 0.49302165297800243, + "grad_norm": 0.44447091221809387, + "learning_rate": 0.0001, + "loss": 1.6769, + "step": 4292 + }, + { + "epoch": 0.49313652288782955, + "grad_norm": 0.3885389566421509, + "learning_rate": 0.0001, + "loss": 1.6375, + "step": 4293 + }, + { + "epoch": 0.4932513927976567, + "grad_norm": 0.3895886242389679, + "learning_rate": 0.0001, + "loss": 1.5885, + "step": 4294 + }, + { + "epoch": 0.4933662627074838, + "grad_norm": 0.4553350806236267, + "learning_rate": 0.0001, + "loss": 1.793, + "step": 4295 + }, + { + "epoch": 0.4934811326173109, + "grad_norm": 0.384520560503006, + "learning_rate": 0.0001, + "loss": 1.4896, + "step": 4296 + }, + { + "epoch": 0.49359600252713803, + "grad_norm": 0.4104674160480499, + "learning_rate": 0.0001, + "loss": 1.6698, + "step": 4297 + }, + { + "epoch": 0.49371087243696515, + "grad_norm": 0.4543203115463257, + "learning_rate": 0.0001, + "loss": 1.7958, + "step": 4298 + }, + { + "epoch": 0.4938257423467923, + "grad_norm": 0.40907683968544006, + "learning_rate": 0.0001, + "loss": 1.6956, + "step": 4299 + }, + { + "epoch": 0.4939406122566194, + "grad_norm": 0.43240875005722046, + "learning_rate": 0.0001, + "loss": 1.4569, + "step": 4300 + }, + { + "epoch": 0.4940554821664465, + "grad_norm": 0.41229525208473206, + "learning_rate": 0.0001, + "loss": 1.7708, + "step": 4301 + }, + { + "epoch": 0.49417035207627363, + "grad_norm": 0.4220798909664154, + "learning_rate": 0.0001, + "loss": 1.6977, + "step": 4302 + }, + { + "epoch": 0.49428522198610075, + "grad_norm": 0.3954547345638275, + "learning_rate": 0.0001, + "loss": 1.6479, + "step": 4303 + }, + { + "epoch": 0.4944000918959279, + "grad_norm": 0.41736140847206116, + "learning_rate": 0.0001, + "loss": 1.7265, + "step": 4304 + }, + { + "epoch": 0.494514961805755, + "grad_norm": 0.39642804861068726, + "learning_rate": 0.0001, + "loss": 1.5365, + "step": 4305 + }, + { + "epoch": 0.4946298317155821, + "grad_norm": 0.4558791220188141, + "learning_rate": 0.0001, + "loss": 1.6012, + "step": 4306 + }, + { + "epoch": 0.49474470162540923, + "grad_norm": 0.39758846163749695, + "learning_rate": 0.0001, + "loss": 1.6934, + "step": 4307 + }, + { + "epoch": 0.49485957153523635, + "grad_norm": 0.43392619490623474, + "learning_rate": 0.0001, + "loss": 1.7523, + "step": 4308 + }, + { + "epoch": 0.4949744414450635, + "grad_norm": 0.3947283625602722, + "learning_rate": 0.0001, + "loss": 1.5171, + "step": 4309 + }, + { + "epoch": 0.4950893113548906, + "grad_norm": 0.3992256224155426, + "learning_rate": 0.0001, + "loss": 1.7432, + "step": 4310 + }, + { + "epoch": 0.4952041812647177, + "grad_norm": 0.4191253185272217, + "learning_rate": 0.0001, + "loss": 1.7122, + "step": 4311 + }, + { + "epoch": 0.49531905117454483, + "grad_norm": 0.41294851899147034, + "learning_rate": 0.0001, + "loss": 1.8674, + "step": 4312 + }, + { + "epoch": 0.49543392108437195, + "grad_norm": 0.4245891869068146, + "learning_rate": 0.0001, + "loss": 1.5189, + "step": 4313 + }, + { + "epoch": 0.4955487909941991, + "grad_norm": 0.44946831464767456, + "learning_rate": 0.0001, + "loss": 1.8442, + "step": 4314 + }, + { + "epoch": 0.4956636609040262, + "grad_norm": 0.4152979254722595, + "learning_rate": 0.0001, + "loss": 1.6727, + "step": 4315 + }, + { + "epoch": 0.4957785308138533, + "grad_norm": 0.3948223888874054, + "learning_rate": 0.0001, + "loss": 1.6683, + "step": 4316 + }, + { + "epoch": 0.49589340072368043, + "grad_norm": 0.423066109418869, + "learning_rate": 0.0001, + "loss": 1.6065, + "step": 4317 + }, + { + "epoch": 0.49600827063350755, + "grad_norm": 0.3875940144062042, + "learning_rate": 0.0001, + "loss": 1.4541, + "step": 4318 + }, + { + "epoch": 0.4961231405433347, + "grad_norm": 0.4109596014022827, + "learning_rate": 0.0001, + "loss": 1.5759, + "step": 4319 + }, + { + "epoch": 0.4962380104531618, + "grad_norm": 0.42905092239379883, + "learning_rate": 0.0001, + "loss": 1.6572, + "step": 4320 + }, + { + "epoch": 0.4963528803629889, + "grad_norm": 0.38710954785346985, + "learning_rate": 0.0001, + "loss": 1.3936, + "step": 4321 + }, + { + "epoch": 0.49646775027281603, + "grad_norm": 0.4148370623588562, + "learning_rate": 0.0001, + "loss": 1.3909, + "step": 4322 + }, + { + "epoch": 0.49658262018264315, + "grad_norm": 0.4470541775226593, + "learning_rate": 0.0001, + "loss": 1.6346, + "step": 4323 + }, + { + "epoch": 0.4966974900924703, + "grad_norm": 0.3968261480331421, + "learning_rate": 0.0001, + "loss": 1.5145, + "step": 4324 + }, + { + "epoch": 0.4968123600022974, + "grad_norm": 0.43433794379234314, + "learning_rate": 0.0001, + "loss": 1.7366, + "step": 4325 + }, + { + "epoch": 0.4969272299121245, + "grad_norm": 0.39728692173957825, + "learning_rate": 0.0001, + "loss": 1.4233, + "step": 4326 + }, + { + "epoch": 0.49704209982195163, + "grad_norm": 0.3991283178329468, + "learning_rate": 0.0001, + "loss": 1.6562, + "step": 4327 + }, + { + "epoch": 0.49715696973177875, + "grad_norm": 0.4111589193344116, + "learning_rate": 0.0001, + "loss": 1.6703, + "step": 4328 + }, + { + "epoch": 0.4972718396416059, + "grad_norm": 0.41188472509384155, + "learning_rate": 0.0001, + "loss": 1.7465, + "step": 4329 + }, + { + "epoch": 0.497386709551433, + "grad_norm": 0.435154527425766, + "learning_rate": 0.0001, + "loss": 1.6414, + "step": 4330 + }, + { + "epoch": 0.4975015794612601, + "grad_norm": 0.40389662981033325, + "learning_rate": 0.0001, + "loss": 1.7158, + "step": 4331 + }, + { + "epoch": 0.49761644937108723, + "grad_norm": 0.4168131947517395, + "learning_rate": 0.0001, + "loss": 1.6856, + "step": 4332 + }, + { + "epoch": 0.49773131928091435, + "grad_norm": 0.43243077397346497, + "learning_rate": 0.0001, + "loss": 1.8349, + "step": 4333 + }, + { + "epoch": 0.4978461891907415, + "grad_norm": 0.42854100465774536, + "learning_rate": 0.0001, + "loss": 1.732, + "step": 4334 + }, + { + "epoch": 0.4979610591005686, + "grad_norm": 0.41653481125831604, + "learning_rate": 0.0001, + "loss": 1.6833, + "step": 4335 + }, + { + "epoch": 0.4980759290103957, + "grad_norm": 0.45638951659202576, + "learning_rate": 0.0001, + "loss": 1.844, + "step": 4336 + }, + { + "epoch": 0.49819079892022283, + "grad_norm": 0.4429507553577423, + "learning_rate": 0.0001, + "loss": 1.6555, + "step": 4337 + }, + { + "epoch": 0.49830566883004995, + "grad_norm": 0.45221859216690063, + "learning_rate": 0.0001, + "loss": 1.8052, + "step": 4338 + }, + { + "epoch": 0.4984205387398771, + "grad_norm": 0.42000943422317505, + "learning_rate": 0.0001, + "loss": 1.5532, + "step": 4339 + }, + { + "epoch": 0.4985354086497042, + "grad_norm": 0.44999954104423523, + "learning_rate": 0.0001, + "loss": 1.7288, + "step": 4340 + }, + { + "epoch": 0.4986502785595313, + "grad_norm": 0.44586285948753357, + "learning_rate": 0.0001, + "loss": 1.8559, + "step": 4341 + }, + { + "epoch": 0.49876514846935843, + "grad_norm": 0.41515326499938965, + "learning_rate": 0.0001, + "loss": 1.6989, + "step": 4342 + }, + { + "epoch": 0.49888001837918555, + "grad_norm": 0.40862441062927246, + "learning_rate": 0.0001, + "loss": 1.6441, + "step": 4343 + }, + { + "epoch": 0.4989948882890127, + "grad_norm": 0.4498591423034668, + "learning_rate": 0.0001, + "loss": 1.7579, + "step": 4344 + }, + { + "epoch": 0.4991097581988398, + "grad_norm": 0.4223852753639221, + "learning_rate": 0.0001, + "loss": 1.6668, + "step": 4345 + }, + { + "epoch": 0.4992246281086669, + "grad_norm": 0.39054638147354126, + "learning_rate": 0.0001, + "loss": 1.3893, + "step": 4346 + }, + { + "epoch": 0.49933949801849403, + "grad_norm": 0.4462035298347473, + "learning_rate": 0.0001, + "loss": 1.7637, + "step": 4347 + }, + { + "epoch": 0.49945436792832115, + "grad_norm": 0.4029097855091095, + "learning_rate": 0.0001, + "loss": 1.6173, + "step": 4348 + }, + { + "epoch": 0.4995692378381483, + "grad_norm": 0.4124845862388611, + "learning_rate": 0.0001, + "loss": 1.7638, + "step": 4349 + }, + { + "epoch": 0.4996841077479754, + "grad_norm": 0.37937289476394653, + "learning_rate": 0.0001, + "loss": 1.5661, + "step": 4350 + }, + { + "epoch": 0.4997989776578025, + "grad_norm": 0.40648961067199707, + "learning_rate": 0.0001, + "loss": 1.5733, + "step": 4351 + }, + { + "epoch": 0.49991384756762963, + "grad_norm": 0.4147469997406006, + "learning_rate": 0.0001, + "loss": 1.5053, + "step": 4352 + }, + { + "epoch": 0.5000287174774568, + "grad_norm": 0.42763659358024597, + "learning_rate": 0.0001, + "loss": 1.6607, + "step": 4353 + }, + { + "epoch": 0.5001435873872839, + "grad_norm": 0.4175349771976471, + "learning_rate": 0.0001, + "loss": 1.675, + "step": 4354 + }, + { + "epoch": 0.500258457297111, + "grad_norm": 0.4236915707588196, + "learning_rate": 0.0001, + "loss": 1.6923, + "step": 4355 + }, + { + "epoch": 0.5003733272069382, + "grad_norm": 0.41402342915534973, + "learning_rate": 0.0001, + "loss": 1.8008, + "step": 4356 + }, + { + "epoch": 0.5004881971167653, + "grad_norm": 0.4136095345020294, + "learning_rate": 0.0001, + "loss": 1.5044, + "step": 4357 + }, + { + "epoch": 0.5006030670265924, + "grad_norm": 0.43916845321655273, + "learning_rate": 0.0001, + "loss": 1.8298, + "step": 4358 + }, + { + "epoch": 0.5007179369364195, + "grad_norm": 0.41625019907951355, + "learning_rate": 0.0001, + "loss": 1.6969, + "step": 4359 + }, + { + "epoch": 0.5008328068462466, + "grad_norm": 0.422838419675827, + "learning_rate": 0.0001, + "loss": 1.5765, + "step": 4360 + }, + { + "epoch": 0.5009476767560738, + "grad_norm": 0.356978178024292, + "learning_rate": 0.0001, + "loss": 1.3617, + "step": 4361 + }, + { + "epoch": 0.5010625466659009, + "grad_norm": 0.4372119903564453, + "learning_rate": 0.0001, + "loss": 1.7264, + "step": 4362 + }, + { + "epoch": 0.501177416575728, + "grad_norm": 0.43347251415252686, + "learning_rate": 0.0001, + "loss": 1.6472, + "step": 4363 + }, + { + "epoch": 0.5012922864855551, + "grad_norm": 0.38658055663108826, + "learning_rate": 0.0001, + "loss": 1.564, + "step": 4364 + }, + { + "epoch": 0.5014071563953822, + "grad_norm": 0.3823000192642212, + "learning_rate": 0.0001, + "loss": 1.6238, + "step": 4365 + }, + { + "epoch": 0.5015220263052094, + "grad_norm": 0.44494375586509705, + "learning_rate": 0.0001, + "loss": 1.884, + "step": 4366 + }, + { + "epoch": 0.5016368962150365, + "grad_norm": 0.37995579838752747, + "learning_rate": 0.0001, + "loss": 1.5818, + "step": 4367 + }, + { + "epoch": 0.5017517661248636, + "grad_norm": 0.416050523519516, + "learning_rate": 0.0001, + "loss": 1.8128, + "step": 4368 + }, + { + "epoch": 0.5018666360346907, + "grad_norm": 0.41402295231819153, + "learning_rate": 0.0001, + "loss": 1.7219, + "step": 4369 + }, + { + "epoch": 0.5019815059445178, + "grad_norm": 0.4046735167503357, + "learning_rate": 0.0001, + "loss": 1.6491, + "step": 4370 + }, + { + "epoch": 0.502096375854345, + "grad_norm": 0.4079616367816925, + "learning_rate": 0.0001, + "loss": 1.5906, + "step": 4371 + }, + { + "epoch": 0.5022112457641721, + "grad_norm": 0.42762479186058044, + "learning_rate": 0.0001, + "loss": 1.7243, + "step": 4372 + }, + { + "epoch": 0.5023261156739992, + "grad_norm": 0.42116057872772217, + "learning_rate": 0.0001, + "loss": 1.7058, + "step": 4373 + }, + { + "epoch": 0.5024409855838263, + "grad_norm": 0.43111321330070496, + "learning_rate": 0.0001, + "loss": 1.6998, + "step": 4374 + }, + { + "epoch": 0.5025558554936534, + "grad_norm": 0.43248212337493896, + "learning_rate": 0.0001, + "loss": 1.839, + "step": 4375 + }, + { + "epoch": 0.5026707254034806, + "grad_norm": 0.40893498063087463, + "learning_rate": 0.0001, + "loss": 1.6113, + "step": 4376 + }, + { + "epoch": 0.5027855953133077, + "grad_norm": 0.43067118525505066, + "learning_rate": 0.0001, + "loss": 1.5609, + "step": 4377 + }, + { + "epoch": 0.5029004652231348, + "grad_norm": 0.3844601809978485, + "learning_rate": 0.0001, + "loss": 1.5709, + "step": 4378 + }, + { + "epoch": 0.5030153351329619, + "grad_norm": 0.41813233494758606, + "learning_rate": 0.0001, + "loss": 1.7656, + "step": 4379 + }, + { + "epoch": 0.503130205042789, + "grad_norm": 0.4095008373260498, + "learning_rate": 0.0001, + "loss": 1.6633, + "step": 4380 + }, + { + "epoch": 0.5032450749526162, + "grad_norm": 0.4308955669403076, + "learning_rate": 0.0001, + "loss": 1.5697, + "step": 4381 + }, + { + "epoch": 0.5033599448624433, + "grad_norm": 0.41021478176116943, + "learning_rate": 0.0001, + "loss": 1.6172, + "step": 4382 + }, + { + "epoch": 0.5034748147722704, + "grad_norm": 0.37911084294319153, + "learning_rate": 0.0001, + "loss": 1.4941, + "step": 4383 + }, + { + "epoch": 0.5035896846820975, + "grad_norm": 0.3981432020664215, + "learning_rate": 0.0001, + "loss": 1.7432, + "step": 4384 + }, + { + "epoch": 0.5037045545919246, + "grad_norm": 0.3934857249259949, + "learning_rate": 0.0001, + "loss": 1.7143, + "step": 4385 + }, + { + "epoch": 0.5038194245017518, + "grad_norm": 0.40038371086120605, + "learning_rate": 0.0001, + "loss": 1.7377, + "step": 4386 + }, + { + "epoch": 0.5039342944115789, + "grad_norm": 0.422829806804657, + "learning_rate": 0.0001, + "loss": 1.6704, + "step": 4387 + }, + { + "epoch": 0.504049164321406, + "grad_norm": 0.3928106725215912, + "learning_rate": 0.0001, + "loss": 1.6269, + "step": 4388 + }, + { + "epoch": 0.5041640342312331, + "grad_norm": 0.4150887131690979, + "learning_rate": 0.0001, + "loss": 1.4932, + "step": 4389 + }, + { + "epoch": 0.5042789041410602, + "grad_norm": 0.40696507692337036, + "learning_rate": 0.0001, + "loss": 1.6295, + "step": 4390 + }, + { + "epoch": 0.5043937740508874, + "grad_norm": 0.43781277537345886, + "learning_rate": 0.0001, + "loss": 1.6691, + "step": 4391 + }, + { + "epoch": 0.5045086439607145, + "grad_norm": 0.4133634865283966, + "learning_rate": 0.0001, + "loss": 1.6552, + "step": 4392 + }, + { + "epoch": 0.5046235138705416, + "grad_norm": 0.3970431387424469, + "learning_rate": 0.0001, + "loss": 1.2777, + "step": 4393 + }, + { + "epoch": 0.5047383837803687, + "grad_norm": 0.44825881719589233, + "learning_rate": 0.0001, + "loss": 1.5762, + "step": 4394 + }, + { + "epoch": 0.5048532536901958, + "grad_norm": 0.38739416003227234, + "learning_rate": 0.0001, + "loss": 1.6222, + "step": 4395 + }, + { + "epoch": 0.504968123600023, + "grad_norm": 0.41295453906059265, + "learning_rate": 0.0001, + "loss": 1.7472, + "step": 4396 + }, + { + "epoch": 0.5050829935098501, + "grad_norm": 0.3822581470012665, + "learning_rate": 0.0001, + "loss": 1.5052, + "step": 4397 + }, + { + "epoch": 0.5051978634196772, + "grad_norm": 0.40491965413093567, + "learning_rate": 0.0001, + "loss": 1.49, + "step": 4398 + }, + { + "epoch": 0.5053127333295043, + "grad_norm": 0.48093438148498535, + "learning_rate": 0.0001, + "loss": 1.7826, + "step": 4399 + }, + { + "epoch": 0.5054276032393314, + "grad_norm": 0.4143761694431305, + "learning_rate": 0.0001, + "loss": 1.5054, + "step": 4400 + }, + { + "epoch": 0.5055424731491586, + "grad_norm": 0.4251968264579773, + "learning_rate": 0.0001, + "loss": 1.61, + "step": 4401 + }, + { + "epoch": 0.5056573430589857, + "grad_norm": 0.4139546751976013, + "learning_rate": 0.0001, + "loss": 1.4851, + "step": 4402 + }, + { + "epoch": 0.5057722129688128, + "grad_norm": 0.4102967381477356, + "learning_rate": 0.0001, + "loss": 1.6109, + "step": 4403 + }, + { + "epoch": 0.5058870828786399, + "grad_norm": 0.411199688911438, + "learning_rate": 0.0001, + "loss": 1.3911, + "step": 4404 + }, + { + "epoch": 0.506001952788467, + "grad_norm": 0.43025514483451843, + "learning_rate": 0.0001, + "loss": 1.6434, + "step": 4405 + }, + { + "epoch": 0.5061168226982942, + "grad_norm": 0.4648951292037964, + "learning_rate": 0.0001, + "loss": 1.8235, + "step": 4406 + }, + { + "epoch": 0.5062316926081213, + "grad_norm": 0.4003261923789978, + "learning_rate": 0.0001, + "loss": 1.6433, + "step": 4407 + }, + { + "epoch": 0.5063465625179484, + "grad_norm": 0.4348773956298828, + "learning_rate": 0.0001, + "loss": 1.6097, + "step": 4408 + }, + { + "epoch": 0.5064614324277755, + "grad_norm": 0.3802754580974579, + "learning_rate": 0.0001, + "loss": 1.3187, + "step": 4409 + }, + { + "epoch": 0.5065763023376026, + "grad_norm": 0.4715876579284668, + "learning_rate": 0.0001, + "loss": 1.9138, + "step": 4410 + }, + { + "epoch": 0.5066911722474298, + "grad_norm": 0.42883962392807007, + "learning_rate": 0.0001, + "loss": 1.5673, + "step": 4411 + }, + { + "epoch": 0.5068060421572569, + "grad_norm": 0.3982695937156677, + "learning_rate": 0.0001, + "loss": 1.5531, + "step": 4412 + }, + { + "epoch": 0.506920912067084, + "grad_norm": 0.4575648605823517, + "learning_rate": 0.0001, + "loss": 1.6778, + "step": 4413 + }, + { + "epoch": 0.5070357819769111, + "grad_norm": 0.422573983669281, + "learning_rate": 0.0001, + "loss": 1.7639, + "step": 4414 + }, + { + "epoch": 0.5071506518867382, + "grad_norm": 0.41428160667419434, + "learning_rate": 0.0001, + "loss": 1.5767, + "step": 4415 + }, + { + "epoch": 0.5072655217965654, + "grad_norm": 0.39599475264549255, + "learning_rate": 0.0001, + "loss": 1.442, + "step": 4416 + }, + { + "epoch": 0.5073803917063925, + "grad_norm": 0.41915363073349, + "learning_rate": 0.0001, + "loss": 1.6944, + "step": 4417 + }, + { + "epoch": 0.5074952616162196, + "grad_norm": 0.4034424126148224, + "learning_rate": 0.0001, + "loss": 1.5641, + "step": 4418 + }, + { + "epoch": 0.5076101315260467, + "grad_norm": 0.4440549612045288, + "learning_rate": 0.0001, + "loss": 1.9008, + "step": 4419 + }, + { + "epoch": 0.5077250014358738, + "grad_norm": 0.43546631932258606, + "learning_rate": 0.0001, + "loss": 1.6674, + "step": 4420 + }, + { + "epoch": 0.507839871345701, + "grad_norm": 0.4137895107269287, + "learning_rate": 0.0001, + "loss": 1.7555, + "step": 4421 + }, + { + "epoch": 0.5079547412555281, + "grad_norm": 0.4285581111907959, + "learning_rate": 0.0001, + "loss": 1.4998, + "step": 4422 + }, + { + "epoch": 0.5080696111653552, + "grad_norm": 0.39121928811073303, + "learning_rate": 0.0001, + "loss": 1.5603, + "step": 4423 + }, + { + "epoch": 0.5081844810751823, + "grad_norm": 0.45524442195892334, + "learning_rate": 0.0001, + "loss": 1.6992, + "step": 4424 + }, + { + "epoch": 0.5082993509850094, + "grad_norm": 0.42169350385665894, + "learning_rate": 0.0001, + "loss": 1.4717, + "step": 4425 + }, + { + "epoch": 0.5084142208948366, + "grad_norm": 0.44150853157043457, + "learning_rate": 0.0001, + "loss": 1.735, + "step": 4426 + }, + { + "epoch": 0.5085290908046637, + "grad_norm": 0.4405604600906372, + "learning_rate": 0.0001, + "loss": 1.7817, + "step": 4427 + }, + { + "epoch": 0.5086439607144908, + "grad_norm": 0.4324178099632263, + "learning_rate": 0.0001, + "loss": 1.8098, + "step": 4428 + }, + { + "epoch": 0.5087588306243179, + "grad_norm": 0.3942691683769226, + "learning_rate": 0.0001, + "loss": 1.5467, + "step": 4429 + }, + { + "epoch": 0.508873700534145, + "grad_norm": 0.3946244716644287, + "learning_rate": 0.0001, + "loss": 1.5341, + "step": 4430 + }, + { + "epoch": 0.5089885704439722, + "grad_norm": 0.4541240930557251, + "learning_rate": 0.0001, + "loss": 1.8342, + "step": 4431 + }, + { + "epoch": 0.5091034403537993, + "grad_norm": 0.4114912450313568, + "learning_rate": 0.0001, + "loss": 1.6808, + "step": 4432 + }, + { + "epoch": 0.5092183102636264, + "grad_norm": 0.40291449427604675, + "learning_rate": 0.0001, + "loss": 1.4467, + "step": 4433 + }, + { + "epoch": 0.5093331801734535, + "grad_norm": 0.43536198139190674, + "learning_rate": 0.0001, + "loss": 1.6229, + "step": 4434 + }, + { + "epoch": 0.5094480500832806, + "grad_norm": 0.4481135606765747, + "learning_rate": 0.0001, + "loss": 1.8252, + "step": 4435 + }, + { + "epoch": 0.5095629199931078, + "grad_norm": 0.41042807698249817, + "learning_rate": 0.0001, + "loss": 1.6277, + "step": 4436 + }, + { + "epoch": 0.5096777899029349, + "grad_norm": 0.44647055864334106, + "learning_rate": 0.0001, + "loss": 1.7893, + "step": 4437 + }, + { + "epoch": 0.509792659812762, + "grad_norm": 0.4402143061161041, + "learning_rate": 0.0001, + "loss": 1.7, + "step": 4438 + }, + { + "epoch": 0.5099075297225891, + "grad_norm": 0.44720834493637085, + "learning_rate": 0.0001, + "loss": 1.6523, + "step": 4439 + }, + { + "epoch": 0.5100223996324162, + "grad_norm": 0.43396279215812683, + "learning_rate": 0.0001, + "loss": 1.6265, + "step": 4440 + }, + { + "epoch": 0.5101372695422434, + "grad_norm": 0.42125704884529114, + "learning_rate": 0.0001, + "loss": 1.6251, + "step": 4441 + }, + { + "epoch": 0.5102521394520705, + "grad_norm": 0.4178660213947296, + "learning_rate": 0.0001, + "loss": 1.7153, + "step": 4442 + }, + { + "epoch": 0.5103670093618976, + "grad_norm": 0.490590363740921, + "learning_rate": 0.0001, + "loss": 1.6246, + "step": 4443 + }, + { + "epoch": 0.5104818792717247, + "grad_norm": 0.40945470333099365, + "learning_rate": 0.0001, + "loss": 1.4578, + "step": 4444 + }, + { + "epoch": 0.5105967491815518, + "grad_norm": 0.44269421696662903, + "learning_rate": 0.0001, + "loss": 1.6619, + "step": 4445 + }, + { + "epoch": 0.510711619091379, + "grad_norm": 0.41479024291038513, + "learning_rate": 0.0001, + "loss": 1.6295, + "step": 4446 + }, + { + "epoch": 0.5108264890012061, + "grad_norm": 0.3876959979534149, + "learning_rate": 0.0001, + "loss": 1.505, + "step": 4447 + }, + { + "epoch": 0.5109413589110332, + "grad_norm": 0.4491494297981262, + "learning_rate": 0.0001, + "loss": 1.681, + "step": 4448 + }, + { + "epoch": 0.5110562288208603, + "grad_norm": 0.41451436281204224, + "learning_rate": 0.0001, + "loss": 1.6657, + "step": 4449 + }, + { + "epoch": 0.5111710987306874, + "grad_norm": 0.39285629987716675, + "learning_rate": 0.0001, + "loss": 1.5493, + "step": 4450 + }, + { + "epoch": 0.5112859686405146, + "grad_norm": 0.39508745074272156, + "learning_rate": 0.0001, + "loss": 1.4939, + "step": 4451 + }, + { + "epoch": 0.5114008385503417, + "grad_norm": 0.44860512018203735, + "learning_rate": 0.0001, + "loss": 1.5626, + "step": 4452 + }, + { + "epoch": 0.5115157084601688, + "grad_norm": 0.4382397532463074, + "learning_rate": 0.0001, + "loss": 1.6823, + "step": 4453 + }, + { + "epoch": 0.5116305783699959, + "grad_norm": 0.40884271264076233, + "learning_rate": 0.0001, + "loss": 1.5451, + "step": 4454 + }, + { + "epoch": 0.511745448279823, + "grad_norm": 0.43925192952156067, + "learning_rate": 0.0001, + "loss": 1.6231, + "step": 4455 + }, + { + "epoch": 0.5118603181896503, + "grad_norm": 0.42059096693992615, + "learning_rate": 0.0001, + "loss": 1.6763, + "step": 4456 + }, + { + "epoch": 0.5119751880994774, + "grad_norm": 0.4306631088256836, + "learning_rate": 0.0001, + "loss": 1.5764, + "step": 4457 + }, + { + "epoch": 0.5120900580093045, + "grad_norm": 0.41633883118629456, + "learning_rate": 0.0001, + "loss": 1.6652, + "step": 4458 + }, + { + "epoch": 0.5122049279191316, + "grad_norm": 0.4185430705547333, + "learning_rate": 0.0001, + "loss": 1.6695, + "step": 4459 + }, + { + "epoch": 0.5123197978289588, + "grad_norm": 0.38397789001464844, + "learning_rate": 0.0001, + "loss": 1.3248, + "step": 4460 + }, + { + "epoch": 0.5124346677387859, + "grad_norm": 0.42769157886505127, + "learning_rate": 0.0001, + "loss": 1.6386, + "step": 4461 + }, + { + "epoch": 0.512549537648613, + "grad_norm": 0.40281736850738525, + "learning_rate": 0.0001, + "loss": 1.553, + "step": 4462 + }, + { + "epoch": 0.5126644075584401, + "grad_norm": 0.48889243602752686, + "learning_rate": 0.0001, + "loss": 1.8824, + "step": 4463 + }, + { + "epoch": 0.5127792774682672, + "grad_norm": 0.4288654625415802, + "learning_rate": 0.0001, + "loss": 1.6987, + "step": 4464 + }, + { + "epoch": 0.5128941473780944, + "grad_norm": 0.4444088637828827, + "learning_rate": 0.0001, + "loss": 1.7862, + "step": 4465 + }, + { + "epoch": 0.5130090172879215, + "grad_norm": 0.3586702346801758, + "learning_rate": 0.0001, + "loss": 1.4236, + "step": 4466 + }, + { + "epoch": 0.5131238871977486, + "grad_norm": 0.41832202672958374, + "learning_rate": 0.0001, + "loss": 1.6599, + "step": 4467 + }, + { + "epoch": 0.5132387571075757, + "grad_norm": 0.41678473353385925, + "learning_rate": 0.0001, + "loss": 1.663, + "step": 4468 + }, + { + "epoch": 0.5133536270174028, + "grad_norm": 0.39657461643218994, + "learning_rate": 0.0001, + "loss": 1.541, + "step": 4469 + }, + { + "epoch": 0.51346849692723, + "grad_norm": 0.4119437336921692, + "learning_rate": 0.0001, + "loss": 1.631, + "step": 4470 + }, + { + "epoch": 0.5135833668370571, + "grad_norm": 0.4205493927001953, + "learning_rate": 0.0001, + "loss": 1.5289, + "step": 4471 + }, + { + "epoch": 0.5136982367468842, + "grad_norm": 0.4292590618133545, + "learning_rate": 0.0001, + "loss": 1.6888, + "step": 4472 + }, + { + "epoch": 0.5138131066567113, + "grad_norm": 0.4054730534553528, + "learning_rate": 0.0001, + "loss": 1.6858, + "step": 4473 + }, + { + "epoch": 0.5139279765665384, + "grad_norm": 0.5104745626449585, + "learning_rate": 0.0001, + "loss": 1.5412, + "step": 4474 + }, + { + "epoch": 0.5140428464763656, + "grad_norm": 0.4039926528930664, + "learning_rate": 0.0001, + "loss": 1.7071, + "step": 4475 + }, + { + "epoch": 0.5141577163861927, + "grad_norm": 0.38494694232940674, + "learning_rate": 0.0001, + "loss": 1.5487, + "step": 4476 + }, + { + "epoch": 0.5142725862960198, + "grad_norm": 0.4241875112056732, + "learning_rate": 0.0001, + "loss": 1.6744, + "step": 4477 + }, + { + "epoch": 0.5143874562058469, + "grad_norm": 0.41834181547164917, + "learning_rate": 0.0001, + "loss": 1.4409, + "step": 4478 + }, + { + "epoch": 0.514502326115674, + "grad_norm": 0.38371768593788147, + "learning_rate": 0.0001, + "loss": 1.4855, + "step": 4479 + }, + { + "epoch": 0.5146171960255012, + "grad_norm": 0.42289024591445923, + "learning_rate": 0.0001, + "loss": 1.8331, + "step": 4480 + }, + { + "epoch": 0.5147320659353283, + "grad_norm": 0.41524538397789, + "learning_rate": 0.0001, + "loss": 1.6312, + "step": 4481 + }, + { + "epoch": 0.5148469358451554, + "grad_norm": 0.4520059823989868, + "learning_rate": 0.0001, + "loss": 1.7944, + "step": 4482 + }, + { + "epoch": 0.5149618057549825, + "grad_norm": 0.46914947032928467, + "learning_rate": 0.0001, + "loss": 1.6221, + "step": 4483 + }, + { + "epoch": 0.5150766756648096, + "grad_norm": 0.4201009273529053, + "learning_rate": 0.0001, + "loss": 1.7413, + "step": 4484 + }, + { + "epoch": 0.5151915455746368, + "grad_norm": 0.43544942140579224, + "learning_rate": 0.0001, + "loss": 1.7241, + "step": 4485 + }, + { + "epoch": 0.5153064154844639, + "grad_norm": 0.3825484812259674, + "learning_rate": 0.0001, + "loss": 1.543, + "step": 4486 + }, + { + "epoch": 0.515421285394291, + "grad_norm": 0.46979475021362305, + "learning_rate": 0.0001, + "loss": 1.765, + "step": 4487 + }, + { + "epoch": 0.5155361553041181, + "grad_norm": 0.4438627064228058, + "learning_rate": 0.0001, + "loss": 1.796, + "step": 4488 + }, + { + "epoch": 0.5156510252139452, + "grad_norm": 0.4040473699569702, + "learning_rate": 0.0001, + "loss": 1.6934, + "step": 4489 + }, + { + "epoch": 0.5157658951237724, + "grad_norm": 0.3772587776184082, + "learning_rate": 0.0001, + "loss": 1.548, + "step": 4490 + }, + { + "epoch": 0.5158807650335995, + "grad_norm": 0.4254496693611145, + "learning_rate": 0.0001, + "loss": 1.7002, + "step": 4491 + }, + { + "epoch": 0.5159956349434266, + "grad_norm": 0.43310895562171936, + "learning_rate": 0.0001, + "loss": 1.7606, + "step": 4492 + }, + { + "epoch": 0.5161105048532537, + "grad_norm": 0.43428242206573486, + "learning_rate": 0.0001, + "loss": 1.8536, + "step": 4493 + }, + { + "epoch": 0.5162253747630808, + "grad_norm": 0.4234102964401245, + "learning_rate": 0.0001, + "loss": 1.6754, + "step": 4494 + }, + { + "epoch": 0.516340244672908, + "grad_norm": 0.4157521724700928, + "learning_rate": 0.0001, + "loss": 1.717, + "step": 4495 + }, + { + "epoch": 0.5164551145827351, + "grad_norm": 0.4052300751209259, + "learning_rate": 0.0001, + "loss": 1.4448, + "step": 4496 + }, + { + "epoch": 0.5165699844925622, + "grad_norm": 0.4426092207431793, + "learning_rate": 0.0001, + "loss": 1.683, + "step": 4497 + }, + { + "epoch": 0.5166848544023893, + "grad_norm": 0.41626426577568054, + "learning_rate": 0.0001, + "loss": 1.5067, + "step": 4498 + }, + { + "epoch": 0.5167997243122164, + "grad_norm": 0.38398024439811707, + "learning_rate": 0.0001, + "loss": 1.6062, + "step": 4499 + }, + { + "epoch": 0.5169145942220436, + "grad_norm": 0.4056454598903656, + "learning_rate": 0.0001, + "loss": 1.5062, + "step": 4500 + }, + { + "epoch": 0.5170294641318707, + "grad_norm": 0.4996061325073242, + "learning_rate": 0.0001, + "loss": 1.7317, + "step": 4501 + }, + { + "epoch": 0.5171443340416978, + "grad_norm": 0.40144461393356323, + "learning_rate": 0.0001, + "loss": 1.6122, + "step": 4502 + }, + { + "epoch": 0.5172592039515249, + "grad_norm": 0.4855468273162842, + "learning_rate": 0.0001, + "loss": 1.6999, + "step": 4503 + }, + { + "epoch": 0.517374073861352, + "grad_norm": 0.4194789230823517, + "learning_rate": 0.0001, + "loss": 1.6659, + "step": 4504 + }, + { + "epoch": 0.5174889437711792, + "grad_norm": 0.3837035596370697, + "learning_rate": 0.0001, + "loss": 1.5493, + "step": 4505 + }, + { + "epoch": 0.5176038136810063, + "grad_norm": 0.39867258071899414, + "learning_rate": 0.0001, + "loss": 1.3699, + "step": 4506 + }, + { + "epoch": 0.5177186835908334, + "grad_norm": 0.4189174473285675, + "learning_rate": 0.0001, + "loss": 1.6099, + "step": 4507 + }, + { + "epoch": 0.5178335535006605, + "grad_norm": 0.43066421151161194, + "learning_rate": 0.0001, + "loss": 1.7007, + "step": 4508 + }, + { + "epoch": 0.5179484234104876, + "grad_norm": 0.40682274103164673, + "learning_rate": 0.0001, + "loss": 1.4341, + "step": 4509 + }, + { + "epoch": 0.5180632933203148, + "grad_norm": 0.4145391583442688, + "learning_rate": 0.0001, + "loss": 1.6284, + "step": 4510 + }, + { + "epoch": 0.5181781632301419, + "grad_norm": 0.42484042048454285, + "learning_rate": 0.0001, + "loss": 1.6313, + "step": 4511 + }, + { + "epoch": 0.518293033139969, + "grad_norm": 0.48949721455574036, + "learning_rate": 0.0001, + "loss": 1.7115, + "step": 4512 + }, + { + "epoch": 0.5184079030497961, + "grad_norm": 0.46473968029022217, + "learning_rate": 0.0001, + "loss": 1.778, + "step": 4513 + }, + { + "epoch": 0.5185227729596232, + "grad_norm": 0.4237271845340729, + "learning_rate": 0.0001, + "loss": 1.7289, + "step": 4514 + }, + { + "epoch": 0.5186376428694504, + "grad_norm": 0.4288172721862793, + "learning_rate": 0.0001, + "loss": 1.8229, + "step": 4515 + }, + { + "epoch": 0.5187525127792775, + "grad_norm": 0.39855021238327026, + "learning_rate": 0.0001, + "loss": 1.7383, + "step": 4516 + }, + { + "epoch": 0.5188673826891046, + "grad_norm": 0.4043690860271454, + "learning_rate": 0.0001, + "loss": 1.7437, + "step": 4517 + }, + { + "epoch": 0.5189822525989317, + "grad_norm": 0.4174858331680298, + "learning_rate": 0.0001, + "loss": 1.4415, + "step": 4518 + }, + { + "epoch": 0.5190971225087588, + "grad_norm": 0.4092039167881012, + "learning_rate": 0.0001, + "loss": 1.7563, + "step": 4519 + }, + { + "epoch": 0.519211992418586, + "grad_norm": 0.44929736852645874, + "learning_rate": 0.0001, + "loss": 1.8381, + "step": 4520 + }, + { + "epoch": 0.5193268623284131, + "grad_norm": 0.38499635457992554, + "learning_rate": 0.0001, + "loss": 1.5668, + "step": 4521 + }, + { + "epoch": 0.5194417322382402, + "grad_norm": 0.4187855124473572, + "learning_rate": 0.0001, + "loss": 1.6891, + "step": 4522 + }, + { + "epoch": 0.5195566021480673, + "grad_norm": 0.43714576959609985, + "learning_rate": 0.0001, + "loss": 1.7564, + "step": 4523 + }, + { + "epoch": 0.5196714720578944, + "grad_norm": 0.42951980233192444, + "learning_rate": 0.0001, + "loss": 1.6217, + "step": 4524 + }, + { + "epoch": 0.5197863419677216, + "grad_norm": 0.4339199662208557, + "learning_rate": 0.0001, + "loss": 1.5095, + "step": 4525 + }, + { + "epoch": 0.5199012118775487, + "grad_norm": 0.434725821018219, + "learning_rate": 0.0001, + "loss": 1.6993, + "step": 4526 + }, + { + "epoch": 0.5200160817873758, + "grad_norm": 0.42468908429145813, + "learning_rate": 0.0001, + "loss": 1.6434, + "step": 4527 + }, + { + "epoch": 0.5201309516972029, + "grad_norm": 0.42453402280807495, + "learning_rate": 0.0001, + "loss": 1.6366, + "step": 4528 + }, + { + "epoch": 0.52024582160703, + "grad_norm": 0.4131234586238861, + "learning_rate": 0.0001, + "loss": 1.4308, + "step": 4529 + }, + { + "epoch": 0.5203606915168572, + "grad_norm": 0.40424486994743347, + "learning_rate": 0.0001, + "loss": 1.5853, + "step": 4530 + }, + { + "epoch": 0.5204755614266843, + "grad_norm": 0.45649197697639465, + "learning_rate": 0.0001, + "loss": 1.8034, + "step": 4531 + }, + { + "epoch": 0.5205904313365114, + "grad_norm": 0.412251740694046, + "learning_rate": 0.0001, + "loss": 1.6628, + "step": 4532 + }, + { + "epoch": 0.5207053012463385, + "grad_norm": 0.4387061297893524, + "learning_rate": 0.0001, + "loss": 1.8548, + "step": 4533 + }, + { + "epoch": 0.5208201711561656, + "grad_norm": 0.4186844229698181, + "learning_rate": 0.0001, + "loss": 1.7774, + "step": 4534 + }, + { + "epoch": 0.5209350410659928, + "grad_norm": 0.41150832176208496, + "learning_rate": 0.0001, + "loss": 1.6646, + "step": 4535 + }, + { + "epoch": 0.5210499109758199, + "grad_norm": 0.4150691628456116, + "learning_rate": 0.0001, + "loss": 1.579, + "step": 4536 + }, + { + "epoch": 0.521164780885647, + "grad_norm": 0.4016995131969452, + "learning_rate": 0.0001, + "loss": 1.6122, + "step": 4537 + }, + { + "epoch": 0.5212796507954741, + "grad_norm": 0.4465637505054474, + "learning_rate": 0.0001, + "loss": 1.8251, + "step": 4538 + }, + { + "epoch": 0.5213945207053012, + "grad_norm": 0.38491567969322205, + "learning_rate": 0.0001, + "loss": 1.5484, + "step": 4539 + }, + { + "epoch": 0.5215093906151284, + "grad_norm": 0.3878459632396698, + "learning_rate": 0.0001, + "loss": 1.6515, + "step": 4540 + }, + { + "epoch": 0.5216242605249555, + "grad_norm": 0.38580122590065, + "learning_rate": 0.0001, + "loss": 1.55, + "step": 4541 + }, + { + "epoch": 0.5217391304347826, + "grad_norm": 0.3909989595413208, + "learning_rate": 0.0001, + "loss": 1.6571, + "step": 4542 + }, + { + "epoch": 0.5218540003446097, + "grad_norm": 0.49450787901878357, + "learning_rate": 0.0001, + "loss": 1.744, + "step": 4543 + }, + { + "epoch": 0.5219688702544368, + "grad_norm": 0.43028536438941956, + "learning_rate": 0.0001, + "loss": 1.6926, + "step": 4544 + }, + { + "epoch": 0.522083740164264, + "grad_norm": 0.4146410822868347, + "learning_rate": 0.0001, + "loss": 1.5019, + "step": 4545 + }, + { + "epoch": 0.5221986100740911, + "grad_norm": 0.40828803181648254, + "learning_rate": 0.0001, + "loss": 1.696, + "step": 4546 + }, + { + "epoch": 0.5223134799839182, + "grad_norm": 0.38090986013412476, + "learning_rate": 0.0001, + "loss": 1.4923, + "step": 4547 + }, + { + "epoch": 0.5224283498937453, + "grad_norm": 0.3952951431274414, + "learning_rate": 0.0001, + "loss": 1.524, + "step": 4548 + }, + { + "epoch": 0.5225432198035724, + "grad_norm": 0.4446341395378113, + "learning_rate": 0.0001, + "loss": 1.7717, + "step": 4549 + }, + { + "epoch": 0.5226580897133996, + "grad_norm": 0.4435203969478607, + "learning_rate": 0.0001, + "loss": 1.6232, + "step": 4550 + }, + { + "epoch": 0.5227729596232267, + "grad_norm": 0.4518386423587799, + "learning_rate": 0.0001, + "loss": 1.6649, + "step": 4551 + }, + { + "epoch": 0.5228878295330538, + "grad_norm": 0.4210054576396942, + "learning_rate": 0.0001, + "loss": 1.5636, + "step": 4552 + }, + { + "epoch": 0.5230026994428809, + "grad_norm": 0.464871346950531, + "learning_rate": 0.0001, + "loss": 1.7242, + "step": 4553 + }, + { + "epoch": 0.523117569352708, + "grad_norm": 0.4055049419403076, + "learning_rate": 0.0001, + "loss": 1.6339, + "step": 4554 + }, + { + "epoch": 0.5232324392625352, + "grad_norm": 0.4289097189903259, + "learning_rate": 0.0001, + "loss": 1.5451, + "step": 4555 + }, + { + "epoch": 0.5233473091723623, + "grad_norm": 0.40787026286125183, + "learning_rate": 0.0001, + "loss": 1.5455, + "step": 4556 + }, + { + "epoch": 0.5234621790821894, + "grad_norm": 0.41342809796333313, + "learning_rate": 0.0001, + "loss": 1.6431, + "step": 4557 + }, + { + "epoch": 0.5235770489920165, + "grad_norm": 0.3959392011165619, + "learning_rate": 0.0001, + "loss": 1.5286, + "step": 4558 + }, + { + "epoch": 0.5236919189018436, + "grad_norm": 0.41905930638313293, + "learning_rate": 0.0001, + "loss": 1.4938, + "step": 4559 + }, + { + "epoch": 0.5238067888116708, + "grad_norm": 0.4155671000480652, + "learning_rate": 0.0001, + "loss": 1.6784, + "step": 4560 + }, + { + "epoch": 0.5239216587214979, + "grad_norm": 0.3972344398498535, + "learning_rate": 0.0001, + "loss": 1.5804, + "step": 4561 + }, + { + "epoch": 0.524036528631325, + "grad_norm": 0.40942510962486267, + "learning_rate": 0.0001, + "loss": 1.648, + "step": 4562 + }, + { + "epoch": 0.5241513985411521, + "grad_norm": 0.43989264965057373, + "learning_rate": 0.0001, + "loss": 1.8291, + "step": 4563 + }, + { + "epoch": 0.5242662684509792, + "grad_norm": 0.38364219665527344, + "learning_rate": 0.0001, + "loss": 1.6185, + "step": 4564 + }, + { + "epoch": 0.5243811383608064, + "grad_norm": 0.3964410424232483, + "learning_rate": 0.0001, + "loss": 1.6562, + "step": 4565 + }, + { + "epoch": 0.5244960082706335, + "grad_norm": 0.3990534842014313, + "learning_rate": 0.0001, + "loss": 1.62, + "step": 4566 + }, + { + "epoch": 0.5246108781804606, + "grad_norm": 0.4154457747936249, + "learning_rate": 0.0001, + "loss": 1.6036, + "step": 4567 + }, + { + "epoch": 0.5247257480902877, + "grad_norm": 0.4000798761844635, + "learning_rate": 0.0001, + "loss": 1.685, + "step": 4568 + }, + { + "epoch": 0.5248406180001148, + "grad_norm": 0.4398530423641205, + "learning_rate": 0.0001, + "loss": 1.7647, + "step": 4569 + }, + { + "epoch": 0.524955487909942, + "grad_norm": 0.41386568546295166, + "learning_rate": 0.0001, + "loss": 1.7705, + "step": 4570 + }, + { + "epoch": 0.5250703578197691, + "grad_norm": 0.48047158122062683, + "learning_rate": 0.0001, + "loss": 1.8748, + "step": 4571 + }, + { + "epoch": 0.5251852277295962, + "grad_norm": 0.38870546221733093, + "learning_rate": 0.0001, + "loss": 1.4763, + "step": 4572 + }, + { + "epoch": 0.5253000976394233, + "grad_norm": 0.48835334181785583, + "learning_rate": 0.0001, + "loss": 1.6587, + "step": 4573 + }, + { + "epoch": 0.5254149675492504, + "grad_norm": 0.39779576659202576, + "learning_rate": 0.0001, + "loss": 1.5337, + "step": 4574 + }, + { + "epoch": 0.5255298374590776, + "grad_norm": 0.3713628947734833, + "learning_rate": 0.0001, + "loss": 1.5932, + "step": 4575 + }, + { + "epoch": 0.5256447073689047, + "grad_norm": 0.39374494552612305, + "learning_rate": 0.0001, + "loss": 1.5818, + "step": 4576 + }, + { + "epoch": 0.5257595772787318, + "grad_norm": 0.40991827845573425, + "learning_rate": 0.0001, + "loss": 1.6499, + "step": 4577 + }, + { + "epoch": 0.5258744471885589, + "grad_norm": 0.4329836964607239, + "learning_rate": 0.0001, + "loss": 1.7166, + "step": 4578 + }, + { + "epoch": 0.525989317098386, + "grad_norm": 0.44446802139282227, + "learning_rate": 0.0001, + "loss": 1.7636, + "step": 4579 + }, + { + "epoch": 0.5261041870082132, + "grad_norm": 0.42868572473526, + "learning_rate": 0.0001, + "loss": 1.8013, + "step": 4580 + }, + { + "epoch": 0.5262190569180403, + "grad_norm": 0.38663357496261597, + "learning_rate": 0.0001, + "loss": 1.5617, + "step": 4581 + }, + { + "epoch": 0.5263339268278674, + "grad_norm": 0.4398927390575409, + "learning_rate": 0.0001, + "loss": 1.6136, + "step": 4582 + }, + { + "epoch": 0.5264487967376945, + "grad_norm": 0.4069356620311737, + "learning_rate": 0.0001, + "loss": 1.5597, + "step": 4583 + }, + { + "epoch": 0.5265636666475216, + "grad_norm": 0.40244022011756897, + "learning_rate": 0.0001, + "loss": 1.5808, + "step": 4584 + }, + { + "epoch": 0.5266785365573488, + "grad_norm": 0.42762094736099243, + "learning_rate": 0.0001, + "loss": 1.7083, + "step": 4585 + }, + { + "epoch": 0.5267934064671759, + "grad_norm": 0.4556163251399994, + "learning_rate": 0.0001, + "loss": 1.824, + "step": 4586 + }, + { + "epoch": 0.526908276377003, + "grad_norm": 0.4342927634716034, + "learning_rate": 0.0001, + "loss": 1.496, + "step": 4587 + }, + { + "epoch": 0.5270231462868301, + "grad_norm": 0.4018527865409851, + "learning_rate": 0.0001, + "loss": 1.6768, + "step": 4588 + }, + { + "epoch": 0.5271380161966572, + "grad_norm": 0.3828233480453491, + "learning_rate": 0.0001, + "loss": 1.31, + "step": 4589 + }, + { + "epoch": 0.5272528861064844, + "grad_norm": 0.4347113072872162, + "learning_rate": 0.0001, + "loss": 1.7758, + "step": 4590 + }, + { + "epoch": 0.5273677560163115, + "grad_norm": 0.38657423853874207, + "learning_rate": 0.0001, + "loss": 1.6179, + "step": 4591 + }, + { + "epoch": 0.5274826259261386, + "grad_norm": 0.42126983404159546, + "learning_rate": 0.0001, + "loss": 1.4377, + "step": 4592 + }, + { + "epoch": 0.5275974958359658, + "grad_norm": 0.45303142070770264, + "learning_rate": 0.0001, + "loss": 1.7843, + "step": 4593 + }, + { + "epoch": 0.5277123657457929, + "grad_norm": 0.395085871219635, + "learning_rate": 0.0001, + "loss": 1.5851, + "step": 4594 + }, + { + "epoch": 0.5278272356556201, + "grad_norm": 0.3973239064216614, + "learning_rate": 0.0001, + "loss": 1.6489, + "step": 4595 + }, + { + "epoch": 0.5279421055654472, + "grad_norm": 0.3768349587917328, + "learning_rate": 0.0001, + "loss": 1.325, + "step": 4596 + }, + { + "epoch": 0.5280569754752743, + "grad_norm": 0.43345212936401367, + "learning_rate": 0.0001, + "loss": 1.6315, + "step": 4597 + }, + { + "epoch": 0.5281718453851014, + "grad_norm": 0.4553665816783905, + "learning_rate": 0.0001, + "loss": 1.7439, + "step": 4598 + }, + { + "epoch": 0.5282867152949285, + "grad_norm": 0.4854236841201782, + "learning_rate": 0.0001, + "loss": 1.9161, + "step": 4599 + }, + { + "epoch": 0.5284015852047557, + "grad_norm": 0.421282559633255, + "learning_rate": 0.0001, + "loss": 1.5079, + "step": 4600 + }, + { + "epoch": 0.5285164551145828, + "grad_norm": 0.4243543744087219, + "learning_rate": 0.0001, + "loss": 1.6191, + "step": 4601 + }, + { + "epoch": 0.5286313250244099, + "grad_norm": 0.4060072600841522, + "learning_rate": 0.0001, + "loss": 1.5374, + "step": 4602 + }, + { + "epoch": 0.528746194934237, + "grad_norm": 0.4349260628223419, + "learning_rate": 0.0001, + "loss": 1.8664, + "step": 4603 + }, + { + "epoch": 0.5288610648440641, + "grad_norm": 0.40768831968307495, + "learning_rate": 0.0001, + "loss": 1.5477, + "step": 4604 + }, + { + "epoch": 0.5289759347538913, + "grad_norm": 0.39794713258743286, + "learning_rate": 0.0001, + "loss": 1.6151, + "step": 4605 + }, + { + "epoch": 0.5290908046637184, + "grad_norm": 0.4161005914211273, + "learning_rate": 0.0001, + "loss": 1.6587, + "step": 4606 + }, + { + "epoch": 0.5292056745735455, + "grad_norm": 0.44163885712623596, + "learning_rate": 0.0001, + "loss": 1.8116, + "step": 4607 + }, + { + "epoch": 0.5293205444833726, + "grad_norm": 0.5268819332122803, + "learning_rate": 0.0001, + "loss": 1.8569, + "step": 4608 + }, + { + "epoch": 0.5294354143931997, + "grad_norm": 0.46857014298439026, + "learning_rate": 0.0001, + "loss": 1.6907, + "step": 4609 + }, + { + "epoch": 0.5295502843030269, + "grad_norm": 0.401594340801239, + "learning_rate": 0.0001, + "loss": 1.5846, + "step": 4610 + }, + { + "epoch": 0.529665154212854, + "grad_norm": 0.42373210191726685, + "learning_rate": 0.0001, + "loss": 1.6703, + "step": 4611 + }, + { + "epoch": 0.5297800241226811, + "grad_norm": 0.40101566910743713, + "learning_rate": 0.0001, + "loss": 1.6537, + "step": 4612 + }, + { + "epoch": 0.5298948940325082, + "grad_norm": 0.4653320014476776, + "learning_rate": 0.0001, + "loss": 1.6741, + "step": 4613 + }, + { + "epoch": 0.5300097639423353, + "grad_norm": 0.4147876501083374, + "learning_rate": 0.0001, + "loss": 1.638, + "step": 4614 + }, + { + "epoch": 0.5301246338521625, + "grad_norm": 0.4062972068786621, + "learning_rate": 0.0001, + "loss": 1.5747, + "step": 4615 + }, + { + "epoch": 0.5302395037619896, + "grad_norm": 0.4018900394439697, + "learning_rate": 0.0001, + "loss": 1.6461, + "step": 4616 + }, + { + "epoch": 0.5303543736718167, + "grad_norm": 0.41791051626205444, + "learning_rate": 0.0001, + "loss": 1.7225, + "step": 4617 + }, + { + "epoch": 0.5304692435816438, + "grad_norm": 0.41558316349983215, + "learning_rate": 0.0001, + "loss": 1.8203, + "step": 4618 + }, + { + "epoch": 0.5305841134914709, + "grad_norm": 0.4309985935688019, + "learning_rate": 0.0001, + "loss": 1.5813, + "step": 4619 + }, + { + "epoch": 0.5306989834012981, + "grad_norm": 0.4364014267921448, + "learning_rate": 0.0001, + "loss": 1.8082, + "step": 4620 + }, + { + "epoch": 0.5308138533111252, + "grad_norm": 0.41828298568725586, + "learning_rate": 0.0001, + "loss": 1.6167, + "step": 4621 + }, + { + "epoch": 0.5309287232209523, + "grad_norm": 0.3910794258117676, + "learning_rate": 0.0001, + "loss": 1.5617, + "step": 4622 + }, + { + "epoch": 0.5310435931307794, + "grad_norm": 0.4651833772659302, + "learning_rate": 0.0001, + "loss": 1.7743, + "step": 4623 + }, + { + "epoch": 0.5311584630406065, + "grad_norm": 0.44561001658439636, + "learning_rate": 0.0001, + "loss": 1.7388, + "step": 4624 + }, + { + "epoch": 0.5312733329504337, + "grad_norm": 0.3969671130180359, + "learning_rate": 0.0001, + "loss": 1.5671, + "step": 4625 + }, + { + "epoch": 0.5313882028602608, + "grad_norm": 0.4239133894443512, + "learning_rate": 0.0001, + "loss": 1.6436, + "step": 4626 + }, + { + "epoch": 0.5315030727700879, + "grad_norm": 0.3926708698272705, + "learning_rate": 0.0001, + "loss": 1.6082, + "step": 4627 + }, + { + "epoch": 0.531617942679915, + "grad_norm": 0.4034792184829712, + "learning_rate": 0.0001, + "loss": 1.5234, + "step": 4628 + }, + { + "epoch": 0.5317328125897421, + "grad_norm": 0.4075685143470764, + "learning_rate": 0.0001, + "loss": 1.597, + "step": 4629 + }, + { + "epoch": 0.5318476824995693, + "grad_norm": 0.41798871755599976, + "learning_rate": 0.0001, + "loss": 1.7057, + "step": 4630 + }, + { + "epoch": 0.5319625524093964, + "grad_norm": 0.44857409596443176, + "learning_rate": 0.0001, + "loss": 1.7721, + "step": 4631 + }, + { + "epoch": 0.5320774223192235, + "grad_norm": 0.39396408200263977, + "learning_rate": 0.0001, + "loss": 1.5443, + "step": 4632 + }, + { + "epoch": 0.5321922922290506, + "grad_norm": 0.4028686285018921, + "learning_rate": 0.0001, + "loss": 1.6521, + "step": 4633 + }, + { + "epoch": 0.5323071621388777, + "grad_norm": 0.4449384808540344, + "learning_rate": 0.0001, + "loss": 1.6526, + "step": 4634 + }, + { + "epoch": 0.5324220320487049, + "grad_norm": 0.4593904912471771, + "learning_rate": 0.0001, + "loss": 1.8295, + "step": 4635 + }, + { + "epoch": 0.532536901958532, + "grad_norm": 0.40952321887016296, + "learning_rate": 0.0001, + "loss": 1.4723, + "step": 4636 + }, + { + "epoch": 0.5326517718683591, + "grad_norm": 0.421491801738739, + "learning_rate": 0.0001, + "loss": 1.5559, + "step": 4637 + }, + { + "epoch": 0.5327666417781862, + "grad_norm": 0.5018342733383179, + "learning_rate": 0.0001, + "loss": 1.7603, + "step": 4638 + }, + { + "epoch": 0.5328815116880133, + "grad_norm": 0.394999235868454, + "learning_rate": 0.0001, + "loss": 1.4546, + "step": 4639 + }, + { + "epoch": 0.5329963815978405, + "grad_norm": 0.3952369689941406, + "learning_rate": 0.0001, + "loss": 1.5503, + "step": 4640 + }, + { + "epoch": 0.5331112515076676, + "grad_norm": 0.44612810015678406, + "learning_rate": 0.0001, + "loss": 1.8486, + "step": 4641 + }, + { + "epoch": 0.5332261214174947, + "grad_norm": 0.4116518497467041, + "learning_rate": 0.0001, + "loss": 1.7686, + "step": 4642 + }, + { + "epoch": 0.5333409913273218, + "grad_norm": 0.4047883152961731, + "learning_rate": 0.0001, + "loss": 1.6333, + "step": 4643 + }, + { + "epoch": 0.5334558612371489, + "grad_norm": 0.4536430239677429, + "learning_rate": 0.0001, + "loss": 1.8459, + "step": 4644 + }, + { + "epoch": 0.5335707311469761, + "grad_norm": 0.4192769527435303, + "learning_rate": 0.0001, + "loss": 1.7317, + "step": 4645 + }, + { + "epoch": 0.5336856010568032, + "grad_norm": 0.426587849855423, + "learning_rate": 0.0001, + "loss": 1.8026, + "step": 4646 + }, + { + "epoch": 0.5338004709666303, + "grad_norm": 0.40313446521759033, + "learning_rate": 0.0001, + "loss": 1.6022, + "step": 4647 + }, + { + "epoch": 0.5339153408764574, + "grad_norm": 0.37919333577156067, + "learning_rate": 0.0001, + "loss": 1.3846, + "step": 4648 + }, + { + "epoch": 0.5340302107862845, + "grad_norm": 0.4192160367965698, + "learning_rate": 0.0001, + "loss": 1.6382, + "step": 4649 + }, + { + "epoch": 0.5341450806961117, + "grad_norm": 0.42048901319503784, + "learning_rate": 0.0001, + "loss": 1.6995, + "step": 4650 + }, + { + "epoch": 0.5342599506059388, + "grad_norm": 0.4094361960887909, + "learning_rate": 0.0001, + "loss": 1.6199, + "step": 4651 + }, + { + "epoch": 0.5343748205157659, + "grad_norm": 0.4340735971927643, + "learning_rate": 0.0001, + "loss": 1.6967, + "step": 4652 + }, + { + "epoch": 0.534489690425593, + "grad_norm": 0.41635647416114807, + "learning_rate": 0.0001, + "loss": 1.7417, + "step": 4653 + }, + { + "epoch": 0.5346045603354201, + "grad_norm": 0.42730745673179626, + "learning_rate": 0.0001, + "loss": 1.6606, + "step": 4654 + }, + { + "epoch": 0.5347194302452473, + "grad_norm": 0.43178364634513855, + "learning_rate": 0.0001, + "loss": 1.5394, + "step": 4655 + }, + { + "epoch": 0.5348343001550744, + "grad_norm": 0.4151526391506195, + "learning_rate": 0.0001, + "loss": 1.7846, + "step": 4656 + }, + { + "epoch": 0.5349491700649015, + "grad_norm": 0.4182411730289459, + "learning_rate": 0.0001, + "loss": 1.6844, + "step": 4657 + }, + { + "epoch": 0.5350640399747286, + "grad_norm": 0.423403263092041, + "learning_rate": 0.0001, + "loss": 1.7087, + "step": 4658 + }, + { + "epoch": 0.5351789098845557, + "grad_norm": 0.41044798493385315, + "learning_rate": 0.0001, + "loss": 1.6801, + "step": 4659 + }, + { + "epoch": 0.5352937797943829, + "grad_norm": 0.4252939224243164, + "learning_rate": 0.0001, + "loss": 1.7553, + "step": 4660 + }, + { + "epoch": 0.53540864970421, + "grad_norm": 0.4315437078475952, + "learning_rate": 0.0001, + "loss": 1.7822, + "step": 4661 + }, + { + "epoch": 0.5355235196140371, + "grad_norm": 0.4264442026615143, + "learning_rate": 0.0001, + "loss": 1.7567, + "step": 4662 + }, + { + "epoch": 0.5356383895238642, + "grad_norm": 0.3990129232406616, + "learning_rate": 0.0001, + "loss": 1.5359, + "step": 4663 + }, + { + "epoch": 0.5357532594336913, + "grad_norm": 0.4486422836780548, + "learning_rate": 0.0001, + "loss": 1.9106, + "step": 4664 + }, + { + "epoch": 0.5358681293435185, + "grad_norm": 0.4086044430732727, + "learning_rate": 0.0001, + "loss": 1.7818, + "step": 4665 + }, + { + "epoch": 0.5359829992533456, + "grad_norm": 0.39344853162765503, + "learning_rate": 0.0001, + "loss": 1.5296, + "step": 4666 + }, + { + "epoch": 0.5360978691631727, + "grad_norm": 0.4144117534160614, + "learning_rate": 0.0001, + "loss": 1.4561, + "step": 4667 + }, + { + "epoch": 0.5362127390729998, + "grad_norm": 0.40345409512519836, + "learning_rate": 0.0001, + "loss": 1.6228, + "step": 4668 + }, + { + "epoch": 0.5363276089828269, + "grad_norm": 0.4093291461467743, + "learning_rate": 0.0001, + "loss": 1.5001, + "step": 4669 + }, + { + "epoch": 0.5364424788926541, + "grad_norm": 0.39114460349082947, + "learning_rate": 0.0001, + "loss": 1.4858, + "step": 4670 + }, + { + "epoch": 0.5365573488024812, + "grad_norm": 0.4037148058414459, + "learning_rate": 0.0001, + "loss": 1.5966, + "step": 4671 + }, + { + "epoch": 0.5366722187123083, + "grad_norm": 0.3919554054737091, + "learning_rate": 0.0001, + "loss": 1.5426, + "step": 4672 + }, + { + "epoch": 0.5367870886221354, + "grad_norm": 0.45879948139190674, + "learning_rate": 0.0001, + "loss": 1.8282, + "step": 4673 + }, + { + "epoch": 0.5369019585319625, + "grad_norm": 0.4263037443161011, + "learning_rate": 0.0001, + "loss": 1.7901, + "step": 4674 + }, + { + "epoch": 0.5370168284417897, + "grad_norm": 0.4132154583930969, + "learning_rate": 0.0001, + "loss": 1.5991, + "step": 4675 + }, + { + "epoch": 0.5371316983516168, + "grad_norm": 0.4135374426841736, + "learning_rate": 0.0001, + "loss": 1.6075, + "step": 4676 + }, + { + "epoch": 0.5372465682614439, + "grad_norm": 0.43650367856025696, + "learning_rate": 0.0001, + "loss": 1.7241, + "step": 4677 + }, + { + "epoch": 0.537361438171271, + "grad_norm": 0.5079533457756042, + "learning_rate": 0.0001, + "loss": 1.7532, + "step": 4678 + }, + { + "epoch": 0.5374763080810981, + "grad_norm": 0.46438872814178467, + "learning_rate": 0.0001, + "loss": 1.7692, + "step": 4679 + }, + { + "epoch": 0.5375911779909253, + "grad_norm": 0.43697604537010193, + "learning_rate": 0.0001, + "loss": 1.7386, + "step": 4680 + }, + { + "epoch": 0.5377060479007524, + "grad_norm": 0.41365137696266174, + "learning_rate": 0.0001, + "loss": 1.5255, + "step": 4681 + }, + { + "epoch": 0.5378209178105795, + "grad_norm": 0.42320525646209717, + "learning_rate": 0.0001, + "loss": 1.5984, + "step": 4682 + }, + { + "epoch": 0.5379357877204066, + "grad_norm": 0.40122270584106445, + "learning_rate": 0.0001, + "loss": 1.4893, + "step": 4683 + }, + { + "epoch": 0.5380506576302337, + "grad_norm": 0.434253990650177, + "learning_rate": 0.0001, + "loss": 1.5002, + "step": 4684 + }, + { + "epoch": 0.5381655275400609, + "grad_norm": 0.4121977686882019, + "learning_rate": 0.0001, + "loss": 1.639, + "step": 4685 + }, + { + "epoch": 0.538280397449888, + "grad_norm": 0.4040560722351074, + "learning_rate": 0.0001, + "loss": 1.5075, + "step": 4686 + }, + { + "epoch": 0.5383952673597151, + "grad_norm": 0.4145204722881317, + "learning_rate": 0.0001, + "loss": 1.591, + "step": 4687 + }, + { + "epoch": 0.5385101372695422, + "grad_norm": 0.39439859986305237, + "learning_rate": 0.0001, + "loss": 1.4481, + "step": 4688 + }, + { + "epoch": 0.5386250071793693, + "grad_norm": 0.4014344811439514, + "learning_rate": 0.0001, + "loss": 1.5778, + "step": 4689 + }, + { + "epoch": 0.5387398770891965, + "grad_norm": 0.42101871967315674, + "learning_rate": 0.0001, + "loss": 1.6869, + "step": 4690 + }, + { + "epoch": 0.5388547469990236, + "grad_norm": 0.45102766156196594, + "learning_rate": 0.0001, + "loss": 1.627, + "step": 4691 + }, + { + "epoch": 0.5389696169088507, + "grad_norm": 0.43720656633377075, + "learning_rate": 0.0001, + "loss": 1.6872, + "step": 4692 + }, + { + "epoch": 0.5390844868186778, + "grad_norm": 0.4264827072620392, + "learning_rate": 0.0001, + "loss": 1.4308, + "step": 4693 + }, + { + "epoch": 0.5391993567285049, + "grad_norm": 0.4725572466850281, + "learning_rate": 0.0001, + "loss": 1.7914, + "step": 4694 + }, + { + "epoch": 0.5393142266383321, + "grad_norm": 0.4113958477973938, + "learning_rate": 0.0001, + "loss": 1.625, + "step": 4695 + }, + { + "epoch": 0.5394290965481592, + "grad_norm": 0.4475966691970825, + "learning_rate": 0.0001, + "loss": 1.7054, + "step": 4696 + }, + { + "epoch": 0.5395439664579863, + "grad_norm": 0.43861621618270874, + "learning_rate": 0.0001, + "loss": 1.6496, + "step": 4697 + }, + { + "epoch": 0.5396588363678134, + "grad_norm": 0.4491176903247833, + "learning_rate": 0.0001, + "loss": 1.5547, + "step": 4698 + }, + { + "epoch": 0.5397737062776405, + "grad_norm": 0.4262305498123169, + "learning_rate": 0.0001, + "loss": 1.7657, + "step": 4699 + }, + { + "epoch": 0.5398885761874677, + "grad_norm": 0.4179171025753021, + "learning_rate": 0.0001, + "loss": 1.5766, + "step": 4700 + }, + { + "epoch": 0.5400034460972948, + "grad_norm": 0.39834854006767273, + "learning_rate": 0.0001, + "loss": 1.5324, + "step": 4701 + }, + { + "epoch": 0.5401183160071219, + "grad_norm": 0.42186471819877625, + "learning_rate": 0.0001, + "loss": 1.8005, + "step": 4702 + }, + { + "epoch": 0.540233185916949, + "grad_norm": 0.42857295274734497, + "learning_rate": 0.0001, + "loss": 1.7399, + "step": 4703 + }, + { + "epoch": 0.5403480558267761, + "grad_norm": 0.4083864390850067, + "learning_rate": 0.0001, + "loss": 1.707, + "step": 4704 + }, + { + "epoch": 0.5404629257366033, + "grad_norm": 0.4472416341304779, + "learning_rate": 0.0001, + "loss": 1.5065, + "step": 4705 + }, + { + "epoch": 0.5405777956464304, + "grad_norm": 0.41945722699165344, + "learning_rate": 0.0001, + "loss": 1.5004, + "step": 4706 + }, + { + "epoch": 0.5406926655562575, + "grad_norm": 0.41861411929130554, + "learning_rate": 0.0001, + "loss": 1.6596, + "step": 4707 + }, + { + "epoch": 0.5408075354660846, + "grad_norm": 0.45254752039909363, + "learning_rate": 0.0001, + "loss": 1.6306, + "step": 4708 + }, + { + "epoch": 0.5409224053759117, + "grad_norm": 0.42132070660591125, + "learning_rate": 0.0001, + "loss": 1.672, + "step": 4709 + }, + { + "epoch": 0.5410372752857389, + "grad_norm": 0.4404768943786621, + "learning_rate": 0.0001, + "loss": 1.6902, + "step": 4710 + }, + { + "epoch": 0.541152145195566, + "grad_norm": 0.42923909425735474, + "learning_rate": 0.0001, + "loss": 1.7181, + "step": 4711 + }, + { + "epoch": 0.5412670151053931, + "grad_norm": 0.4722338020801544, + "learning_rate": 0.0001, + "loss": 1.8949, + "step": 4712 + }, + { + "epoch": 0.5413818850152202, + "grad_norm": 0.4394582211971283, + "learning_rate": 0.0001, + "loss": 1.7403, + "step": 4713 + }, + { + "epoch": 0.5414967549250473, + "grad_norm": 0.3982559144496918, + "learning_rate": 0.0001, + "loss": 1.594, + "step": 4714 + }, + { + "epoch": 0.5416116248348745, + "grad_norm": 0.42785125970840454, + "learning_rate": 0.0001, + "loss": 1.7779, + "step": 4715 + }, + { + "epoch": 0.5417264947447016, + "grad_norm": 0.38715437054634094, + "learning_rate": 0.0001, + "loss": 1.6039, + "step": 4716 + }, + { + "epoch": 0.5418413646545287, + "grad_norm": 0.4336237609386444, + "learning_rate": 0.0001, + "loss": 1.6872, + "step": 4717 + }, + { + "epoch": 0.5419562345643558, + "grad_norm": 0.4250262975692749, + "learning_rate": 0.0001, + "loss": 1.6197, + "step": 4718 + }, + { + "epoch": 0.5420711044741829, + "grad_norm": 4.473871231079102, + "learning_rate": 0.0001, + "loss": 1.6598, + "step": 4719 + }, + { + "epoch": 0.5421859743840101, + "grad_norm": 0.4131799042224884, + "learning_rate": 0.0001, + "loss": 1.5988, + "step": 4720 + }, + { + "epoch": 0.5423008442938372, + "grad_norm": 0.43611302971839905, + "learning_rate": 0.0001, + "loss": 1.6548, + "step": 4721 + }, + { + "epoch": 0.5424157142036643, + "grad_norm": 0.4363075792789459, + "learning_rate": 0.0001, + "loss": 1.5381, + "step": 4722 + }, + { + "epoch": 0.5425305841134914, + "grad_norm": 0.37694451212882996, + "learning_rate": 0.0001, + "loss": 1.4295, + "step": 4723 + }, + { + "epoch": 0.5426454540233185, + "grad_norm": 0.4214974641799927, + "learning_rate": 0.0001, + "loss": 1.5914, + "step": 4724 + }, + { + "epoch": 0.5427603239331457, + "grad_norm": 0.46396970748901367, + "learning_rate": 0.0001, + "loss": 1.6362, + "step": 4725 + }, + { + "epoch": 0.5428751938429728, + "grad_norm": 0.45121777057647705, + "learning_rate": 0.0001, + "loss": 1.7544, + "step": 4726 + }, + { + "epoch": 0.5429900637527999, + "grad_norm": 0.40204212069511414, + "learning_rate": 0.0001, + "loss": 1.6319, + "step": 4727 + }, + { + "epoch": 0.543104933662627, + "grad_norm": 0.4438580572605133, + "learning_rate": 0.0001, + "loss": 1.7001, + "step": 4728 + }, + { + "epoch": 0.5432198035724541, + "grad_norm": 0.41876137256622314, + "learning_rate": 0.0001, + "loss": 1.612, + "step": 4729 + }, + { + "epoch": 0.5433346734822814, + "grad_norm": 0.4323340952396393, + "learning_rate": 0.0001, + "loss": 1.6284, + "step": 4730 + }, + { + "epoch": 0.5434495433921085, + "grad_norm": 0.4222297668457031, + "learning_rate": 0.0001, + "loss": 1.5687, + "step": 4731 + }, + { + "epoch": 0.5435644133019356, + "grad_norm": 0.4332796633243561, + "learning_rate": 0.0001, + "loss": 1.6464, + "step": 4732 + }, + { + "epoch": 0.5436792832117627, + "grad_norm": 0.4457077383995056, + "learning_rate": 0.0001, + "loss": 1.6226, + "step": 4733 + }, + { + "epoch": 0.5437941531215899, + "grad_norm": 0.40646892786026, + "learning_rate": 0.0001, + "loss": 1.3511, + "step": 4734 + }, + { + "epoch": 0.543909023031417, + "grad_norm": 0.42264634370803833, + "learning_rate": 0.0001, + "loss": 1.5848, + "step": 4735 + }, + { + "epoch": 0.5440238929412441, + "grad_norm": 7.803822994232178, + "learning_rate": 0.0001, + "loss": 1.548, + "step": 4736 + }, + { + "epoch": 0.5441387628510712, + "grad_norm": 0.4260199964046478, + "learning_rate": 0.0001, + "loss": 1.7074, + "step": 4737 + }, + { + "epoch": 0.5442536327608983, + "grad_norm": 0.4420236051082611, + "learning_rate": 0.0001, + "loss": 1.8486, + "step": 4738 + }, + { + "epoch": 0.5443685026707255, + "grad_norm": 0.39475566148757935, + "learning_rate": 0.0001, + "loss": 1.448, + "step": 4739 + }, + { + "epoch": 0.5444833725805526, + "grad_norm": 0.48185980319976807, + "learning_rate": 0.0001, + "loss": 1.9924, + "step": 4740 + }, + { + "epoch": 0.5445982424903797, + "grad_norm": 0.4522726833820343, + "learning_rate": 0.0001, + "loss": 1.7875, + "step": 4741 + }, + { + "epoch": 0.5447131124002068, + "grad_norm": 0.45154809951782227, + "learning_rate": 0.0001, + "loss": 1.5931, + "step": 4742 + }, + { + "epoch": 0.5448279823100339, + "grad_norm": 0.39332887530326843, + "learning_rate": 0.0001, + "loss": 1.6388, + "step": 4743 + }, + { + "epoch": 0.544942852219861, + "grad_norm": 0.4441022574901581, + "learning_rate": 0.0001, + "loss": 1.5344, + "step": 4744 + }, + { + "epoch": 0.5450577221296882, + "grad_norm": 0.42498818039894104, + "learning_rate": 0.0001, + "loss": 1.7215, + "step": 4745 + }, + { + "epoch": 0.5451725920395153, + "grad_norm": 0.39692699909210205, + "learning_rate": 0.0001, + "loss": 1.5205, + "step": 4746 + }, + { + "epoch": 0.5452874619493424, + "grad_norm": 0.4348728060722351, + "learning_rate": 0.0001, + "loss": 1.6317, + "step": 4747 + }, + { + "epoch": 0.5454023318591695, + "grad_norm": 0.4652867615222931, + "learning_rate": 0.0001, + "loss": 1.6545, + "step": 4748 + }, + { + "epoch": 0.5455172017689967, + "grad_norm": 3.736886739730835, + "learning_rate": 0.0001, + "loss": 1.74, + "step": 4749 + }, + { + "epoch": 0.5456320716788238, + "grad_norm": 0.4334099292755127, + "learning_rate": 0.0001, + "loss": 1.7789, + "step": 4750 + }, + { + "epoch": 0.5457469415886509, + "grad_norm": 0.4378454387187958, + "learning_rate": 0.0001, + "loss": 1.6962, + "step": 4751 + }, + { + "epoch": 0.545861811498478, + "grad_norm": 0.46604326367378235, + "learning_rate": 0.0001, + "loss": 1.6623, + "step": 4752 + }, + { + "epoch": 0.5459766814083051, + "grad_norm": 13.21650505065918, + "learning_rate": 0.0001, + "loss": 2.8761, + "step": 4753 + }, + { + "epoch": 0.5460915513181323, + "grad_norm": 0.6902241706848145, + "learning_rate": 0.0001, + "loss": 1.6363, + "step": 4754 + }, + { + "epoch": 0.5462064212279594, + "grad_norm": 14.453927040100098, + "learning_rate": 0.0001, + "loss": 1.5613, + "step": 4755 + }, + { + "epoch": 0.5463212911377865, + "grad_norm": 1.930003046989441, + "learning_rate": 0.0001, + "loss": 1.6478, + "step": 4756 + }, + { + "epoch": 0.5464361610476136, + "grad_norm": 0.43484631180763245, + "learning_rate": 0.0001, + "loss": 1.6057, + "step": 4757 + }, + { + "epoch": 0.5465510309574407, + "grad_norm": 0.4576379656791687, + "learning_rate": 0.0001, + "loss": 1.6909, + "step": 4758 + }, + { + "epoch": 0.5466659008672679, + "grad_norm": 0.4986869990825653, + "learning_rate": 0.0001, + "loss": 1.5963, + "step": 4759 + }, + { + "epoch": 0.546780770777095, + "grad_norm": 0.4431321620941162, + "learning_rate": 0.0001, + "loss": 1.5434, + "step": 4760 + }, + { + "epoch": 0.5468956406869221, + "grad_norm": 0.9614063501358032, + "learning_rate": 0.0001, + "loss": 1.7638, + "step": 4761 + }, + { + "epoch": 0.5470105105967492, + "grad_norm": 0.505508303642273, + "learning_rate": 0.0001, + "loss": 1.7313, + "step": 4762 + }, + { + "epoch": 0.5471253805065763, + "grad_norm": 0.44963207840919495, + "learning_rate": 0.0001, + "loss": 1.465, + "step": 4763 + }, + { + "epoch": 0.5472402504164035, + "grad_norm": 0.4592747092247009, + "learning_rate": 0.0001, + "loss": 1.7584, + "step": 4764 + }, + { + "epoch": 0.5473551203262306, + "grad_norm": 0.5370006561279297, + "learning_rate": 0.0001, + "loss": 1.744, + "step": 4765 + }, + { + "epoch": 0.5474699902360577, + "grad_norm": 0.4782159924507141, + "learning_rate": 0.0001, + "loss": 1.5303, + "step": 4766 + }, + { + "epoch": 0.5475848601458848, + "grad_norm": 0.48043233156204224, + "learning_rate": 0.0001, + "loss": 1.886, + "step": 4767 + }, + { + "epoch": 0.5476997300557119, + "grad_norm": 0.47774550318717957, + "learning_rate": 0.0001, + "loss": 1.6583, + "step": 4768 + }, + { + "epoch": 0.547814599965539, + "grad_norm": 0.4518395662307739, + "learning_rate": 0.0001, + "loss": 1.4686, + "step": 4769 + }, + { + "epoch": 0.5479294698753662, + "grad_norm": 0.4505947530269623, + "learning_rate": 0.0001, + "loss": 1.6369, + "step": 4770 + }, + { + "epoch": 0.5480443397851933, + "grad_norm": 0.480541467666626, + "learning_rate": 0.0001, + "loss": 1.8852, + "step": 4771 + }, + { + "epoch": 0.5481592096950204, + "grad_norm": 0.48814818263053894, + "learning_rate": 0.0001, + "loss": 1.6918, + "step": 4772 + }, + { + "epoch": 0.5482740796048475, + "grad_norm": 0.5180444121360779, + "learning_rate": 0.0001, + "loss": 1.684, + "step": 4773 + }, + { + "epoch": 0.5483889495146747, + "grad_norm": 0.4888029396533966, + "learning_rate": 0.0001, + "loss": 1.6361, + "step": 4774 + }, + { + "epoch": 0.5485038194245018, + "grad_norm": 0.5578180551528931, + "learning_rate": 0.0001, + "loss": 1.5176, + "step": 4775 + }, + { + "epoch": 0.5486186893343289, + "grad_norm": 0.47964030504226685, + "learning_rate": 0.0001, + "loss": 1.7186, + "step": 4776 + }, + { + "epoch": 0.548733559244156, + "grad_norm": 0.45499885082244873, + "learning_rate": 0.0001, + "loss": 1.6304, + "step": 4777 + }, + { + "epoch": 0.5488484291539831, + "grad_norm": 0.4086899757385254, + "learning_rate": 0.0001, + "loss": 1.5499, + "step": 4778 + }, + { + "epoch": 0.5489632990638103, + "grad_norm": 0.4332214593887329, + "learning_rate": 0.0001, + "loss": 1.6879, + "step": 4779 + }, + { + "epoch": 0.5490781689736374, + "grad_norm": 0.4639904499053955, + "learning_rate": 0.0001, + "loss": 1.741, + "step": 4780 + }, + { + "epoch": 0.5491930388834645, + "grad_norm": 0.4308798313140869, + "learning_rate": 0.0001, + "loss": 1.6483, + "step": 4781 + }, + { + "epoch": 0.5493079087932916, + "grad_norm": 0.4329991042613983, + "learning_rate": 0.0001, + "loss": 1.6593, + "step": 4782 + }, + { + "epoch": 0.5494227787031187, + "grad_norm": 0.48464497923851013, + "learning_rate": 0.0001, + "loss": 1.6769, + "step": 4783 + }, + { + "epoch": 0.5495376486129459, + "grad_norm": 0.44216474890708923, + "learning_rate": 0.0001, + "loss": 1.744, + "step": 4784 + }, + { + "epoch": 0.549652518522773, + "grad_norm": 0.44045913219451904, + "learning_rate": 0.0001, + "loss": 1.6436, + "step": 4785 + }, + { + "epoch": 0.5497673884326001, + "grad_norm": 0.4664250910282135, + "learning_rate": 0.0001, + "loss": 1.634, + "step": 4786 + }, + { + "epoch": 0.5498822583424272, + "grad_norm": 0.41855260729789734, + "learning_rate": 0.0001, + "loss": 1.7692, + "step": 4787 + }, + { + "epoch": 0.5499971282522543, + "grad_norm": 0.41777145862579346, + "learning_rate": 0.0001, + "loss": 1.5246, + "step": 4788 + }, + { + "epoch": 0.5501119981620815, + "grad_norm": 0.4096580147743225, + "learning_rate": 0.0001, + "loss": 1.7369, + "step": 4789 + }, + { + "epoch": 0.5502268680719086, + "grad_norm": 0.4547279477119446, + "learning_rate": 0.0001, + "loss": 1.7846, + "step": 4790 + }, + { + "epoch": 0.5503417379817357, + "grad_norm": 0.4305475354194641, + "learning_rate": 0.0001, + "loss": 1.5839, + "step": 4791 + }, + { + "epoch": 0.5504566078915628, + "grad_norm": 0.461092084646225, + "learning_rate": 0.0001, + "loss": 1.5893, + "step": 4792 + }, + { + "epoch": 0.5505714778013899, + "grad_norm": 0.43664494156837463, + "learning_rate": 0.0001, + "loss": 1.5907, + "step": 4793 + }, + { + "epoch": 0.550686347711217, + "grad_norm": 0.4284107983112335, + "learning_rate": 0.0001, + "loss": 1.6731, + "step": 4794 + }, + { + "epoch": 0.5508012176210442, + "grad_norm": 0.47756850719451904, + "learning_rate": 0.0001, + "loss": 1.7553, + "step": 4795 + }, + { + "epoch": 0.5509160875308713, + "grad_norm": 0.39941316843032837, + "learning_rate": 0.0001, + "loss": 1.5273, + "step": 4796 + }, + { + "epoch": 0.5510309574406984, + "grad_norm": 0.4312814474105835, + "learning_rate": 0.0001, + "loss": 1.4235, + "step": 4797 + }, + { + "epoch": 0.5511458273505255, + "grad_norm": 0.4587937891483307, + "learning_rate": 0.0001, + "loss": 1.7078, + "step": 4798 + }, + { + "epoch": 0.5512606972603527, + "grad_norm": 0.43980634212493896, + "learning_rate": 0.0001, + "loss": 1.7708, + "step": 4799 + }, + { + "epoch": 0.5513755671701798, + "grad_norm": 0.4155730903148651, + "learning_rate": 0.0001, + "loss": 1.5494, + "step": 4800 + }, + { + "epoch": 0.5514904370800069, + "grad_norm": 0.4062232971191406, + "learning_rate": 0.0001, + "loss": 1.6272, + "step": 4801 + }, + { + "epoch": 0.551605306989834, + "grad_norm": 0.43724343180656433, + "learning_rate": 0.0001, + "loss": 1.6522, + "step": 4802 + }, + { + "epoch": 0.5517201768996611, + "grad_norm": 0.4811611473560333, + "learning_rate": 0.0001, + "loss": 1.4776, + "step": 4803 + }, + { + "epoch": 0.5518350468094882, + "grad_norm": 0.3989713191986084, + "learning_rate": 0.0001, + "loss": 1.2717, + "step": 4804 + }, + { + "epoch": 0.5519499167193154, + "grad_norm": 0.43779945373535156, + "learning_rate": 0.0001, + "loss": 1.7521, + "step": 4805 + }, + { + "epoch": 0.5520647866291425, + "grad_norm": 0.46011802554130554, + "learning_rate": 0.0001, + "loss": 1.7228, + "step": 4806 + }, + { + "epoch": 0.5521796565389696, + "grad_norm": 0.3978514075279236, + "learning_rate": 0.0001, + "loss": 1.5681, + "step": 4807 + }, + { + "epoch": 0.5522945264487967, + "grad_norm": 0.4245090186595917, + "learning_rate": 0.0001, + "loss": 1.6901, + "step": 4808 + }, + { + "epoch": 0.5524093963586238, + "grad_norm": 0.46046197414398193, + "learning_rate": 0.0001, + "loss": 1.6541, + "step": 4809 + }, + { + "epoch": 0.552524266268451, + "grad_norm": 0.40693244338035583, + "learning_rate": 0.0001, + "loss": 1.601, + "step": 4810 + }, + { + "epoch": 0.5526391361782781, + "grad_norm": 0.4510886073112488, + "learning_rate": 0.0001, + "loss": 1.6634, + "step": 4811 + }, + { + "epoch": 0.5527540060881052, + "grad_norm": 0.43159717321395874, + "learning_rate": 0.0001, + "loss": 1.7195, + "step": 4812 + }, + { + "epoch": 0.5528688759979323, + "grad_norm": 0.6375383734703064, + "learning_rate": 0.0001, + "loss": 1.4753, + "step": 4813 + }, + { + "epoch": 0.5529837459077594, + "grad_norm": 0.4993058741092682, + "learning_rate": 0.0001, + "loss": 1.8643, + "step": 4814 + }, + { + "epoch": 0.5530986158175866, + "grad_norm": 0.44548937678337097, + "learning_rate": 0.0001, + "loss": 1.6889, + "step": 4815 + }, + { + "epoch": 0.5532134857274137, + "grad_norm": 0.416087806224823, + "learning_rate": 0.0001, + "loss": 1.62, + "step": 4816 + }, + { + "epoch": 0.5533283556372408, + "grad_norm": 0.4004303216934204, + "learning_rate": 0.0001, + "loss": 1.6196, + "step": 4817 + }, + { + "epoch": 0.5534432255470679, + "grad_norm": 0.46140408515930176, + "learning_rate": 0.0001, + "loss": 1.8474, + "step": 4818 + }, + { + "epoch": 0.553558095456895, + "grad_norm": 0.41780680418014526, + "learning_rate": 0.0001, + "loss": 1.5836, + "step": 4819 + }, + { + "epoch": 0.5536729653667222, + "grad_norm": 0.4246324598789215, + "learning_rate": 0.0001, + "loss": 1.5161, + "step": 4820 + }, + { + "epoch": 0.5537878352765493, + "grad_norm": 0.4020591080188751, + "learning_rate": 0.0001, + "loss": 1.5327, + "step": 4821 + }, + { + "epoch": 0.5539027051863764, + "grad_norm": 0.4175722301006317, + "learning_rate": 0.0001, + "loss": 1.6808, + "step": 4822 + }, + { + "epoch": 0.5540175750962035, + "grad_norm": 0.41285720467567444, + "learning_rate": 0.0001, + "loss": 1.7206, + "step": 4823 + }, + { + "epoch": 0.5541324450060306, + "grad_norm": 0.3958023190498352, + "learning_rate": 0.0001, + "loss": 1.4307, + "step": 4824 + }, + { + "epoch": 0.5542473149158578, + "grad_norm": 0.43072789907455444, + "learning_rate": 0.0001, + "loss": 1.6919, + "step": 4825 + }, + { + "epoch": 0.5543621848256849, + "grad_norm": 0.4155644178390503, + "learning_rate": 0.0001, + "loss": 1.6788, + "step": 4826 + }, + { + "epoch": 0.554477054735512, + "grad_norm": 0.4563823640346527, + "learning_rate": 0.0001, + "loss": 1.503, + "step": 4827 + }, + { + "epoch": 0.5545919246453391, + "grad_norm": 0.40075716376304626, + "learning_rate": 0.0001, + "loss": 1.4557, + "step": 4828 + }, + { + "epoch": 0.5547067945551662, + "grad_norm": 0.3869268000125885, + "learning_rate": 0.0001, + "loss": 1.375, + "step": 4829 + }, + { + "epoch": 0.5548216644649934, + "grad_norm": 0.4496128559112549, + "learning_rate": 0.0001, + "loss": 1.6762, + "step": 4830 + }, + { + "epoch": 0.5549365343748205, + "grad_norm": 0.42789193987846375, + "learning_rate": 0.0001, + "loss": 1.4755, + "step": 4831 + }, + { + "epoch": 0.5550514042846476, + "grad_norm": 0.43896588683128357, + "learning_rate": 0.0001, + "loss": 1.861, + "step": 4832 + }, + { + "epoch": 0.5551662741944747, + "grad_norm": 0.42781296372413635, + "learning_rate": 0.0001, + "loss": 1.4889, + "step": 4833 + }, + { + "epoch": 0.5552811441043018, + "grad_norm": 0.40033093094825745, + "learning_rate": 0.0001, + "loss": 1.6163, + "step": 4834 + }, + { + "epoch": 0.555396014014129, + "grad_norm": 0.43149489164352417, + "learning_rate": 0.0001, + "loss": 1.5783, + "step": 4835 + }, + { + "epoch": 0.5555108839239561, + "grad_norm": 0.4161946177482605, + "learning_rate": 0.0001, + "loss": 1.4264, + "step": 4836 + }, + { + "epoch": 0.5556257538337832, + "grad_norm": 0.45024940371513367, + "learning_rate": 0.0001, + "loss": 1.6887, + "step": 4837 + }, + { + "epoch": 0.5557406237436103, + "grad_norm": 0.48352208733558655, + "learning_rate": 0.0001, + "loss": 1.7129, + "step": 4838 + }, + { + "epoch": 0.5558554936534374, + "grad_norm": 0.40999725461006165, + "learning_rate": 0.0001, + "loss": 1.6054, + "step": 4839 + }, + { + "epoch": 0.5559703635632646, + "grad_norm": 0.43245741724967957, + "learning_rate": 0.0001, + "loss": 1.635, + "step": 4840 + }, + { + "epoch": 0.5560852334730917, + "grad_norm": 0.4161476194858551, + "learning_rate": 0.0001, + "loss": 1.5808, + "step": 4841 + }, + { + "epoch": 0.5562001033829188, + "grad_norm": 0.39406320452690125, + "learning_rate": 0.0001, + "loss": 1.4987, + "step": 4842 + }, + { + "epoch": 0.5563149732927459, + "grad_norm": 0.43421339988708496, + "learning_rate": 0.0001, + "loss": 1.6044, + "step": 4843 + }, + { + "epoch": 0.556429843202573, + "grad_norm": 0.4208906888961792, + "learning_rate": 0.0001, + "loss": 1.4881, + "step": 4844 + }, + { + "epoch": 0.5565447131124002, + "grad_norm": 0.43722692131996155, + "learning_rate": 0.0001, + "loss": 1.7109, + "step": 4845 + }, + { + "epoch": 0.5566595830222273, + "grad_norm": 0.4307270348072052, + "learning_rate": 0.0001, + "loss": 1.7208, + "step": 4846 + }, + { + "epoch": 0.5567744529320544, + "grad_norm": 0.44914358854293823, + "learning_rate": 0.0001, + "loss": 1.5319, + "step": 4847 + }, + { + "epoch": 0.5568893228418815, + "grad_norm": 0.4444207549095154, + "learning_rate": 0.0001, + "loss": 1.6153, + "step": 4848 + }, + { + "epoch": 0.5570041927517086, + "grad_norm": 0.46169236302375793, + "learning_rate": 0.0001, + "loss": 1.7459, + "step": 4849 + }, + { + "epoch": 0.5571190626615358, + "grad_norm": 0.45408234000205994, + "learning_rate": 0.0001, + "loss": 1.6744, + "step": 4850 + }, + { + "epoch": 0.5572339325713629, + "grad_norm": 0.4497997760772705, + "learning_rate": 0.0001, + "loss": 1.7392, + "step": 4851 + }, + { + "epoch": 0.55734880248119, + "grad_norm": 0.4248064160346985, + "learning_rate": 0.0001, + "loss": 1.4848, + "step": 4852 + }, + { + "epoch": 0.5574636723910171, + "grad_norm": 0.436135470867157, + "learning_rate": 0.0001, + "loss": 1.5865, + "step": 4853 + }, + { + "epoch": 0.5575785423008442, + "grad_norm": 0.4491892158985138, + "learning_rate": 0.0001, + "loss": 1.7014, + "step": 4854 + }, + { + "epoch": 0.5576934122106714, + "grad_norm": 0.395537793636322, + "learning_rate": 0.0001, + "loss": 1.6343, + "step": 4855 + }, + { + "epoch": 0.5578082821204985, + "grad_norm": 0.46674367785453796, + "learning_rate": 0.0001, + "loss": 1.6606, + "step": 4856 + }, + { + "epoch": 0.5579231520303256, + "grad_norm": 0.5027632713317871, + "learning_rate": 0.0001, + "loss": 1.4204, + "step": 4857 + }, + { + "epoch": 0.5580380219401527, + "grad_norm": 0.39720940589904785, + "learning_rate": 0.0001, + "loss": 1.4723, + "step": 4858 + }, + { + "epoch": 0.5581528918499798, + "grad_norm": 0.4141447842121124, + "learning_rate": 0.0001, + "loss": 1.5265, + "step": 4859 + }, + { + "epoch": 0.558267761759807, + "grad_norm": 0.4240556061267853, + "learning_rate": 0.0001, + "loss": 1.6346, + "step": 4860 + }, + { + "epoch": 0.5583826316696341, + "grad_norm": 0.47687891125679016, + "learning_rate": 0.0001, + "loss": 1.9204, + "step": 4861 + }, + { + "epoch": 0.5584975015794612, + "grad_norm": 0.4583745002746582, + "learning_rate": 0.0001, + "loss": 1.5725, + "step": 4862 + }, + { + "epoch": 0.5586123714892883, + "grad_norm": 0.43266603350639343, + "learning_rate": 0.0001, + "loss": 1.6633, + "step": 4863 + }, + { + "epoch": 0.5587272413991154, + "grad_norm": 0.415556401014328, + "learning_rate": 0.0001, + "loss": 1.5649, + "step": 4864 + }, + { + "epoch": 0.5588421113089426, + "grad_norm": 0.4445532262325287, + "learning_rate": 0.0001, + "loss": 1.7728, + "step": 4865 + }, + { + "epoch": 0.5589569812187697, + "grad_norm": 0.43825092911720276, + "learning_rate": 0.0001, + "loss": 1.7384, + "step": 4866 + }, + { + "epoch": 0.5590718511285969, + "grad_norm": 0.4609036147594452, + "learning_rate": 0.0001, + "loss": 1.4781, + "step": 4867 + }, + { + "epoch": 0.559186721038424, + "grad_norm": 0.44946613907814026, + "learning_rate": 0.0001, + "loss": 1.6208, + "step": 4868 + }, + { + "epoch": 0.5593015909482512, + "grad_norm": 0.44441309571266174, + "learning_rate": 0.0001, + "loss": 1.7457, + "step": 4869 + }, + { + "epoch": 0.5594164608580783, + "grad_norm": 0.42556867003440857, + "learning_rate": 0.0001, + "loss": 1.7197, + "step": 4870 + }, + { + "epoch": 0.5595313307679054, + "grad_norm": 0.47983217239379883, + "learning_rate": 0.0001, + "loss": 1.8489, + "step": 4871 + }, + { + "epoch": 0.5596462006777325, + "grad_norm": 0.46023857593536377, + "learning_rate": 0.0001, + "loss": 1.7739, + "step": 4872 + }, + { + "epoch": 0.5597610705875596, + "grad_norm": 0.4268305003643036, + "learning_rate": 0.0001, + "loss": 1.6737, + "step": 4873 + }, + { + "epoch": 0.5598759404973868, + "grad_norm": 0.43025729060173035, + "learning_rate": 0.0001, + "loss": 1.3838, + "step": 4874 + }, + { + "epoch": 0.5599908104072139, + "grad_norm": 0.4198823869228363, + "learning_rate": 0.0001, + "loss": 1.5685, + "step": 4875 + }, + { + "epoch": 0.560105680317041, + "grad_norm": 0.39771756529808044, + "learning_rate": 0.0001, + "loss": 1.3502, + "step": 4876 + }, + { + "epoch": 0.5602205502268681, + "grad_norm": 0.42094194889068604, + "learning_rate": 0.0001, + "loss": 1.7735, + "step": 4877 + }, + { + "epoch": 0.5603354201366952, + "grad_norm": 0.4145548343658447, + "learning_rate": 0.0001, + "loss": 1.535, + "step": 4878 + }, + { + "epoch": 0.5604502900465224, + "grad_norm": 0.4257977306842804, + "learning_rate": 0.0001, + "loss": 1.7172, + "step": 4879 + }, + { + "epoch": 0.5605651599563495, + "grad_norm": 0.43406781554222107, + "learning_rate": 0.0001, + "loss": 1.6353, + "step": 4880 + }, + { + "epoch": 0.5606800298661766, + "grad_norm": 0.4150993525981903, + "learning_rate": 0.0001, + "loss": 1.5706, + "step": 4881 + }, + { + "epoch": 0.5607948997760037, + "grad_norm": 0.4342502951622009, + "learning_rate": 0.0001, + "loss": 1.7698, + "step": 4882 + }, + { + "epoch": 0.5609097696858308, + "grad_norm": 0.46102142333984375, + "learning_rate": 0.0001, + "loss": 1.6396, + "step": 4883 + }, + { + "epoch": 0.561024639595658, + "grad_norm": 0.42644503712654114, + "learning_rate": 0.0001, + "loss": 1.7872, + "step": 4884 + }, + { + "epoch": 0.5611395095054851, + "grad_norm": 0.4223010241985321, + "learning_rate": 0.0001, + "loss": 1.6453, + "step": 4885 + }, + { + "epoch": 0.5612543794153122, + "grad_norm": 0.4439496695995331, + "learning_rate": 0.0001, + "loss": 1.6689, + "step": 4886 + }, + { + "epoch": 0.5613692493251393, + "grad_norm": 0.4849221408367157, + "learning_rate": 0.0001, + "loss": 1.7259, + "step": 4887 + }, + { + "epoch": 0.5614841192349664, + "grad_norm": 0.4383675158023834, + "learning_rate": 0.0001, + "loss": 1.7152, + "step": 4888 + }, + { + "epoch": 0.5615989891447936, + "grad_norm": 0.4355122745037079, + "learning_rate": 0.0001, + "loss": 1.548, + "step": 4889 + }, + { + "epoch": 0.5617138590546207, + "grad_norm": 0.4068525731563568, + "learning_rate": 0.0001, + "loss": 1.3924, + "step": 4890 + }, + { + "epoch": 0.5618287289644478, + "grad_norm": 0.4126929044723511, + "learning_rate": 0.0001, + "loss": 1.4923, + "step": 4891 + }, + { + "epoch": 0.5619435988742749, + "grad_norm": 0.43039217591285706, + "learning_rate": 0.0001, + "loss": 1.5134, + "step": 4892 + }, + { + "epoch": 0.562058468784102, + "grad_norm": 0.4148006737232208, + "learning_rate": 0.0001, + "loss": 1.4452, + "step": 4893 + }, + { + "epoch": 0.5621733386939292, + "grad_norm": 0.467264860868454, + "learning_rate": 0.0001, + "loss": 1.7233, + "step": 4894 + }, + { + "epoch": 0.5622882086037563, + "grad_norm": 0.43316879868507385, + "learning_rate": 0.0001, + "loss": 1.6064, + "step": 4895 + }, + { + "epoch": 0.5624030785135834, + "grad_norm": 0.4467410147190094, + "learning_rate": 0.0001, + "loss": 1.7144, + "step": 4896 + }, + { + "epoch": 0.5625179484234105, + "grad_norm": 0.45276862382888794, + "learning_rate": 0.0001, + "loss": 1.7343, + "step": 4897 + }, + { + "epoch": 0.5626328183332376, + "grad_norm": 0.4133438169956207, + "learning_rate": 0.0001, + "loss": 1.6185, + "step": 4898 + }, + { + "epoch": 0.5627476882430648, + "grad_norm": 0.4188348352909088, + "learning_rate": 0.0001, + "loss": 1.5476, + "step": 4899 + }, + { + "epoch": 0.5628625581528919, + "grad_norm": 0.44320085644721985, + "learning_rate": 0.0001, + "loss": 1.6812, + "step": 4900 + }, + { + "epoch": 0.562977428062719, + "grad_norm": 0.43440455198287964, + "learning_rate": 0.0001, + "loss": 1.6843, + "step": 4901 + }, + { + "epoch": 0.5630922979725461, + "grad_norm": 0.45006540417671204, + "learning_rate": 0.0001, + "loss": 1.5807, + "step": 4902 + }, + { + "epoch": 0.5632071678823732, + "grad_norm": 0.4506426453590393, + "learning_rate": 0.0001, + "loss": 1.6472, + "step": 4903 + }, + { + "epoch": 0.5633220377922004, + "grad_norm": 0.43011045455932617, + "learning_rate": 0.0001, + "loss": 1.6468, + "step": 4904 + }, + { + "epoch": 0.5634369077020275, + "grad_norm": 0.4250010848045349, + "learning_rate": 0.0001, + "loss": 1.5558, + "step": 4905 + }, + { + "epoch": 0.5635517776118546, + "grad_norm": 0.4180524945259094, + "learning_rate": 0.0001, + "loss": 1.5528, + "step": 4906 + }, + { + "epoch": 0.5636666475216817, + "grad_norm": 0.4589906334877014, + "learning_rate": 0.0001, + "loss": 1.69, + "step": 4907 + }, + { + "epoch": 0.5637815174315088, + "grad_norm": 0.40024542808532715, + "learning_rate": 0.0001, + "loss": 1.6212, + "step": 4908 + }, + { + "epoch": 0.563896387341336, + "grad_norm": 0.4209577739238739, + "learning_rate": 0.0001, + "loss": 1.7709, + "step": 4909 + }, + { + "epoch": 0.5640112572511631, + "grad_norm": 0.42181211709976196, + "learning_rate": 0.0001, + "loss": 1.6891, + "step": 4910 + }, + { + "epoch": 0.5641261271609902, + "grad_norm": 0.44479602575302124, + "learning_rate": 0.0001, + "loss": 1.8052, + "step": 4911 + }, + { + "epoch": 0.5642409970708173, + "grad_norm": 0.41980117559432983, + "learning_rate": 0.0001, + "loss": 1.7223, + "step": 4912 + }, + { + "epoch": 0.5643558669806444, + "grad_norm": 0.45046502351760864, + "learning_rate": 0.0001, + "loss": 1.6636, + "step": 4913 + }, + { + "epoch": 0.5644707368904716, + "grad_norm": 0.4410959482192993, + "learning_rate": 0.0001, + "loss": 1.751, + "step": 4914 + }, + { + "epoch": 0.5645856068002987, + "grad_norm": 0.4019821286201477, + "learning_rate": 0.0001, + "loss": 1.6362, + "step": 4915 + }, + { + "epoch": 0.5647004767101258, + "grad_norm": 0.4647962152957916, + "learning_rate": 0.0001, + "loss": 1.7969, + "step": 4916 + }, + { + "epoch": 0.5648153466199529, + "grad_norm": 0.4058244824409485, + "learning_rate": 0.0001, + "loss": 1.723, + "step": 4917 + }, + { + "epoch": 0.56493021652978, + "grad_norm": 0.41798049211502075, + "learning_rate": 0.0001, + "loss": 1.5339, + "step": 4918 + }, + { + "epoch": 0.5650450864396072, + "grad_norm": 0.39506152272224426, + "learning_rate": 0.0001, + "loss": 1.2809, + "step": 4919 + }, + { + "epoch": 0.5651599563494343, + "grad_norm": 0.41670095920562744, + "learning_rate": 0.0001, + "loss": 1.6378, + "step": 4920 + }, + { + "epoch": 0.5652748262592614, + "grad_norm": 0.41250118613243103, + "learning_rate": 0.0001, + "loss": 1.5185, + "step": 4921 + }, + { + "epoch": 0.5653896961690885, + "grad_norm": 0.4120524227619171, + "learning_rate": 0.0001, + "loss": 1.6923, + "step": 4922 + }, + { + "epoch": 0.5655045660789156, + "grad_norm": 0.3931626081466675, + "learning_rate": 0.0001, + "loss": 1.4581, + "step": 4923 + }, + { + "epoch": 0.5656194359887428, + "grad_norm": 0.42506247758865356, + "learning_rate": 0.0001, + "loss": 1.6842, + "step": 4924 + }, + { + "epoch": 0.5657343058985699, + "grad_norm": 0.4446874260902405, + "learning_rate": 0.0001, + "loss": 1.6743, + "step": 4925 + }, + { + "epoch": 0.565849175808397, + "grad_norm": 0.3943353295326233, + "learning_rate": 0.0001, + "loss": 1.5795, + "step": 4926 + }, + { + "epoch": 0.5659640457182241, + "grad_norm": 0.41571202874183655, + "learning_rate": 0.0001, + "loss": 1.3642, + "step": 4927 + }, + { + "epoch": 0.5660789156280512, + "grad_norm": 0.4377326965332031, + "learning_rate": 0.0001, + "loss": 1.6458, + "step": 4928 + }, + { + "epoch": 0.5661937855378784, + "grad_norm": 0.4330956041812897, + "learning_rate": 0.0001, + "loss": 1.6974, + "step": 4929 + }, + { + "epoch": 0.5663086554477055, + "grad_norm": 0.4336974024772644, + "learning_rate": 0.0001, + "loss": 1.6062, + "step": 4930 + }, + { + "epoch": 0.5664235253575326, + "grad_norm": 0.43469882011413574, + "learning_rate": 0.0001, + "loss": 1.6367, + "step": 4931 + }, + { + "epoch": 0.5665383952673597, + "grad_norm": 0.42299988865852356, + "learning_rate": 0.0001, + "loss": 1.6504, + "step": 4932 + }, + { + "epoch": 0.5666532651771868, + "grad_norm": 0.4548715651035309, + "learning_rate": 0.0001, + "loss": 1.771, + "step": 4933 + }, + { + "epoch": 0.566768135087014, + "grad_norm": 0.45423921942710876, + "learning_rate": 0.0001, + "loss": 1.7278, + "step": 4934 + }, + { + "epoch": 0.5668830049968411, + "grad_norm": 0.4271089434623718, + "learning_rate": 0.0001, + "loss": 1.6531, + "step": 4935 + }, + { + "epoch": 0.5669978749066682, + "grad_norm": 0.4774452745914459, + "learning_rate": 0.0001, + "loss": 1.7197, + "step": 4936 + }, + { + "epoch": 0.5671127448164953, + "grad_norm": 0.4133590757846832, + "learning_rate": 0.0001, + "loss": 1.5688, + "step": 4937 + }, + { + "epoch": 0.5672276147263224, + "grad_norm": 0.4909397065639496, + "learning_rate": 0.0001, + "loss": 1.7266, + "step": 4938 + }, + { + "epoch": 0.5673424846361496, + "grad_norm": 0.47322049736976624, + "learning_rate": 0.0001, + "loss": 1.5714, + "step": 4939 + }, + { + "epoch": 0.5674573545459767, + "grad_norm": 0.42633056640625, + "learning_rate": 0.0001, + "loss": 1.5971, + "step": 4940 + }, + { + "epoch": 0.5675722244558038, + "grad_norm": 0.43367111682891846, + "learning_rate": 0.0001, + "loss": 1.5723, + "step": 4941 + }, + { + "epoch": 0.5676870943656309, + "grad_norm": 0.4305420219898224, + "learning_rate": 0.0001, + "loss": 1.6753, + "step": 4942 + }, + { + "epoch": 0.567801964275458, + "grad_norm": 0.4496208727359772, + "learning_rate": 0.0001, + "loss": 1.8287, + "step": 4943 + }, + { + "epoch": 0.5679168341852852, + "grad_norm": 0.4456053376197815, + "learning_rate": 0.0001, + "loss": 1.5372, + "step": 4944 + }, + { + "epoch": 0.5680317040951123, + "grad_norm": 0.41399866342544556, + "learning_rate": 0.0001, + "loss": 1.6282, + "step": 4945 + }, + { + "epoch": 0.5681465740049394, + "grad_norm": 0.41621148586273193, + "learning_rate": 0.0001, + "loss": 1.5602, + "step": 4946 + }, + { + "epoch": 0.5682614439147665, + "grad_norm": 0.4587562084197998, + "learning_rate": 0.0001, + "loss": 1.8734, + "step": 4947 + }, + { + "epoch": 0.5683763138245936, + "grad_norm": 0.4021860361099243, + "learning_rate": 0.0001, + "loss": 1.3734, + "step": 4948 + }, + { + "epoch": 0.5684911837344208, + "grad_norm": 0.4463704228401184, + "learning_rate": 0.0001, + "loss": 1.8134, + "step": 4949 + }, + { + "epoch": 0.5686060536442479, + "grad_norm": 0.424570769071579, + "learning_rate": 0.0001, + "loss": 1.7018, + "step": 4950 + }, + { + "epoch": 0.568720923554075, + "grad_norm": 0.42182692885398865, + "learning_rate": 0.0001, + "loss": 1.4935, + "step": 4951 + }, + { + "epoch": 0.5688357934639021, + "grad_norm": 0.4009837806224823, + "learning_rate": 0.0001, + "loss": 1.5051, + "step": 4952 + }, + { + "epoch": 0.5689506633737292, + "grad_norm": 0.394553542137146, + "learning_rate": 0.0001, + "loss": 1.5031, + "step": 4953 + }, + { + "epoch": 0.5690655332835564, + "grad_norm": 0.4050164520740509, + "learning_rate": 0.0001, + "loss": 1.548, + "step": 4954 + }, + { + "epoch": 0.5691804031933835, + "grad_norm": 0.4225456714630127, + "learning_rate": 0.0001, + "loss": 1.6712, + "step": 4955 + }, + { + "epoch": 0.5692952731032106, + "grad_norm": 0.4110758304595947, + "learning_rate": 0.0001, + "loss": 1.4407, + "step": 4956 + }, + { + "epoch": 0.5694101430130377, + "grad_norm": 0.434222549200058, + "learning_rate": 0.0001, + "loss": 1.6659, + "step": 4957 + }, + { + "epoch": 0.5695250129228648, + "grad_norm": 0.4162403345108032, + "learning_rate": 0.0001, + "loss": 1.6895, + "step": 4958 + }, + { + "epoch": 0.569639882832692, + "grad_norm": 0.43041545152664185, + "learning_rate": 0.0001, + "loss": 1.6472, + "step": 4959 + }, + { + "epoch": 0.5697547527425191, + "grad_norm": 0.4041105806827545, + "learning_rate": 0.0001, + "loss": 1.4989, + "step": 4960 + }, + { + "epoch": 0.5698696226523462, + "grad_norm": 0.4230792820453644, + "learning_rate": 0.0001, + "loss": 1.5815, + "step": 4961 + }, + { + "epoch": 0.5699844925621733, + "grad_norm": 0.4470837414264679, + "learning_rate": 0.0001, + "loss": 1.9244, + "step": 4962 + }, + { + "epoch": 0.5700993624720004, + "grad_norm": 0.40653735399246216, + "learning_rate": 0.0001, + "loss": 1.5813, + "step": 4963 + }, + { + "epoch": 0.5702142323818276, + "grad_norm": 0.42262402176856995, + "learning_rate": 0.0001, + "loss": 1.6871, + "step": 4964 + }, + { + "epoch": 0.5703291022916547, + "grad_norm": 0.4305260181427002, + "learning_rate": 0.0001, + "loss": 1.5856, + "step": 4965 + }, + { + "epoch": 0.5704439722014818, + "grad_norm": 0.4613392651081085, + "learning_rate": 0.0001, + "loss": 1.7875, + "step": 4966 + }, + { + "epoch": 0.5705588421113089, + "grad_norm": 0.446806937456131, + "learning_rate": 0.0001, + "loss": 1.7816, + "step": 4967 + }, + { + "epoch": 0.570673712021136, + "grad_norm": 0.40673378109931946, + "learning_rate": 0.0001, + "loss": 1.5248, + "step": 4968 + }, + { + "epoch": 0.5707885819309632, + "grad_norm": 0.46812543272972107, + "learning_rate": 0.0001, + "loss": 1.5529, + "step": 4969 + }, + { + "epoch": 0.5709034518407903, + "grad_norm": 0.406392902135849, + "learning_rate": 0.0001, + "loss": 1.5711, + "step": 4970 + }, + { + "epoch": 0.5710183217506174, + "grad_norm": 0.4076795279979706, + "learning_rate": 0.0001, + "loss": 1.4139, + "step": 4971 + }, + { + "epoch": 0.5711331916604445, + "grad_norm": 0.4175947308540344, + "learning_rate": 0.0001, + "loss": 1.6253, + "step": 4972 + }, + { + "epoch": 0.5712480615702716, + "grad_norm": 0.4206025004386902, + "learning_rate": 0.0001, + "loss": 1.7126, + "step": 4973 + }, + { + "epoch": 0.5713629314800988, + "grad_norm": 0.41030603647232056, + "learning_rate": 0.0001, + "loss": 1.6595, + "step": 4974 + }, + { + "epoch": 0.5714778013899259, + "grad_norm": 0.42624354362487793, + "learning_rate": 0.0001, + "loss": 1.5886, + "step": 4975 + }, + { + "epoch": 0.571592671299753, + "grad_norm": 0.39379024505615234, + "learning_rate": 0.0001, + "loss": 1.4981, + "step": 4976 + }, + { + "epoch": 0.5717075412095801, + "grad_norm": 0.42009881138801575, + "learning_rate": 0.0001, + "loss": 1.5639, + "step": 4977 + }, + { + "epoch": 0.5718224111194072, + "grad_norm": 0.4457103908061981, + "learning_rate": 0.0001, + "loss": 1.4205, + "step": 4978 + }, + { + "epoch": 0.5719372810292344, + "grad_norm": 0.4116220474243164, + "learning_rate": 0.0001, + "loss": 1.6223, + "step": 4979 + }, + { + "epoch": 0.5720521509390615, + "grad_norm": 0.44594505429267883, + "learning_rate": 0.0001, + "loss": 1.5149, + "step": 4980 + }, + { + "epoch": 0.5721670208488886, + "grad_norm": 0.4323948919773102, + "learning_rate": 0.0001, + "loss": 1.4976, + "step": 4981 + }, + { + "epoch": 0.5722818907587157, + "grad_norm": 0.38895127177238464, + "learning_rate": 0.0001, + "loss": 1.4075, + "step": 4982 + }, + { + "epoch": 0.5723967606685428, + "grad_norm": 0.4395938515663147, + "learning_rate": 0.0001, + "loss": 1.5436, + "step": 4983 + }, + { + "epoch": 0.57251163057837, + "grad_norm": 0.43774986267089844, + "learning_rate": 0.0001, + "loss": 1.4892, + "step": 4984 + }, + { + "epoch": 0.5726265004881971, + "grad_norm": 0.4183255136013031, + "learning_rate": 0.0001, + "loss": 1.6281, + "step": 4985 + }, + { + "epoch": 0.5727413703980242, + "grad_norm": 0.439797580242157, + "learning_rate": 0.0001, + "loss": 1.4632, + "step": 4986 + }, + { + "epoch": 0.5728562403078513, + "grad_norm": 0.413779616355896, + "learning_rate": 0.0001, + "loss": 1.3779, + "step": 4987 + }, + { + "epoch": 0.5729711102176784, + "grad_norm": 0.4428330361843109, + "learning_rate": 0.0001, + "loss": 1.7449, + "step": 4988 + }, + { + "epoch": 0.5730859801275056, + "grad_norm": 0.41953855752944946, + "learning_rate": 0.0001, + "loss": 1.6953, + "step": 4989 + }, + { + "epoch": 0.5732008500373327, + "grad_norm": 0.4438232183456421, + "learning_rate": 0.0001, + "loss": 1.7179, + "step": 4990 + }, + { + "epoch": 0.5733157199471598, + "grad_norm": 0.4678000807762146, + "learning_rate": 0.0001, + "loss": 1.8077, + "step": 4991 + }, + { + "epoch": 0.5734305898569869, + "grad_norm": 0.48187345266342163, + "learning_rate": 0.0001, + "loss": 1.7687, + "step": 4992 + }, + { + "epoch": 0.573545459766814, + "grad_norm": 0.4310360848903656, + "learning_rate": 0.0001, + "loss": 1.5992, + "step": 4993 + }, + { + "epoch": 0.5736603296766412, + "grad_norm": 0.4390687346458435, + "learning_rate": 0.0001, + "loss": 1.6331, + "step": 4994 + }, + { + "epoch": 0.5737751995864683, + "grad_norm": 0.44566598534584045, + "learning_rate": 0.0001, + "loss": 1.6057, + "step": 4995 + }, + { + "epoch": 0.5738900694962954, + "grad_norm": 0.399006724357605, + "learning_rate": 0.0001, + "loss": 1.6791, + "step": 4996 + }, + { + "epoch": 0.5740049394061225, + "grad_norm": 0.42733508348464966, + "learning_rate": 0.0001, + "loss": 1.7704, + "step": 4997 + }, + { + "epoch": 0.5741198093159496, + "grad_norm": 0.4273543655872345, + "learning_rate": 0.0001, + "loss": 1.6384, + "step": 4998 + }, + { + "epoch": 0.5742346792257768, + "grad_norm": 0.43150708079338074, + "learning_rate": 0.0001, + "loss": 1.7348, + "step": 4999 + }, + { + "epoch": 0.5743495491356039, + "grad_norm": 0.4143429398536682, + "learning_rate": 0.0001, + "loss": 1.5195, + "step": 5000 + }, + { + "epoch": 0.574464419045431, + "grad_norm": 0.41837653517723083, + "learning_rate": 0.0001, + "loss": 1.6473, + "step": 5001 + }, + { + "epoch": 0.5745792889552581, + "grad_norm": 0.4500226080417633, + "learning_rate": 0.0001, + "loss": 1.8321, + "step": 5002 + }, + { + "epoch": 0.5746941588650852, + "grad_norm": 0.43939411640167236, + "learning_rate": 0.0001, + "loss": 1.7294, + "step": 5003 + }, + { + "epoch": 0.5748090287749125, + "grad_norm": 0.4168783128261566, + "learning_rate": 0.0001, + "loss": 1.7016, + "step": 5004 + }, + { + "epoch": 0.5749238986847396, + "grad_norm": 0.4361497759819031, + "learning_rate": 0.0001, + "loss": 1.7038, + "step": 5005 + }, + { + "epoch": 0.5750387685945667, + "grad_norm": 0.4564398527145386, + "learning_rate": 0.0001, + "loss": 1.738, + "step": 5006 + }, + { + "epoch": 0.5751536385043938, + "grad_norm": 0.4513961672782898, + "learning_rate": 0.0001, + "loss": 1.6747, + "step": 5007 + }, + { + "epoch": 0.575268508414221, + "grad_norm": 0.4509972631931305, + "learning_rate": 0.0001, + "loss": 1.6103, + "step": 5008 + }, + { + "epoch": 0.5753833783240481, + "grad_norm": 0.42755764722824097, + "learning_rate": 0.0001, + "loss": 1.6723, + "step": 5009 + }, + { + "epoch": 0.5754982482338752, + "grad_norm": 0.4352383017539978, + "learning_rate": 0.0001, + "loss": 1.604, + "step": 5010 + }, + { + "epoch": 0.5756131181437023, + "grad_norm": 0.4069184362888336, + "learning_rate": 0.0001, + "loss": 1.5665, + "step": 5011 + }, + { + "epoch": 0.5757279880535294, + "grad_norm": 0.4504094421863556, + "learning_rate": 0.0001, + "loss": 1.7465, + "step": 5012 + }, + { + "epoch": 0.5758428579633565, + "grad_norm": 0.4137848913669586, + "learning_rate": 0.0001, + "loss": 1.419, + "step": 5013 + }, + { + "epoch": 0.5759577278731837, + "grad_norm": 0.43577203154563904, + "learning_rate": 0.0001, + "loss": 1.8129, + "step": 5014 + }, + { + "epoch": 0.5760725977830108, + "grad_norm": 0.45914286375045776, + "learning_rate": 0.0001, + "loss": 1.8664, + "step": 5015 + }, + { + "epoch": 0.5761874676928379, + "grad_norm": 0.4018266499042511, + "learning_rate": 0.0001, + "loss": 1.4698, + "step": 5016 + }, + { + "epoch": 0.576302337602665, + "grad_norm": 0.4395134449005127, + "learning_rate": 0.0001, + "loss": 1.7316, + "step": 5017 + }, + { + "epoch": 0.5764172075124921, + "grad_norm": 0.42709699273109436, + "learning_rate": 0.0001, + "loss": 1.7313, + "step": 5018 + }, + { + "epoch": 0.5765320774223193, + "grad_norm": 0.40391069650650024, + "learning_rate": 0.0001, + "loss": 1.6898, + "step": 5019 + }, + { + "epoch": 0.5766469473321464, + "grad_norm": 0.4373941421508789, + "learning_rate": 0.0001, + "loss": 1.6366, + "step": 5020 + }, + { + "epoch": 0.5767618172419735, + "grad_norm": 0.4111486077308655, + "learning_rate": 0.0001, + "loss": 1.5335, + "step": 5021 + }, + { + "epoch": 0.5768766871518006, + "grad_norm": 0.44314584136009216, + "learning_rate": 0.0001, + "loss": 1.7859, + "step": 5022 + }, + { + "epoch": 0.5769915570616277, + "grad_norm": 0.38819631934165955, + "learning_rate": 0.0001, + "loss": 1.3416, + "step": 5023 + }, + { + "epoch": 0.5771064269714549, + "grad_norm": 0.4158842861652374, + "learning_rate": 0.0001, + "loss": 1.7234, + "step": 5024 + }, + { + "epoch": 0.577221296881282, + "grad_norm": 0.41062501072883606, + "learning_rate": 0.0001, + "loss": 1.51, + "step": 5025 + }, + { + "epoch": 0.5773361667911091, + "grad_norm": 0.4440518617630005, + "learning_rate": 0.0001, + "loss": 1.555, + "step": 5026 + }, + { + "epoch": 0.5774510367009362, + "grad_norm": 0.3918763995170593, + "learning_rate": 0.0001, + "loss": 1.5668, + "step": 5027 + }, + { + "epoch": 0.5775659066107633, + "grad_norm": 0.4936619997024536, + "learning_rate": 0.0001, + "loss": 1.9151, + "step": 5028 + }, + { + "epoch": 0.5776807765205905, + "grad_norm": 0.4350280463695526, + "learning_rate": 0.0001, + "loss": 1.534, + "step": 5029 + }, + { + "epoch": 0.5777956464304176, + "grad_norm": 0.41786620020866394, + "learning_rate": 0.0001, + "loss": 1.6405, + "step": 5030 + }, + { + "epoch": 0.5779105163402447, + "grad_norm": 0.44200530648231506, + "learning_rate": 0.0001, + "loss": 1.8449, + "step": 5031 + }, + { + "epoch": 0.5780253862500718, + "grad_norm": 0.49857649207115173, + "learning_rate": 0.0001, + "loss": 1.7837, + "step": 5032 + }, + { + "epoch": 0.578140256159899, + "grad_norm": 0.40374937653541565, + "learning_rate": 0.0001, + "loss": 1.3166, + "step": 5033 + }, + { + "epoch": 0.5782551260697261, + "grad_norm": 0.425289124250412, + "learning_rate": 0.0001, + "loss": 1.5533, + "step": 5034 + }, + { + "epoch": 0.5783699959795532, + "grad_norm": 0.42704570293426514, + "learning_rate": 0.0001, + "loss": 1.6993, + "step": 5035 + }, + { + "epoch": 0.5784848658893803, + "grad_norm": 0.46401023864746094, + "learning_rate": 0.0001, + "loss": 1.7889, + "step": 5036 + }, + { + "epoch": 0.5785997357992074, + "grad_norm": 0.45945799350738525, + "learning_rate": 0.0001, + "loss": 1.6405, + "step": 5037 + }, + { + "epoch": 0.5787146057090345, + "grad_norm": 0.4467889070510864, + "learning_rate": 0.0001, + "loss": 1.5449, + "step": 5038 + }, + { + "epoch": 0.5788294756188617, + "grad_norm": 0.418954998254776, + "learning_rate": 0.0001, + "loss": 1.5479, + "step": 5039 + }, + { + "epoch": 0.5789443455286888, + "grad_norm": 0.4095263183116913, + "learning_rate": 0.0001, + "loss": 1.5838, + "step": 5040 + }, + { + "epoch": 0.5790592154385159, + "grad_norm": 0.39436352252960205, + "learning_rate": 0.0001, + "loss": 1.3867, + "step": 5041 + }, + { + "epoch": 0.579174085348343, + "grad_norm": 0.43269625306129456, + "learning_rate": 0.0001, + "loss": 1.686, + "step": 5042 + }, + { + "epoch": 0.5792889552581701, + "grad_norm": 0.4262961745262146, + "learning_rate": 0.0001, + "loss": 1.7065, + "step": 5043 + }, + { + "epoch": 0.5794038251679973, + "grad_norm": 0.42846837639808655, + "learning_rate": 0.0001, + "loss": 1.5565, + "step": 5044 + }, + { + "epoch": 0.5795186950778244, + "grad_norm": 0.4515441656112671, + "learning_rate": 0.0001, + "loss": 1.7085, + "step": 5045 + }, + { + "epoch": 0.5796335649876515, + "grad_norm": 0.4548323452472687, + "learning_rate": 0.0001, + "loss": 1.9127, + "step": 5046 + }, + { + "epoch": 0.5797484348974786, + "grad_norm": 0.4489242136478424, + "learning_rate": 0.0001, + "loss": 1.7654, + "step": 5047 + }, + { + "epoch": 0.5798633048073057, + "grad_norm": 0.4277878403663635, + "learning_rate": 0.0001, + "loss": 1.7829, + "step": 5048 + }, + { + "epoch": 0.5799781747171329, + "grad_norm": 0.4465863108634949, + "learning_rate": 0.0001, + "loss": 1.4391, + "step": 5049 + }, + { + "epoch": 0.58009304462696, + "grad_norm": 0.4445357322692871, + "learning_rate": 0.0001, + "loss": 1.5621, + "step": 5050 + }, + { + "epoch": 0.5802079145367871, + "grad_norm": 0.41989558935165405, + "learning_rate": 0.0001, + "loss": 1.5528, + "step": 5051 + }, + { + "epoch": 0.5803227844466142, + "grad_norm": 0.4065076410770416, + "learning_rate": 0.0001, + "loss": 1.5414, + "step": 5052 + }, + { + "epoch": 0.5804376543564413, + "grad_norm": 0.42565080523490906, + "learning_rate": 0.0001, + "loss": 1.6824, + "step": 5053 + }, + { + "epoch": 0.5805525242662685, + "grad_norm": 0.4369344115257263, + "learning_rate": 0.0001, + "loss": 1.6905, + "step": 5054 + }, + { + "epoch": 0.5806673941760956, + "grad_norm": 0.4437665343284607, + "learning_rate": 0.0001, + "loss": 1.702, + "step": 5055 + }, + { + "epoch": 0.5807822640859227, + "grad_norm": 0.4107830822467804, + "learning_rate": 0.0001, + "loss": 1.5411, + "step": 5056 + }, + { + "epoch": 0.5808971339957498, + "grad_norm": 0.4161568582057953, + "learning_rate": 0.0001, + "loss": 1.6756, + "step": 5057 + }, + { + "epoch": 0.581012003905577, + "grad_norm": 0.4613957405090332, + "learning_rate": 0.0001, + "loss": 1.7078, + "step": 5058 + }, + { + "epoch": 0.5811268738154041, + "grad_norm": 0.44085410237312317, + "learning_rate": 0.0001, + "loss": 1.7637, + "step": 5059 + }, + { + "epoch": 0.5812417437252312, + "grad_norm": 0.4186076819896698, + "learning_rate": 0.0001, + "loss": 1.571, + "step": 5060 + }, + { + "epoch": 0.5813566136350583, + "grad_norm": 0.4560279846191406, + "learning_rate": 0.0001, + "loss": 1.7631, + "step": 5061 + }, + { + "epoch": 0.5814714835448854, + "grad_norm": 0.44726184010505676, + "learning_rate": 0.0001, + "loss": 1.8003, + "step": 5062 + }, + { + "epoch": 0.5815863534547125, + "grad_norm": 0.4400385320186615, + "learning_rate": 0.0001, + "loss": 1.7035, + "step": 5063 + }, + { + "epoch": 0.5817012233645397, + "grad_norm": 0.42711982131004333, + "learning_rate": 0.0001, + "loss": 1.5343, + "step": 5064 + }, + { + "epoch": 0.5818160932743668, + "grad_norm": 0.40731683373451233, + "learning_rate": 0.0001, + "loss": 1.715, + "step": 5065 + }, + { + "epoch": 0.5819309631841939, + "grad_norm": 0.4120251536369324, + "learning_rate": 0.0001, + "loss": 1.4904, + "step": 5066 + }, + { + "epoch": 0.582045833094021, + "grad_norm": 0.4294714033603668, + "learning_rate": 0.0001, + "loss": 1.7013, + "step": 5067 + }, + { + "epoch": 0.5821607030038481, + "grad_norm": 0.4543513059616089, + "learning_rate": 0.0001, + "loss": 1.7307, + "step": 5068 + }, + { + "epoch": 0.5822755729136753, + "grad_norm": 0.4606270492076874, + "learning_rate": 0.0001, + "loss": 1.7362, + "step": 5069 + }, + { + "epoch": 0.5823904428235024, + "grad_norm": 0.47500404715538025, + "learning_rate": 0.0001, + "loss": 1.7057, + "step": 5070 + }, + { + "epoch": 0.5825053127333295, + "grad_norm": 0.39924708008766174, + "learning_rate": 0.0001, + "loss": 1.4902, + "step": 5071 + }, + { + "epoch": 0.5826201826431566, + "grad_norm": 0.4525192379951477, + "learning_rate": 0.0001, + "loss": 1.5355, + "step": 5072 + }, + { + "epoch": 0.5827350525529837, + "grad_norm": 0.462971568107605, + "learning_rate": 0.0001, + "loss": 1.5256, + "step": 5073 + }, + { + "epoch": 0.5828499224628109, + "grad_norm": 0.3806494474411011, + "learning_rate": 0.0001, + "loss": 1.3727, + "step": 5074 + }, + { + "epoch": 0.582964792372638, + "grad_norm": 0.4327925741672516, + "learning_rate": 0.0001, + "loss": 1.5462, + "step": 5075 + }, + { + "epoch": 0.5830796622824651, + "grad_norm": 0.39075303077697754, + "learning_rate": 0.0001, + "loss": 1.2952, + "step": 5076 + }, + { + "epoch": 0.5831945321922922, + "grad_norm": 0.4309884011745453, + "learning_rate": 0.0001, + "loss": 1.5131, + "step": 5077 + }, + { + "epoch": 0.5833094021021193, + "grad_norm": 0.4743507206439972, + "learning_rate": 0.0001, + "loss": 1.7537, + "step": 5078 + }, + { + "epoch": 0.5834242720119465, + "grad_norm": 0.39666301012039185, + "learning_rate": 0.0001, + "loss": 1.4356, + "step": 5079 + }, + { + "epoch": 0.5835391419217736, + "grad_norm": 0.431244432926178, + "learning_rate": 0.0001, + "loss": 1.6521, + "step": 5080 + }, + { + "epoch": 0.5836540118316007, + "grad_norm": 0.4098447859287262, + "learning_rate": 0.0001, + "loss": 1.4811, + "step": 5081 + }, + { + "epoch": 0.5837688817414278, + "grad_norm": 0.44986143708229065, + "learning_rate": 0.0001, + "loss": 1.6683, + "step": 5082 + }, + { + "epoch": 0.583883751651255, + "grad_norm": 0.40975865721702576, + "learning_rate": 0.0001, + "loss": 1.6027, + "step": 5083 + }, + { + "epoch": 0.5839986215610821, + "grad_norm": 0.42169931530952454, + "learning_rate": 0.0001, + "loss": 1.6373, + "step": 5084 + }, + { + "epoch": 0.5841134914709092, + "grad_norm": 0.4497613310813904, + "learning_rate": 0.0001, + "loss": 1.7151, + "step": 5085 + }, + { + "epoch": 0.5842283613807363, + "grad_norm": 0.4520888030529022, + "learning_rate": 0.0001, + "loss": 1.5859, + "step": 5086 + }, + { + "epoch": 0.5843432312905634, + "grad_norm": 0.42139726877212524, + "learning_rate": 0.0001, + "loss": 1.7561, + "step": 5087 + }, + { + "epoch": 0.5844581012003905, + "grad_norm": 0.4443289041519165, + "learning_rate": 0.0001, + "loss": 1.6255, + "step": 5088 + }, + { + "epoch": 0.5845729711102177, + "grad_norm": 0.3971986770629883, + "learning_rate": 0.0001, + "loss": 1.5313, + "step": 5089 + }, + { + "epoch": 0.5846878410200448, + "grad_norm": 0.4299372434616089, + "learning_rate": 0.0001, + "loss": 1.6881, + "step": 5090 + }, + { + "epoch": 0.5848027109298719, + "grad_norm": 0.40114906430244446, + "learning_rate": 0.0001, + "loss": 1.5197, + "step": 5091 + }, + { + "epoch": 0.584917580839699, + "grad_norm": 0.4438854157924652, + "learning_rate": 0.0001, + "loss": 1.6968, + "step": 5092 + }, + { + "epoch": 0.5850324507495261, + "grad_norm": 0.44349342584609985, + "learning_rate": 0.0001, + "loss": 1.6948, + "step": 5093 + }, + { + "epoch": 0.5851473206593533, + "grad_norm": 0.42675167322158813, + "learning_rate": 0.0001, + "loss": 1.5868, + "step": 5094 + }, + { + "epoch": 0.5852621905691804, + "grad_norm": 0.43637698888778687, + "learning_rate": 0.0001, + "loss": 1.7436, + "step": 5095 + }, + { + "epoch": 0.5853770604790075, + "grad_norm": 0.44595828652381897, + "learning_rate": 0.0001, + "loss": 1.7227, + "step": 5096 + }, + { + "epoch": 0.5854919303888346, + "grad_norm": 0.4193721413612366, + "learning_rate": 0.0001, + "loss": 1.6027, + "step": 5097 + }, + { + "epoch": 0.5856068002986617, + "grad_norm": 0.42000266909599304, + "learning_rate": 0.0001, + "loss": 1.6702, + "step": 5098 + }, + { + "epoch": 0.5857216702084889, + "grad_norm": 0.4284018874168396, + "learning_rate": 0.0001, + "loss": 1.6606, + "step": 5099 + }, + { + "epoch": 0.585836540118316, + "grad_norm": 0.40313681960105896, + "learning_rate": 0.0001, + "loss": 1.3916, + "step": 5100 + }, + { + "epoch": 0.5859514100281431, + "grad_norm": 0.4598637521266937, + "learning_rate": 0.0001, + "loss": 1.7123, + "step": 5101 + }, + { + "epoch": 0.5860662799379702, + "grad_norm": 0.45406660437583923, + "learning_rate": 0.0001, + "loss": 1.6212, + "step": 5102 + }, + { + "epoch": 0.5861811498477973, + "grad_norm": 0.40444087982177734, + "learning_rate": 0.0001, + "loss": 1.4982, + "step": 5103 + }, + { + "epoch": 0.5862960197576245, + "grad_norm": 0.4442274272441864, + "learning_rate": 0.0001, + "loss": 1.5654, + "step": 5104 + }, + { + "epoch": 0.5864108896674516, + "grad_norm": 0.43179601430892944, + "learning_rate": 0.0001, + "loss": 1.5765, + "step": 5105 + }, + { + "epoch": 0.5865257595772787, + "grad_norm": 0.5134614706039429, + "learning_rate": 0.0001, + "loss": 1.8153, + "step": 5106 + }, + { + "epoch": 0.5866406294871058, + "grad_norm": 0.4080960750579834, + "learning_rate": 0.0001, + "loss": 1.4996, + "step": 5107 + }, + { + "epoch": 0.586755499396933, + "grad_norm": 0.4044763743877411, + "learning_rate": 0.0001, + "loss": 1.5646, + "step": 5108 + }, + { + "epoch": 0.5868703693067601, + "grad_norm": 0.40748360753059387, + "learning_rate": 0.0001, + "loss": 1.3913, + "step": 5109 + }, + { + "epoch": 0.5869852392165872, + "grad_norm": 0.46773701906204224, + "learning_rate": 0.0001, + "loss": 1.724, + "step": 5110 + }, + { + "epoch": 0.5871001091264143, + "grad_norm": 0.46918338537216187, + "learning_rate": 0.0001, + "loss": 1.8717, + "step": 5111 + }, + { + "epoch": 0.5872149790362414, + "grad_norm": 0.45832809805870056, + "learning_rate": 0.0001, + "loss": 1.8843, + "step": 5112 + }, + { + "epoch": 0.5873298489460685, + "grad_norm": 0.4251342713832855, + "learning_rate": 0.0001, + "loss": 1.5424, + "step": 5113 + }, + { + "epoch": 0.5874447188558957, + "grad_norm": 0.4477440118789673, + "learning_rate": 0.0001, + "loss": 1.729, + "step": 5114 + }, + { + "epoch": 0.5875595887657228, + "grad_norm": 0.47431480884552, + "learning_rate": 0.0001, + "loss": 1.8087, + "step": 5115 + }, + { + "epoch": 0.5876744586755499, + "grad_norm": 0.4649442434310913, + "learning_rate": 0.0001, + "loss": 1.8614, + "step": 5116 + }, + { + "epoch": 0.587789328585377, + "grad_norm": 0.40479037165641785, + "learning_rate": 0.0001, + "loss": 1.5014, + "step": 5117 + }, + { + "epoch": 0.5879041984952041, + "grad_norm": 0.4071318805217743, + "learning_rate": 0.0001, + "loss": 1.6169, + "step": 5118 + }, + { + "epoch": 0.5880190684050313, + "grad_norm": 0.3959406316280365, + "learning_rate": 0.0001, + "loss": 1.4992, + "step": 5119 + }, + { + "epoch": 0.5881339383148584, + "grad_norm": 0.4508780241012573, + "learning_rate": 0.0001, + "loss": 1.6267, + "step": 5120 + }, + { + "epoch": 0.5882488082246855, + "grad_norm": 0.41295212507247925, + "learning_rate": 0.0001, + "loss": 1.4935, + "step": 5121 + }, + { + "epoch": 0.5883636781345126, + "grad_norm": 0.39473357796669006, + "learning_rate": 0.0001, + "loss": 1.4775, + "step": 5122 + }, + { + "epoch": 0.5884785480443397, + "grad_norm": 0.4360009431838989, + "learning_rate": 0.0001, + "loss": 1.8118, + "step": 5123 + }, + { + "epoch": 0.5885934179541669, + "grad_norm": 0.4192604720592499, + "learning_rate": 0.0001, + "loss": 1.5552, + "step": 5124 + }, + { + "epoch": 0.588708287863994, + "grad_norm": 0.44447702169418335, + "learning_rate": 0.0001, + "loss": 1.5772, + "step": 5125 + }, + { + "epoch": 0.5888231577738211, + "grad_norm": 0.4330410063266754, + "learning_rate": 0.0001, + "loss": 1.6319, + "step": 5126 + }, + { + "epoch": 0.5889380276836482, + "grad_norm": 0.44558635354042053, + "learning_rate": 0.0001, + "loss": 1.8103, + "step": 5127 + }, + { + "epoch": 0.5890528975934753, + "grad_norm": 0.4943927526473999, + "learning_rate": 0.0001, + "loss": 1.77, + "step": 5128 + }, + { + "epoch": 0.5891677675033025, + "grad_norm": 0.4345741271972656, + "learning_rate": 0.0001, + "loss": 1.5057, + "step": 5129 + }, + { + "epoch": 0.5892826374131296, + "grad_norm": 0.4273737967014313, + "learning_rate": 0.0001, + "loss": 1.6475, + "step": 5130 + }, + { + "epoch": 0.5893975073229567, + "grad_norm": 0.4054570198059082, + "learning_rate": 0.0001, + "loss": 1.7624, + "step": 5131 + }, + { + "epoch": 0.5895123772327838, + "grad_norm": 0.43333131074905396, + "learning_rate": 0.0001, + "loss": 1.434, + "step": 5132 + }, + { + "epoch": 0.5896272471426109, + "grad_norm": 0.4401973485946655, + "learning_rate": 0.0001, + "loss": 1.4911, + "step": 5133 + }, + { + "epoch": 0.5897421170524381, + "grad_norm": 0.43175458908081055, + "learning_rate": 0.0001, + "loss": 1.6226, + "step": 5134 + }, + { + "epoch": 0.5898569869622652, + "grad_norm": 0.4577193558216095, + "learning_rate": 0.0001, + "loss": 1.825, + "step": 5135 + }, + { + "epoch": 0.5899718568720923, + "grad_norm": 0.45421233773231506, + "learning_rate": 0.0001, + "loss": 1.7356, + "step": 5136 + }, + { + "epoch": 0.5900867267819194, + "grad_norm": 0.4477955102920532, + "learning_rate": 0.0001, + "loss": 1.6004, + "step": 5137 + }, + { + "epoch": 0.5902015966917465, + "grad_norm": 0.41615384817123413, + "learning_rate": 0.0001, + "loss": 1.5265, + "step": 5138 + }, + { + "epoch": 0.5903164666015737, + "grad_norm": 0.4334186613559723, + "learning_rate": 0.0001, + "loss": 1.7663, + "step": 5139 + }, + { + "epoch": 0.5904313365114008, + "grad_norm": 0.4008978605270386, + "learning_rate": 0.0001, + "loss": 1.6154, + "step": 5140 + }, + { + "epoch": 0.5905462064212279, + "grad_norm": 0.4319349229335785, + "learning_rate": 0.0001, + "loss": 1.6769, + "step": 5141 + }, + { + "epoch": 0.5906610763310551, + "grad_norm": 0.4281979501247406, + "learning_rate": 0.0001, + "loss": 1.6188, + "step": 5142 + }, + { + "epoch": 0.5907759462408823, + "grad_norm": 0.4367743134498596, + "learning_rate": 0.0001, + "loss": 1.7232, + "step": 5143 + }, + { + "epoch": 0.5908908161507094, + "grad_norm": 0.4409448504447937, + "learning_rate": 0.0001, + "loss": 1.5421, + "step": 5144 + }, + { + "epoch": 0.5910056860605365, + "grad_norm": 0.4476580023765564, + "learning_rate": 0.0001, + "loss": 1.6214, + "step": 5145 + }, + { + "epoch": 0.5911205559703636, + "grad_norm": 0.43535661697387695, + "learning_rate": 0.0001, + "loss": 1.6434, + "step": 5146 + }, + { + "epoch": 0.5912354258801907, + "grad_norm": 0.4497022330760956, + "learning_rate": 0.0001, + "loss": 1.6346, + "step": 5147 + }, + { + "epoch": 0.5913502957900179, + "grad_norm": 0.42515555024147034, + "learning_rate": 0.0001, + "loss": 1.4534, + "step": 5148 + }, + { + "epoch": 0.591465165699845, + "grad_norm": 0.4538893699645996, + "learning_rate": 0.0001, + "loss": 1.6042, + "step": 5149 + }, + { + "epoch": 0.5915800356096721, + "grad_norm": 0.4361916184425354, + "learning_rate": 0.0001, + "loss": 1.6162, + "step": 5150 + }, + { + "epoch": 0.5916949055194992, + "grad_norm": 0.43373700976371765, + "learning_rate": 0.0001, + "loss": 1.7429, + "step": 5151 + }, + { + "epoch": 0.5918097754293263, + "grad_norm": 0.46759071946144104, + "learning_rate": 0.0001, + "loss": 1.6652, + "step": 5152 + }, + { + "epoch": 0.5919246453391535, + "grad_norm": 0.45296040177345276, + "learning_rate": 0.0001, + "loss": 1.6969, + "step": 5153 + }, + { + "epoch": 0.5920395152489806, + "grad_norm": 0.3908827602863312, + "learning_rate": 0.0001, + "loss": 1.5251, + "step": 5154 + }, + { + "epoch": 0.5921543851588077, + "grad_norm": 0.41083312034606934, + "learning_rate": 0.0001, + "loss": 1.53, + "step": 5155 + }, + { + "epoch": 0.5922692550686348, + "grad_norm": 0.4350758492946625, + "learning_rate": 0.0001, + "loss": 1.6496, + "step": 5156 + }, + { + "epoch": 0.5923841249784619, + "grad_norm": 0.44962963461875916, + "learning_rate": 0.0001, + "loss": 1.6378, + "step": 5157 + }, + { + "epoch": 0.592498994888289, + "grad_norm": 0.47061699628829956, + "learning_rate": 0.0001, + "loss": 1.9426, + "step": 5158 + }, + { + "epoch": 0.5926138647981162, + "grad_norm": 0.4288279712200165, + "learning_rate": 0.0001, + "loss": 1.4317, + "step": 5159 + }, + { + "epoch": 0.5927287347079433, + "grad_norm": 0.4379502534866333, + "learning_rate": 0.0001, + "loss": 1.6085, + "step": 5160 + }, + { + "epoch": 0.5928436046177704, + "grad_norm": 0.41820356249809265, + "learning_rate": 0.0001, + "loss": 1.5538, + "step": 5161 + }, + { + "epoch": 0.5929584745275975, + "grad_norm": 0.4359080493450165, + "learning_rate": 0.0001, + "loss": 1.7123, + "step": 5162 + }, + { + "epoch": 0.5930733444374247, + "grad_norm": 0.40916740894317627, + "learning_rate": 0.0001, + "loss": 1.6091, + "step": 5163 + }, + { + "epoch": 0.5931882143472518, + "grad_norm": 0.4382950961589813, + "learning_rate": 0.0001, + "loss": 1.5852, + "step": 5164 + }, + { + "epoch": 0.5933030842570789, + "grad_norm": 0.4391472041606903, + "learning_rate": 0.0001, + "loss": 1.8258, + "step": 5165 + }, + { + "epoch": 0.593417954166906, + "grad_norm": 0.4115656316280365, + "learning_rate": 0.0001, + "loss": 1.5322, + "step": 5166 + }, + { + "epoch": 0.5935328240767331, + "grad_norm": 0.4537063241004944, + "learning_rate": 0.0001, + "loss": 1.7013, + "step": 5167 + }, + { + "epoch": 0.5936476939865603, + "grad_norm": 0.4141421616077423, + "learning_rate": 0.0001, + "loss": 1.5697, + "step": 5168 + }, + { + "epoch": 0.5937625638963874, + "grad_norm": 0.42686209082603455, + "learning_rate": 0.0001, + "loss": 1.5285, + "step": 5169 + }, + { + "epoch": 0.5938774338062145, + "grad_norm": 0.4201597273349762, + "learning_rate": 0.0001, + "loss": 1.3904, + "step": 5170 + }, + { + "epoch": 0.5939923037160416, + "grad_norm": 0.43558233976364136, + "learning_rate": 0.0001, + "loss": 1.529, + "step": 5171 + }, + { + "epoch": 0.5941071736258687, + "grad_norm": 0.43399038910865784, + "learning_rate": 0.0001, + "loss": 1.6307, + "step": 5172 + }, + { + "epoch": 0.5942220435356959, + "grad_norm": 0.40567871928215027, + "learning_rate": 0.0001, + "loss": 1.4648, + "step": 5173 + }, + { + "epoch": 0.594336913445523, + "grad_norm": 0.44226759672164917, + "learning_rate": 0.0001, + "loss": 1.6939, + "step": 5174 + }, + { + "epoch": 0.5944517833553501, + "grad_norm": 0.4169192910194397, + "learning_rate": 0.0001, + "loss": 1.5668, + "step": 5175 + }, + { + "epoch": 0.5945666532651772, + "grad_norm": 0.4189157485961914, + "learning_rate": 0.0001, + "loss": 1.6832, + "step": 5176 + }, + { + "epoch": 0.5946815231750043, + "grad_norm": 0.4361801743507385, + "learning_rate": 0.0001, + "loss": 1.53, + "step": 5177 + }, + { + "epoch": 0.5947963930848315, + "grad_norm": 0.41981688141822815, + "learning_rate": 0.0001, + "loss": 1.6242, + "step": 5178 + }, + { + "epoch": 0.5949112629946586, + "grad_norm": 0.4301121234893799, + "learning_rate": 0.0001, + "loss": 1.6233, + "step": 5179 + }, + { + "epoch": 0.5950261329044857, + "grad_norm": 0.42863231897354126, + "learning_rate": 0.0001, + "loss": 1.6747, + "step": 5180 + }, + { + "epoch": 0.5951410028143128, + "grad_norm": 0.4355190694332123, + "learning_rate": 0.0001, + "loss": 1.6419, + "step": 5181 + }, + { + "epoch": 0.5952558727241399, + "grad_norm": 0.4153485894203186, + "learning_rate": 0.0001, + "loss": 1.6399, + "step": 5182 + }, + { + "epoch": 0.595370742633967, + "grad_norm": 0.44687628746032715, + "learning_rate": 0.0001, + "loss": 1.7095, + "step": 5183 + }, + { + "epoch": 0.5954856125437942, + "grad_norm": 0.4202732443809509, + "learning_rate": 0.0001, + "loss": 1.6044, + "step": 5184 + }, + { + "epoch": 0.5956004824536213, + "grad_norm": 0.4697241187095642, + "learning_rate": 0.0001, + "loss": 1.7244, + "step": 5185 + }, + { + "epoch": 0.5957153523634484, + "grad_norm": 0.428061306476593, + "learning_rate": 0.0001, + "loss": 1.4106, + "step": 5186 + }, + { + "epoch": 0.5958302222732755, + "grad_norm": 0.4194773733615875, + "learning_rate": 0.0001, + "loss": 1.6237, + "step": 5187 + }, + { + "epoch": 0.5959450921831027, + "grad_norm": 0.45670002698898315, + "learning_rate": 0.0001, + "loss": 1.6503, + "step": 5188 + }, + { + "epoch": 0.5960599620929298, + "grad_norm": 0.43582040071487427, + "learning_rate": 0.0001, + "loss": 1.6735, + "step": 5189 + }, + { + "epoch": 0.5961748320027569, + "grad_norm": 0.4162776470184326, + "learning_rate": 0.0001, + "loss": 1.5631, + "step": 5190 + }, + { + "epoch": 0.596289701912584, + "grad_norm": 0.4055253565311432, + "learning_rate": 0.0001, + "loss": 1.5907, + "step": 5191 + }, + { + "epoch": 0.5964045718224111, + "grad_norm": 0.4150947332382202, + "learning_rate": 0.0001, + "loss": 1.6579, + "step": 5192 + }, + { + "epoch": 0.5965194417322383, + "grad_norm": 0.4119030237197876, + "learning_rate": 0.0001, + "loss": 1.5916, + "step": 5193 + }, + { + "epoch": 0.5966343116420654, + "grad_norm": 0.43656405806541443, + "learning_rate": 0.0001, + "loss": 1.6424, + "step": 5194 + }, + { + "epoch": 0.5967491815518925, + "grad_norm": 0.4502866864204407, + "learning_rate": 0.0001, + "loss": 1.7509, + "step": 5195 + }, + { + "epoch": 0.5968640514617196, + "grad_norm": 0.4506380558013916, + "learning_rate": 0.0001, + "loss": 1.6351, + "step": 5196 + }, + { + "epoch": 0.5969789213715467, + "grad_norm": 0.48698195815086365, + "learning_rate": 0.0001, + "loss": 1.7874, + "step": 5197 + }, + { + "epoch": 0.5970937912813739, + "grad_norm": 0.42721307277679443, + "learning_rate": 0.0001, + "loss": 1.6294, + "step": 5198 + }, + { + "epoch": 0.597208661191201, + "grad_norm": 0.428030788898468, + "learning_rate": 0.0001, + "loss": 1.752, + "step": 5199 + }, + { + "epoch": 0.5973235311010281, + "grad_norm": 0.43251699209213257, + "learning_rate": 0.0001, + "loss": 1.8308, + "step": 5200 + }, + { + "epoch": 0.5974384010108552, + "grad_norm": 0.4091329872608185, + "learning_rate": 0.0001, + "loss": 1.5663, + "step": 5201 + }, + { + "epoch": 0.5975532709206823, + "grad_norm": 0.4518309235572815, + "learning_rate": 0.0001, + "loss": 1.6743, + "step": 5202 + }, + { + "epoch": 0.5976681408305095, + "grad_norm": 0.4639250338077545, + "learning_rate": 0.0001, + "loss": 1.6573, + "step": 5203 + }, + { + "epoch": 0.5977830107403366, + "grad_norm": 0.41630759835243225, + "learning_rate": 0.0001, + "loss": 1.6118, + "step": 5204 + }, + { + "epoch": 0.5978978806501637, + "grad_norm": 0.47292619943618774, + "learning_rate": 0.0001, + "loss": 1.8113, + "step": 5205 + }, + { + "epoch": 0.5980127505599908, + "grad_norm": 0.4484859108924866, + "learning_rate": 0.0001, + "loss": 1.6676, + "step": 5206 + }, + { + "epoch": 0.5981276204698179, + "grad_norm": 0.42833200097084045, + "learning_rate": 0.0001, + "loss": 1.7428, + "step": 5207 + }, + { + "epoch": 0.598242490379645, + "grad_norm": 0.40881600975990295, + "learning_rate": 0.0001, + "loss": 1.4835, + "step": 5208 + }, + { + "epoch": 0.5983573602894722, + "grad_norm": 0.412045955657959, + "learning_rate": 0.0001, + "loss": 1.591, + "step": 5209 + }, + { + "epoch": 0.5984722301992993, + "grad_norm": 0.4817086160182953, + "learning_rate": 0.0001, + "loss": 1.7922, + "step": 5210 + }, + { + "epoch": 0.5985871001091264, + "grad_norm": 0.46558666229248047, + "learning_rate": 0.0001, + "loss": 1.7812, + "step": 5211 + }, + { + "epoch": 0.5987019700189535, + "grad_norm": 0.43379732966423035, + "learning_rate": 0.0001, + "loss": 1.7301, + "step": 5212 + }, + { + "epoch": 0.5988168399287807, + "grad_norm": 0.4646340310573578, + "learning_rate": 0.0001, + "loss": 1.7395, + "step": 5213 + }, + { + "epoch": 0.5989317098386078, + "grad_norm": 0.4490647315979004, + "learning_rate": 0.0001, + "loss": 1.559, + "step": 5214 + }, + { + "epoch": 0.5990465797484349, + "grad_norm": 0.404638409614563, + "learning_rate": 0.0001, + "loss": 1.6963, + "step": 5215 + }, + { + "epoch": 0.599161449658262, + "grad_norm": 0.4501255452632904, + "learning_rate": 0.0001, + "loss": 1.7297, + "step": 5216 + }, + { + "epoch": 0.5992763195680891, + "grad_norm": 0.39730724692344666, + "learning_rate": 0.0001, + "loss": 1.4956, + "step": 5217 + }, + { + "epoch": 0.5993911894779163, + "grad_norm": 0.5064243674278259, + "learning_rate": 0.0001, + "loss": 1.8652, + "step": 5218 + }, + { + "epoch": 0.5995060593877434, + "grad_norm": 0.41334420442581177, + "learning_rate": 0.0001, + "loss": 1.7021, + "step": 5219 + }, + { + "epoch": 0.5996209292975705, + "grad_norm": 0.44307664036750793, + "learning_rate": 0.0001, + "loss": 1.7261, + "step": 5220 + }, + { + "epoch": 0.5997357992073976, + "grad_norm": 0.4100275933742523, + "learning_rate": 0.0001, + "loss": 1.5175, + "step": 5221 + }, + { + "epoch": 0.5998506691172247, + "grad_norm": 0.4305122494697571, + "learning_rate": 0.0001, + "loss": 1.5417, + "step": 5222 + }, + { + "epoch": 0.5999655390270519, + "grad_norm": 0.8222983479499817, + "learning_rate": 0.0001, + "loss": 1.5928, + "step": 5223 + }, + { + "epoch": 0.600080408936879, + "grad_norm": 0.46560120582580566, + "learning_rate": 0.0001, + "loss": 1.5538, + "step": 5224 + }, + { + "epoch": 0.6001952788467061, + "grad_norm": 0.413687139749527, + "learning_rate": 0.0001, + "loss": 1.5065, + "step": 5225 + }, + { + "epoch": 0.6003101487565332, + "grad_norm": 0.4451950192451477, + "learning_rate": 0.0001, + "loss": 1.6965, + "step": 5226 + }, + { + "epoch": 0.6004250186663603, + "grad_norm": 0.42893052101135254, + "learning_rate": 0.0001, + "loss": 1.4295, + "step": 5227 + }, + { + "epoch": 0.6005398885761875, + "grad_norm": 0.4320535659790039, + "learning_rate": 0.0001, + "loss": 1.6712, + "step": 5228 + }, + { + "epoch": 0.6006547584860146, + "grad_norm": 0.4111475944519043, + "learning_rate": 0.0001, + "loss": 1.5597, + "step": 5229 + }, + { + "epoch": 0.6007696283958417, + "grad_norm": 0.4269995093345642, + "learning_rate": 0.0001, + "loss": 1.5736, + "step": 5230 + }, + { + "epoch": 0.6008844983056688, + "grad_norm": 0.4943247139453888, + "learning_rate": 0.0001, + "loss": 1.4476, + "step": 5231 + }, + { + "epoch": 0.6009993682154959, + "grad_norm": 0.43460339307785034, + "learning_rate": 0.0001, + "loss": 1.738, + "step": 5232 + }, + { + "epoch": 0.601114238125323, + "grad_norm": 0.45859336853027344, + "learning_rate": 0.0001, + "loss": 1.7815, + "step": 5233 + }, + { + "epoch": 0.6012291080351502, + "grad_norm": 0.4369470775127411, + "learning_rate": 0.0001, + "loss": 1.745, + "step": 5234 + }, + { + "epoch": 0.6013439779449773, + "grad_norm": 0.39994218945503235, + "learning_rate": 0.0001, + "loss": 1.4631, + "step": 5235 + }, + { + "epoch": 0.6014588478548044, + "grad_norm": 0.4235687851905823, + "learning_rate": 0.0001, + "loss": 1.6046, + "step": 5236 + }, + { + "epoch": 0.6015737177646315, + "grad_norm": 0.45614656805992126, + "learning_rate": 0.0001, + "loss": 1.6661, + "step": 5237 + }, + { + "epoch": 0.6016885876744587, + "grad_norm": 0.4340771436691284, + "learning_rate": 0.0001, + "loss": 1.5328, + "step": 5238 + }, + { + "epoch": 0.6018034575842858, + "grad_norm": 0.458238422870636, + "learning_rate": 0.0001, + "loss": 1.796, + "step": 5239 + }, + { + "epoch": 0.6019183274941129, + "grad_norm": 0.42685821652412415, + "learning_rate": 0.0001, + "loss": 1.6421, + "step": 5240 + }, + { + "epoch": 0.60203319740394, + "grad_norm": 0.4220592975616455, + "learning_rate": 0.0001, + "loss": 1.602, + "step": 5241 + }, + { + "epoch": 0.6021480673137671, + "grad_norm": 0.41730085015296936, + "learning_rate": 0.0001, + "loss": 1.5507, + "step": 5242 + }, + { + "epoch": 0.6022629372235943, + "grad_norm": 0.4415682852268219, + "learning_rate": 0.0001, + "loss": 1.6285, + "step": 5243 + }, + { + "epoch": 0.6023778071334214, + "grad_norm": 0.41799354553222656, + "learning_rate": 0.0001, + "loss": 1.5479, + "step": 5244 + }, + { + "epoch": 0.6024926770432485, + "grad_norm": 0.47606340050697327, + "learning_rate": 0.0001, + "loss": 1.8567, + "step": 5245 + }, + { + "epoch": 0.6026075469530756, + "grad_norm": 0.4121111333370209, + "learning_rate": 0.0001, + "loss": 1.5148, + "step": 5246 + }, + { + "epoch": 0.6027224168629027, + "grad_norm": 0.43468672037124634, + "learning_rate": 0.0001, + "loss": 1.5985, + "step": 5247 + }, + { + "epoch": 0.6028372867727299, + "grad_norm": 0.3789305090904236, + "learning_rate": 0.0001, + "loss": 1.2786, + "step": 5248 + }, + { + "epoch": 0.602952156682557, + "grad_norm": 0.3968721330165863, + "learning_rate": 0.0001, + "loss": 1.5158, + "step": 5249 + }, + { + "epoch": 0.6030670265923841, + "grad_norm": 0.46839994192123413, + "learning_rate": 0.0001, + "loss": 1.9271, + "step": 5250 + }, + { + "epoch": 0.6031818965022112, + "grad_norm": 0.4195137023925781, + "learning_rate": 0.0001, + "loss": 1.573, + "step": 5251 + }, + { + "epoch": 0.6032967664120383, + "grad_norm": 0.4006030261516571, + "learning_rate": 0.0001, + "loss": 1.509, + "step": 5252 + }, + { + "epoch": 0.6034116363218655, + "grad_norm": 0.4589271545410156, + "learning_rate": 0.0001, + "loss": 1.791, + "step": 5253 + }, + { + "epoch": 0.6035265062316926, + "grad_norm": 0.4122845232486725, + "learning_rate": 0.0001, + "loss": 1.6732, + "step": 5254 + }, + { + "epoch": 0.6036413761415197, + "grad_norm": 0.4685841500759125, + "learning_rate": 0.0001, + "loss": 1.7381, + "step": 5255 + }, + { + "epoch": 0.6037562460513468, + "grad_norm": 0.455402672290802, + "learning_rate": 0.0001, + "loss": 1.6299, + "step": 5256 + }, + { + "epoch": 0.6038711159611739, + "grad_norm": 0.4526597261428833, + "learning_rate": 0.0001, + "loss": 1.5991, + "step": 5257 + }, + { + "epoch": 0.603985985871001, + "grad_norm": 0.42606300115585327, + "learning_rate": 0.0001, + "loss": 1.4687, + "step": 5258 + }, + { + "epoch": 0.6041008557808282, + "grad_norm": 0.45859774947166443, + "learning_rate": 0.0001, + "loss": 1.7105, + "step": 5259 + }, + { + "epoch": 0.6042157256906553, + "grad_norm": 0.4001207947731018, + "learning_rate": 0.0001, + "loss": 1.446, + "step": 5260 + }, + { + "epoch": 0.6043305956004824, + "grad_norm": 0.4063998758792877, + "learning_rate": 0.0001, + "loss": 1.6282, + "step": 5261 + }, + { + "epoch": 0.6044454655103095, + "grad_norm": 0.4181976616382599, + "learning_rate": 0.0001, + "loss": 1.5813, + "step": 5262 + }, + { + "epoch": 0.6045603354201367, + "grad_norm": 0.4905684292316437, + "learning_rate": 0.0001, + "loss": 1.8801, + "step": 5263 + }, + { + "epoch": 0.6046752053299638, + "grad_norm": 0.4444495737552643, + "learning_rate": 0.0001, + "loss": 1.4989, + "step": 5264 + }, + { + "epoch": 0.6047900752397909, + "grad_norm": 0.4244244694709778, + "learning_rate": 0.0001, + "loss": 1.5065, + "step": 5265 + }, + { + "epoch": 0.604904945149618, + "grad_norm": 0.48309603333473206, + "learning_rate": 0.0001, + "loss": 1.7415, + "step": 5266 + }, + { + "epoch": 0.6050198150594451, + "grad_norm": 0.4249451756477356, + "learning_rate": 0.0001, + "loss": 1.6244, + "step": 5267 + }, + { + "epoch": 0.6051346849692723, + "grad_norm": 0.4714745283126831, + "learning_rate": 0.0001, + "loss": 1.6689, + "step": 5268 + }, + { + "epoch": 0.6052495548790994, + "grad_norm": 0.4268541634082794, + "learning_rate": 0.0001, + "loss": 1.6959, + "step": 5269 + }, + { + "epoch": 0.6053644247889265, + "grad_norm": 0.4523731768131256, + "learning_rate": 0.0001, + "loss": 1.5711, + "step": 5270 + }, + { + "epoch": 0.6054792946987536, + "grad_norm": 0.47980424761772156, + "learning_rate": 0.0001, + "loss": 1.6036, + "step": 5271 + }, + { + "epoch": 0.6055941646085807, + "grad_norm": 0.43983185291290283, + "learning_rate": 0.0001, + "loss": 1.4902, + "step": 5272 + }, + { + "epoch": 0.6057090345184079, + "grad_norm": 0.41765522956848145, + "learning_rate": 0.0001, + "loss": 1.5531, + "step": 5273 + }, + { + "epoch": 0.605823904428235, + "grad_norm": 0.4522044360637665, + "learning_rate": 0.0001, + "loss": 1.7149, + "step": 5274 + }, + { + "epoch": 0.6059387743380621, + "grad_norm": 0.4616985321044922, + "learning_rate": 0.0001, + "loss": 1.714, + "step": 5275 + }, + { + "epoch": 0.6060536442478892, + "grad_norm": 0.4109836518764496, + "learning_rate": 0.0001, + "loss": 1.6499, + "step": 5276 + }, + { + "epoch": 0.6061685141577163, + "grad_norm": 0.4296359717845917, + "learning_rate": 0.0001, + "loss": 1.507, + "step": 5277 + }, + { + "epoch": 0.6062833840675435, + "grad_norm": 0.469368577003479, + "learning_rate": 0.0001, + "loss": 1.7734, + "step": 5278 + }, + { + "epoch": 0.6063982539773707, + "grad_norm": 0.41937753558158875, + "learning_rate": 0.0001, + "loss": 1.7647, + "step": 5279 + }, + { + "epoch": 0.6065131238871978, + "grad_norm": 0.4088084399700165, + "learning_rate": 0.0001, + "loss": 1.3934, + "step": 5280 + }, + { + "epoch": 0.6066279937970249, + "grad_norm": 0.4353596568107605, + "learning_rate": 0.0001, + "loss": 1.6364, + "step": 5281 + }, + { + "epoch": 0.606742863706852, + "grad_norm": 0.4809739589691162, + "learning_rate": 0.0001, + "loss": 1.8213, + "step": 5282 + }, + { + "epoch": 0.6068577336166792, + "grad_norm": 0.4168318808078766, + "learning_rate": 0.0001, + "loss": 1.5803, + "step": 5283 + }, + { + "epoch": 0.6069726035265063, + "grad_norm": 0.4438597857952118, + "learning_rate": 0.0001, + "loss": 1.6453, + "step": 5284 + }, + { + "epoch": 0.6070874734363334, + "grad_norm": 0.4545292556285858, + "learning_rate": 0.0001, + "loss": 1.4997, + "step": 5285 + }, + { + "epoch": 0.6072023433461605, + "grad_norm": 0.4365883469581604, + "learning_rate": 0.0001, + "loss": 1.4618, + "step": 5286 + }, + { + "epoch": 0.6073172132559876, + "grad_norm": 0.4614627957344055, + "learning_rate": 0.0001, + "loss": 1.6091, + "step": 5287 + }, + { + "epoch": 0.6074320831658148, + "grad_norm": 0.4482710659503937, + "learning_rate": 0.0001, + "loss": 1.7119, + "step": 5288 + }, + { + "epoch": 0.6075469530756419, + "grad_norm": 0.404892235994339, + "learning_rate": 0.0001, + "loss": 1.5626, + "step": 5289 + }, + { + "epoch": 0.607661822985469, + "grad_norm": 0.4186989367008209, + "learning_rate": 0.0001, + "loss": 1.6791, + "step": 5290 + }, + { + "epoch": 0.6077766928952961, + "grad_norm": 0.39953678846359253, + "learning_rate": 0.0001, + "loss": 1.5322, + "step": 5291 + }, + { + "epoch": 0.6078915628051232, + "grad_norm": 0.4357908070087433, + "learning_rate": 0.0001, + "loss": 1.7891, + "step": 5292 + }, + { + "epoch": 0.6080064327149504, + "grad_norm": 0.4001089334487915, + "learning_rate": 0.0001, + "loss": 1.489, + "step": 5293 + }, + { + "epoch": 0.6081213026247775, + "grad_norm": 0.44134122133255005, + "learning_rate": 0.0001, + "loss": 1.5262, + "step": 5294 + }, + { + "epoch": 0.6082361725346046, + "grad_norm": 0.4329252243041992, + "learning_rate": 0.0001, + "loss": 1.5406, + "step": 5295 + }, + { + "epoch": 0.6083510424444317, + "grad_norm": 0.41654008626937866, + "learning_rate": 0.0001, + "loss": 1.5814, + "step": 5296 + }, + { + "epoch": 0.6084659123542588, + "grad_norm": 0.43320518732070923, + "learning_rate": 0.0001, + "loss": 1.5642, + "step": 5297 + }, + { + "epoch": 0.608580782264086, + "grad_norm": 0.4640411138534546, + "learning_rate": 0.0001, + "loss": 1.7619, + "step": 5298 + }, + { + "epoch": 0.6086956521739131, + "grad_norm": 0.4832577705383301, + "learning_rate": 0.0001, + "loss": 1.886, + "step": 5299 + }, + { + "epoch": 0.6088105220837402, + "grad_norm": 0.4550410509109497, + "learning_rate": 0.0001, + "loss": 1.6412, + "step": 5300 + }, + { + "epoch": 0.6089253919935673, + "grad_norm": 0.4446395933628082, + "learning_rate": 0.0001, + "loss": 1.5463, + "step": 5301 + }, + { + "epoch": 0.6090402619033944, + "grad_norm": 0.44712090492248535, + "learning_rate": 0.0001, + "loss": 1.7342, + "step": 5302 + }, + { + "epoch": 0.6091551318132216, + "grad_norm": 0.46963006258010864, + "learning_rate": 0.0001, + "loss": 1.7253, + "step": 5303 + }, + { + "epoch": 0.6092700017230487, + "grad_norm": 0.4730952978134155, + "learning_rate": 0.0001, + "loss": 1.537, + "step": 5304 + }, + { + "epoch": 0.6093848716328758, + "grad_norm": 0.4162120521068573, + "learning_rate": 0.0001, + "loss": 1.3652, + "step": 5305 + }, + { + "epoch": 0.6094997415427029, + "grad_norm": 0.4251774251461029, + "learning_rate": 0.0001, + "loss": 1.5932, + "step": 5306 + }, + { + "epoch": 0.60961461145253, + "grad_norm": 0.4354092478752136, + "learning_rate": 0.0001, + "loss": 1.6629, + "step": 5307 + }, + { + "epoch": 0.6097294813623572, + "grad_norm": 0.41313931345939636, + "learning_rate": 0.0001, + "loss": 1.6076, + "step": 5308 + }, + { + "epoch": 0.6098443512721843, + "grad_norm": 0.47585734724998474, + "learning_rate": 0.0001, + "loss": 1.6206, + "step": 5309 + }, + { + "epoch": 0.6099592211820114, + "grad_norm": 0.41451969742774963, + "learning_rate": 0.0001, + "loss": 1.5368, + "step": 5310 + }, + { + "epoch": 0.6100740910918385, + "grad_norm": 0.42730268836021423, + "learning_rate": 0.0001, + "loss": 1.4848, + "step": 5311 + }, + { + "epoch": 0.6101889610016656, + "grad_norm": 0.4963441789150238, + "learning_rate": 0.0001, + "loss": 1.719, + "step": 5312 + }, + { + "epoch": 0.6103038309114928, + "grad_norm": 0.46581605076789856, + "learning_rate": 0.0001, + "loss": 1.8281, + "step": 5313 + }, + { + "epoch": 0.6104187008213199, + "grad_norm": 0.4398311972618103, + "learning_rate": 0.0001, + "loss": 1.6242, + "step": 5314 + }, + { + "epoch": 0.610533570731147, + "grad_norm": 0.44360533356666565, + "learning_rate": 0.0001, + "loss": 1.6104, + "step": 5315 + }, + { + "epoch": 0.6106484406409741, + "grad_norm": 0.4808873236179352, + "learning_rate": 0.0001, + "loss": 1.7675, + "step": 5316 + }, + { + "epoch": 0.6107633105508012, + "grad_norm": 0.40180516242980957, + "learning_rate": 0.0001, + "loss": 1.4326, + "step": 5317 + }, + { + "epoch": 0.6108781804606284, + "grad_norm": 0.4601239860057831, + "learning_rate": 0.0001, + "loss": 1.5387, + "step": 5318 + }, + { + "epoch": 0.6109930503704555, + "grad_norm": 0.4126811921596527, + "learning_rate": 0.0001, + "loss": 1.5125, + "step": 5319 + }, + { + "epoch": 0.6111079202802826, + "grad_norm": 0.45912793278694153, + "learning_rate": 0.0001, + "loss": 1.716, + "step": 5320 + }, + { + "epoch": 0.6112227901901097, + "grad_norm": 0.43586671352386475, + "learning_rate": 0.0001, + "loss": 1.6456, + "step": 5321 + }, + { + "epoch": 0.6113376600999368, + "grad_norm": 0.4181722402572632, + "learning_rate": 0.0001, + "loss": 1.6628, + "step": 5322 + }, + { + "epoch": 0.611452530009764, + "grad_norm": 0.4227443039417267, + "learning_rate": 0.0001, + "loss": 1.4071, + "step": 5323 + }, + { + "epoch": 0.6115673999195911, + "grad_norm": 0.42024925351142883, + "learning_rate": 0.0001, + "loss": 1.6544, + "step": 5324 + }, + { + "epoch": 0.6116822698294182, + "grad_norm": 0.42331016063690186, + "learning_rate": 0.0001, + "loss": 1.6499, + "step": 5325 + }, + { + "epoch": 0.6117971397392453, + "grad_norm": 0.4389086663722992, + "learning_rate": 0.0001, + "loss": 1.5742, + "step": 5326 + }, + { + "epoch": 0.6119120096490724, + "grad_norm": 0.42871832847595215, + "learning_rate": 0.0001, + "loss": 1.5348, + "step": 5327 + }, + { + "epoch": 0.6120268795588996, + "grad_norm": 0.4434123635292053, + "learning_rate": 0.0001, + "loss": 1.358, + "step": 5328 + }, + { + "epoch": 0.6121417494687267, + "grad_norm": 0.460252046585083, + "learning_rate": 0.0001, + "loss": 1.5934, + "step": 5329 + }, + { + "epoch": 0.6122566193785538, + "grad_norm": 0.4134620428085327, + "learning_rate": 0.0001, + "loss": 1.5169, + "step": 5330 + }, + { + "epoch": 0.6123714892883809, + "grad_norm": 0.4522944986820221, + "learning_rate": 0.0001, + "loss": 1.754, + "step": 5331 + }, + { + "epoch": 0.612486359198208, + "grad_norm": 0.47153130173683167, + "learning_rate": 0.0001, + "loss": 1.6896, + "step": 5332 + }, + { + "epoch": 0.6126012291080352, + "grad_norm": 0.4305824041366577, + "learning_rate": 0.0001, + "loss": 1.5964, + "step": 5333 + }, + { + "epoch": 0.6127160990178623, + "grad_norm": 0.4275292158126831, + "learning_rate": 0.0001, + "loss": 1.6582, + "step": 5334 + }, + { + "epoch": 0.6128309689276894, + "grad_norm": 0.44580233097076416, + "learning_rate": 0.0001, + "loss": 1.5851, + "step": 5335 + }, + { + "epoch": 0.6129458388375165, + "grad_norm": 0.42193603515625, + "learning_rate": 0.0001, + "loss": 1.5258, + "step": 5336 + }, + { + "epoch": 0.6130607087473436, + "grad_norm": 0.4233403503894806, + "learning_rate": 0.0001, + "loss": 1.6786, + "step": 5337 + }, + { + "epoch": 0.6131755786571708, + "grad_norm": 0.45567867159843445, + "learning_rate": 0.0001, + "loss": 1.76, + "step": 5338 + }, + { + "epoch": 0.6132904485669979, + "grad_norm": 0.4384849965572357, + "learning_rate": 0.0001, + "loss": 1.4506, + "step": 5339 + }, + { + "epoch": 0.613405318476825, + "grad_norm": 0.442428320646286, + "learning_rate": 0.0001, + "loss": 1.7631, + "step": 5340 + }, + { + "epoch": 0.6135201883866521, + "grad_norm": 0.42773962020874023, + "learning_rate": 0.0001, + "loss": 1.6109, + "step": 5341 + }, + { + "epoch": 0.6136350582964792, + "grad_norm": 0.4419418275356293, + "learning_rate": 0.0001, + "loss": 1.6981, + "step": 5342 + }, + { + "epoch": 0.6137499282063064, + "grad_norm": 0.4556467533111572, + "learning_rate": 0.0001, + "loss": 1.726, + "step": 5343 + }, + { + "epoch": 0.6138647981161335, + "grad_norm": 0.41763773560523987, + "learning_rate": 0.0001, + "loss": 1.7347, + "step": 5344 + }, + { + "epoch": 0.6139796680259606, + "grad_norm": 0.39312151074409485, + "learning_rate": 0.0001, + "loss": 1.4951, + "step": 5345 + }, + { + "epoch": 0.6140945379357877, + "grad_norm": 0.4212815761566162, + "learning_rate": 0.0001, + "loss": 1.711, + "step": 5346 + }, + { + "epoch": 0.6142094078456148, + "grad_norm": 0.425071656703949, + "learning_rate": 0.0001, + "loss": 1.5043, + "step": 5347 + }, + { + "epoch": 0.614324277755442, + "grad_norm": 0.4074556827545166, + "learning_rate": 0.0001, + "loss": 1.5206, + "step": 5348 + }, + { + "epoch": 0.6144391476652691, + "grad_norm": 0.45755401253700256, + "learning_rate": 0.0001, + "loss": 1.6133, + "step": 5349 + }, + { + "epoch": 0.6145540175750962, + "grad_norm": 0.43361806869506836, + "learning_rate": 0.0001, + "loss": 1.6676, + "step": 5350 + }, + { + "epoch": 0.6146688874849233, + "grad_norm": 0.45359665155410767, + "learning_rate": 0.0001, + "loss": 1.7329, + "step": 5351 + }, + { + "epoch": 0.6147837573947504, + "grad_norm": 0.4411814510822296, + "learning_rate": 0.0001, + "loss": 1.5939, + "step": 5352 + }, + { + "epoch": 0.6148986273045776, + "grad_norm": 0.4266383647918701, + "learning_rate": 0.0001, + "loss": 1.6575, + "step": 5353 + }, + { + "epoch": 0.6150134972144047, + "grad_norm": 0.4333922564983368, + "learning_rate": 0.0001, + "loss": 1.7112, + "step": 5354 + }, + { + "epoch": 0.6151283671242318, + "grad_norm": 0.48001882433891296, + "learning_rate": 0.0001, + "loss": 1.8559, + "step": 5355 + }, + { + "epoch": 0.6152432370340589, + "grad_norm": 0.43073657155036926, + "learning_rate": 0.0001, + "loss": 1.6248, + "step": 5356 + }, + { + "epoch": 0.615358106943886, + "grad_norm": 0.42971518635749817, + "learning_rate": 0.0001, + "loss": 1.6327, + "step": 5357 + }, + { + "epoch": 0.6154729768537132, + "grad_norm": 0.41858428716659546, + "learning_rate": 0.0001, + "loss": 1.7038, + "step": 5358 + }, + { + "epoch": 0.6155878467635403, + "grad_norm": 0.4066700041294098, + "learning_rate": 0.0001, + "loss": 1.5025, + "step": 5359 + }, + { + "epoch": 0.6157027166733674, + "grad_norm": 0.42960700392723083, + "learning_rate": 0.0001, + "loss": 1.7926, + "step": 5360 + }, + { + "epoch": 0.6158175865831945, + "grad_norm": 0.425981342792511, + "learning_rate": 0.0001, + "loss": 1.5527, + "step": 5361 + }, + { + "epoch": 0.6159324564930216, + "grad_norm": 0.44844427704811096, + "learning_rate": 0.0001, + "loss": 1.7161, + "step": 5362 + }, + { + "epoch": 0.6160473264028488, + "grad_norm": 0.44269078969955444, + "learning_rate": 0.0001, + "loss": 1.6469, + "step": 5363 + }, + { + "epoch": 0.6161621963126759, + "grad_norm": 0.4440799951553345, + "learning_rate": 0.0001, + "loss": 1.5691, + "step": 5364 + }, + { + "epoch": 0.616277066222503, + "grad_norm": 0.46443289518356323, + "learning_rate": 0.0001, + "loss": 1.7149, + "step": 5365 + }, + { + "epoch": 0.6163919361323301, + "grad_norm": 0.43236956000328064, + "learning_rate": 0.0001, + "loss": 1.4569, + "step": 5366 + }, + { + "epoch": 0.6165068060421572, + "grad_norm": 0.4475458860397339, + "learning_rate": 0.0001, + "loss": 1.6878, + "step": 5367 + }, + { + "epoch": 0.6166216759519844, + "grad_norm": 0.4264468252658844, + "learning_rate": 0.0001, + "loss": 1.6352, + "step": 5368 + }, + { + "epoch": 0.6167365458618115, + "grad_norm": 0.4327240586280823, + "learning_rate": 0.0001, + "loss": 1.6209, + "step": 5369 + }, + { + "epoch": 0.6168514157716386, + "grad_norm": 0.43093448877334595, + "learning_rate": 0.0001, + "loss": 1.637, + "step": 5370 + }, + { + "epoch": 0.6169662856814657, + "grad_norm": 0.4652967154979706, + "learning_rate": 0.0001, + "loss": 1.8944, + "step": 5371 + }, + { + "epoch": 0.6170811555912928, + "grad_norm": 0.40782034397125244, + "learning_rate": 0.0001, + "loss": 1.4373, + "step": 5372 + }, + { + "epoch": 0.61719602550112, + "grad_norm": 0.3958606421947479, + "learning_rate": 0.0001, + "loss": 1.4501, + "step": 5373 + }, + { + "epoch": 0.6173108954109471, + "grad_norm": 0.4088459312915802, + "learning_rate": 0.0001, + "loss": 1.5566, + "step": 5374 + }, + { + "epoch": 0.6174257653207742, + "grad_norm": 0.42610761523246765, + "learning_rate": 0.0001, + "loss": 1.7281, + "step": 5375 + }, + { + "epoch": 0.6175406352306013, + "grad_norm": 0.4308398365974426, + "learning_rate": 0.0001, + "loss": 1.5579, + "step": 5376 + }, + { + "epoch": 0.6176555051404284, + "grad_norm": 0.4212062358856201, + "learning_rate": 0.0001, + "loss": 1.5424, + "step": 5377 + }, + { + "epoch": 0.6177703750502556, + "grad_norm": 0.4277915358543396, + "learning_rate": 0.0001, + "loss": 1.4436, + "step": 5378 + }, + { + "epoch": 0.6178852449600827, + "grad_norm": 0.45164668560028076, + "learning_rate": 0.0001, + "loss": 1.3541, + "step": 5379 + }, + { + "epoch": 0.6180001148699098, + "grad_norm": 0.46653538942337036, + "learning_rate": 0.0001, + "loss": 1.7403, + "step": 5380 + }, + { + "epoch": 0.6181149847797369, + "grad_norm": 0.4493342936038971, + "learning_rate": 0.0001, + "loss": 1.6205, + "step": 5381 + }, + { + "epoch": 0.618229854689564, + "grad_norm": 0.46748727560043335, + "learning_rate": 0.0001, + "loss": 1.7097, + "step": 5382 + }, + { + "epoch": 0.6183447245993912, + "grad_norm": 0.46008816361427307, + "learning_rate": 0.0001, + "loss": 1.7876, + "step": 5383 + }, + { + "epoch": 0.6184595945092183, + "grad_norm": 0.4166385531425476, + "learning_rate": 0.0001, + "loss": 1.666, + "step": 5384 + }, + { + "epoch": 0.6185744644190454, + "grad_norm": 0.46595972776412964, + "learning_rate": 0.0001, + "loss": 1.7317, + "step": 5385 + }, + { + "epoch": 0.6186893343288725, + "grad_norm": 0.41361644864082336, + "learning_rate": 0.0001, + "loss": 1.6432, + "step": 5386 + }, + { + "epoch": 0.6188042042386996, + "grad_norm": 0.4521470069885254, + "learning_rate": 0.0001, + "loss": 1.7181, + "step": 5387 + }, + { + "epoch": 0.6189190741485268, + "grad_norm": 0.4314734637737274, + "learning_rate": 0.0001, + "loss": 1.7027, + "step": 5388 + }, + { + "epoch": 0.6190339440583539, + "grad_norm": 0.49784350395202637, + "learning_rate": 0.0001, + "loss": 1.473, + "step": 5389 + }, + { + "epoch": 0.619148813968181, + "grad_norm": 0.4353158473968506, + "learning_rate": 0.0001, + "loss": 1.5425, + "step": 5390 + }, + { + "epoch": 0.6192636838780081, + "grad_norm": 0.405387818813324, + "learning_rate": 0.0001, + "loss": 1.5862, + "step": 5391 + }, + { + "epoch": 0.6193785537878352, + "grad_norm": 0.42280814051628113, + "learning_rate": 0.0001, + "loss": 1.6584, + "step": 5392 + }, + { + "epoch": 0.6194934236976624, + "grad_norm": 0.42651262879371643, + "learning_rate": 0.0001, + "loss": 1.6186, + "step": 5393 + }, + { + "epoch": 0.6196082936074895, + "grad_norm": 0.4402959644794464, + "learning_rate": 0.0001, + "loss": 1.5753, + "step": 5394 + }, + { + "epoch": 0.6197231635173166, + "grad_norm": 0.41979432106018066, + "learning_rate": 0.0001, + "loss": 1.5584, + "step": 5395 + }, + { + "epoch": 0.6198380334271437, + "grad_norm": 0.40594765543937683, + "learning_rate": 0.0001, + "loss": 1.5995, + "step": 5396 + }, + { + "epoch": 0.6199529033369708, + "grad_norm": 0.4297271966934204, + "learning_rate": 0.0001, + "loss": 1.6042, + "step": 5397 + }, + { + "epoch": 0.620067773246798, + "grad_norm": 0.47684258222579956, + "learning_rate": 0.0001, + "loss": 1.7715, + "step": 5398 + }, + { + "epoch": 0.6201826431566251, + "grad_norm": 0.44073134660720825, + "learning_rate": 0.0001, + "loss": 1.6191, + "step": 5399 + }, + { + "epoch": 0.6202975130664522, + "grad_norm": 0.4722387194633484, + "learning_rate": 0.0001, + "loss": 1.6578, + "step": 5400 + }, + { + "epoch": 0.6204123829762793, + "grad_norm": 0.4637337327003479, + "learning_rate": 0.0001, + "loss": 1.6055, + "step": 5401 + }, + { + "epoch": 0.6205272528861064, + "grad_norm": 0.43157336115837097, + "learning_rate": 0.0001, + "loss": 1.7081, + "step": 5402 + }, + { + "epoch": 0.6206421227959336, + "grad_norm": 0.4264506995677948, + "learning_rate": 0.0001, + "loss": 1.5326, + "step": 5403 + }, + { + "epoch": 0.6207569927057607, + "grad_norm": 0.42598217725753784, + "learning_rate": 0.0001, + "loss": 1.6313, + "step": 5404 + }, + { + "epoch": 0.6208718626155878, + "grad_norm": 0.4123907685279846, + "learning_rate": 0.0001, + "loss": 1.4996, + "step": 5405 + }, + { + "epoch": 0.6209867325254149, + "grad_norm": 0.4402943551540375, + "learning_rate": 0.0001, + "loss": 1.7162, + "step": 5406 + }, + { + "epoch": 0.621101602435242, + "grad_norm": 0.4304845929145813, + "learning_rate": 0.0001, + "loss": 1.7301, + "step": 5407 + }, + { + "epoch": 0.6212164723450692, + "grad_norm": 0.43135395646095276, + "learning_rate": 0.0001, + "loss": 1.6792, + "step": 5408 + }, + { + "epoch": 0.6213313422548963, + "grad_norm": 0.42243319749832153, + "learning_rate": 0.0001, + "loss": 1.548, + "step": 5409 + }, + { + "epoch": 0.6214462121647234, + "grad_norm": 0.4920620024204254, + "learning_rate": 0.0001, + "loss": 1.7013, + "step": 5410 + }, + { + "epoch": 0.6215610820745505, + "grad_norm": 0.41864511370658875, + "learning_rate": 0.0001, + "loss": 1.6426, + "step": 5411 + }, + { + "epoch": 0.6216759519843776, + "grad_norm": 0.4509679675102234, + "learning_rate": 0.0001, + "loss": 1.7161, + "step": 5412 + }, + { + "epoch": 0.6217908218942048, + "grad_norm": 0.4350128769874573, + "learning_rate": 0.0001, + "loss": 1.7858, + "step": 5413 + }, + { + "epoch": 0.6219056918040319, + "grad_norm": 0.4748633801937103, + "learning_rate": 0.0001, + "loss": 1.7502, + "step": 5414 + }, + { + "epoch": 0.622020561713859, + "grad_norm": 0.47240012884140015, + "learning_rate": 0.0001, + "loss": 1.88, + "step": 5415 + }, + { + "epoch": 0.6221354316236862, + "grad_norm": 0.42443564534187317, + "learning_rate": 0.0001, + "loss": 1.6455, + "step": 5416 + }, + { + "epoch": 0.6222503015335134, + "grad_norm": 0.4257674217224121, + "learning_rate": 0.0001, + "loss": 1.726, + "step": 5417 + }, + { + "epoch": 0.6223651714433405, + "grad_norm": 0.4025082588195801, + "learning_rate": 0.0001, + "loss": 1.5188, + "step": 5418 + }, + { + "epoch": 0.6224800413531676, + "grad_norm": 0.4154866337776184, + "learning_rate": 0.0001, + "loss": 1.6686, + "step": 5419 + }, + { + "epoch": 0.6225949112629947, + "grad_norm": 0.41281425952911377, + "learning_rate": 0.0001, + "loss": 1.5721, + "step": 5420 + }, + { + "epoch": 0.6227097811728218, + "grad_norm": 0.44021958112716675, + "learning_rate": 0.0001, + "loss": 1.6801, + "step": 5421 + }, + { + "epoch": 0.622824651082649, + "grad_norm": 0.4224892556667328, + "learning_rate": 0.0001, + "loss": 1.7219, + "step": 5422 + }, + { + "epoch": 0.6229395209924761, + "grad_norm": 0.44819357991218567, + "learning_rate": 0.0001, + "loss": 1.6665, + "step": 5423 + }, + { + "epoch": 0.6230543909023032, + "grad_norm": 0.4217240512371063, + "learning_rate": 0.0001, + "loss": 1.5709, + "step": 5424 + }, + { + "epoch": 0.6231692608121303, + "grad_norm": 0.45825833082199097, + "learning_rate": 0.0001, + "loss": 1.6292, + "step": 5425 + }, + { + "epoch": 0.6232841307219574, + "grad_norm": 0.427919864654541, + "learning_rate": 0.0001, + "loss": 1.6708, + "step": 5426 + }, + { + "epoch": 0.6233990006317846, + "grad_norm": 0.42998164892196655, + "learning_rate": 0.0001, + "loss": 1.4521, + "step": 5427 + }, + { + "epoch": 0.6235138705416117, + "grad_norm": 0.4373745024204254, + "learning_rate": 0.0001, + "loss": 1.6303, + "step": 5428 + }, + { + "epoch": 0.6236287404514388, + "grad_norm": 0.4681239426136017, + "learning_rate": 0.0001, + "loss": 1.7811, + "step": 5429 + }, + { + "epoch": 0.6237436103612659, + "grad_norm": 0.4634624719619751, + "learning_rate": 0.0001, + "loss": 1.8239, + "step": 5430 + }, + { + "epoch": 0.623858480271093, + "grad_norm": 0.4383297562599182, + "learning_rate": 0.0001, + "loss": 1.5753, + "step": 5431 + }, + { + "epoch": 0.6239733501809201, + "grad_norm": 0.46512508392333984, + "learning_rate": 0.0001, + "loss": 1.8332, + "step": 5432 + }, + { + "epoch": 0.6240882200907473, + "grad_norm": 0.4566185474395752, + "learning_rate": 0.0001, + "loss": 1.6087, + "step": 5433 + }, + { + "epoch": 0.6242030900005744, + "grad_norm": 0.3922889828681946, + "learning_rate": 0.0001, + "loss": 1.525, + "step": 5434 + }, + { + "epoch": 0.6243179599104015, + "grad_norm": 0.4435923993587494, + "learning_rate": 0.0001, + "loss": 1.7022, + "step": 5435 + }, + { + "epoch": 0.6244328298202286, + "grad_norm": 0.39931339025497437, + "learning_rate": 0.0001, + "loss": 1.4173, + "step": 5436 + }, + { + "epoch": 0.6245476997300557, + "grad_norm": 0.44321221113204956, + "learning_rate": 0.0001, + "loss": 1.6534, + "step": 5437 + }, + { + "epoch": 0.6246625696398829, + "grad_norm": 0.40193137526512146, + "learning_rate": 0.0001, + "loss": 1.5338, + "step": 5438 + }, + { + "epoch": 0.62477743954971, + "grad_norm": 0.45055443048477173, + "learning_rate": 0.0001, + "loss": 1.6323, + "step": 5439 + }, + { + "epoch": 0.6248923094595371, + "grad_norm": 0.38443759083747864, + "learning_rate": 0.0001, + "loss": 1.4288, + "step": 5440 + }, + { + "epoch": 0.6250071793693642, + "grad_norm": 0.47293904423713684, + "learning_rate": 0.0001, + "loss": 1.7437, + "step": 5441 + }, + { + "epoch": 0.6251220492791913, + "grad_norm": 0.4477282464504242, + "learning_rate": 0.0001, + "loss": 1.709, + "step": 5442 + }, + { + "epoch": 0.6252369191890185, + "grad_norm": 0.4328649938106537, + "learning_rate": 0.0001, + "loss": 1.4047, + "step": 5443 + }, + { + "epoch": 0.6253517890988456, + "grad_norm": 0.4671032428741455, + "learning_rate": 0.0001, + "loss": 1.6841, + "step": 5444 + }, + { + "epoch": 0.6254666590086727, + "grad_norm": 0.4175901412963867, + "learning_rate": 0.0001, + "loss": 1.5299, + "step": 5445 + }, + { + "epoch": 0.6255815289184998, + "grad_norm": 0.45132726430892944, + "learning_rate": 0.0001, + "loss": 1.5784, + "step": 5446 + }, + { + "epoch": 0.625696398828327, + "grad_norm": 0.45002493262290955, + "learning_rate": 0.0001, + "loss": 1.6631, + "step": 5447 + }, + { + "epoch": 0.6258112687381541, + "grad_norm": 0.49596941471099854, + "learning_rate": 0.0001, + "loss": 1.7334, + "step": 5448 + }, + { + "epoch": 0.6259261386479812, + "grad_norm": 0.5087515711784363, + "learning_rate": 0.0001, + "loss": 1.7773, + "step": 5449 + }, + { + "epoch": 0.6260410085578083, + "grad_norm": 0.43160462379455566, + "learning_rate": 0.0001, + "loss": 1.5872, + "step": 5450 + }, + { + "epoch": 0.6261558784676354, + "grad_norm": 0.41861292719841003, + "learning_rate": 0.0001, + "loss": 1.4826, + "step": 5451 + }, + { + "epoch": 0.6262707483774625, + "grad_norm": 0.3896733820438385, + "learning_rate": 0.0001, + "loss": 1.4015, + "step": 5452 + }, + { + "epoch": 0.6263856182872897, + "grad_norm": 0.4674106538295746, + "learning_rate": 0.0001, + "loss": 1.6152, + "step": 5453 + }, + { + "epoch": 0.6265004881971168, + "grad_norm": 0.4450713098049164, + "learning_rate": 0.0001, + "loss": 1.5596, + "step": 5454 + }, + { + "epoch": 0.6266153581069439, + "grad_norm": 0.5066030621528625, + "learning_rate": 0.0001, + "loss": 1.7224, + "step": 5455 + }, + { + "epoch": 0.626730228016771, + "grad_norm": 0.41346675157546997, + "learning_rate": 0.0001, + "loss": 1.6268, + "step": 5456 + }, + { + "epoch": 0.6268450979265981, + "grad_norm": 0.45128896832466125, + "learning_rate": 0.0001, + "loss": 1.5715, + "step": 5457 + }, + { + "epoch": 0.6269599678364253, + "grad_norm": 0.42013174295425415, + "learning_rate": 0.0001, + "loss": 1.7011, + "step": 5458 + }, + { + "epoch": 0.6270748377462524, + "grad_norm": 0.4740227460861206, + "learning_rate": 0.0001, + "loss": 1.6902, + "step": 5459 + }, + { + "epoch": 0.6271897076560795, + "grad_norm": 0.4719151258468628, + "learning_rate": 0.0001, + "loss": 1.905, + "step": 5460 + }, + { + "epoch": 0.6273045775659066, + "grad_norm": 0.46546176075935364, + "learning_rate": 0.0001, + "loss": 1.6859, + "step": 5461 + }, + { + "epoch": 0.6274194474757337, + "grad_norm": 0.44801339507102966, + "learning_rate": 0.0001, + "loss": 1.606, + "step": 5462 + }, + { + "epoch": 0.6275343173855609, + "grad_norm": 0.49131953716278076, + "learning_rate": 0.0001, + "loss": 1.6653, + "step": 5463 + }, + { + "epoch": 0.627649187295388, + "grad_norm": 0.4395316243171692, + "learning_rate": 0.0001, + "loss": 1.5935, + "step": 5464 + }, + { + "epoch": 0.6277640572052151, + "grad_norm": 0.43068498373031616, + "learning_rate": 0.0001, + "loss": 1.6478, + "step": 5465 + }, + { + "epoch": 0.6278789271150422, + "grad_norm": 0.41301921010017395, + "learning_rate": 0.0001, + "loss": 1.41, + "step": 5466 + }, + { + "epoch": 0.6279937970248693, + "grad_norm": 0.44340062141418457, + "learning_rate": 0.0001, + "loss": 1.7067, + "step": 5467 + }, + { + "epoch": 0.6281086669346965, + "grad_norm": 0.43528372049331665, + "learning_rate": 0.0001, + "loss": 1.5209, + "step": 5468 + }, + { + "epoch": 0.6282235368445236, + "grad_norm": 0.4367297291755676, + "learning_rate": 0.0001, + "loss": 1.574, + "step": 5469 + }, + { + "epoch": 0.6283384067543507, + "grad_norm": 0.4735720455646515, + "learning_rate": 0.0001, + "loss": 1.6138, + "step": 5470 + }, + { + "epoch": 0.6284532766641778, + "grad_norm": 0.43283382058143616, + "learning_rate": 0.0001, + "loss": 1.6992, + "step": 5471 + }, + { + "epoch": 0.628568146574005, + "grad_norm": 0.446654349565506, + "learning_rate": 0.0001, + "loss": 1.4944, + "step": 5472 + }, + { + "epoch": 0.6286830164838321, + "grad_norm": 0.4372828006744385, + "learning_rate": 0.0001, + "loss": 1.4353, + "step": 5473 + }, + { + "epoch": 0.6287978863936592, + "grad_norm": 0.4139867424964905, + "learning_rate": 0.0001, + "loss": 1.5969, + "step": 5474 + }, + { + "epoch": 0.6289127563034863, + "grad_norm": 0.4681108891963959, + "learning_rate": 0.0001, + "loss": 1.7943, + "step": 5475 + }, + { + "epoch": 0.6290276262133134, + "grad_norm": 0.42920827865600586, + "learning_rate": 0.0001, + "loss": 1.5649, + "step": 5476 + }, + { + "epoch": 0.6291424961231405, + "grad_norm": 0.42748481035232544, + "learning_rate": 0.0001, + "loss": 1.5491, + "step": 5477 + }, + { + "epoch": 0.6292573660329677, + "grad_norm": 0.4460189938545227, + "learning_rate": 0.0001, + "loss": 1.5892, + "step": 5478 + }, + { + "epoch": 0.6293722359427948, + "grad_norm": 0.43394696712493896, + "learning_rate": 0.0001, + "loss": 1.5284, + "step": 5479 + }, + { + "epoch": 0.6294871058526219, + "grad_norm": 0.429083913564682, + "learning_rate": 0.0001, + "loss": 1.6287, + "step": 5480 + }, + { + "epoch": 0.629601975762449, + "grad_norm": 0.4215359687805176, + "learning_rate": 0.0001, + "loss": 1.5587, + "step": 5481 + }, + { + "epoch": 0.6297168456722761, + "grad_norm": 0.45450299978256226, + "learning_rate": 0.0001, + "loss": 1.3634, + "step": 5482 + }, + { + "epoch": 0.6298317155821033, + "grad_norm": 0.4605744481086731, + "learning_rate": 0.0001, + "loss": 1.7417, + "step": 5483 + }, + { + "epoch": 0.6299465854919304, + "grad_norm": 0.4325323700904846, + "learning_rate": 0.0001, + "loss": 1.4853, + "step": 5484 + }, + { + "epoch": 0.6300614554017575, + "grad_norm": 0.441438764333725, + "learning_rate": 0.0001, + "loss": 1.5838, + "step": 5485 + }, + { + "epoch": 0.6301763253115846, + "grad_norm": 0.5233652591705322, + "learning_rate": 0.0001, + "loss": 1.9394, + "step": 5486 + }, + { + "epoch": 0.6302911952214117, + "grad_norm": 0.44866594672203064, + "learning_rate": 0.0001, + "loss": 1.8056, + "step": 5487 + }, + { + "epoch": 0.6304060651312389, + "grad_norm": 0.4310179352760315, + "learning_rate": 0.0001, + "loss": 1.6206, + "step": 5488 + }, + { + "epoch": 0.630520935041066, + "grad_norm": 0.46676987409591675, + "learning_rate": 0.0001, + "loss": 1.7363, + "step": 5489 + }, + { + "epoch": 0.6306358049508931, + "grad_norm": 0.45726069808006287, + "learning_rate": 0.0001, + "loss": 1.5432, + "step": 5490 + }, + { + "epoch": 0.6307506748607202, + "grad_norm": 0.46568796038627625, + "learning_rate": 0.0001, + "loss": 1.772, + "step": 5491 + }, + { + "epoch": 0.6308655447705473, + "grad_norm": 0.43564847111701965, + "learning_rate": 0.0001, + "loss": 1.7335, + "step": 5492 + }, + { + "epoch": 0.6309804146803745, + "grad_norm": 0.43100833892822266, + "learning_rate": 0.0001, + "loss": 1.5743, + "step": 5493 + }, + { + "epoch": 0.6310952845902016, + "grad_norm": 0.48715853691101074, + "learning_rate": 0.0001, + "loss": 1.6999, + "step": 5494 + }, + { + "epoch": 0.6312101545000287, + "grad_norm": 0.4202541708946228, + "learning_rate": 0.0001, + "loss": 1.4744, + "step": 5495 + }, + { + "epoch": 0.6313250244098558, + "grad_norm": 0.49194201827049255, + "learning_rate": 0.0001, + "loss": 1.6739, + "step": 5496 + }, + { + "epoch": 0.631439894319683, + "grad_norm": 0.4225814938545227, + "learning_rate": 0.0001, + "loss": 1.6509, + "step": 5497 + }, + { + "epoch": 0.6315547642295101, + "grad_norm": 0.41638097167015076, + "learning_rate": 0.0001, + "loss": 1.6172, + "step": 5498 + }, + { + "epoch": 0.6316696341393372, + "grad_norm": 0.44772642850875854, + "learning_rate": 0.0001, + "loss": 1.8187, + "step": 5499 + }, + { + "epoch": 0.6317845040491643, + "grad_norm": 0.4238649904727936, + "learning_rate": 0.0001, + "loss": 1.6478, + "step": 5500 + }, + { + "epoch": 0.6318993739589914, + "grad_norm": 0.4199509918689728, + "learning_rate": 0.0001, + "loss": 1.7285, + "step": 5501 + }, + { + "epoch": 0.6320142438688185, + "grad_norm": 0.4321916699409485, + "learning_rate": 0.0001, + "loss": 1.6335, + "step": 5502 + }, + { + "epoch": 0.6321291137786457, + "grad_norm": 0.4737303853034973, + "learning_rate": 0.0001, + "loss": 1.4796, + "step": 5503 + }, + { + "epoch": 0.6322439836884728, + "grad_norm": 0.450839102268219, + "learning_rate": 0.0001, + "loss": 1.5807, + "step": 5504 + }, + { + "epoch": 0.6323588535982999, + "grad_norm": 0.4439679682254791, + "learning_rate": 0.0001, + "loss": 1.6076, + "step": 5505 + }, + { + "epoch": 0.632473723508127, + "grad_norm": 0.438607782125473, + "learning_rate": 0.0001, + "loss": 1.6273, + "step": 5506 + }, + { + "epoch": 0.6325885934179541, + "grad_norm": 0.44356659054756165, + "learning_rate": 0.0001, + "loss": 1.6346, + "step": 5507 + }, + { + "epoch": 0.6327034633277813, + "grad_norm": 0.47536197304725647, + "learning_rate": 0.0001, + "loss": 1.673, + "step": 5508 + }, + { + "epoch": 0.6328183332376084, + "grad_norm": 0.4841341972351074, + "learning_rate": 0.0001, + "loss": 1.627, + "step": 5509 + }, + { + "epoch": 0.6329332031474355, + "grad_norm": 0.4180241823196411, + "learning_rate": 0.0001, + "loss": 1.4014, + "step": 5510 + }, + { + "epoch": 0.6330480730572626, + "grad_norm": 0.46159014105796814, + "learning_rate": 0.0001, + "loss": 1.7947, + "step": 5511 + }, + { + "epoch": 0.6331629429670897, + "grad_norm": 0.43560919165611267, + "learning_rate": 0.0001, + "loss": 1.6293, + "step": 5512 + }, + { + "epoch": 0.6332778128769169, + "grad_norm": 0.41563135385513306, + "learning_rate": 0.0001, + "loss": 1.548, + "step": 5513 + }, + { + "epoch": 0.633392682786744, + "grad_norm": 0.4384070634841919, + "learning_rate": 0.0001, + "loss": 1.6485, + "step": 5514 + }, + { + "epoch": 0.6335075526965711, + "grad_norm": 0.45419734716415405, + "learning_rate": 0.0001, + "loss": 1.5951, + "step": 5515 + }, + { + "epoch": 0.6336224226063982, + "grad_norm": 0.438769668340683, + "learning_rate": 0.0001, + "loss": 1.5075, + "step": 5516 + }, + { + "epoch": 0.6337372925162253, + "grad_norm": 0.4500299394130707, + "learning_rate": 0.0001, + "loss": 1.7686, + "step": 5517 + }, + { + "epoch": 0.6338521624260525, + "grad_norm": 0.543213963508606, + "learning_rate": 0.0001, + "loss": 1.7704, + "step": 5518 + }, + { + "epoch": 0.6339670323358796, + "grad_norm": 0.4491806924343109, + "learning_rate": 0.0001, + "loss": 1.7672, + "step": 5519 + }, + { + "epoch": 0.6340819022457067, + "grad_norm": 0.4237130284309387, + "learning_rate": 0.0001, + "loss": 1.5714, + "step": 5520 + }, + { + "epoch": 0.6341967721555338, + "grad_norm": 0.4394038915634155, + "learning_rate": 0.0001, + "loss": 1.5889, + "step": 5521 + }, + { + "epoch": 0.634311642065361, + "grad_norm": 0.4348239302635193, + "learning_rate": 0.0001, + "loss": 1.6972, + "step": 5522 + }, + { + "epoch": 0.6344265119751881, + "grad_norm": 0.45309606194496155, + "learning_rate": 0.0001, + "loss": 1.6889, + "step": 5523 + }, + { + "epoch": 0.6345413818850152, + "grad_norm": 0.40822720527648926, + "learning_rate": 0.0001, + "loss": 1.5377, + "step": 5524 + }, + { + "epoch": 0.6346562517948423, + "grad_norm": 0.48253944516181946, + "learning_rate": 0.0001, + "loss": 1.727, + "step": 5525 + }, + { + "epoch": 0.6347711217046694, + "grad_norm": 0.4541143476963043, + "learning_rate": 0.0001, + "loss": 1.8615, + "step": 5526 + }, + { + "epoch": 0.6348859916144965, + "grad_norm": 0.4158259630203247, + "learning_rate": 0.0001, + "loss": 1.4537, + "step": 5527 + }, + { + "epoch": 0.6350008615243237, + "grad_norm": 0.41893231868743896, + "learning_rate": 0.0001, + "loss": 1.6787, + "step": 5528 + }, + { + "epoch": 0.6351157314341508, + "grad_norm": 0.4827512502670288, + "learning_rate": 0.0001, + "loss": 1.7608, + "step": 5529 + }, + { + "epoch": 0.6352306013439779, + "grad_norm": 0.44435709714889526, + "learning_rate": 0.0001, + "loss": 1.6079, + "step": 5530 + }, + { + "epoch": 0.635345471253805, + "grad_norm": 0.4805011451244354, + "learning_rate": 0.0001, + "loss": 1.7446, + "step": 5531 + }, + { + "epoch": 0.6354603411636321, + "grad_norm": 0.43575429916381836, + "learning_rate": 0.0001, + "loss": 1.6265, + "step": 5532 + }, + { + "epoch": 0.6355752110734593, + "grad_norm": 0.4281097650527954, + "learning_rate": 0.0001, + "loss": 1.7776, + "step": 5533 + }, + { + "epoch": 0.6356900809832864, + "grad_norm": 0.41298815608024597, + "learning_rate": 0.0001, + "loss": 1.5776, + "step": 5534 + }, + { + "epoch": 0.6358049508931135, + "grad_norm": 0.43981999158859253, + "learning_rate": 0.0001, + "loss": 1.6219, + "step": 5535 + }, + { + "epoch": 0.6359198208029406, + "grad_norm": 0.46365123987197876, + "learning_rate": 0.0001, + "loss": 1.7387, + "step": 5536 + }, + { + "epoch": 0.6360346907127677, + "grad_norm": 0.40434473752975464, + "learning_rate": 0.0001, + "loss": 1.5259, + "step": 5537 + }, + { + "epoch": 0.6361495606225949, + "grad_norm": 0.42286887764930725, + "learning_rate": 0.0001, + "loss": 1.4093, + "step": 5538 + }, + { + "epoch": 0.636264430532422, + "grad_norm": 0.4497700035572052, + "learning_rate": 0.0001, + "loss": 1.8652, + "step": 5539 + }, + { + "epoch": 0.6363793004422491, + "grad_norm": 0.4267425537109375, + "learning_rate": 0.0001, + "loss": 1.704, + "step": 5540 + }, + { + "epoch": 0.6364941703520762, + "grad_norm": 0.448066383600235, + "learning_rate": 0.0001, + "loss": 1.5721, + "step": 5541 + }, + { + "epoch": 0.6366090402619033, + "grad_norm": 0.46805888414382935, + "learning_rate": 0.0001, + "loss": 1.7208, + "step": 5542 + }, + { + "epoch": 0.6367239101717305, + "grad_norm": 0.42206695675849915, + "learning_rate": 0.0001, + "loss": 1.3684, + "step": 5543 + }, + { + "epoch": 0.6368387800815576, + "grad_norm": 0.39510831236839294, + "learning_rate": 0.0001, + "loss": 1.6721, + "step": 5544 + }, + { + "epoch": 0.6369536499913847, + "grad_norm": 0.4754282236099243, + "learning_rate": 0.0001, + "loss": 1.8412, + "step": 5545 + }, + { + "epoch": 0.6370685199012118, + "grad_norm": 0.42278730869293213, + "learning_rate": 0.0001, + "loss": 1.4947, + "step": 5546 + }, + { + "epoch": 0.637183389811039, + "grad_norm": 0.4810383915901184, + "learning_rate": 0.0001, + "loss": 1.6435, + "step": 5547 + }, + { + "epoch": 0.6372982597208661, + "grad_norm": 0.4392484128475189, + "learning_rate": 0.0001, + "loss": 1.4485, + "step": 5548 + }, + { + "epoch": 0.6374131296306932, + "grad_norm": 0.46849682927131653, + "learning_rate": 0.0001, + "loss": 1.7611, + "step": 5549 + }, + { + "epoch": 0.6375279995405203, + "grad_norm": 0.428786039352417, + "learning_rate": 0.0001, + "loss": 1.7185, + "step": 5550 + }, + { + "epoch": 0.6376428694503474, + "grad_norm": 0.41656240820884705, + "learning_rate": 0.0001, + "loss": 1.5787, + "step": 5551 + }, + { + "epoch": 0.6377577393601745, + "grad_norm": 0.447986364364624, + "learning_rate": 0.0001, + "loss": 1.6424, + "step": 5552 + }, + { + "epoch": 0.6378726092700018, + "grad_norm": 0.4375683665275574, + "learning_rate": 0.0001, + "loss": 1.7039, + "step": 5553 + }, + { + "epoch": 0.6379874791798289, + "grad_norm": 0.4190763831138611, + "learning_rate": 0.0001, + "loss": 1.5491, + "step": 5554 + }, + { + "epoch": 0.638102349089656, + "grad_norm": 0.42789924144744873, + "learning_rate": 0.0001, + "loss": 1.6198, + "step": 5555 + }, + { + "epoch": 0.6382172189994831, + "grad_norm": 0.42377930879592896, + "learning_rate": 0.0001, + "loss": 1.6668, + "step": 5556 + }, + { + "epoch": 0.6383320889093103, + "grad_norm": 0.43022698163986206, + "learning_rate": 0.0001, + "loss": 1.5902, + "step": 5557 + }, + { + "epoch": 0.6384469588191374, + "grad_norm": 0.40309348702430725, + "learning_rate": 0.0001, + "loss": 1.4805, + "step": 5558 + }, + { + "epoch": 0.6385618287289645, + "grad_norm": 0.4346220791339874, + "learning_rate": 0.0001, + "loss": 1.598, + "step": 5559 + }, + { + "epoch": 0.6386766986387916, + "grad_norm": 0.45233309268951416, + "learning_rate": 0.0001, + "loss": 1.685, + "step": 5560 + }, + { + "epoch": 0.6387915685486187, + "grad_norm": 0.44141674041748047, + "learning_rate": 0.0001, + "loss": 1.6379, + "step": 5561 + }, + { + "epoch": 0.6389064384584459, + "grad_norm": 0.45340585708618164, + "learning_rate": 0.0001, + "loss": 1.7477, + "step": 5562 + }, + { + "epoch": 0.639021308368273, + "grad_norm": 0.46206724643707275, + "learning_rate": 0.0001, + "loss": 1.8006, + "step": 5563 + }, + { + "epoch": 0.6391361782781001, + "grad_norm": 0.42310798168182373, + "learning_rate": 0.0001, + "loss": 1.7606, + "step": 5564 + }, + { + "epoch": 0.6392510481879272, + "grad_norm": 0.40606430172920227, + "learning_rate": 0.0001, + "loss": 1.4893, + "step": 5565 + }, + { + "epoch": 0.6393659180977543, + "grad_norm": 0.4227276146411896, + "learning_rate": 0.0001, + "loss": 1.5051, + "step": 5566 + }, + { + "epoch": 0.6394807880075815, + "grad_norm": 0.42999976873397827, + "learning_rate": 0.0001, + "loss": 1.6919, + "step": 5567 + }, + { + "epoch": 0.6395956579174086, + "grad_norm": 0.4548937678337097, + "learning_rate": 0.0001, + "loss": 1.606, + "step": 5568 + }, + { + "epoch": 0.6397105278272357, + "grad_norm": 0.4163641333580017, + "learning_rate": 0.0001, + "loss": 1.4716, + "step": 5569 + }, + { + "epoch": 0.6398253977370628, + "grad_norm": 0.41462451219558716, + "learning_rate": 0.0001, + "loss": 1.674, + "step": 5570 + }, + { + "epoch": 0.6399402676468899, + "grad_norm": 0.4123389422893524, + "learning_rate": 0.0001, + "loss": 1.5769, + "step": 5571 + }, + { + "epoch": 0.6400551375567171, + "grad_norm": 0.447255939245224, + "learning_rate": 0.0001, + "loss": 1.6402, + "step": 5572 + }, + { + "epoch": 0.6401700074665442, + "grad_norm": 0.46185576915740967, + "learning_rate": 0.0001, + "loss": 1.5632, + "step": 5573 + }, + { + "epoch": 0.6402848773763713, + "grad_norm": 0.46707770228385925, + "learning_rate": 0.0001, + "loss": 1.7298, + "step": 5574 + }, + { + "epoch": 0.6403997472861984, + "grad_norm": 0.4091823399066925, + "learning_rate": 0.0001, + "loss": 1.4606, + "step": 5575 + }, + { + "epoch": 0.6405146171960255, + "grad_norm": 0.45736071467399597, + "learning_rate": 0.0001, + "loss": 1.764, + "step": 5576 + }, + { + "epoch": 0.6406294871058527, + "grad_norm": 0.4426022469997406, + "learning_rate": 0.0001, + "loss": 1.6942, + "step": 5577 + }, + { + "epoch": 0.6407443570156798, + "grad_norm": 0.42908066511154175, + "learning_rate": 0.0001, + "loss": 1.6928, + "step": 5578 + }, + { + "epoch": 0.6408592269255069, + "grad_norm": 0.4217868149280548, + "learning_rate": 0.0001, + "loss": 1.5034, + "step": 5579 + }, + { + "epoch": 0.640974096835334, + "grad_norm": 0.4545440673828125, + "learning_rate": 0.0001, + "loss": 1.7952, + "step": 5580 + }, + { + "epoch": 0.6410889667451611, + "grad_norm": 0.42396777868270874, + "learning_rate": 0.0001, + "loss": 1.54, + "step": 5581 + }, + { + "epoch": 0.6412038366549883, + "grad_norm": 0.43753355741500854, + "learning_rate": 0.0001, + "loss": 1.5584, + "step": 5582 + }, + { + "epoch": 0.6413187065648154, + "grad_norm": 0.46905073523521423, + "learning_rate": 0.0001, + "loss": 1.5581, + "step": 5583 + }, + { + "epoch": 0.6414335764746425, + "grad_norm": 0.4515920877456665, + "learning_rate": 0.0001, + "loss": 1.4264, + "step": 5584 + }, + { + "epoch": 0.6415484463844696, + "grad_norm": 0.434110552072525, + "learning_rate": 0.0001, + "loss": 1.6005, + "step": 5585 + }, + { + "epoch": 0.6416633162942967, + "grad_norm": 0.4196917414665222, + "learning_rate": 0.0001, + "loss": 1.6059, + "step": 5586 + }, + { + "epoch": 0.6417781862041239, + "grad_norm": 0.4676187336444855, + "learning_rate": 0.0001, + "loss": 1.7113, + "step": 5587 + }, + { + "epoch": 0.641893056113951, + "grad_norm": 0.4227234423160553, + "learning_rate": 0.0001, + "loss": 1.573, + "step": 5588 + }, + { + "epoch": 0.6420079260237781, + "grad_norm": 0.45535120368003845, + "learning_rate": 0.0001, + "loss": 1.6142, + "step": 5589 + }, + { + "epoch": 0.6421227959336052, + "grad_norm": 0.4416712820529938, + "learning_rate": 0.0001, + "loss": 1.6422, + "step": 5590 + }, + { + "epoch": 0.6422376658434323, + "grad_norm": 0.454698383808136, + "learning_rate": 0.0001, + "loss": 1.835, + "step": 5591 + }, + { + "epoch": 0.6423525357532595, + "grad_norm": 0.429575115442276, + "learning_rate": 0.0001, + "loss": 1.6046, + "step": 5592 + }, + { + "epoch": 0.6424674056630866, + "grad_norm": 0.47407978773117065, + "learning_rate": 0.0001, + "loss": 1.7451, + "step": 5593 + }, + { + "epoch": 0.6425822755729137, + "grad_norm": 0.4476969838142395, + "learning_rate": 0.0001, + "loss": 1.6611, + "step": 5594 + }, + { + "epoch": 0.6426971454827408, + "grad_norm": 0.48670604825019836, + "learning_rate": 0.0001, + "loss": 1.8238, + "step": 5595 + }, + { + "epoch": 0.6428120153925679, + "grad_norm": 0.4715462923049927, + "learning_rate": 0.0001, + "loss": 1.5888, + "step": 5596 + }, + { + "epoch": 0.6429268853023951, + "grad_norm": 0.4310706555843353, + "learning_rate": 0.0001, + "loss": 1.6972, + "step": 5597 + }, + { + "epoch": 0.6430417552122222, + "grad_norm": 0.4375722408294678, + "learning_rate": 0.0001, + "loss": 1.7016, + "step": 5598 + }, + { + "epoch": 0.6431566251220493, + "grad_norm": 0.44785434007644653, + "learning_rate": 0.0001, + "loss": 1.5328, + "step": 5599 + }, + { + "epoch": 0.6432714950318764, + "grad_norm": 0.43473127484321594, + "learning_rate": 0.0001, + "loss": 1.5382, + "step": 5600 + }, + { + "epoch": 0.6433863649417035, + "grad_norm": 0.5235137939453125, + "learning_rate": 0.0001, + "loss": 1.7376, + "step": 5601 + }, + { + "epoch": 0.6435012348515307, + "grad_norm": 0.47962141036987305, + "learning_rate": 0.0001, + "loss": 1.4158, + "step": 5602 + }, + { + "epoch": 0.6436161047613578, + "grad_norm": 0.42864152789115906, + "learning_rate": 0.0001, + "loss": 1.5161, + "step": 5603 + }, + { + "epoch": 0.6437309746711849, + "grad_norm": 0.4481607675552368, + "learning_rate": 0.0001, + "loss": 1.6201, + "step": 5604 + }, + { + "epoch": 0.643845844581012, + "grad_norm": 0.503777027130127, + "learning_rate": 0.0001, + "loss": 1.9532, + "step": 5605 + }, + { + "epoch": 0.6439607144908391, + "grad_norm": 0.44352078437805176, + "learning_rate": 0.0001, + "loss": 1.7002, + "step": 5606 + }, + { + "epoch": 0.6440755844006663, + "grad_norm": 0.43461933732032776, + "learning_rate": 0.0001, + "loss": 1.685, + "step": 5607 + }, + { + "epoch": 0.6441904543104934, + "grad_norm": 0.41317033767700195, + "learning_rate": 0.0001, + "loss": 1.6394, + "step": 5608 + }, + { + "epoch": 0.6443053242203205, + "grad_norm": 0.47428956627845764, + "learning_rate": 0.0001, + "loss": 1.768, + "step": 5609 + }, + { + "epoch": 0.6444201941301476, + "grad_norm": 0.4190300703048706, + "learning_rate": 0.0001, + "loss": 1.5919, + "step": 5610 + }, + { + "epoch": 0.6445350640399747, + "grad_norm": 0.4992254376411438, + "learning_rate": 0.0001, + "loss": 1.9133, + "step": 5611 + }, + { + "epoch": 0.6446499339498019, + "grad_norm": 0.4316772222518921, + "learning_rate": 0.0001, + "loss": 1.3204, + "step": 5612 + }, + { + "epoch": 0.644764803859629, + "grad_norm": 0.4693394899368286, + "learning_rate": 0.0001, + "loss": 1.8395, + "step": 5613 + }, + { + "epoch": 0.6448796737694561, + "grad_norm": 0.4208371043205261, + "learning_rate": 0.0001, + "loss": 1.4802, + "step": 5614 + }, + { + "epoch": 0.6449945436792832, + "grad_norm": 0.44202813506126404, + "learning_rate": 0.0001, + "loss": 1.5953, + "step": 5615 + }, + { + "epoch": 0.6451094135891103, + "grad_norm": 0.4268895387649536, + "learning_rate": 0.0001, + "loss": 1.6182, + "step": 5616 + }, + { + "epoch": 0.6452242834989375, + "grad_norm": 0.4346774220466614, + "learning_rate": 0.0001, + "loss": 1.5141, + "step": 5617 + }, + { + "epoch": 0.6453391534087646, + "grad_norm": 0.46032214164733887, + "learning_rate": 0.0001, + "loss": 1.6249, + "step": 5618 + }, + { + "epoch": 0.6454540233185917, + "grad_norm": 0.4409688413143158, + "learning_rate": 0.0001, + "loss": 1.5832, + "step": 5619 + }, + { + "epoch": 0.6455688932284188, + "grad_norm": 0.4379858374595642, + "learning_rate": 0.0001, + "loss": 1.6493, + "step": 5620 + }, + { + "epoch": 0.6456837631382459, + "grad_norm": 0.43202537298202515, + "learning_rate": 0.0001, + "loss": 1.5685, + "step": 5621 + }, + { + "epoch": 0.6457986330480731, + "grad_norm": 0.44553086161613464, + "learning_rate": 0.0001, + "loss": 1.5579, + "step": 5622 + }, + { + "epoch": 0.6459135029579002, + "grad_norm": 0.42607608437538147, + "learning_rate": 0.0001, + "loss": 1.6773, + "step": 5623 + }, + { + "epoch": 0.6460283728677273, + "grad_norm": 0.45914703607559204, + "learning_rate": 0.0001, + "loss": 1.8061, + "step": 5624 + }, + { + "epoch": 0.6461432427775544, + "grad_norm": 0.46458396315574646, + "learning_rate": 0.0001, + "loss": 1.701, + "step": 5625 + }, + { + "epoch": 0.6462581126873815, + "grad_norm": 0.4472190737724304, + "learning_rate": 0.0001, + "loss": 1.6258, + "step": 5626 + }, + { + "epoch": 0.6463729825972087, + "grad_norm": 0.4329933524131775, + "learning_rate": 0.0001, + "loss": 1.5421, + "step": 5627 + }, + { + "epoch": 0.6464878525070358, + "grad_norm": 0.4526257812976837, + "learning_rate": 0.0001, + "loss": 1.5711, + "step": 5628 + }, + { + "epoch": 0.6466027224168629, + "grad_norm": 0.4655599892139435, + "learning_rate": 0.0001, + "loss": 1.5488, + "step": 5629 + }, + { + "epoch": 0.64671759232669, + "grad_norm": 0.40564316511154175, + "learning_rate": 0.0001, + "loss": 1.602, + "step": 5630 + }, + { + "epoch": 0.6468324622365171, + "grad_norm": 0.41868773102760315, + "learning_rate": 0.0001, + "loss": 1.6287, + "step": 5631 + }, + { + "epoch": 0.6469473321463443, + "grad_norm": 0.4080910384654999, + "learning_rate": 0.0001, + "loss": 1.6231, + "step": 5632 + }, + { + "epoch": 0.6470622020561714, + "grad_norm": 0.45903560519218445, + "learning_rate": 0.0001, + "loss": 1.8245, + "step": 5633 + }, + { + "epoch": 0.6471770719659985, + "grad_norm": 0.4414862394332886, + "learning_rate": 0.0001, + "loss": 1.6152, + "step": 5634 + }, + { + "epoch": 0.6472919418758256, + "grad_norm": 0.43176713585853577, + "learning_rate": 0.0001, + "loss": 1.6816, + "step": 5635 + }, + { + "epoch": 0.6474068117856527, + "grad_norm": 0.41198432445526123, + "learning_rate": 0.0001, + "loss": 1.566, + "step": 5636 + }, + { + "epoch": 0.6475216816954799, + "grad_norm": 0.47420576214790344, + "learning_rate": 0.0001, + "loss": 1.7435, + "step": 5637 + }, + { + "epoch": 0.647636551605307, + "grad_norm": 0.44818076491355896, + "learning_rate": 0.0001, + "loss": 1.6726, + "step": 5638 + }, + { + "epoch": 0.6477514215151341, + "grad_norm": 0.43701666593551636, + "learning_rate": 0.0001, + "loss": 1.5689, + "step": 5639 + }, + { + "epoch": 0.6478662914249612, + "grad_norm": 0.4181070923805237, + "learning_rate": 0.0001, + "loss": 1.4794, + "step": 5640 + }, + { + "epoch": 0.6479811613347883, + "grad_norm": 0.4229847192764282, + "learning_rate": 0.0001, + "loss": 1.594, + "step": 5641 + }, + { + "epoch": 0.6480960312446155, + "grad_norm": 0.4145675003528595, + "learning_rate": 0.0001, + "loss": 1.5745, + "step": 5642 + }, + { + "epoch": 0.6482109011544426, + "grad_norm": 0.43566563725471497, + "learning_rate": 0.0001, + "loss": 1.7148, + "step": 5643 + }, + { + "epoch": 0.6483257710642697, + "grad_norm": 0.4276692867279053, + "learning_rate": 0.0001, + "loss": 1.4687, + "step": 5644 + }, + { + "epoch": 0.6484406409740968, + "grad_norm": 0.3920123875141144, + "learning_rate": 0.0001, + "loss": 1.4192, + "step": 5645 + }, + { + "epoch": 0.6485555108839239, + "grad_norm": 0.4451933801174164, + "learning_rate": 0.0001, + "loss": 1.6342, + "step": 5646 + }, + { + "epoch": 0.6486703807937511, + "grad_norm": 0.404407799243927, + "learning_rate": 0.0001, + "loss": 1.5801, + "step": 5647 + }, + { + "epoch": 0.6487852507035782, + "grad_norm": 0.4093591570854187, + "learning_rate": 0.0001, + "loss": 1.4817, + "step": 5648 + }, + { + "epoch": 0.6489001206134053, + "grad_norm": 0.4147598147392273, + "learning_rate": 0.0001, + "loss": 1.7049, + "step": 5649 + }, + { + "epoch": 0.6490149905232324, + "grad_norm": 0.45576632022857666, + "learning_rate": 0.0001, + "loss": 1.6524, + "step": 5650 + }, + { + "epoch": 0.6491298604330595, + "grad_norm": 0.4459483027458191, + "learning_rate": 0.0001, + "loss": 1.7097, + "step": 5651 + }, + { + "epoch": 0.6492447303428867, + "grad_norm": 0.47733941674232483, + "learning_rate": 0.0001, + "loss": 1.8741, + "step": 5652 + }, + { + "epoch": 0.6493596002527138, + "grad_norm": 0.45850062370300293, + "learning_rate": 0.0001, + "loss": 1.7533, + "step": 5653 + }, + { + "epoch": 0.6494744701625409, + "grad_norm": 0.42292144894599915, + "learning_rate": 0.0001, + "loss": 1.5892, + "step": 5654 + }, + { + "epoch": 0.649589340072368, + "grad_norm": 0.4376376271247864, + "learning_rate": 0.0001, + "loss": 1.7886, + "step": 5655 + }, + { + "epoch": 0.6497042099821951, + "grad_norm": 0.4351494610309601, + "learning_rate": 0.0001, + "loss": 1.6099, + "step": 5656 + }, + { + "epoch": 0.6498190798920223, + "grad_norm": 0.4215450882911682, + "learning_rate": 0.0001, + "loss": 1.6642, + "step": 5657 + }, + { + "epoch": 0.6499339498018494, + "grad_norm": 0.4552718997001648, + "learning_rate": 0.0001, + "loss": 1.7277, + "step": 5658 + }, + { + "epoch": 0.6500488197116765, + "grad_norm": 0.4284704625606537, + "learning_rate": 0.0001, + "loss": 1.6748, + "step": 5659 + }, + { + "epoch": 0.6501636896215036, + "grad_norm": 0.42294004559516907, + "learning_rate": 0.0001, + "loss": 1.575, + "step": 5660 + }, + { + "epoch": 0.6502785595313307, + "grad_norm": 0.43804609775543213, + "learning_rate": 0.0001, + "loss": 1.6529, + "step": 5661 + }, + { + "epoch": 0.6503934294411579, + "grad_norm": 0.4440459907054901, + "learning_rate": 0.0001, + "loss": 1.7683, + "step": 5662 + }, + { + "epoch": 0.650508299350985, + "grad_norm": 0.4592325687408447, + "learning_rate": 0.0001, + "loss": 1.5892, + "step": 5663 + }, + { + "epoch": 0.6506231692608121, + "grad_norm": 0.44203251600265503, + "learning_rate": 0.0001, + "loss": 1.6765, + "step": 5664 + }, + { + "epoch": 0.6507380391706392, + "grad_norm": 0.4115028977394104, + "learning_rate": 0.0001, + "loss": 1.3906, + "step": 5665 + }, + { + "epoch": 0.6508529090804663, + "grad_norm": 0.456676721572876, + "learning_rate": 0.0001, + "loss": 1.6296, + "step": 5666 + }, + { + "epoch": 0.6509677789902935, + "grad_norm": 0.4849773645401001, + "learning_rate": 0.0001, + "loss": 1.5728, + "step": 5667 + }, + { + "epoch": 0.6510826489001206, + "grad_norm": 0.4652778208255768, + "learning_rate": 0.0001, + "loss": 1.8482, + "step": 5668 + }, + { + "epoch": 0.6511975188099477, + "grad_norm": 0.42466142773628235, + "learning_rate": 0.0001, + "loss": 1.4337, + "step": 5669 + }, + { + "epoch": 0.6513123887197748, + "grad_norm": 0.4781624674797058, + "learning_rate": 0.0001, + "loss": 1.884, + "step": 5670 + }, + { + "epoch": 0.6514272586296019, + "grad_norm": 0.4678051769733429, + "learning_rate": 0.0001, + "loss": 1.5438, + "step": 5671 + }, + { + "epoch": 0.651542128539429, + "grad_norm": 0.4526500701904297, + "learning_rate": 0.0001, + "loss": 1.5922, + "step": 5672 + }, + { + "epoch": 0.6516569984492562, + "grad_norm": 0.44952574372291565, + "learning_rate": 0.0001, + "loss": 1.5678, + "step": 5673 + }, + { + "epoch": 0.6517718683590833, + "grad_norm": 0.4886074960231781, + "learning_rate": 0.0001, + "loss": 1.8023, + "step": 5674 + }, + { + "epoch": 0.6518867382689104, + "grad_norm": 0.43213948607444763, + "learning_rate": 0.0001, + "loss": 1.6028, + "step": 5675 + }, + { + "epoch": 0.6520016081787375, + "grad_norm": 0.41444772481918335, + "learning_rate": 0.0001, + "loss": 1.5833, + "step": 5676 + }, + { + "epoch": 0.6521164780885647, + "grad_norm": 0.46063005924224854, + "learning_rate": 0.0001, + "loss": 1.686, + "step": 5677 + }, + { + "epoch": 0.6522313479983918, + "grad_norm": 0.4700499176979065, + "learning_rate": 0.0001, + "loss": 1.6407, + "step": 5678 + }, + { + "epoch": 0.6523462179082189, + "grad_norm": 0.48296359181404114, + "learning_rate": 0.0001, + "loss": 1.5319, + "step": 5679 + }, + { + "epoch": 0.652461087818046, + "grad_norm": 0.4892067015171051, + "learning_rate": 0.0001, + "loss": 1.8181, + "step": 5680 + }, + { + "epoch": 0.6525759577278731, + "grad_norm": 0.426547646522522, + "learning_rate": 0.0001, + "loss": 1.7042, + "step": 5681 + }, + { + "epoch": 0.6526908276377003, + "grad_norm": 0.48687633872032166, + "learning_rate": 0.0001, + "loss": 1.6153, + "step": 5682 + }, + { + "epoch": 0.6528056975475274, + "grad_norm": 0.449789434671402, + "learning_rate": 0.0001, + "loss": 1.7252, + "step": 5683 + }, + { + "epoch": 0.6529205674573545, + "grad_norm": 0.43669000267982483, + "learning_rate": 0.0001, + "loss": 1.5944, + "step": 5684 + }, + { + "epoch": 0.6530354373671816, + "grad_norm": 0.4181348383426666, + "learning_rate": 0.0001, + "loss": 1.6324, + "step": 5685 + }, + { + "epoch": 0.6531503072770087, + "grad_norm": 0.4275439381599426, + "learning_rate": 0.0001, + "loss": 1.7064, + "step": 5686 + }, + { + "epoch": 0.6532651771868359, + "grad_norm": 0.44249075651168823, + "learning_rate": 0.0001, + "loss": 1.6323, + "step": 5687 + }, + { + "epoch": 0.653380047096663, + "grad_norm": 0.4645352363586426, + "learning_rate": 0.0001, + "loss": 1.733, + "step": 5688 + }, + { + "epoch": 0.6534949170064901, + "grad_norm": 0.47578033804893494, + "learning_rate": 0.0001, + "loss": 1.8043, + "step": 5689 + }, + { + "epoch": 0.6536097869163173, + "grad_norm": 0.4634343981742859, + "learning_rate": 0.0001, + "loss": 1.6265, + "step": 5690 + }, + { + "epoch": 0.6537246568261444, + "grad_norm": 0.4607868194580078, + "learning_rate": 0.0001, + "loss": 1.6108, + "step": 5691 + }, + { + "epoch": 0.6538395267359716, + "grad_norm": 0.43927955627441406, + "learning_rate": 0.0001, + "loss": 1.7641, + "step": 5692 + }, + { + "epoch": 0.6539543966457987, + "grad_norm": 0.42264971137046814, + "learning_rate": 0.0001, + "loss": 1.6051, + "step": 5693 + }, + { + "epoch": 0.6540692665556258, + "grad_norm": 0.4350726902484894, + "learning_rate": 0.0001, + "loss": 1.5614, + "step": 5694 + }, + { + "epoch": 0.6541841364654529, + "grad_norm": 0.45460623502731323, + "learning_rate": 0.0001, + "loss": 1.5079, + "step": 5695 + }, + { + "epoch": 0.65429900637528, + "grad_norm": 0.45324012637138367, + "learning_rate": 0.0001, + "loss": 1.458, + "step": 5696 + }, + { + "epoch": 0.6544138762851072, + "grad_norm": 0.4302757680416107, + "learning_rate": 0.0001, + "loss": 1.4266, + "step": 5697 + }, + { + "epoch": 0.6545287461949343, + "grad_norm": 0.4926580488681793, + "learning_rate": 0.0001, + "loss": 1.9029, + "step": 5698 + }, + { + "epoch": 0.6546436161047614, + "grad_norm": 0.4108467698097229, + "learning_rate": 0.0001, + "loss": 1.4296, + "step": 5699 + }, + { + "epoch": 0.6547584860145885, + "grad_norm": 0.4439946413040161, + "learning_rate": 0.0001, + "loss": 1.4945, + "step": 5700 + }, + { + "epoch": 0.6548733559244156, + "grad_norm": 0.42766717076301575, + "learning_rate": 0.0001, + "loss": 1.6439, + "step": 5701 + }, + { + "epoch": 0.6549882258342428, + "grad_norm": 0.4058894217014313, + "learning_rate": 0.0001, + "loss": 1.5366, + "step": 5702 + }, + { + "epoch": 0.6551030957440699, + "grad_norm": 0.42546913027763367, + "learning_rate": 0.0001, + "loss": 1.5646, + "step": 5703 + }, + { + "epoch": 0.655217965653897, + "grad_norm": 0.452540785074234, + "learning_rate": 0.0001, + "loss": 1.5635, + "step": 5704 + }, + { + "epoch": 0.6553328355637241, + "grad_norm": 0.4617590308189392, + "learning_rate": 0.0001, + "loss": 1.6154, + "step": 5705 + }, + { + "epoch": 0.6554477054735512, + "grad_norm": 0.41302189230918884, + "learning_rate": 0.0001, + "loss": 1.5644, + "step": 5706 + }, + { + "epoch": 0.6555625753833784, + "grad_norm": 0.43837377429008484, + "learning_rate": 0.0001, + "loss": 1.6912, + "step": 5707 + }, + { + "epoch": 0.6556774452932055, + "grad_norm": 0.4599246382713318, + "learning_rate": 0.0001, + "loss": 1.7636, + "step": 5708 + }, + { + "epoch": 0.6557923152030326, + "grad_norm": 0.44273531436920166, + "learning_rate": 0.0001, + "loss": 1.5861, + "step": 5709 + }, + { + "epoch": 0.6559071851128597, + "grad_norm": 0.40543317794799805, + "learning_rate": 0.0001, + "loss": 1.4967, + "step": 5710 + }, + { + "epoch": 0.6560220550226868, + "grad_norm": 0.44224417209625244, + "learning_rate": 0.0001, + "loss": 1.4827, + "step": 5711 + }, + { + "epoch": 0.656136924932514, + "grad_norm": 0.4091229736804962, + "learning_rate": 0.0001, + "loss": 1.5083, + "step": 5712 + }, + { + "epoch": 0.6562517948423411, + "grad_norm": 0.44809257984161377, + "learning_rate": 0.0001, + "loss": 1.7865, + "step": 5713 + }, + { + "epoch": 0.6563666647521682, + "grad_norm": 0.4188038110733032, + "learning_rate": 0.0001, + "loss": 1.6259, + "step": 5714 + }, + { + "epoch": 0.6564815346619953, + "grad_norm": 0.4264969229698181, + "learning_rate": 0.0001, + "loss": 1.3747, + "step": 5715 + }, + { + "epoch": 0.6565964045718224, + "grad_norm": 0.46266868710517883, + "learning_rate": 0.0001, + "loss": 1.8456, + "step": 5716 + }, + { + "epoch": 0.6567112744816496, + "grad_norm": 0.45116865634918213, + "learning_rate": 0.0001, + "loss": 1.4373, + "step": 5717 + }, + { + "epoch": 0.6568261443914767, + "grad_norm": 0.4384673833847046, + "learning_rate": 0.0001, + "loss": 1.5562, + "step": 5718 + }, + { + "epoch": 0.6569410143013038, + "grad_norm": 0.4380660355091095, + "learning_rate": 0.0001, + "loss": 1.5475, + "step": 5719 + }, + { + "epoch": 0.6570558842111309, + "grad_norm": 0.4457499086856842, + "learning_rate": 0.0001, + "loss": 1.5831, + "step": 5720 + }, + { + "epoch": 0.657170754120958, + "grad_norm": 0.45870745182037354, + "learning_rate": 0.0001, + "loss": 1.6296, + "step": 5721 + }, + { + "epoch": 0.6572856240307852, + "grad_norm": 0.44491496682167053, + "learning_rate": 0.0001, + "loss": 1.6472, + "step": 5722 + }, + { + "epoch": 0.6574004939406123, + "grad_norm": 0.4536452293395996, + "learning_rate": 0.0001, + "loss": 1.5365, + "step": 5723 + }, + { + "epoch": 0.6575153638504394, + "grad_norm": 0.4279188811779022, + "learning_rate": 0.0001, + "loss": 1.6861, + "step": 5724 + }, + { + "epoch": 0.6576302337602665, + "grad_norm": 0.49207669496536255, + "learning_rate": 0.0001, + "loss": 1.6214, + "step": 5725 + }, + { + "epoch": 0.6577451036700936, + "grad_norm": 0.47983112931251526, + "learning_rate": 0.0001, + "loss": 1.77, + "step": 5726 + }, + { + "epoch": 0.6578599735799208, + "grad_norm": 0.42303088307380676, + "learning_rate": 0.0001, + "loss": 1.439, + "step": 5727 + }, + { + "epoch": 0.6579748434897479, + "grad_norm": 0.40364500880241394, + "learning_rate": 0.0001, + "loss": 1.4152, + "step": 5728 + }, + { + "epoch": 0.658089713399575, + "grad_norm": 0.44114431738853455, + "learning_rate": 0.0001, + "loss": 1.5885, + "step": 5729 + }, + { + "epoch": 0.6582045833094021, + "grad_norm": 0.4176272451877594, + "learning_rate": 0.0001, + "loss": 1.5793, + "step": 5730 + }, + { + "epoch": 0.6583194532192292, + "grad_norm": 0.4215009808540344, + "learning_rate": 0.0001, + "loss": 1.5576, + "step": 5731 + }, + { + "epoch": 0.6584343231290564, + "grad_norm": 0.4414535164833069, + "learning_rate": 0.0001, + "loss": 1.6113, + "step": 5732 + }, + { + "epoch": 0.6585491930388835, + "grad_norm": 0.46415939927101135, + "learning_rate": 0.0001, + "loss": 1.6739, + "step": 5733 + }, + { + "epoch": 0.6586640629487106, + "grad_norm": 0.42953890562057495, + "learning_rate": 0.0001, + "loss": 1.5175, + "step": 5734 + }, + { + "epoch": 0.6587789328585377, + "grad_norm": 0.4586336016654968, + "learning_rate": 0.0001, + "loss": 1.6979, + "step": 5735 + }, + { + "epoch": 0.6588938027683648, + "grad_norm": 0.41848480701446533, + "learning_rate": 0.0001, + "loss": 1.724, + "step": 5736 + }, + { + "epoch": 0.659008672678192, + "grad_norm": 0.43761909008026123, + "learning_rate": 0.0001, + "loss": 1.7051, + "step": 5737 + }, + { + "epoch": 0.6591235425880191, + "grad_norm": 0.4685233235359192, + "learning_rate": 0.0001, + "loss": 1.7588, + "step": 5738 + }, + { + "epoch": 0.6592384124978462, + "grad_norm": 0.4934740364551544, + "learning_rate": 0.0001, + "loss": 1.7505, + "step": 5739 + }, + { + "epoch": 0.6593532824076733, + "grad_norm": 0.44371286034584045, + "learning_rate": 0.0001, + "loss": 1.6174, + "step": 5740 + }, + { + "epoch": 0.6594681523175004, + "grad_norm": 0.429153710603714, + "learning_rate": 0.0001, + "loss": 1.6668, + "step": 5741 + }, + { + "epoch": 0.6595830222273276, + "grad_norm": 0.46635177731513977, + "learning_rate": 0.0001, + "loss": 1.6942, + "step": 5742 + }, + { + "epoch": 0.6596978921371547, + "grad_norm": 0.39945048093795776, + "learning_rate": 0.0001, + "loss": 1.6229, + "step": 5743 + }, + { + "epoch": 0.6598127620469818, + "grad_norm": 0.46601754426956177, + "learning_rate": 0.0001, + "loss": 1.6118, + "step": 5744 + }, + { + "epoch": 0.6599276319568089, + "grad_norm": 0.48034870624542236, + "learning_rate": 0.0001, + "loss": 1.5462, + "step": 5745 + }, + { + "epoch": 0.660042501866636, + "grad_norm": 0.42302387952804565, + "learning_rate": 0.0001, + "loss": 1.4733, + "step": 5746 + }, + { + "epoch": 0.6601573717764632, + "grad_norm": 0.4531380236148834, + "learning_rate": 0.0001, + "loss": 1.72, + "step": 5747 + }, + { + "epoch": 0.6602722416862903, + "grad_norm": 0.43431270122528076, + "learning_rate": 0.0001, + "loss": 1.5529, + "step": 5748 + }, + { + "epoch": 0.6603871115961174, + "grad_norm": 0.44529253244400024, + "learning_rate": 0.0001, + "loss": 1.5221, + "step": 5749 + }, + { + "epoch": 0.6605019815059445, + "grad_norm": 0.43237540125846863, + "learning_rate": 0.0001, + "loss": 1.5567, + "step": 5750 + }, + { + "epoch": 0.6606168514157716, + "grad_norm": 0.4513912796974182, + "learning_rate": 0.0001, + "loss": 1.5776, + "step": 5751 + }, + { + "epoch": 0.6607317213255988, + "grad_norm": 0.42277923226356506, + "learning_rate": 0.0001, + "loss": 1.7517, + "step": 5752 + }, + { + "epoch": 0.6608465912354259, + "grad_norm": 0.40695422887802124, + "learning_rate": 0.0001, + "loss": 1.5718, + "step": 5753 + }, + { + "epoch": 0.660961461145253, + "grad_norm": 0.4478527009487152, + "learning_rate": 0.0001, + "loss": 1.7765, + "step": 5754 + }, + { + "epoch": 0.6610763310550801, + "grad_norm": 0.4173600971698761, + "learning_rate": 0.0001, + "loss": 1.5796, + "step": 5755 + }, + { + "epoch": 0.6611912009649072, + "grad_norm": 0.4463992416858673, + "learning_rate": 0.0001, + "loss": 1.6569, + "step": 5756 + }, + { + "epoch": 0.6613060708747344, + "grad_norm": 0.4375052750110626, + "learning_rate": 0.0001, + "loss": 1.4407, + "step": 5757 + }, + { + "epoch": 0.6614209407845615, + "grad_norm": 0.4219117760658264, + "learning_rate": 0.0001, + "loss": 1.5296, + "step": 5758 + }, + { + "epoch": 0.6615358106943886, + "grad_norm": 0.45714181661605835, + "learning_rate": 0.0001, + "loss": 1.674, + "step": 5759 + }, + { + "epoch": 0.6616506806042157, + "grad_norm": 0.4279073178768158, + "learning_rate": 0.0001, + "loss": 1.5276, + "step": 5760 + }, + { + "epoch": 0.6617655505140428, + "grad_norm": 0.4680931568145752, + "learning_rate": 0.0001, + "loss": 1.7313, + "step": 5761 + }, + { + "epoch": 0.66188042042387, + "grad_norm": 0.4030158817768097, + "learning_rate": 0.0001, + "loss": 1.5361, + "step": 5762 + }, + { + "epoch": 0.6619952903336971, + "grad_norm": 0.4316936433315277, + "learning_rate": 0.0001, + "loss": 1.6257, + "step": 5763 + }, + { + "epoch": 0.6621101602435242, + "grad_norm": 0.45963194966316223, + "learning_rate": 0.0001, + "loss": 1.7182, + "step": 5764 + }, + { + "epoch": 0.6622250301533513, + "grad_norm": 0.4106147289276123, + "learning_rate": 0.0001, + "loss": 1.5614, + "step": 5765 + }, + { + "epoch": 0.6623399000631784, + "grad_norm": 0.4534708261489868, + "learning_rate": 0.0001, + "loss": 1.7293, + "step": 5766 + }, + { + "epoch": 0.6624547699730056, + "grad_norm": 0.4368360936641693, + "learning_rate": 0.0001, + "loss": 1.5357, + "step": 5767 + }, + { + "epoch": 0.6625696398828327, + "grad_norm": 0.40505045652389526, + "learning_rate": 0.0001, + "loss": 1.5054, + "step": 5768 + }, + { + "epoch": 0.6626845097926598, + "grad_norm": 0.42817744612693787, + "learning_rate": 0.0001, + "loss": 1.6716, + "step": 5769 + }, + { + "epoch": 0.6627993797024869, + "grad_norm": 0.46093493700027466, + "learning_rate": 0.0001, + "loss": 1.6421, + "step": 5770 + }, + { + "epoch": 0.662914249612314, + "grad_norm": 0.44305554032325745, + "learning_rate": 0.0001, + "loss": 1.6918, + "step": 5771 + }, + { + "epoch": 0.6630291195221412, + "grad_norm": 0.39377158880233765, + "learning_rate": 0.0001, + "loss": 1.5552, + "step": 5772 + }, + { + "epoch": 0.6631439894319683, + "grad_norm": 0.4344120919704437, + "learning_rate": 0.0001, + "loss": 1.5301, + "step": 5773 + }, + { + "epoch": 0.6632588593417954, + "grad_norm": 0.44036102294921875, + "learning_rate": 0.0001, + "loss": 1.6784, + "step": 5774 + }, + { + "epoch": 0.6633737292516225, + "grad_norm": 0.4466957449913025, + "learning_rate": 0.0001, + "loss": 1.6549, + "step": 5775 + }, + { + "epoch": 0.6634885991614496, + "grad_norm": 0.43694761395454407, + "learning_rate": 0.0001, + "loss": 1.6732, + "step": 5776 + }, + { + "epoch": 0.6636034690712768, + "grad_norm": 0.4368375539779663, + "learning_rate": 0.0001, + "loss": 1.5941, + "step": 5777 + }, + { + "epoch": 0.6637183389811039, + "grad_norm": 0.4765366017818451, + "learning_rate": 0.0001, + "loss": 1.7931, + "step": 5778 + }, + { + "epoch": 0.663833208890931, + "grad_norm": 0.43931543827056885, + "learning_rate": 0.0001, + "loss": 1.6254, + "step": 5779 + }, + { + "epoch": 0.6639480788007581, + "grad_norm": 0.43625199794769287, + "learning_rate": 0.0001, + "loss": 1.6375, + "step": 5780 + }, + { + "epoch": 0.6640629487105852, + "grad_norm": 0.4167712926864624, + "learning_rate": 0.0001, + "loss": 1.4647, + "step": 5781 + }, + { + "epoch": 0.6641778186204124, + "grad_norm": 0.40136420726776123, + "learning_rate": 0.0001, + "loss": 1.5526, + "step": 5782 + }, + { + "epoch": 0.6642926885302395, + "grad_norm": 0.4109748303890228, + "learning_rate": 0.0001, + "loss": 1.4092, + "step": 5783 + }, + { + "epoch": 0.6644075584400666, + "grad_norm": 0.4257631301879883, + "learning_rate": 0.0001, + "loss": 1.6211, + "step": 5784 + }, + { + "epoch": 0.6645224283498937, + "grad_norm": 0.4686290919780731, + "learning_rate": 0.0001, + "loss": 1.7784, + "step": 5785 + }, + { + "epoch": 0.6646372982597208, + "grad_norm": 0.4436352252960205, + "learning_rate": 0.0001, + "loss": 1.6364, + "step": 5786 + }, + { + "epoch": 0.664752168169548, + "grad_norm": 0.4808412790298462, + "learning_rate": 0.0001, + "loss": 1.7645, + "step": 5787 + }, + { + "epoch": 0.6648670380793751, + "grad_norm": 0.5230939984321594, + "learning_rate": 0.0001, + "loss": 1.8253, + "step": 5788 + }, + { + "epoch": 0.6649819079892022, + "grad_norm": 0.4324409067630768, + "learning_rate": 0.0001, + "loss": 1.4929, + "step": 5789 + }, + { + "epoch": 0.6650967778990293, + "grad_norm": 0.48814013600349426, + "learning_rate": 0.0001, + "loss": 1.8974, + "step": 5790 + }, + { + "epoch": 0.6652116478088564, + "grad_norm": 0.4277442991733551, + "learning_rate": 0.0001, + "loss": 1.4797, + "step": 5791 + }, + { + "epoch": 0.6653265177186836, + "grad_norm": 0.43875277042388916, + "learning_rate": 0.0001, + "loss": 1.624, + "step": 5792 + }, + { + "epoch": 0.6654413876285107, + "grad_norm": 0.47888147830963135, + "learning_rate": 0.0001, + "loss": 1.6131, + "step": 5793 + }, + { + "epoch": 0.6655562575383378, + "grad_norm": 0.43682193756103516, + "learning_rate": 0.0001, + "loss": 1.6384, + "step": 5794 + }, + { + "epoch": 0.6656711274481649, + "grad_norm": 0.4496222734451294, + "learning_rate": 0.0001, + "loss": 1.7213, + "step": 5795 + }, + { + "epoch": 0.665785997357992, + "grad_norm": 0.44679972529411316, + "learning_rate": 0.0001, + "loss": 1.644, + "step": 5796 + }, + { + "epoch": 0.6659008672678192, + "grad_norm": 0.44004032015800476, + "learning_rate": 0.0001, + "loss": 1.6932, + "step": 5797 + }, + { + "epoch": 0.6660157371776463, + "grad_norm": 0.46193936467170715, + "learning_rate": 0.0001, + "loss": 1.5795, + "step": 5798 + }, + { + "epoch": 0.6661306070874734, + "grad_norm": 0.4364147186279297, + "learning_rate": 0.0001, + "loss": 1.6024, + "step": 5799 + }, + { + "epoch": 0.6662454769973005, + "grad_norm": 0.46602505445480347, + "learning_rate": 0.0001, + "loss": 1.6581, + "step": 5800 + }, + { + "epoch": 0.6663603469071276, + "grad_norm": 0.4367719888687134, + "learning_rate": 0.0001, + "loss": 1.615, + "step": 5801 + }, + { + "epoch": 0.6664752168169548, + "grad_norm": 0.4234819710254669, + "learning_rate": 0.0001, + "loss": 1.654, + "step": 5802 + }, + { + "epoch": 0.6665900867267819, + "grad_norm": 0.46970421075820923, + "learning_rate": 0.0001, + "loss": 1.8436, + "step": 5803 + }, + { + "epoch": 0.666704956636609, + "grad_norm": 0.45983707904815674, + "learning_rate": 0.0001, + "loss": 1.5, + "step": 5804 + }, + { + "epoch": 0.6668198265464361, + "grad_norm": 0.4581074118614197, + "learning_rate": 0.0001, + "loss": 1.6173, + "step": 5805 + }, + { + "epoch": 0.6669346964562632, + "grad_norm": 0.4409380853176117, + "learning_rate": 0.0001, + "loss": 1.6336, + "step": 5806 + }, + { + "epoch": 0.6670495663660904, + "grad_norm": 0.46238353848457336, + "learning_rate": 0.0001, + "loss": 1.8162, + "step": 5807 + }, + { + "epoch": 0.6671644362759175, + "grad_norm": 0.4390278160572052, + "learning_rate": 0.0001, + "loss": 1.7262, + "step": 5808 + }, + { + "epoch": 0.6672793061857446, + "grad_norm": 0.44605204463005066, + "learning_rate": 0.0001, + "loss": 1.6926, + "step": 5809 + }, + { + "epoch": 0.6673941760955717, + "grad_norm": 0.41407981514930725, + "learning_rate": 0.0001, + "loss": 1.6147, + "step": 5810 + }, + { + "epoch": 0.6675090460053988, + "grad_norm": 0.44644859433174133, + "learning_rate": 0.0001, + "loss": 1.5268, + "step": 5811 + }, + { + "epoch": 0.667623915915226, + "grad_norm": 0.4450972080230713, + "learning_rate": 0.0001, + "loss": 1.7008, + "step": 5812 + }, + { + "epoch": 0.6677387858250531, + "grad_norm": 0.45438942313194275, + "learning_rate": 0.0001, + "loss": 1.572, + "step": 5813 + }, + { + "epoch": 0.6678536557348802, + "grad_norm": 0.4170825481414795, + "learning_rate": 0.0001, + "loss": 1.5343, + "step": 5814 + }, + { + "epoch": 0.6679685256447073, + "grad_norm": 0.4803465008735657, + "learning_rate": 0.0001, + "loss": 1.8373, + "step": 5815 + }, + { + "epoch": 0.6680833955545344, + "grad_norm": 0.4364113211631775, + "learning_rate": 0.0001, + "loss": 1.5873, + "step": 5816 + }, + { + "epoch": 0.6681982654643616, + "grad_norm": 0.4389718174934387, + "learning_rate": 0.0001, + "loss": 1.3752, + "step": 5817 + }, + { + "epoch": 0.6683131353741887, + "grad_norm": 0.4332119822502136, + "learning_rate": 0.0001, + "loss": 1.5222, + "step": 5818 + }, + { + "epoch": 0.6684280052840158, + "grad_norm": 0.42295655608177185, + "learning_rate": 0.0001, + "loss": 1.4948, + "step": 5819 + }, + { + "epoch": 0.6685428751938429, + "grad_norm": 0.39874276518821716, + "learning_rate": 0.0001, + "loss": 1.5184, + "step": 5820 + }, + { + "epoch": 0.66865774510367, + "grad_norm": 0.4008484482765198, + "learning_rate": 0.0001, + "loss": 1.6389, + "step": 5821 + }, + { + "epoch": 0.6687726150134972, + "grad_norm": 0.43164539337158203, + "learning_rate": 0.0001, + "loss": 1.6292, + "step": 5822 + }, + { + "epoch": 0.6688874849233243, + "grad_norm": 0.39950746297836304, + "learning_rate": 0.0001, + "loss": 1.5272, + "step": 5823 + }, + { + "epoch": 0.6690023548331514, + "grad_norm": 0.442396879196167, + "learning_rate": 0.0001, + "loss": 1.6061, + "step": 5824 + }, + { + "epoch": 0.6691172247429785, + "grad_norm": 0.4410782754421234, + "learning_rate": 0.0001, + "loss": 1.4986, + "step": 5825 + }, + { + "epoch": 0.6692320946528056, + "grad_norm": 0.4995570480823517, + "learning_rate": 0.0001, + "loss": 1.7851, + "step": 5826 + }, + { + "epoch": 0.6693469645626329, + "grad_norm": 0.43698373436927795, + "learning_rate": 0.0001, + "loss": 1.5612, + "step": 5827 + }, + { + "epoch": 0.66946183447246, + "grad_norm": 0.5518622994422913, + "learning_rate": 0.0001, + "loss": 1.6748, + "step": 5828 + }, + { + "epoch": 0.6695767043822871, + "grad_norm": 0.4583264887332916, + "learning_rate": 0.0001, + "loss": 1.7106, + "step": 5829 + }, + { + "epoch": 0.6696915742921142, + "grad_norm": 0.40689265727996826, + "learning_rate": 0.0001, + "loss": 1.5925, + "step": 5830 + }, + { + "epoch": 0.6698064442019414, + "grad_norm": 0.45366060733795166, + "learning_rate": 0.0001, + "loss": 1.6063, + "step": 5831 + }, + { + "epoch": 0.6699213141117685, + "grad_norm": 0.4742351472377777, + "learning_rate": 0.0001, + "loss": 1.5794, + "step": 5832 + }, + { + "epoch": 0.6700361840215956, + "grad_norm": 0.43246540427207947, + "learning_rate": 0.0001, + "loss": 1.524, + "step": 5833 + }, + { + "epoch": 0.6701510539314227, + "grad_norm": 0.4532145857810974, + "learning_rate": 0.0001, + "loss": 1.6588, + "step": 5834 + }, + { + "epoch": 0.6702659238412498, + "grad_norm": 0.4628235399723053, + "learning_rate": 0.0001, + "loss": 1.7553, + "step": 5835 + }, + { + "epoch": 0.670380793751077, + "grad_norm": 0.5169723629951477, + "learning_rate": 0.0001, + "loss": 1.6494, + "step": 5836 + }, + { + "epoch": 0.6704956636609041, + "grad_norm": 0.4652624726295471, + "learning_rate": 0.0001, + "loss": 1.6496, + "step": 5837 + }, + { + "epoch": 0.6706105335707312, + "grad_norm": 0.45965853333473206, + "learning_rate": 0.0001, + "loss": 1.6612, + "step": 5838 + }, + { + "epoch": 0.6707254034805583, + "grad_norm": 0.4296656548976898, + "learning_rate": 0.0001, + "loss": 1.5995, + "step": 5839 + }, + { + "epoch": 0.6708402733903854, + "grad_norm": 0.4188656806945801, + "learning_rate": 0.0001, + "loss": 1.594, + "step": 5840 + }, + { + "epoch": 0.6709551433002126, + "grad_norm": 0.442221075296402, + "learning_rate": 0.0001, + "loss": 1.6074, + "step": 5841 + }, + { + "epoch": 0.6710700132100397, + "grad_norm": 0.4227312207221985, + "learning_rate": 0.0001, + "loss": 1.5262, + "step": 5842 + }, + { + "epoch": 0.6711848831198668, + "grad_norm": 0.4592028856277466, + "learning_rate": 0.0001, + "loss": 1.6376, + "step": 5843 + }, + { + "epoch": 0.6712997530296939, + "grad_norm": 0.4832910895347595, + "learning_rate": 0.0001, + "loss": 1.6153, + "step": 5844 + }, + { + "epoch": 0.671414622939521, + "grad_norm": 0.44571515917778015, + "learning_rate": 0.0001, + "loss": 1.7643, + "step": 5845 + }, + { + "epoch": 0.6715294928493482, + "grad_norm": 0.45350438356399536, + "learning_rate": 0.0001, + "loss": 1.7148, + "step": 5846 + }, + { + "epoch": 0.6716443627591753, + "grad_norm": 0.4490528106689453, + "learning_rate": 0.0001, + "loss": 1.6748, + "step": 5847 + }, + { + "epoch": 0.6717592326690024, + "grad_norm": 0.4567161500453949, + "learning_rate": 0.0001, + "loss": 1.6875, + "step": 5848 + }, + { + "epoch": 0.6718741025788295, + "grad_norm": 0.44406652450561523, + "learning_rate": 0.0001, + "loss": 1.6164, + "step": 5849 + }, + { + "epoch": 0.6719889724886566, + "grad_norm": 0.447835773229599, + "learning_rate": 0.0001, + "loss": 1.6824, + "step": 5850 + }, + { + "epoch": 0.6721038423984838, + "grad_norm": 0.597815752029419, + "learning_rate": 0.0001, + "loss": 1.5491, + "step": 5851 + }, + { + "epoch": 0.6722187123083109, + "grad_norm": 0.4304426610469818, + "learning_rate": 0.0001, + "loss": 1.5914, + "step": 5852 + }, + { + "epoch": 0.672333582218138, + "grad_norm": 0.4473625719547272, + "learning_rate": 0.0001, + "loss": 1.511, + "step": 5853 + }, + { + "epoch": 0.6724484521279651, + "grad_norm": 0.4547788202762604, + "learning_rate": 0.0001, + "loss": 1.8366, + "step": 5854 + }, + { + "epoch": 0.6725633220377922, + "grad_norm": 0.43215829133987427, + "learning_rate": 0.0001, + "loss": 1.611, + "step": 5855 + }, + { + "epoch": 0.6726781919476194, + "grad_norm": 0.515165388584137, + "learning_rate": 0.0001, + "loss": 1.6671, + "step": 5856 + }, + { + "epoch": 0.6727930618574465, + "grad_norm": 0.4932282865047455, + "learning_rate": 0.0001, + "loss": 1.5951, + "step": 5857 + }, + { + "epoch": 0.6729079317672736, + "grad_norm": 0.46201443672180176, + "learning_rate": 0.0001, + "loss": 1.5531, + "step": 5858 + }, + { + "epoch": 0.6730228016771007, + "grad_norm": 0.470007061958313, + "learning_rate": 0.0001, + "loss": 1.6675, + "step": 5859 + }, + { + "epoch": 0.6731376715869278, + "grad_norm": 0.4670080542564392, + "learning_rate": 0.0001, + "loss": 1.7879, + "step": 5860 + }, + { + "epoch": 0.673252541496755, + "grad_norm": 0.4259360730648041, + "learning_rate": 0.0001, + "loss": 1.581, + "step": 5861 + }, + { + "epoch": 0.6733674114065821, + "grad_norm": 0.47610583901405334, + "learning_rate": 0.0001, + "loss": 1.8552, + "step": 5862 + }, + { + "epoch": 0.6734822813164092, + "grad_norm": 0.4580018222332001, + "learning_rate": 0.0001, + "loss": 1.6853, + "step": 5863 + }, + { + "epoch": 0.6735971512262363, + "grad_norm": 0.4032216966152191, + "learning_rate": 0.0001, + "loss": 1.5547, + "step": 5864 + }, + { + "epoch": 0.6737120211360634, + "grad_norm": 0.45826172828674316, + "learning_rate": 0.0001, + "loss": 1.7622, + "step": 5865 + }, + { + "epoch": 0.6738268910458906, + "grad_norm": 0.43257811665534973, + "learning_rate": 0.0001, + "loss": 1.6669, + "step": 5866 + }, + { + "epoch": 0.6739417609557177, + "grad_norm": 0.434496134519577, + "learning_rate": 0.0001, + "loss": 1.6081, + "step": 5867 + }, + { + "epoch": 0.6740566308655448, + "grad_norm": 0.4298497438430786, + "learning_rate": 0.0001, + "loss": 1.5388, + "step": 5868 + }, + { + "epoch": 0.6741715007753719, + "grad_norm": 0.46281471848487854, + "learning_rate": 0.0001, + "loss": 1.7304, + "step": 5869 + }, + { + "epoch": 0.674286370685199, + "grad_norm": 0.42578932642936707, + "learning_rate": 0.0001, + "loss": 1.5891, + "step": 5870 + }, + { + "epoch": 0.6744012405950262, + "grad_norm": 0.431417316198349, + "learning_rate": 0.0001, + "loss": 1.411, + "step": 5871 + }, + { + "epoch": 0.6745161105048533, + "grad_norm": 0.4810255169868469, + "learning_rate": 0.0001, + "loss": 1.5891, + "step": 5872 + }, + { + "epoch": 0.6746309804146804, + "grad_norm": 0.4635904133319855, + "learning_rate": 0.0001, + "loss": 1.8039, + "step": 5873 + }, + { + "epoch": 0.6747458503245075, + "grad_norm": 0.46906232833862305, + "learning_rate": 0.0001, + "loss": 1.86, + "step": 5874 + }, + { + "epoch": 0.6748607202343346, + "grad_norm": 0.44870471954345703, + "learning_rate": 0.0001, + "loss": 1.5199, + "step": 5875 + }, + { + "epoch": 0.6749755901441618, + "grad_norm": 0.4645586609840393, + "learning_rate": 0.0001, + "loss": 1.7069, + "step": 5876 + }, + { + "epoch": 0.6750904600539889, + "grad_norm": 0.45580583810806274, + "learning_rate": 0.0001, + "loss": 1.6092, + "step": 5877 + }, + { + "epoch": 0.675205329963816, + "grad_norm": 0.4602547585964203, + "learning_rate": 0.0001, + "loss": 1.5772, + "step": 5878 + }, + { + "epoch": 0.6753201998736431, + "grad_norm": 0.45218491554260254, + "learning_rate": 0.0001, + "loss": 1.5413, + "step": 5879 + }, + { + "epoch": 0.6754350697834702, + "grad_norm": 0.4633892774581909, + "learning_rate": 0.0001, + "loss": 1.5366, + "step": 5880 + }, + { + "epoch": 0.6755499396932974, + "grad_norm": 0.47411301732063293, + "learning_rate": 0.0001, + "loss": 1.6984, + "step": 5881 + }, + { + "epoch": 0.6756648096031245, + "grad_norm": 0.4305265545845032, + "learning_rate": 0.0001, + "loss": 1.6009, + "step": 5882 + }, + { + "epoch": 0.6757796795129516, + "grad_norm": 0.444837749004364, + "learning_rate": 0.0001, + "loss": 1.5941, + "step": 5883 + }, + { + "epoch": 0.6758945494227787, + "grad_norm": 0.4728662967681885, + "learning_rate": 0.0001, + "loss": 1.7626, + "step": 5884 + }, + { + "epoch": 0.6760094193326058, + "grad_norm": 0.4483180046081543, + "learning_rate": 0.0001, + "loss": 1.7023, + "step": 5885 + }, + { + "epoch": 0.676124289242433, + "grad_norm": 0.4406643211841583, + "learning_rate": 0.0001, + "loss": 1.7195, + "step": 5886 + }, + { + "epoch": 0.6762391591522601, + "grad_norm": 0.44651803374290466, + "learning_rate": 0.0001, + "loss": 1.5651, + "step": 5887 + }, + { + "epoch": 0.6763540290620872, + "grad_norm": 0.4229283630847931, + "learning_rate": 0.0001, + "loss": 1.7038, + "step": 5888 + }, + { + "epoch": 0.6764688989719143, + "grad_norm": 0.4413118064403534, + "learning_rate": 0.0001, + "loss": 1.5026, + "step": 5889 + }, + { + "epoch": 0.6765837688817414, + "grad_norm": 0.44531190395355225, + "learning_rate": 0.0001, + "loss": 1.6668, + "step": 5890 + }, + { + "epoch": 0.6766986387915686, + "grad_norm": 0.4322546720504761, + "learning_rate": 0.0001, + "loss": 1.7821, + "step": 5891 + }, + { + "epoch": 0.6768135087013957, + "grad_norm": 0.43194666504859924, + "learning_rate": 0.0001, + "loss": 1.5749, + "step": 5892 + }, + { + "epoch": 0.6769283786112228, + "grad_norm": 0.45931199193000793, + "learning_rate": 0.0001, + "loss": 1.5343, + "step": 5893 + }, + { + "epoch": 0.6770432485210499, + "grad_norm": 0.4326918423175812, + "learning_rate": 0.0001, + "loss": 1.394, + "step": 5894 + }, + { + "epoch": 0.677158118430877, + "grad_norm": 0.43630099296569824, + "learning_rate": 0.0001, + "loss": 1.4559, + "step": 5895 + }, + { + "epoch": 0.6772729883407042, + "grad_norm": 0.4202820360660553, + "learning_rate": 0.0001, + "loss": 1.6519, + "step": 5896 + }, + { + "epoch": 0.6773878582505313, + "grad_norm": 0.508823812007904, + "learning_rate": 0.0001, + "loss": 1.6324, + "step": 5897 + }, + { + "epoch": 0.6775027281603584, + "grad_norm": 0.43873631954193115, + "learning_rate": 0.0001, + "loss": 1.522, + "step": 5898 + }, + { + "epoch": 0.6776175980701855, + "grad_norm": 0.4063846170902252, + "learning_rate": 0.0001, + "loss": 1.6185, + "step": 5899 + }, + { + "epoch": 0.6777324679800126, + "grad_norm": 0.4242575466632843, + "learning_rate": 0.0001, + "loss": 1.6539, + "step": 5900 + }, + { + "epoch": 0.6778473378898398, + "grad_norm": 0.44178491830825806, + "learning_rate": 0.0001, + "loss": 1.4893, + "step": 5901 + }, + { + "epoch": 0.6779622077996669, + "grad_norm": 0.4485710561275482, + "learning_rate": 0.0001, + "loss": 1.6329, + "step": 5902 + }, + { + "epoch": 0.678077077709494, + "grad_norm": 0.4207887351512909, + "learning_rate": 0.0001, + "loss": 1.5055, + "step": 5903 + }, + { + "epoch": 0.6781919476193211, + "grad_norm": 1.451164722442627, + "learning_rate": 0.0001, + "loss": 1.6455, + "step": 5904 + }, + { + "epoch": 0.6783068175291482, + "grad_norm": 0.46162787079811096, + "learning_rate": 0.0001, + "loss": 1.5872, + "step": 5905 + }, + { + "epoch": 0.6784216874389754, + "grad_norm": 0.4590327739715576, + "learning_rate": 0.0001, + "loss": 1.703, + "step": 5906 + }, + { + "epoch": 0.6785365573488025, + "grad_norm": 0.44896572828292847, + "learning_rate": 0.0001, + "loss": 1.7352, + "step": 5907 + }, + { + "epoch": 0.6786514272586296, + "grad_norm": 0.43614816665649414, + "learning_rate": 0.0001, + "loss": 1.5451, + "step": 5908 + }, + { + "epoch": 0.6787662971684567, + "grad_norm": 0.4527941942214966, + "learning_rate": 0.0001, + "loss": 1.5409, + "step": 5909 + }, + { + "epoch": 0.6788811670782838, + "grad_norm": 0.4711673855781555, + "learning_rate": 0.0001, + "loss": 1.5503, + "step": 5910 + }, + { + "epoch": 0.678996036988111, + "grad_norm": 0.5014576315879822, + "learning_rate": 0.0001, + "loss": 1.6173, + "step": 5911 + }, + { + "epoch": 0.6791109068979381, + "grad_norm": 0.4469035565853119, + "learning_rate": 0.0001, + "loss": 1.5892, + "step": 5912 + }, + { + "epoch": 0.6792257768077652, + "grad_norm": 0.47193190455436707, + "learning_rate": 0.0001, + "loss": 1.58, + "step": 5913 + }, + { + "epoch": 0.6793406467175923, + "grad_norm": 0.4429560899734497, + "learning_rate": 0.0001, + "loss": 1.6693, + "step": 5914 + }, + { + "epoch": 0.6794555166274194, + "grad_norm": 0.46765467524528503, + "learning_rate": 0.0001, + "loss": 1.6985, + "step": 5915 + }, + { + "epoch": 0.6795703865372466, + "grad_norm": 0.4488067328929901, + "learning_rate": 0.0001, + "loss": 1.6629, + "step": 5916 + }, + { + "epoch": 0.6796852564470737, + "grad_norm": 0.4276580214500427, + "learning_rate": 0.0001, + "loss": 1.579, + "step": 5917 + }, + { + "epoch": 0.6798001263569008, + "grad_norm": 0.47405701875686646, + "learning_rate": 0.0001, + "loss": 1.7718, + "step": 5918 + }, + { + "epoch": 0.6799149962667279, + "grad_norm": 0.4314579367637634, + "learning_rate": 0.0001, + "loss": 1.5739, + "step": 5919 + }, + { + "epoch": 0.680029866176555, + "grad_norm": 0.4281775951385498, + "learning_rate": 0.0001, + "loss": 1.4945, + "step": 5920 + }, + { + "epoch": 0.6801447360863822, + "grad_norm": 0.4201895594596863, + "learning_rate": 0.0001, + "loss": 1.519, + "step": 5921 + }, + { + "epoch": 0.6802596059962093, + "grad_norm": 0.42386960983276367, + "learning_rate": 0.0001, + "loss": 1.5078, + "step": 5922 + }, + { + "epoch": 0.6803744759060364, + "grad_norm": 0.49620696902275085, + "learning_rate": 0.0001, + "loss": 1.8191, + "step": 5923 + }, + { + "epoch": 0.6804893458158635, + "grad_norm": 0.4446530044078827, + "learning_rate": 0.0001, + "loss": 1.7002, + "step": 5924 + }, + { + "epoch": 0.6806042157256906, + "grad_norm": 0.436937540769577, + "learning_rate": 0.0001, + "loss": 1.5185, + "step": 5925 + }, + { + "epoch": 0.6807190856355178, + "grad_norm": 0.41910529136657715, + "learning_rate": 0.0001, + "loss": 1.3554, + "step": 5926 + }, + { + "epoch": 0.6808339555453449, + "grad_norm": 0.4524851143360138, + "learning_rate": 0.0001, + "loss": 1.5111, + "step": 5927 + }, + { + "epoch": 0.680948825455172, + "grad_norm": 0.5210041999816895, + "learning_rate": 0.0001, + "loss": 1.8005, + "step": 5928 + }, + { + "epoch": 0.6810636953649991, + "grad_norm": 0.4216352105140686, + "learning_rate": 0.0001, + "loss": 1.5726, + "step": 5929 + }, + { + "epoch": 0.6811785652748262, + "grad_norm": 0.44295185804367065, + "learning_rate": 0.0001, + "loss": 1.5998, + "step": 5930 + }, + { + "epoch": 0.6812934351846534, + "grad_norm": 0.4396561086177826, + "learning_rate": 0.0001, + "loss": 1.6761, + "step": 5931 + }, + { + "epoch": 0.6814083050944805, + "grad_norm": 0.440660297870636, + "learning_rate": 0.0001, + "loss": 1.7889, + "step": 5932 + }, + { + "epoch": 0.6815231750043076, + "grad_norm": 0.4529242217540741, + "learning_rate": 0.0001, + "loss": 1.6308, + "step": 5933 + }, + { + "epoch": 0.6816380449141347, + "grad_norm": 0.43237507343292236, + "learning_rate": 0.0001, + "loss": 1.569, + "step": 5934 + }, + { + "epoch": 0.6817529148239618, + "grad_norm": 0.41882574558258057, + "learning_rate": 0.0001, + "loss": 1.5298, + "step": 5935 + }, + { + "epoch": 0.681867784733789, + "grad_norm": 0.501582682132721, + "learning_rate": 0.0001, + "loss": 1.7749, + "step": 5936 + }, + { + "epoch": 0.6819826546436161, + "grad_norm": 0.4597390294075012, + "learning_rate": 0.0001, + "loss": 1.7377, + "step": 5937 + }, + { + "epoch": 0.6820975245534432, + "grad_norm": 0.42805132269859314, + "learning_rate": 0.0001, + "loss": 1.5204, + "step": 5938 + }, + { + "epoch": 0.6822123944632703, + "grad_norm": 0.4181848168373108, + "learning_rate": 0.0001, + "loss": 1.3851, + "step": 5939 + }, + { + "epoch": 0.6823272643730974, + "grad_norm": 0.5363374352455139, + "learning_rate": 0.0001, + "loss": 1.9319, + "step": 5940 + }, + { + "epoch": 0.6824421342829246, + "grad_norm": 0.47564831376075745, + "learning_rate": 0.0001, + "loss": 1.7815, + "step": 5941 + }, + { + "epoch": 0.6825570041927517, + "grad_norm": 0.4543587267398834, + "learning_rate": 0.0001, + "loss": 1.626, + "step": 5942 + }, + { + "epoch": 0.6826718741025788, + "grad_norm": 0.43262454867362976, + "learning_rate": 0.0001, + "loss": 1.6205, + "step": 5943 + }, + { + "epoch": 0.6827867440124059, + "grad_norm": 0.44168633222579956, + "learning_rate": 0.0001, + "loss": 1.3962, + "step": 5944 + }, + { + "epoch": 0.682901613922233, + "grad_norm": 0.4552295207977295, + "learning_rate": 0.0001, + "loss": 1.6962, + "step": 5945 + }, + { + "epoch": 0.6830164838320602, + "grad_norm": 0.4199536442756653, + "learning_rate": 0.0001, + "loss": 1.6031, + "step": 5946 + }, + { + "epoch": 0.6831313537418873, + "grad_norm": 0.44757145643234253, + "learning_rate": 0.0001, + "loss": 1.7174, + "step": 5947 + }, + { + "epoch": 0.6832462236517144, + "grad_norm": 0.4116254448890686, + "learning_rate": 0.0001, + "loss": 1.5348, + "step": 5948 + }, + { + "epoch": 0.6833610935615415, + "grad_norm": 0.4747268259525299, + "learning_rate": 0.0001, + "loss": 1.6715, + "step": 5949 + }, + { + "epoch": 0.6834759634713686, + "grad_norm": 0.41993069648742676, + "learning_rate": 0.0001, + "loss": 1.459, + "step": 5950 + }, + { + "epoch": 0.6835908333811958, + "grad_norm": 0.4138827621936798, + "learning_rate": 0.0001, + "loss": 1.4987, + "step": 5951 + }, + { + "epoch": 0.6837057032910229, + "grad_norm": 0.45296478271484375, + "learning_rate": 0.0001, + "loss": 1.7347, + "step": 5952 + }, + { + "epoch": 0.68382057320085, + "grad_norm": 0.510452389717102, + "learning_rate": 0.0001, + "loss": 1.6208, + "step": 5953 + }, + { + "epoch": 0.6839354431106771, + "grad_norm": 0.474874883890152, + "learning_rate": 0.0001, + "loss": 1.6779, + "step": 5954 + }, + { + "epoch": 0.6840503130205042, + "grad_norm": 0.45102521777153015, + "learning_rate": 0.0001, + "loss": 1.5271, + "step": 5955 + }, + { + "epoch": 0.6841651829303314, + "grad_norm": 0.4434516429901123, + "learning_rate": 0.0001, + "loss": 1.4887, + "step": 5956 + }, + { + "epoch": 0.6842800528401585, + "grad_norm": 0.44997087121009827, + "learning_rate": 0.0001, + "loss": 1.6192, + "step": 5957 + }, + { + "epoch": 0.6843949227499856, + "grad_norm": 0.42875492572784424, + "learning_rate": 0.0001, + "loss": 1.5499, + "step": 5958 + }, + { + "epoch": 0.6845097926598127, + "grad_norm": 0.4152733087539673, + "learning_rate": 0.0001, + "loss": 1.5594, + "step": 5959 + }, + { + "epoch": 0.6846246625696398, + "grad_norm": 0.43790438771247864, + "learning_rate": 0.0001, + "loss": 1.5427, + "step": 5960 + }, + { + "epoch": 0.684739532479467, + "grad_norm": 0.45538073778152466, + "learning_rate": 0.0001, + "loss": 1.8264, + "step": 5961 + }, + { + "epoch": 0.6848544023892941, + "grad_norm": 0.671297013759613, + "learning_rate": 0.0001, + "loss": 1.6119, + "step": 5962 + }, + { + "epoch": 0.6849692722991212, + "grad_norm": 0.454745888710022, + "learning_rate": 0.0001, + "loss": 1.6478, + "step": 5963 + }, + { + "epoch": 0.6850841422089484, + "grad_norm": 0.43612053990364075, + "learning_rate": 0.0001, + "loss": 1.6183, + "step": 5964 + }, + { + "epoch": 0.6851990121187755, + "grad_norm": 0.42518892884254456, + "learning_rate": 0.0001, + "loss": 1.5105, + "step": 5965 + }, + { + "epoch": 0.6853138820286027, + "grad_norm": 0.4237819015979767, + "learning_rate": 0.0001, + "loss": 1.5573, + "step": 5966 + }, + { + "epoch": 0.6854287519384298, + "grad_norm": 0.4369647204875946, + "learning_rate": 0.0001, + "loss": 1.5347, + "step": 5967 + }, + { + "epoch": 0.6855436218482569, + "grad_norm": 0.4214072525501251, + "learning_rate": 0.0001, + "loss": 1.5991, + "step": 5968 + }, + { + "epoch": 0.685658491758084, + "grad_norm": 0.45082470774650574, + "learning_rate": 0.0001, + "loss": 1.7264, + "step": 5969 + }, + { + "epoch": 0.6857733616679111, + "grad_norm": 0.44140127301216125, + "learning_rate": 0.0001, + "loss": 1.447, + "step": 5970 + }, + { + "epoch": 0.6858882315777383, + "grad_norm": 0.4286753833293915, + "learning_rate": 0.0001, + "loss": 1.6595, + "step": 5971 + }, + { + "epoch": 0.6860031014875654, + "grad_norm": 0.43218451738357544, + "learning_rate": 0.0001, + "loss": 1.6167, + "step": 5972 + }, + { + "epoch": 0.6861179713973925, + "grad_norm": 0.40552157163619995, + "learning_rate": 0.0001, + "loss": 1.6152, + "step": 5973 + }, + { + "epoch": 0.6862328413072196, + "grad_norm": 0.449190229177475, + "learning_rate": 0.0001, + "loss": 1.5253, + "step": 5974 + }, + { + "epoch": 0.6863477112170467, + "grad_norm": 0.42103078961372375, + "learning_rate": 0.0001, + "loss": 1.485, + "step": 5975 + }, + { + "epoch": 0.6864625811268739, + "grad_norm": 0.4737277328968048, + "learning_rate": 0.0001, + "loss": 1.5635, + "step": 5976 + }, + { + "epoch": 0.686577451036701, + "grad_norm": 0.44992202520370483, + "learning_rate": 0.0001, + "loss": 1.6039, + "step": 5977 + }, + { + "epoch": 0.6866923209465281, + "grad_norm": 0.4830211400985718, + "learning_rate": 0.0001, + "loss": 1.3773, + "step": 5978 + }, + { + "epoch": 0.6868071908563552, + "grad_norm": 0.48300692439079285, + "learning_rate": 0.0001, + "loss": 1.6668, + "step": 5979 + }, + { + "epoch": 0.6869220607661823, + "grad_norm": 0.4886820614337921, + "learning_rate": 0.0001, + "loss": 1.7533, + "step": 5980 + }, + { + "epoch": 0.6870369306760095, + "grad_norm": 0.4783158600330353, + "learning_rate": 0.0001, + "loss": 1.5382, + "step": 5981 + }, + { + "epoch": 0.6871518005858366, + "grad_norm": 0.44039803743362427, + "learning_rate": 0.0001, + "loss": 1.6981, + "step": 5982 + }, + { + "epoch": 0.6872666704956637, + "grad_norm": 0.44316425919532776, + "learning_rate": 0.0001, + "loss": 1.6761, + "step": 5983 + }, + { + "epoch": 0.6873815404054908, + "grad_norm": 0.442717581987381, + "learning_rate": 0.0001, + "loss": 1.6243, + "step": 5984 + }, + { + "epoch": 0.6874964103153179, + "grad_norm": 0.40507930517196655, + "learning_rate": 0.0001, + "loss": 1.5069, + "step": 5985 + }, + { + "epoch": 0.6876112802251451, + "grad_norm": 0.4449567496776581, + "learning_rate": 0.0001, + "loss": 1.6059, + "step": 5986 + }, + { + "epoch": 0.6877261501349722, + "grad_norm": 0.4152354300022125, + "learning_rate": 0.0001, + "loss": 1.5178, + "step": 5987 + }, + { + "epoch": 0.6878410200447993, + "grad_norm": 0.4327658414840698, + "learning_rate": 0.0001, + "loss": 1.6211, + "step": 5988 + }, + { + "epoch": 0.6879558899546264, + "grad_norm": 0.39523184299468994, + "learning_rate": 0.0001, + "loss": 1.4862, + "step": 5989 + }, + { + "epoch": 0.6880707598644535, + "grad_norm": 0.4782007336616516, + "learning_rate": 0.0001, + "loss": 1.7516, + "step": 5990 + }, + { + "epoch": 0.6881856297742807, + "grad_norm": 0.4494955539703369, + "learning_rate": 0.0001, + "loss": 1.5783, + "step": 5991 + }, + { + "epoch": 0.6883004996841078, + "grad_norm": 0.44850456714630127, + "learning_rate": 0.0001, + "loss": 1.6185, + "step": 5992 + }, + { + "epoch": 0.6884153695939349, + "grad_norm": 0.43033161759376526, + "learning_rate": 0.0001, + "loss": 1.4819, + "step": 5993 + }, + { + "epoch": 0.688530239503762, + "grad_norm": 0.4402123987674713, + "learning_rate": 0.0001, + "loss": 1.6092, + "step": 5994 + }, + { + "epoch": 0.6886451094135891, + "grad_norm": 0.41884496808052063, + "learning_rate": 0.0001, + "loss": 1.5384, + "step": 5995 + }, + { + "epoch": 0.6887599793234163, + "grad_norm": 0.4290192127227783, + "learning_rate": 0.0001, + "loss": 1.6155, + "step": 5996 + }, + { + "epoch": 0.6888748492332434, + "grad_norm": 0.5004449486732483, + "learning_rate": 0.0001, + "loss": 1.6538, + "step": 5997 + }, + { + "epoch": 0.6889897191430705, + "grad_norm": 0.4282512068748474, + "learning_rate": 0.0001, + "loss": 1.554, + "step": 5998 + }, + { + "epoch": 0.6891045890528976, + "grad_norm": 0.4622134566307068, + "learning_rate": 0.0001, + "loss": 1.5423, + "step": 5999 + }, + { + "epoch": 0.6892194589627247, + "grad_norm": 0.43330010771751404, + "learning_rate": 0.0001, + "loss": 1.2861, + "step": 6000 + }, + { + "epoch": 0.6893343288725519, + "grad_norm": 0.47380784153938293, + "learning_rate": 0.0001, + "loss": 1.5154, + "step": 6001 + }, + { + "epoch": 0.689449198782379, + "grad_norm": 0.4909721314907074, + "learning_rate": 0.0001, + "loss": 1.6611, + "step": 6002 + }, + { + "epoch": 0.6895640686922061, + "grad_norm": 0.444553405046463, + "learning_rate": 0.0001, + "loss": 1.5056, + "step": 6003 + }, + { + "epoch": 0.6896789386020332, + "grad_norm": 0.4200032651424408, + "learning_rate": 0.0001, + "loss": 1.4871, + "step": 6004 + }, + { + "epoch": 0.6897938085118603, + "grad_norm": 0.45052191615104675, + "learning_rate": 0.0001, + "loss": 1.6345, + "step": 6005 + }, + { + "epoch": 0.6899086784216875, + "grad_norm": 0.44020742177963257, + "learning_rate": 0.0001, + "loss": 1.5327, + "step": 6006 + }, + { + "epoch": 0.6900235483315146, + "grad_norm": 0.4479757249355316, + "learning_rate": 0.0001, + "loss": 1.652, + "step": 6007 + }, + { + "epoch": 0.6901384182413417, + "grad_norm": 0.40676623582839966, + "learning_rate": 0.0001, + "loss": 1.4707, + "step": 6008 + }, + { + "epoch": 0.6902532881511688, + "grad_norm": 0.44237807393074036, + "learning_rate": 0.0001, + "loss": 1.6429, + "step": 6009 + }, + { + "epoch": 0.6903681580609959, + "grad_norm": 0.5069726705551147, + "learning_rate": 0.0001, + "loss": 1.819, + "step": 6010 + }, + { + "epoch": 0.6904830279708231, + "grad_norm": 0.4417176842689514, + "learning_rate": 0.0001, + "loss": 1.5259, + "step": 6011 + }, + { + "epoch": 0.6905978978806502, + "grad_norm": 0.4071502685546875, + "learning_rate": 0.0001, + "loss": 1.4589, + "step": 6012 + }, + { + "epoch": 0.6907127677904773, + "grad_norm": 0.4580749273300171, + "learning_rate": 0.0001, + "loss": 1.4879, + "step": 6013 + }, + { + "epoch": 0.6908276377003044, + "grad_norm": 0.4313982427120209, + "learning_rate": 0.0001, + "loss": 1.584, + "step": 6014 + }, + { + "epoch": 0.6909425076101315, + "grad_norm": 0.4240618944168091, + "learning_rate": 0.0001, + "loss": 1.5827, + "step": 6015 + }, + { + "epoch": 0.6910573775199587, + "grad_norm": 0.6877708435058594, + "learning_rate": 0.0001, + "loss": 1.4963, + "step": 6016 + }, + { + "epoch": 0.6911722474297858, + "grad_norm": 0.4607912003993988, + "learning_rate": 0.0001, + "loss": 1.6725, + "step": 6017 + }, + { + "epoch": 0.6912871173396129, + "grad_norm": 0.44218817353248596, + "learning_rate": 0.0001, + "loss": 1.7323, + "step": 6018 + }, + { + "epoch": 0.69140198724944, + "grad_norm": 0.4411420226097107, + "learning_rate": 0.0001, + "loss": 1.6747, + "step": 6019 + }, + { + "epoch": 0.6915168571592671, + "grad_norm": 0.4698505103588104, + "learning_rate": 0.0001, + "loss": 1.8326, + "step": 6020 + }, + { + "epoch": 0.6916317270690943, + "grad_norm": 0.4248715937137604, + "learning_rate": 0.0001, + "loss": 1.7072, + "step": 6021 + }, + { + "epoch": 0.6917465969789214, + "grad_norm": 0.4034425616264343, + "learning_rate": 0.0001, + "loss": 1.4084, + "step": 6022 + }, + { + "epoch": 0.6918614668887485, + "grad_norm": 0.4377814829349518, + "learning_rate": 0.0001, + "loss": 1.5643, + "step": 6023 + }, + { + "epoch": 0.6919763367985756, + "grad_norm": 0.4794885218143463, + "learning_rate": 0.0001, + "loss": 1.6453, + "step": 6024 + }, + { + "epoch": 0.6920912067084027, + "grad_norm": 0.4388996958732605, + "learning_rate": 0.0001, + "loss": 1.6848, + "step": 6025 + }, + { + "epoch": 0.6922060766182299, + "grad_norm": 0.3899931311607361, + "learning_rate": 0.0001, + "loss": 1.3496, + "step": 6026 + }, + { + "epoch": 0.692320946528057, + "grad_norm": 0.49666503071784973, + "learning_rate": 0.0001, + "loss": 1.7286, + "step": 6027 + }, + { + "epoch": 0.6924358164378841, + "grad_norm": 0.4482400119304657, + "learning_rate": 0.0001, + "loss": 1.6767, + "step": 6028 + }, + { + "epoch": 0.6925506863477112, + "grad_norm": 0.41266587376594543, + "learning_rate": 0.0001, + "loss": 1.4796, + "step": 6029 + }, + { + "epoch": 0.6926655562575383, + "grad_norm": 0.3807138502597809, + "learning_rate": 0.0001, + "loss": 1.34, + "step": 6030 + }, + { + "epoch": 0.6927804261673655, + "grad_norm": 0.46382012963294983, + "learning_rate": 0.0001, + "loss": 1.5642, + "step": 6031 + }, + { + "epoch": 0.6928952960771926, + "grad_norm": 0.45717358589172363, + "learning_rate": 0.0001, + "loss": 1.6184, + "step": 6032 + }, + { + "epoch": 0.6930101659870197, + "grad_norm": 0.44662654399871826, + "learning_rate": 0.0001, + "loss": 1.7359, + "step": 6033 + }, + { + "epoch": 0.6931250358968468, + "grad_norm": 0.4659220278263092, + "learning_rate": 0.0001, + "loss": 1.7403, + "step": 6034 + }, + { + "epoch": 0.6932399058066739, + "grad_norm": 0.4265783727169037, + "learning_rate": 0.0001, + "loss": 1.6654, + "step": 6035 + }, + { + "epoch": 0.6933547757165011, + "grad_norm": 0.45917513966560364, + "learning_rate": 0.0001, + "loss": 1.5656, + "step": 6036 + }, + { + "epoch": 0.6934696456263282, + "grad_norm": 0.4457080066204071, + "learning_rate": 0.0001, + "loss": 1.6824, + "step": 6037 + }, + { + "epoch": 0.6935845155361553, + "grad_norm": 0.428594172000885, + "learning_rate": 0.0001, + "loss": 1.4754, + "step": 6038 + }, + { + "epoch": 0.6936993854459824, + "grad_norm": 0.4832080602645874, + "learning_rate": 0.0001, + "loss": 1.7493, + "step": 6039 + }, + { + "epoch": 0.6938142553558095, + "grad_norm": 0.4549182653427124, + "learning_rate": 0.0001, + "loss": 1.5576, + "step": 6040 + }, + { + "epoch": 0.6939291252656367, + "grad_norm": 0.46311667561531067, + "learning_rate": 0.0001, + "loss": 1.6486, + "step": 6041 + }, + { + "epoch": 0.6940439951754638, + "grad_norm": 0.43765565752983093, + "learning_rate": 0.0001, + "loss": 1.5472, + "step": 6042 + }, + { + "epoch": 0.6941588650852909, + "grad_norm": 0.4676961600780487, + "learning_rate": 0.0001, + "loss": 1.5775, + "step": 6043 + }, + { + "epoch": 0.694273734995118, + "grad_norm": 0.48037439584732056, + "learning_rate": 0.0001, + "loss": 1.7783, + "step": 6044 + }, + { + "epoch": 0.6943886049049451, + "grad_norm": 0.4451299011707306, + "learning_rate": 0.0001, + "loss": 1.5532, + "step": 6045 + }, + { + "epoch": 0.6945034748147723, + "grad_norm": 0.4473123848438263, + "learning_rate": 0.0001, + "loss": 1.7079, + "step": 6046 + }, + { + "epoch": 0.6946183447245994, + "grad_norm": 0.4191644489765167, + "learning_rate": 0.0001, + "loss": 1.4635, + "step": 6047 + }, + { + "epoch": 0.6947332146344265, + "grad_norm": 0.5033878087997437, + "learning_rate": 0.0001, + "loss": 1.6531, + "step": 6048 + }, + { + "epoch": 0.6948480845442536, + "grad_norm": 0.4745533764362335, + "learning_rate": 0.0001, + "loss": 1.6522, + "step": 6049 + }, + { + "epoch": 0.6949629544540807, + "grad_norm": 0.44269341230392456, + "learning_rate": 0.0001, + "loss": 1.5863, + "step": 6050 + }, + { + "epoch": 0.6950778243639079, + "grad_norm": 0.47186774015426636, + "learning_rate": 0.0001, + "loss": 1.6539, + "step": 6051 + }, + { + "epoch": 0.695192694273735, + "grad_norm": 0.5083333849906921, + "learning_rate": 0.0001, + "loss": 1.7727, + "step": 6052 + }, + { + "epoch": 0.6953075641835621, + "grad_norm": 0.4614897072315216, + "learning_rate": 0.0001, + "loss": 1.669, + "step": 6053 + }, + { + "epoch": 0.6954224340933892, + "grad_norm": 0.47114160656929016, + "learning_rate": 0.0001, + "loss": 1.6241, + "step": 6054 + }, + { + "epoch": 0.6955373040032163, + "grad_norm": 0.4435749650001526, + "learning_rate": 0.0001, + "loss": 1.5528, + "step": 6055 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 0.4462308883666992, + "learning_rate": 0.0001, + "loss": 1.5747, + "step": 6056 + }, + { + "epoch": 0.6957670438228706, + "grad_norm": 0.4508498013019562, + "learning_rate": 0.0001, + "loss": 1.772, + "step": 6057 + }, + { + "epoch": 0.6958819137326977, + "grad_norm": 0.45102837681770325, + "learning_rate": 0.0001, + "loss": 1.5337, + "step": 6058 + }, + { + "epoch": 0.6959967836425248, + "grad_norm": 0.427142471075058, + "learning_rate": 0.0001, + "loss": 1.5359, + "step": 6059 + }, + { + "epoch": 0.6961116535523519, + "grad_norm": 0.44649988412857056, + "learning_rate": 0.0001, + "loss": 1.7092, + "step": 6060 + }, + { + "epoch": 0.6962265234621791, + "grad_norm": 0.44798871874809265, + "learning_rate": 0.0001, + "loss": 1.6849, + "step": 6061 + }, + { + "epoch": 0.6963413933720062, + "grad_norm": 0.5189127326011658, + "learning_rate": 0.0001, + "loss": 1.1203, + "step": 6062 + }, + { + "epoch": 0.6964562632818333, + "grad_norm": 0.4362175762653351, + "learning_rate": 0.0001, + "loss": 1.6535, + "step": 6063 + }, + { + "epoch": 0.6965711331916604, + "grad_norm": 0.4506964683532715, + "learning_rate": 0.0001, + "loss": 1.5871, + "step": 6064 + }, + { + "epoch": 0.6966860031014875, + "grad_norm": 0.4647710621356964, + "learning_rate": 0.0001, + "loss": 1.5128, + "step": 6065 + }, + { + "epoch": 0.6968008730113147, + "grad_norm": 0.43700385093688965, + "learning_rate": 0.0001, + "loss": 1.5093, + "step": 6066 + }, + { + "epoch": 0.6969157429211418, + "grad_norm": 0.4622689187526703, + "learning_rate": 0.0001, + "loss": 1.6878, + "step": 6067 + }, + { + "epoch": 0.6970306128309689, + "grad_norm": 0.4806528687477112, + "learning_rate": 0.0001, + "loss": 1.7705, + "step": 6068 + }, + { + "epoch": 0.697145482740796, + "grad_norm": 0.4175126254558563, + "learning_rate": 0.0001, + "loss": 1.4511, + "step": 6069 + }, + { + "epoch": 0.6972603526506231, + "grad_norm": 0.4509715735912323, + "learning_rate": 0.0001, + "loss": 1.5212, + "step": 6070 + }, + { + "epoch": 0.6973752225604503, + "grad_norm": 0.4509661793708801, + "learning_rate": 0.0001, + "loss": 1.5706, + "step": 6071 + }, + { + "epoch": 0.6974900924702774, + "grad_norm": 0.4647543728351593, + "learning_rate": 0.0001, + "loss": 1.7786, + "step": 6072 + }, + { + "epoch": 0.6976049623801045, + "grad_norm": 0.4497321546077728, + "learning_rate": 0.0001, + "loss": 1.4881, + "step": 6073 + }, + { + "epoch": 0.6977198322899316, + "grad_norm": 0.431309312582016, + "learning_rate": 0.0001, + "loss": 1.5486, + "step": 6074 + }, + { + "epoch": 0.6978347021997587, + "grad_norm": 0.423828661441803, + "learning_rate": 0.0001, + "loss": 1.717, + "step": 6075 + }, + { + "epoch": 0.6979495721095859, + "grad_norm": 0.4408908784389496, + "learning_rate": 0.0001, + "loss": 1.5664, + "step": 6076 + }, + { + "epoch": 0.698064442019413, + "grad_norm": 0.4263383448123932, + "learning_rate": 0.0001, + "loss": 1.5551, + "step": 6077 + }, + { + "epoch": 0.6981793119292401, + "grad_norm": 0.4382713735103607, + "learning_rate": 0.0001, + "loss": 1.6121, + "step": 6078 + }, + { + "epoch": 0.6982941818390672, + "grad_norm": 0.4006011486053467, + "learning_rate": 0.0001, + "loss": 1.4744, + "step": 6079 + }, + { + "epoch": 0.6984090517488943, + "grad_norm": 0.4670262038707733, + "learning_rate": 0.0001, + "loss": 1.7082, + "step": 6080 + }, + { + "epoch": 0.6985239216587215, + "grad_norm": 0.46280163526535034, + "learning_rate": 0.0001, + "loss": 1.7092, + "step": 6081 + }, + { + "epoch": 0.6986387915685486, + "grad_norm": 0.4739822745323181, + "learning_rate": 0.0001, + "loss": 1.6459, + "step": 6082 + }, + { + "epoch": 0.6987536614783757, + "grad_norm": 0.44407156109809875, + "learning_rate": 0.0001, + "loss": 1.6641, + "step": 6083 + }, + { + "epoch": 0.6988685313882028, + "grad_norm": 0.44577091932296753, + "learning_rate": 0.0001, + "loss": 1.5051, + "step": 6084 + }, + { + "epoch": 0.6989834012980299, + "grad_norm": 0.465145468711853, + "learning_rate": 0.0001, + "loss": 1.5938, + "step": 6085 + }, + { + "epoch": 0.6990982712078571, + "grad_norm": 0.47078871726989746, + "learning_rate": 0.0001, + "loss": 1.7108, + "step": 6086 + }, + { + "epoch": 0.6992131411176842, + "grad_norm": 0.4897402226924896, + "learning_rate": 0.0001, + "loss": 1.5122, + "step": 6087 + }, + { + "epoch": 0.6993280110275113, + "grad_norm": 0.46980687975883484, + "learning_rate": 0.0001, + "loss": 1.7254, + "step": 6088 + }, + { + "epoch": 0.6994428809373384, + "grad_norm": 0.41509878635406494, + "learning_rate": 0.0001, + "loss": 1.4403, + "step": 6089 + }, + { + "epoch": 0.6995577508471655, + "grad_norm": 0.3764444887638092, + "learning_rate": 0.0001, + "loss": 1.2123, + "step": 6090 + }, + { + "epoch": 0.6996726207569927, + "grad_norm": 0.4393060803413391, + "learning_rate": 0.0001, + "loss": 1.4743, + "step": 6091 + }, + { + "epoch": 0.6997874906668198, + "grad_norm": 0.45182573795318604, + "learning_rate": 0.0001, + "loss": 1.7701, + "step": 6092 + }, + { + "epoch": 0.6999023605766469, + "grad_norm": 0.4436628818511963, + "learning_rate": 0.0001, + "loss": 1.6242, + "step": 6093 + }, + { + "epoch": 0.700017230486474, + "grad_norm": 0.44743016362190247, + "learning_rate": 0.0001, + "loss": 1.7463, + "step": 6094 + }, + { + "epoch": 0.7001321003963011, + "grad_norm": 0.4477095901966095, + "learning_rate": 0.0001, + "loss": 1.6296, + "step": 6095 + }, + { + "epoch": 0.7002469703061283, + "grad_norm": 0.43921157717704773, + "learning_rate": 0.0001, + "loss": 1.6887, + "step": 6096 + }, + { + "epoch": 0.7003618402159554, + "grad_norm": 0.4518073797225952, + "learning_rate": 0.0001, + "loss": 1.715, + "step": 6097 + }, + { + "epoch": 0.7004767101257825, + "grad_norm": 0.4737764298915863, + "learning_rate": 0.0001, + "loss": 1.6202, + "step": 6098 + }, + { + "epoch": 0.7005915800356096, + "grad_norm": 0.42427653074264526, + "learning_rate": 0.0001, + "loss": 1.4087, + "step": 6099 + }, + { + "epoch": 0.7007064499454367, + "grad_norm": 0.42667144536972046, + "learning_rate": 0.0001, + "loss": 1.5881, + "step": 6100 + }, + { + "epoch": 0.700821319855264, + "grad_norm": 0.49589815735816956, + "learning_rate": 0.0001, + "loss": 1.712, + "step": 6101 + }, + { + "epoch": 0.7009361897650911, + "grad_norm": 0.43956196308135986, + "learning_rate": 0.0001, + "loss": 1.5506, + "step": 6102 + }, + { + "epoch": 0.7010510596749182, + "grad_norm": 0.45342060923576355, + "learning_rate": 0.0001, + "loss": 1.6568, + "step": 6103 + }, + { + "epoch": 0.7011659295847453, + "grad_norm": 0.4133893847465515, + "learning_rate": 0.0001, + "loss": 1.4956, + "step": 6104 + }, + { + "epoch": 0.7012807994945724, + "grad_norm": 0.41374027729034424, + "learning_rate": 0.0001, + "loss": 1.6463, + "step": 6105 + }, + { + "epoch": 0.7013956694043996, + "grad_norm": 0.44979679584503174, + "learning_rate": 0.0001, + "loss": 1.7201, + "step": 6106 + }, + { + "epoch": 0.7015105393142267, + "grad_norm": 0.42173275351524353, + "learning_rate": 0.0001, + "loss": 1.6273, + "step": 6107 + }, + { + "epoch": 0.7016254092240538, + "grad_norm": 0.4458094835281372, + "learning_rate": 0.0001, + "loss": 1.6467, + "step": 6108 + }, + { + "epoch": 0.7017402791338809, + "grad_norm": 0.44119831919670105, + "learning_rate": 0.0001, + "loss": 1.7528, + "step": 6109 + }, + { + "epoch": 0.701855149043708, + "grad_norm": 0.4866656959056854, + "learning_rate": 0.0001, + "loss": 1.6004, + "step": 6110 + }, + { + "epoch": 0.7019700189535352, + "grad_norm": 0.43976983428001404, + "learning_rate": 0.0001, + "loss": 1.5994, + "step": 6111 + }, + { + "epoch": 0.7020848888633623, + "grad_norm": 0.4584594666957855, + "learning_rate": 0.0001, + "loss": 1.5199, + "step": 6112 + }, + { + "epoch": 0.7021997587731894, + "grad_norm": 0.4671233594417572, + "learning_rate": 0.0001, + "loss": 1.5643, + "step": 6113 + }, + { + "epoch": 0.7023146286830165, + "grad_norm": 0.4047556519508362, + "learning_rate": 0.0001, + "loss": 1.3305, + "step": 6114 + }, + { + "epoch": 0.7024294985928436, + "grad_norm": 0.44763243198394775, + "learning_rate": 0.0001, + "loss": 1.5656, + "step": 6115 + }, + { + "epoch": 0.7025443685026708, + "grad_norm": 0.42392697930336, + "learning_rate": 0.0001, + "loss": 1.5303, + "step": 6116 + }, + { + "epoch": 0.7026592384124979, + "grad_norm": 0.4174063205718994, + "learning_rate": 0.0001, + "loss": 1.4395, + "step": 6117 + }, + { + "epoch": 0.702774108322325, + "grad_norm": 0.44011473655700684, + "learning_rate": 0.0001, + "loss": 1.568, + "step": 6118 + }, + { + "epoch": 0.7028889782321521, + "grad_norm": 0.467358261346817, + "learning_rate": 0.0001, + "loss": 1.7407, + "step": 6119 + }, + { + "epoch": 0.7030038481419792, + "grad_norm": 0.43859270215034485, + "learning_rate": 0.0001, + "loss": 1.6201, + "step": 6120 + }, + { + "epoch": 0.7031187180518064, + "grad_norm": 0.45992812514305115, + "learning_rate": 0.0001, + "loss": 1.6784, + "step": 6121 + }, + { + "epoch": 0.7032335879616335, + "grad_norm": 0.5027499198913574, + "learning_rate": 0.0001, + "loss": 1.8963, + "step": 6122 + }, + { + "epoch": 0.7033484578714606, + "grad_norm": 0.4613763689994812, + "learning_rate": 0.0001, + "loss": 1.7305, + "step": 6123 + }, + { + "epoch": 0.7034633277812877, + "grad_norm": 0.44609150290489197, + "learning_rate": 0.0001, + "loss": 1.593, + "step": 6124 + }, + { + "epoch": 0.7035781976911148, + "grad_norm": 0.47288355231285095, + "learning_rate": 0.0001, + "loss": 1.5033, + "step": 6125 + }, + { + "epoch": 0.703693067600942, + "grad_norm": 0.4981932044029236, + "learning_rate": 0.0001, + "loss": 1.6387, + "step": 6126 + }, + { + "epoch": 0.7038079375107691, + "grad_norm": 0.43476441502571106, + "learning_rate": 0.0001, + "loss": 1.6783, + "step": 6127 + }, + { + "epoch": 0.7039228074205962, + "grad_norm": 0.4434894323348999, + "learning_rate": 0.0001, + "loss": 1.5384, + "step": 6128 + }, + { + "epoch": 0.7040376773304233, + "grad_norm": 0.4626059830188751, + "learning_rate": 0.0001, + "loss": 1.6981, + "step": 6129 + }, + { + "epoch": 0.7041525472402504, + "grad_norm": 0.43429237604141235, + "learning_rate": 0.0001, + "loss": 1.6668, + "step": 6130 + }, + { + "epoch": 0.7042674171500776, + "grad_norm": 0.4962638318538666, + "learning_rate": 0.0001, + "loss": 1.6528, + "step": 6131 + }, + { + "epoch": 0.7043822870599047, + "grad_norm": 0.47368475794792175, + "learning_rate": 0.0001, + "loss": 1.7509, + "step": 6132 + }, + { + "epoch": 0.7044971569697318, + "grad_norm": 0.44226646423339844, + "learning_rate": 0.0001, + "loss": 1.5321, + "step": 6133 + }, + { + "epoch": 0.7046120268795589, + "grad_norm": 0.4369940757751465, + "learning_rate": 0.0001, + "loss": 1.6187, + "step": 6134 + }, + { + "epoch": 0.704726896789386, + "grad_norm": 0.4563344120979309, + "learning_rate": 0.0001, + "loss": 1.7639, + "step": 6135 + }, + { + "epoch": 0.7048417666992132, + "grad_norm": 0.4372788965702057, + "learning_rate": 0.0001, + "loss": 1.6908, + "step": 6136 + }, + { + "epoch": 0.7049566366090403, + "grad_norm": 0.43250662088394165, + "learning_rate": 0.0001, + "loss": 1.6608, + "step": 6137 + }, + { + "epoch": 0.7050715065188674, + "grad_norm": 0.43698424100875854, + "learning_rate": 0.0001, + "loss": 1.656, + "step": 6138 + }, + { + "epoch": 0.7051863764286945, + "grad_norm": 0.4592958390712738, + "learning_rate": 0.0001, + "loss": 1.6603, + "step": 6139 + }, + { + "epoch": 0.7053012463385216, + "grad_norm": 0.4927108883857727, + "learning_rate": 0.0001, + "loss": 1.6586, + "step": 6140 + }, + { + "epoch": 0.7054161162483488, + "grad_norm": 0.41103577613830566, + "learning_rate": 0.0001, + "loss": 1.4693, + "step": 6141 + }, + { + "epoch": 0.7055309861581759, + "grad_norm": 0.429688423871994, + "learning_rate": 0.0001, + "loss": 1.6462, + "step": 6142 + }, + { + "epoch": 0.705645856068003, + "grad_norm": 0.4417214095592499, + "learning_rate": 0.0001, + "loss": 1.769, + "step": 6143 + }, + { + "epoch": 0.7057607259778301, + "grad_norm": 0.4288574457168579, + "learning_rate": 0.0001, + "loss": 1.6017, + "step": 6144 + }, + { + "epoch": 0.7058755958876572, + "grad_norm": 0.4483475089073181, + "learning_rate": 0.0001, + "loss": 1.6976, + "step": 6145 + }, + { + "epoch": 0.7059904657974844, + "grad_norm": 0.44906046986579895, + "learning_rate": 0.0001, + "loss": 1.771, + "step": 6146 + }, + { + "epoch": 0.7061053357073115, + "grad_norm": 0.43787476420402527, + "learning_rate": 0.0001, + "loss": 1.5868, + "step": 6147 + }, + { + "epoch": 0.7062202056171386, + "grad_norm": 0.47539424896240234, + "learning_rate": 0.0001, + "loss": 1.5655, + "step": 6148 + }, + { + "epoch": 0.7063350755269657, + "grad_norm": 0.4609004855155945, + "learning_rate": 0.0001, + "loss": 1.5423, + "step": 6149 + }, + { + "epoch": 0.7064499454367928, + "grad_norm": 0.443278431892395, + "learning_rate": 0.0001, + "loss": 1.5154, + "step": 6150 + }, + { + "epoch": 0.70656481534662, + "grad_norm": 0.43776434659957886, + "learning_rate": 0.0001, + "loss": 1.518, + "step": 6151 + }, + { + "epoch": 0.7066796852564471, + "grad_norm": 0.4367440342903137, + "learning_rate": 0.0001, + "loss": 1.6544, + "step": 6152 + }, + { + "epoch": 0.7067945551662742, + "grad_norm": 0.4293498694896698, + "learning_rate": 0.0001, + "loss": 1.6271, + "step": 6153 + }, + { + "epoch": 0.7069094250761013, + "grad_norm": 0.46721354126930237, + "learning_rate": 0.0001, + "loss": 1.8351, + "step": 6154 + }, + { + "epoch": 0.7070242949859284, + "grad_norm": 0.4610436260700226, + "learning_rate": 0.0001, + "loss": 1.6588, + "step": 6155 + }, + { + "epoch": 0.7071391648957556, + "grad_norm": 0.44414547085762024, + "learning_rate": 0.0001, + "loss": 1.6412, + "step": 6156 + }, + { + "epoch": 0.7072540348055827, + "grad_norm": 0.4219599962234497, + "learning_rate": 0.0001, + "loss": 1.5344, + "step": 6157 + }, + { + "epoch": 0.7073689047154098, + "grad_norm": 0.45348840951919556, + "learning_rate": 0.0001, + "loss": 1.6561, + "step": 6158 + }, + { + "epoch": 0.7074837746252369, + "grad_norm": 0.44765985012054443, + "learning_rate": 0.0001, + "loss": 1.7357, + "step": 6159 + }, + { + "epoch": 0.707598644535064, + "grad_norm": 0.43484190106391907, + "learning_rate": 0.0001, + "loss": 1.6815, + "step": 6160 + }, + { + "epoch": 0.7077135144448912, + "grad_norm": 0.44024690985679626, + "learning_rate": 0.0001, + "loss": 1.5954, + "step": 6161 + }, + { + "epoch": 0.7078283843547183, + "grad_norm": 0.4765664339065552, + "learning_rate": 0.0001, + "loss": 1.6059, + "step": 6162 + }, + { + "epoch": 0.7079432542645454, + "grad_norm": 0.4749075174331665, + "learning_rate": 0.0001, + "loss": 1.6469, + "step": 6163 + }, + { + "epoch": 0.7080581241743725, + "grad_norm": 0.47169381380081177, + "learning_rate": 0.0001, + "loss": 1.7209, + "step": 6164 + }, + { + "epoch": 0.7081729940841996, + "grad_norm": 0.45661336183547974, + "learning_rate": 0.0001, + "loss": 1.573, + "step": 6165 + }, + { + "epoch": 0.7082878639940268, + "grad_norm": 0.45333951711654663, + "learning_rate": 0.0001, + "loss": 1.6396, + "step": 6166 + }, + { + "epoch": 0.7084027339038539, + "grad_norm": 0.4652560353279114, + "learning_rate": 0.0001, + "loss": 1.7598, + "step": 6167 + }, + { + "epoch": 0.708517603813681, + "grad_norm": 0.46927064657211304, + "learning_rate": 0.0001, + "loss": 1.782, + "step": 6168 + }, + { + "epoch": 0.7086324737235081, + "grad_norm": 0.44694873690605164, + "learning_rate": 0.0001, + "loss": 1.7146, + "step": 6169 + }, + { + "epoch": 0.7087473436333352, + "grad_norm": 0.4289446473121643, + "learning_rate": 0.0001, + "loss": 1.6457, + "step": 6170 + }, + { + "epoch": 0.7088622135431624, + "grad_norm": 0.41569510102272034, + "learning_rate": 0.0001, + "loss": 1.4681, + "step": 6171 + }, + { + "epoch": 0.7089770834529895, + "grad_norm": 0.42229345440864563, + "learning_rate": 0.0001, + "loss": 1.6183, + "step": 6172 + }, + { + "epoch": 0.7090919533628166, + "grad_norm": 0.4784461557865143, + "learning_rate": 0.0001, + "loss": 1.727, + "step": 6173 + }, + { + "epoch": 0.7092068232726437, + "grad_norm": 0.40896356105804443, + "learning_rate": 0.0001, + "loss": 1.3902, + "step": 6174 + }, + { + "epoch": 0.7093216931824708, + "grad_norm": 0.4578010141849518, + "learning_rate": 0.0001, + "loss": 1.5634, + "step": 6175 + }, + { + "epoch": 0.709436563092298, + "grad_norm": 0.4804964065551758, + "learning_rate": 0.0001, + "loss": 1.5759, + "step": 6176 + }, + { + "epoch": 0.7095514330021251, + "grad_norm": 0.4631587266921997, + "learning_rate": 0.0001, + "loss": 1.6562, + "step": 6177 + }, + { + "epoch": 0.7096663029119522, + "grad_norm": 0.4732271134853363, + "learning_rate": 0.0001, + "loss": 1.7075, + "step": 6178 + }, + { + "epoch": 0.7097811728217793, + "grad_norm": 0.4341810643672943, + "learning_rate": 0.0001, + "loss": 1.6526, + "step": 6179 + }, + { + "epoch": 0.7098960427316064, + "grad_norm": 0.4346090853214264, + "learning_rate": 0.0001, + "loss": 1.6048, + "step": 6180 + }, + { + "epoch": 0.7100109126414336, + "grad_norm": 0.4299982190132141, + "learning_rate": 0.0001, + "loss": 1.4397, + "step": 6181 + }, + { + "epoch": 0.7101257825512607, + "grad_norm": 0.4570772647857666, + "learning_rate": 0.0001, + "loss": 1.7246, + "step": 6182 + }, + { + "epoch": 0.7102406524610878, + "grad_norm": 0.44084087014198303, + "learning_rate": 0.0001, + "loss": 1.664, + "step": 6183 + }, + { + "epoch": 0.7103555223709149, + "grad_norm": 0.4548969566822052, + "learning_rate": 0.0001, + "loss": 1.707, + "step": 6184 + }, + { + "epoch": 0.710470392280742, + "grad_norm": 0.40112459659576416, + "learning_rate": 0.0001, + "loss": 1.4677, + "step": 6185 + }, + { + "epoch": 0.7105852621905692, + "grad_norm": 0.4894128143787384, + "learning_rate": 0.0001, + "loss": 1.7654, + "step": 6186 + }, + { + "epoch": 0.7107001321003963, + "grad_norm": 0.46307554841041565, + "learning_rate": 0.0001, + "loss": 1.731, + "step": 6187 + }, + { + "epoch": 0.7108150020102234, + "grad_norm": 0.44500499963760376, + "learning_rate": 0.0001, + "loss": 1.4718, + "step": 6188 + }, + { + "epoch": 0.7109298719200505, + "grad_norm": 0.46841320395469666, + "learning_rate": 0.0001, + "loss": 1.7267, + "step": 6189 + }, + { + "epoch": 0.7110447418298776, + "grad_norm": 0.452532023191452, + "learning_rate": 0.0001, + "loss": 1.567, + "step": 6190 + }, + { + "epoch": 0.7111596117397048, + "grad_norm": 0.4364396929740906, + "learning_rate": 0.0001, + "loss": 1.4916, + "step": 6191 + }, + { + "epoch": 0.7112744816495319, + "grad_norm": 0.46478208899497986, + "learning_rate": 0.0001, + "loss": 1.7538, + "step": 6192 + }, + { + "epoch": 0.711389351559359, + "grad_norm": 0.5013893246650696, + "learning_rate": 0.0001, + "loss": 1.9132, + "step": 6193 + }, + { + "epoch": 0.7115042214691861, + "grad_norm": 0.4351854622364044, + "learning_rate": 0.0001, + "loss": 1.5114, + "step": 6194 + }, + { + "epoch": 0.7116190913790132, + "grad_norm": 0.4540832042694092, + "learning_rate": 0.0001, + "loss": 1.782, + "step": 6195 + }, + { + "epoch": 0.7117339612888404, + "grad_norm": 0.444970041513443, + "learning_rate": 0.0001, + "loss": 1.5446, + "step": 6196 + }, + { + "epoch": 0.7118488311986675, + "grad_norm": 0.4413135349750519, + "learning_rate": 0.0001, + "loss": 1.4519, + "step": 6197 + }, + { + "epoch": 0.7119637011084946, + "grad_norm": 0.4258001148700714, + "learning_rate": 0.0001, + "loss": 1.6815, + "step": 6198 + }, + { + "epoch": 0.7120785710183217, + "grad_norm": 0.4716213047504425, + "learning_rate": 0.0001, + "loss": 1.7128, + "step": 6199 + }, + { + "epoch": 0.7121934409281488, + "grad_norm": 0.4352003335952759, + "learning_rate": 0.0001, + "loss": 1.6439, + "step": 6200 + }, + { + "epoch": 0.712308310837976, + "grad_norm": 0.43441304564476013, + "learning_rate": 0.0001, + "loss": 1.5772, + "step": 6201 + }, + { + "epoch": 0.7124231807478031, + "grad_norm": 0.4781912565231323, + "learning_rate": 0.0001, + "loss": 1.685, + "step": 6202 + }, + { + "epoch": 0.7125380506576302, + "grad_norm": 0.4544585347175598, + "learning_rate": 0.0001, + "loss": 1.7234, + "step": 6203 + }, + { + "epoch": 0.7126529205674573, + "grad_norm": 0.398957222700119, + "learning_rate": 0.0001, + "loss": 1.431, + "step": 6204 + }, + { + "epoch": 0.7127677904772844, + "grad_norm": 0.44276830554008484, + "learning_rate": 0.0001, + "loss": 1.4976, + "step": 6205 + }, + { + "epoch": 0.7128826603871116, + "grad_norm": 0.44275638461112976, + "learning_rate": 0.0001, + "loss": 1.5749, + "step": 6206 + }, + { + "epoch": 0.7129975302969387, + "grad_norm": 0.5160487294197083, + "learning_rate": 0.0001, + "loss": 1.6689, + "step": 6207 + }, + { + "epoch": 0.7131124002067658, + "grad_norm": 0.44017454981803894, + "learning_rate": 0.0001, + "loss": 1.6119, + "step": 6208 + }, + { + "epoch": 0.7132272701165929, + "grad_norm": 0.4619143307209015, + "learning_rate": 0.0001, + "loss": 1.8167, + "step": 6209 + }, + { + "epoch": 0.71334214002642, + "grad_norm": 0.4661194086074829, + "learning_rate": 0.0001, + "loss": 1.5478, + "step": 6210 + }, + { + "epoch": 0.7134570099362472, + "grad_norm": 0.4605958163738251, + "learning_rate": 0.0001, + "loss": 1.7684, + "step": 6211 + }, + { + "epoch": 0.7135718798460743, + "grad_norm": 0.463900089263916, + "learning_rate": 0.0001, + "loss": 1.7286, + "step": 6212 + }, + { + "epoch": 0.7136867497559014, + "grad_norm": 0.48637497425079346, + "learning_rate": 0.0001, + "loss": 1.8437, + "step": 6213 + }, + { + "epoch": 0.7138016196657285, + "grad_norm": 0.40677204728126526, + "learning_rate": 0.0001, + "loss": 1.4627, + "step": 6214 + }, + { + "epoch": 0.7139164895755556, + "grad_norm": 0.5679096579551697, + "learning_rate": 0.0001, + "loss": 1.4847, + "step": 6215 + }, + { + "epoch": 0.7140313594853828, + "grad_norm": 0.4575541019439697, + "learning_rate": 0.0001, + "loss": 1.7186, + "step": 6216 + }, + { + "epoch": 0.7141462293952099, + "grad_norm": 0.4405764043331146, + "learning_rate": 0.0001, + "loss": 1.7805, + "step": 6217 + }, + { + "epoch": 0.714261099305037, + "grad_norm": 0.4583107531070709, + "learning_rate": 0.0001, + "loss": 1.625, + "step": 6218 + }, + { + "epoch": 0.7143759692148641, + "grad_norm": 0.44853004813194275, + "learning_rate": 0.0001, + "loss": 1.4672, + "step": 6219 + }, + { + "epoch": 0.7144908391246912, + "grad_norm": 0.4117748439311981, + "learning_rate": 0.0001, + "loss": 1.4321, + "step": 6220 + }, + { + "epoch": 0.7146057090345184, + "grad_norm": 0.43110406398773193, + "learning_rate": 0.0001, + "loss": 1.483, + "step": 6221 + }, + { + "epoch": 0.7147205789443455, + "grad_norm": 0.46626466512680054, + "learning_rate": 0.0001, + "loss": 1.4815, + "step": 6222 + }, + { + "epoch": 0.7148354488541726, + "grad_norm": 0.4862065017223358, + "learning_rate": 0.0001, + "loss": 1.6176, + "step": 6223 + }, + { + "epoch": 0.7149503187639997, + "grad_norm": 0.4461599588394165, + "learning_rate": 0.0001, + "loss": 1.4846, + "step": 6224 + }, + { + "epoch": 0.7150651886738268, + "grad_norm": 0.4415801763534546, + "learning_rate": 0.0001, + "loss": 1.7068, + "step": 6225 + }, + { + "epoch": 0.715180058583654, + "grad_norm": 0.4292342960834503, + "learning_rate": 0.0001, + "loss": 1.5751, + "step": 6226 + }, + { + "epoch": 0.7152949284934811, + "grad_norm": 0.4549332857131958, + "learning_rate": 0.0001, + "loss": 1.699, + "step": 6227 + }, + { + "epoch": 0.7154097984033082, + "grad_norm": 0.4223959147930145, + "learning_rate": 0.0001, + "loss": 1.4203, + "step": 6228 + }, + { + "epoch": 0.7155246683131353, + "grad_norm": 0.42485642433166504, + "learning_rate": 0.0001, + "loss": 1.5133, + "step": 6229 + }, + { + "epoch": 0.7156395382229624, + "grad_norm": 0.45224228501319885, + "learning_rate": 0.0001, + "loss": 1.4644, + "step": 6230 + }, + { + "epoch": 0.7157544081327896, + "grad_norm": 0.4319050908088684, + "learning_rate": 0.0001, + "loss": 1.4666, + "step": 6231 + }, + { + "epoch": 0.7158692780426167, + "grad_norm": 0.4529288113117218, + "learning_rate": 0.0001, + "loss": 1.7083, + "step": 6232 + }, + { + "epoch": 0.7159841479524438, + "grad_norm": 0.4431191682815552, + "learning_rate": 0.0001, + "loss": 1.5841, + "step": 6233 + }, + { + "epoch": 0.7160990178622709, + "grad_norm": 0.44510093331336975, + "learning_rate": 0.0001, + "loss": 1.54, + "step": 6234 + }, + { + "epoch": 0.716213887772098, + "grad_norm": 0.43626534938812256, + "learning_rate": 0.0001, + "loss": 1.5614, + "step": 6235 + }, + { + "epoch": 0.7163287576819252, + "grad_norm": 0.4344061017036438, + "learning_rate": 0.0001, + "loss": 1.3115, + "step": 6236 + }, + { + "epoch": 0.7164436275917523, + "grad_norm": 0.5267593860626221, + "learning_rate": 0.0001, + "loss": 1.9086, + "step": 6237 + }, + { + "epoch": 0.7165584975015795, + "grad_norm": 0.4149724245071411, + "learning_rate": 0.0001, + "loss": 1.6038, + "step": 6238 + }, + { + "epoch": 0.7166733674114066, + "grad_norm": 0.4510195255279541, + "learning_rate": 0.0001, + "loss": 1.8163, + "step": 6239 + }, + { + "epoch": 0.7167882373212338, + "grad_norm": 0.4458303451538086, + "learning_rate": 0.0001, + "loss": 1.5324, + "step": 6240 + }, + { + "epoch": 0.7169031072310609, + "grad_norm": 0.4366897642612457, + "learning_rate": 0.0001, + "loss": 1.6088, + "step": 6241 + }, + { + "epoch": 0.717017977140888, + "grad_norm": 0.43107160925865173, + "learning_rate": 0.0001, + "loss": 1.5355, + "step": 6242 + }, + { + "epoch": 0.7171328470507151, + "grad_norm": 0.43756523728370667, + "learning_rate": 0.0001, + "loss": 1.5967, + "step": 6243 + }, + { + "epoch": 0.7172477169605422, + "grad_norm": 0.4852414131164551, + "learning_rate": 0.0001, + "loss": 1.8807, + "step": 6244 + }, + { + "epoch": 0.7173625868703694, + "grad_norm": 0.43804264068603516, + "learning_rate": 0.0001, + "loss": 1.551, + "step": 6245 + }, + { + "epoch": 0.7174774567801965, + "grad_norm": 0.4281727075576782, + "learning_rate": 0.0001, + "loss": 1.551, + "step": 6246 + }, + { + "epoch": 0.7175923266900236, + "grad_norm": 0.45637455582618713, + "learning_rate": 0.0001, + "loss": 1.6597, + "step": 6247 + }, + { + "epoch": 0.7177071965998507, + "grad_norm": 0.48942601680755615, + "learning_rate": 0.0001, + "loss": 1.7886, + "step": 6248 + }, + { + "epoch": 0.7178220665096778, + "grad_norm": 0.44699156284332275, + "learning_rate": 0.0001, + "loss": 1.6049, + "step": 6249 + }, + { + "epoch": 0.717936936419505, + "grad_norm": 0.45388197898864746, + "learning_rate": 0.0001, + "loss": 1.5923, + "step": 6250 + }, + { + "epoch": 0.7180518063293321, + "grad_norm": 0.4573292136192322, + "learning_rate": 0.0001, + "loss": 1.3832, + "step": 6251 + }, + { + "epoch": 0.7181666762391592, + "grad_norm": 0.4720889925956726, + "learning_rate": 0.0001, + "loss": 1.698, + "step": 6252 + }, + { + "epoch": 0.7182815461489863, + "grad_norm": 0.44124001264572144, + "learning_rate": 0.0001, + "loss": 1.468, + "step": 6253 + }, + { + "epoch": 0.7183964160588134, + "grad_norm": 0.48664864897727966, + "learning_rate": 0.0001, + "loss": 1.6452, + "step": 6254 + }, + { + "epoch": 0.7185112859686406, + "grad_norm": 0.44402679800987244, + "learning_rate": 0.0001, + "loss": 1.5786, + "step": 6255 + }, + { + "epoch": 0.7186261558784677, + "grad_norm": 0.46472397446632385, + "learning_rate": 0.0001, + "loss": 1.4484, + "step": 6256 + }, + { + "epoch": 0.7187410257882948, + "grad_norm": 0.49754804372787476, + "learning_rate": 0.0001, + "loss": 1.9748, + "step": 6257 + }, + { + "epoch": 0.7188558956981219, + "grad_norm": 0.439335435628891, + "learning_rate": 0.0001, + "loss": 1.695, + "step": 6258 + }, + { + "epoch": 0.718970765607949, + "grad_norm": 0.4374498426914215, + "learning_rate": 0.0001, + "loss": 1.5566, + "step": 6259 + }, + { + "epoch": 0.7190856355177762, + "grad_norm": 0.4089290201663971, + "learning_rate": 0.0001, + "loss": 1.4817, + "step": 6260 + }, + { + "epoch": 0.7192005054276033, + "grad_norm": 0.4366007447242737, + "learning_rate": 0.0001, + "loss": 1.5968, + "step": 6261 + }, + { + "epoch": 0.7193153753374304, + "grad_norm": 0.44272562861442566, + "learning_rate": 0.0001, + "loss": 1.4592, + "step": 6262 + }, + { + "epoch": 0.7194302452472575, + "grad_norm": 0.4497889280319214, + "learning_rate": 0.0001, + "loss": 1.6329, + "step": 6263 + }, + { + "epoch": 0.7195451151570846, + "grad_norm": 0.43780991435050964, + "learning_rate": 0.0001, + "loss": 1.5423, + "step": 6264 + }, + { + "epoch": 0.7196599850669118, + "grad_norm": 0.4534786343574524, + "learning_rate": 0.0001, + "loss": 1.7093, + "step": 6265 + }, + { + "epoch": 0.7197748549767389, + "grad_norm": 0.42275941371917725, + "learning_rate": 0.0001, + "loss": 1.4069, + "step": 6266 + }, + { + "epoch": 0.719889724886566, + "grad_norm": 0.47264930605888367, + "learning_rate": 0.0001, + "loss": 1.6001, + "step": 6267 + }, + { + "epoch": 0.7200045947963931, + "grad_norm": 0.4754311740398407, + "learning_rate": 0.0001, + "loss": 1.8423, + "step": 6268 + }, + { + "epoch": 0.7201194647062202, + "grad_norm": 0.4460528492927551, + "learning_rate": 0.0001, + "loss": 1.4606, + "step": 6269 + }, + { + "epoch": 0.7202343346160474, + "grad_norm": 0.43937763571739197, + "learning_rate": 0.0001, + "loss": 1.5779, + "step": 6270 + }, + { + "epoch": 0.7203492045258745, + "grad_norm": 0.4385170638561249, + "learning_rate": 0.0001, + "loss": 1.6334, + "step": 6271 + }, + { + "epoch": 0.7204640744357016, + "grad_norm": 0.4786663353443146, + "learning_rate": 0.0001, + "loss": 1.3311, + "step": 6272 + }, + { + "epoch": 0.7205789443455287, + "grad_norm": 0.45746463537216187, + "learning_rate": 0.0001, + "loss": 1.6661, + "step": 6273 + }, + { + "epoch": 0.7206938142553558, + "grad_norm": 0.4555452764034271, + "learning_rate": 0.0001, + "loss": 1.6727, + "step": 6274 + }, + { + "epoch": 0.720808684165183, + "grad_norm": 0.4532817006111145, + "learning_rate": 0.0001, + "loss": 1.5359, + "step": 6275 + }, + { + "epoch": 0.7209235540750101, + "grad_norm": 0.42538461089134216, + "learning_rate": 0.0001, + "loss": 1.5689, + "step": 6276 + }, + { + "epoch": 0.7210384239848372, + "grad_norm": 0.4184345006942749, + "learning_rate": 0.0001, + "loss": 1.442, + "step": 6277 + }, + { + "epoch": 0.7211532938946643, + "grad_norm": 0.4728604853153229, + "learning_rate": 0.0001, + "loss": 1.7058, + "step": 6278 + }, + { + "epoch": 0.7212681638044914, + "grad_norm": 0.45347052812576294, + "learning_rate": 0.0001, + "loss": 1.4433, + "step": 6279 + }, + { + "epoch": 0.7213830337143186, + "grad_norm": 0.4866488575935364, + "learning_rate": 0.0001, + "loss": 1.7027, + "step": 6280 + }, + { + "epoch": 0.7214979036241457, + "grad_norm": 0.48550790548324585, + "learning_rate": 0.0001, + "loss": 1.6111, + "step": 6281 + }, + { + "epoch": 0.7216127735339728, + "grad_norm": 0.45319709181785583, + "learning_rate": 0.0001, + "loss": 1.5712, + "step": 6282 + }, + { + "epoch": 0.7217276434437999, + "grad_norm": 0.49187415838241577, + "learning_rate": 0.0001, + "loss": 1.7311, + "step": 6283 + }, + { + "epoch": 0.721842513353627, + "grad_norm": 0.4277302324771881, + "learning_rate": 0.0001, + "loss": 1.5716, + "step": 6284 + }, + { + "epoch": 0.7219573832634542, + "grad_norm": 0.4738651216030121, + "learning_rate": 0.0001, + "loss": 1.6308, + "step": 6285 + }, + { + "epoch": 0.7220722531732813, + "grad_norm": 0.4568105638027191, + "learning_rate": 0.0001, + "loss": 1.5111, + "step": 6286 + }, + { + "epoch": 0.7221871230831084, + "grad_norm": 0.41216906905174255, + "learning_rate": 0.0001, + "loss": 1.4644, + "step": 6287 + }, + { + "epoch": 0.7223019929929355, + "grad_norm": 0.4425976574420929, + "learning_rate": 0.0001, + "loss": 1.6404, + "step": 6288 + }, + { + "epoch": 0.7224168629027626, + "grad_norm": 0.42881807684898376, + "learning_rate": 0.0001, + "loss": 1.6171, + "step": 6289 + }, + { + "epoch": 0.7225317328125898, + "grad_norm": 0.41956013441085815, + "learning_rate": 0.0001, + "loss": 1.4663, + "step": 6290 + }, + { + "epoch": 0.7226466027224169, + "grad_norm": 0.46805068850517273, + "learning_rate": 0.0001, + "loss": 1.727, + "step": 6291 + }, + { + "epoch": 0.722761472632244, + "grad_norm": 0.432712197303772, + "learning_rate": 0.0001, + "loss": 1.4838, + "step": 6292 + }, + { + "epoch": 0.7228763425420711, + "grad_norm": 0.4402911067008972, + "learning_rate": 0.0001, + "loss": 1.7633, + "step": 6293 + }, + { + "epoch": 0.7229912124518982, + "grad_norm": 0.44642138481140137, + "learning_rate": 0.0001, + "loss": 1.5206, + "step": 6294 + }, + { + "epoch": 0.7231060823617254, + "grad_norm": 0.43020543456077576, + "learning_rate": 0.0001, + "loss": 1.7095, + "step": 6295 + }, + { + "epoch": 0.7232209522715525, + "grad_norm": 0.4489821195602417, + "learning_rate": 0.0001, + "loss": 1.5012, + "step": 6296 + }, + { + "epoch": 0.7233358221813796, + "grad_norm": 0.427055299282074, + "learning_rate": 0.0001, + "loss": 1.5981, + "step": 6297 + }, + { + "epoch": 0.7234506920912067, + "grad_norm": 0.41614022850990295, + "learning_rate": 0.0001, + "loss": 1.3573, + "step": 6298 + }, + { + "epoch": 0.7235655620010338, + "grad_norm": 0.4344491958618164, + "learning_rate": 0.0001, + "loss": 1.6139, + "step": 6299 + }, + { + "epoch": 0.723680431910861, + "grad_norm": 0.43843087553977966, + "learning_rate": 0.0001, + "loss": 1.1252, + "step": 6300 + }, + { + "epoch": 0.7237953018206881, + "grad_norm": 0.4700905680656433, + "learning_rate": 0.0001, + "loss": 1.48, + "step": 6301 + }, + { + "epoch": 0.7239101717305152, + "grad_norm": 0.48474234342575073, + "learning_rate": 0.0001, + "loss": 1.652, + "step": 6302 + }, + { + "epoch": 0.7240250416403423, + "grad_norm": 0.44834601879119873, + "learning_rate": 0.0001, + "loss": 1.3831, + "step": 6303 + }, + { + "epoch": 0.7241399115501694, + "grad_norm": 0.4446815252304077, + "learning_rate": 0.0001, + "loss": 1.6446, + "step": 6304 + }, + { + "epoch": 0.7242547814599966, + "grad_norm": 0.4817836880683899, + "learning_rate": 0.0001, + "loss": 1.5911, + "step": 6305 + }, + { + "epoch": 0.7243696513698237, + "grad_norm": 0.5483472347259521, + "learning_rate": 0.0001, + "loss": 1.6487, + "step": 6306 + }, + { + "epoch": 0.7244845212796508, + "grad_norm": 0.42818188667297363, + "learning_rate": 0.0001, + "loss": 1.5687, + "step": 6307 + }, + { + "epoch": 0.7245993911894779, + "grad_norm": 0.49449649453163147, + "learning_rate": 0.0001, + "loss": 1.5372, + "step": 6308 + }, + { + "epoch": 0.724714261099305, + "grad_norm": 0.4369608163833618, + "learning_rate": 0.0001, + "loss": 1.5778, + "step": 6309 + }, + { + "epoch": 0.7248291310091322, + "grad_norm": 0.5256768465042114, + "learning_rate": 0.0001, + "loss": 1.583, + "step": 6310 + }, + { + "epoch": 0.7249440009189593, + "grad_norm": 0.44161832332611084, + "learning_rate": 0.0001, + "loss": 1.5239, + "step": 6311 + }, + { + "epoch": 0.7250588708287864, + "grad_norm": 0.4450240731239319, + "learning_rate": 0.0001, + "loss": 1.3923, + "step": 6312 + }, + { + "epoch": 0.7251737407386135, + "grad_norm": 0.45049697160720825, + "learning_rate": 0.0001, + "loss": 1.6166, + "step": 6313 + }, + { + "epoch": 0.7252886106484406, + "grad_norm": 0.43609675765037537, + "learning_rate": 0.0001, + "loss": 1.7061, + "step": 6314 + }, + { + "epoch": 0.7254034805582678, + "grad_norm": 0.516049325466156, + "learning_rate": 0.0001, + "loss": 1.7453, + "step": 6315 + }, + { + "epoch": 0.7255183504680949, + "grad_norm": 0.45927536487579346, + "learning_rate": 0.0001, + "loss": 1.5909, + "step": 6316 + }, + { + "epoch": 0.725633220377922, + "grad_norm": 0.4337046146392822, + "learning_rate": 0.0001, + "loss": 1.6932, + "step": 6317 + }, + { + "epoch": 0.7257480902877491, + "grad_norm": 0.44162049889564514, + "learning_rate": 0.0001, + "loss": 1.541, + "step": 6318 + }, + { + "epoch": 0.7258629601975762, + "grad_norm": 0.4465755224227905, + "learning_rate": 0.0001, + "loss": 1.5295, + "step": 6319 + }, + { + "epoch": 0.7259778301074034, + "grad_norm": 0.4506109654903412, + "learning_rate": 0.0001, + "loss": 1.6735, + "step": 6320 + }, + { + "epoch": 0.7260927000172305, + "grad_norm": 0.45097821950912476, + "learning_rate": 0.0001, + "loss": 1.595, + "step": 6321 + }, + { + "epoch": 0.7262075699270576, + "grad_norm": 0.4603080749511719, + "learning_rate": 0.0001, + "loss": 1.7865, + "step": 6322 + }, + { + "epoch": 0.7263224398368847, + "grad_norm": 0.4264637231826782, + "learning_rate": 0.0001, + "loss": 1.7495, + "step": 6323 + }, + { + "epoch": 0.7264373097467118, + "grad_norm": 0.4358821213245392, + "learning_rate": 0.0001, + "loss": 1.7046, + "step": 6324 + }, + { + "epoch": 0.726552179656539, + "grad_norm": 0.4452092945575714, + "learning_rate": 0.0001, + "loss": 1.6058, + "step": 6325 + }, + { + "epoch": 0.7266670495663661, + "grad_norm": 0.5116655230522156, + "learning_rate": 0.0001, + "loss": 1.7254, + "step": 6326 + }, + { + "epoch": 0.7267819194761932, + "grad_norm": 0.4238274097442627, + "learning_rate": 0.0001, + "loss": 1.4587, + "step": 6327 + }, + { + "epoch": 0.7268967893860203, + "grad_norm": 0.4614119231700897, + "learning_rate": 0.0001, + "loss": 1.7239, + "step": 6328 + }, + { + "epoch": 0.7270116592958474, + "grad_norm": 0.47386687994003296, + "learning_rate": 0.0001, + "loss": 1.6542, + "step": 6329 + }, + { + "epoch": 0.7271265292056746, + "grad_norm": 0.4244793653488159, + "learning_rate": 0.0001, + "loss": 1.5649, + "step": 6330 + }, + { + "epoch": 0.7272413991155017, + "grad_norm": 0.44631174206733704, + "learning_rate": 0.0001, + "loss": 1.4767, + "step": 6331 + }, + { + "epoch": 0.7273562690253288, + "grad_norm": 0.4230101406574249, + "learning_rate": 0.0001, + "loss": 1.4192, + "step": 6332 + }, + { + "epoch": 0.7274711389351559, + "grad_norm": 0.42536047101020813, + "learning_rate": 0.0001, + "loss": 1.5629, + "step": 6333 + }, + { + "epoch": 0.727586008844983, + "grad_norm": 0.45973941683769226, + "learning_rate": 0.0001, + "loss": 1.7336, + "step": 6334 + }, + { + "epoch": 0.7277008787548102, + "grad_norm": 0.4620080888271332, + "learning_rate": 0.0001, + "loss": 1.6115, + "step": 6335 + }, + { + "epoch": 0.7278157486646373, + "grad_norm": 0.5099964737892151, + "learning_rate": 0.0001, + "loss": 1.5348, + "step": 6336 + }, + { + "epoch": 0.7279306185744644, + "grad_norm": 0.4300444722175598, + "learning_rate": 0.0001, + "loss": 1.3811, + "step": 6337 + }, + { + "epoch": 0.7280454884842915, + "grad_norm": 0.5135782957077026, + "learning_rate": 0.0001, + "loss": 1.8914, + "step": 6338 + }, + { + "epoch": 0.7281603583941186, + "grad_norm": 0.4119094908237457, + "learning_rate": 0.0001, + "loss": 1.5381, + "step": 6339 + }, + { + "epoch": 0.7282752283039458, + "grad_norm": 0.4423353374004364, + "learning_rate": 0.0001, + "loss": 1.6617, + "step": 6340 + }, + { + "epoch": 0.7283900982137729, + "grad_norm": 0.4462561011314392, + "learning_rate": 0.0001, + "loss": 1.6236, + "step": 6341 + }, + { + "epoch": 0.7285049681236, + "grad_norm": 0.47333210706710815, + "learning_rate": 0.0001, + "loss": 1.6761, + "step": 6342 + }, + { + "epoch": 0.7286198380334271, + "grad_norm": 0.46242082118988037, + "learning_rate": 0.0001, + "loss": 1.5732, + "step": 6343 + }, + { + "epoch": 0.7287347079432542, + "grad_norm": 0.4549300968647003, + "learning_rate": 0.0001, + "loss": 1.6149, + "step": 6344 + }, + { + "epoch": 0.7288495778530814, + "grad_norm": 0.46361663937568665, + "learning_rate": 0.0001, + "loss": 1.7257, + "step": 6345 + }, + { + "epoch": 0.7289644477629085, + "grad_norm": 0.4788486361503601, + "learning_rate": 0.0001, + "loss": 1.6031, + "step": 6346 + }, + { + "epoch": 0.7290793176727356, + "grad_norm": 0.4863881766796112, + "learning_rate": 0.0001, + "loss": 1.8181, + "step": 6347 + }, + { + "epoch": 0.7291941875825627, + "grad_norm": 0.4203285276889801, + "learning_rate": 0.0001, + "loss": 1.5597, + "step": 6348 + }, + { + "epoch": 0.7293090574923898, + "grad_norm": 0.4476950168609619, + "learning_rate": 0.0001, + "loss": 1.5288, + "step": 6349 + }, + { + "epoch": 0.729423927402217, + "grad_norm": 0.4235599935054779, + "learning_rate": 0.0001, + "loss": 1.3824, + "step": 6350 + }, + { + "epoch": 0.7295387973120441, + "grad_norm": 0.4462408423423767, + "learning_rate": 0.0001, + "loss": 1.5426, + "step": 6351 + }, + { + "epoch": 0.7296536672218712, + "grad_norm": 0.40480706095695496, + "learning_rate": 0.0001, + "loss": 1.4957, + "step": 6352 + }, + { + "epoch": 0.7297685371316983, + "grad_norm": 0.4449676275253296, + "learning_rate": 0.0001, + "loss": 1.5833, + "step": 6353 + }, + { + "epoch": 0.7298834070415254, + "grad_norm": 0.43793755769729614, + "learning_rate": 0.0001, + "loss": 1.5997, + "step": 6354 + }, + { + "epoch": 0.7299982769513526, + "grad_norm": 0.45420295000076294, + "learning_rate": 0.0001, + "loss": 1.7223, + "step": 6355 + }, + { + "epoch": 0.7301131468611797, + "grad_norm": 0.5009284615516663, + "learning_rate": 0.0001, + "loss": 1.5439, + "step": 6356 + }, + { + "epoch": 0.7302280167710068, + "grad_norm": 0.5149348974227905, + "learning_rate": 0.0001, + "loss": 1.5916, + "step": 6357 + }, + { + "epoch": 0.7303428866808339, + "grad_norm": 0.4348343014717102, + "learning_rate": 0.0001, + "loss": 1.6413, + "step": 6358 + }, + { + "epoch": 0.730457756590661, + "grad_norm": 0.44643691182136536, + "learning_rate": 0.0001, + "loss": 1.4579, + "step": 6359 + }, + { + "epoch": 0.7305726265004882, + "grad_norm": 0.42023977637290955, + "learning_rate": 0.0001, + "loss": 1.5319, + "step": 6360 + }, + { + "epoch": 0.7306874964103153, + "grad_norm": 0.49339669942855835, + "learning_rate": 0.0001, + "loss": 1.7135, + "step": 6361 + }, + { + "epoch": 0.7308023663201424, + "grad_norm": 0.44311895966529846, + "learning_rate": 0.0001, + "loss": 1.5733, + "step": 6362 + }, + { + "epoch": 0.7309172362299695, + "grad_norm": 0.4121879041194916, + "learning_rate": 0.0001, + "loss": 1.3989, + "step": 6363 + }, + { + "epoch": 0.7310321061397966, + "grad_norm": 0.43048325181007385, + "learning_rate": 0.0001, + "loss": 1.6959, + "step": 6364 + }, + { + "epoch": 0.7311469760496238, + "grad_norm": 0.4717782735824585, + "learning_rate": 0.0001, + "loss": 1.6826, + "step": 6365 + }, + { + "epoch": 0.7312618459594509, + "grad_norm": 0.46323728561401367, + "learning_rate": 0.0001, + "loss": 1.6805, + "step": 6366 + }, + { + "epoch": 0.731376715869278, + "grad_norm": 0.4346836805343628, + "learning_rate": 0.0001, + "loss": 1.5547, + "step": 6367 + }, + { + "epoch": 0.7314915857791051, + "grad_norm": 0.47042396664619446, + "learning_rate": 0.0001, + "loss": 1.5663, + "step": 6368 + }, + { + "epoch": 0.7316064556889322, + "grad_norm": 0.44026100635528564, + "learning_rate": 0.0001, + "loss": 1.618, + "step": 6369 + }, + { + "epoch": 0.7317213255987594, + "grad_norm": 0.428864449262619, + "learning_rate": 0.0001, + "loss": 1.4316, + "step": 6370 + }, + { + "epoch": 0.7318361955085865, + "grad_norm": 0.4537869989871979, + "learning_rate": 0.0001, + "loss": 1.5533, + "step": 6371 + }, + { + "epoch": 0.7319510654184136, + "grad_norm": 0.45302248001098633, + "learning_rate": 0.0001, + "loss": 1.5901, + "step": 6372 + }, + { + "epoch": 0.7320659353282407, + "grad_norm": 0.4309695363044739, + "learning_rate": 0.0001, + "loss": 1.6486, + "step": 6373 + }, + { + "epoch": 0.7321808052380678, + "grad_norm": 0.4645009934902191, + "learning_rate": 0.0001, + "loss": 1.5925, + "step": 6374 + }, + { + "epoch": 0.7322956751478951, + "grad_norm": 0.42796799540519714, + "learning_rate": 0.0001, + "loss": 1.6142, + "step": 6375 + }, + { + "epoch": 0.7324105450577222, + "grad_norm": 0.4301031231880188, + "learning_rate": 0.0001, + "loss": 1.4068, + "step": 6376 + }, + { + "epoch": 0.7325254149675493, + "grad_norm": 0.4349769055843353, + "learning_rate": 0.0001, + "loss": 1.5291, + "step": 6377 + }, + { + "epoch": 0.7326402848773764, + "grad_norm": 0.4733826518058777, + "learning_rate": 0.0001, + "loss": 1.6648, + "step": 6378 + }, + { + "epoch": 0.7327551547872035, + "grad_norm": 0.4366433322429657, + "learning_rate": 0.0001, + "loss": 1.5781, + "step": 6379 + }, + { + "epoch": 0.7328700246970307, + "grad_norm": 0.47278228402137756, + "learning_rate": 0.0001, + "loss": 1.6445, + "step": 6380 + }, + { + "epoch": 0.7329848946068578, + "grad_norm": 0.4604305922985077, + "learning_rate": 0.0001, + "loss": 1.7437, + "step": 6381 + }, + { + "epoch": 0.7330997645166849, + "grad_norm": 0.45044025778770447, + "learning_rate": 0.0001, + "loss": 1.649, + "step": 6382 + }, + { + "epoch": 0.733214634426512, + "grad_norm": 0.4326942563056946, + "learning_rate": 0.0001, + "loss": 1.567, + "step": 6383 + }, + { + "epoch": 0.7333295043363391, + "grad_norm": 0.4268265664577484, + "learning_rate": 0.0001, + "loss": 1.627, + "step": 6384 + }, + { + "epoch": 0.7334443742461663, + "grad_norm": 0.49201422929763794, + "learning_rate": 0.0001, + "loss": 1.8518, + "step": 6385 + }, + { + "epoch": 0.7335592441559934, + "grad_norm": 0.4553898274898529, + "learning_rate": 0.0001, + "loss": 1.8142, + "step": 6386 + }, + { + "epoch": 0.7336741140658205, + "grad_norm": 0.4906349778175354, + "learning_rate": 0.0001, + "loss": 1.8773, + "step": 6387 + }, + { + "epoch": 0.7337889839756476, + "grad_norm": 0.4062640070915222, + "learning_rate": 0.0001, + "loss": 1.4207, + "step": 6388 + }, + { + "epoch": 0.7339038538854747, + "grad_norm": 0.4532264173030853, + "learning_rate": 0.0001, + "loss": 1.7094, + "step": 6389 + }, + { + "epoch": 0.7340187237953019, + "grad_norm": 0.4875713586807251, + "learning_rate": 0.0001, + "loss": 1.835, + "step": 6390 + }, + { + "epoch": 0.734133593705129, + "grad_norm": 0.4259282052516937, + "learning_rate": 0.0001, + "loss": 1.4534, + "step": 6391 + }, + { + "epoch": 0.7342484636149561, + "grad_norm": 0.4660018980503082, + "learning_rate": 0.0001, + "loss": 1.7193, + "step": 6392 + }, + { + "epoch": 0.7343633335247832, + "grad_norm": 0.4709480106830597, + "learning_rate": 0.0001, + "loss": 1.7843, + "step": 6393 + }, + { + "epoch": 0.7344782034346103, + "grad_norm": 0.4932401180267334, + "learning_rate": 0.0001, + "loss": 1.629, + "step": 6394 + }, + { + "epoch": 0.7345930733444375, + "grad_norm": 0.4418211281299591, + "learning_rate": 0.0001, + "loss": 1.51, + "step": 6395 + }, + { + "epoch": 0.7347079432542646, + "grad_norm": 0.4617857038974762, + "learning_rate": 0.0001, + "loss": 1.6093, + "step": 6396 + }, + { + "epoch": 0.7348228131640917, + "grad_norm": 0.44936829805374146, + "learning_rate": 0.0001, + "loss": 1.5856, + "step": 6397 + }, + { + "epoch": 0.7349376830739188, + "grad_norm": 0.49701419472694397, + "learning_rate": 0.0001, + "loss": 1.7478, + "step": 6398 + }, + { + "epoch": 0.7350525529837459, + "grad_norm": 0.42285898327827454, + "learning_rate": 0.0001, + "loss": 1.4871, + "step": 6399 + }, + { + "epoch": 0.7351674228935731, + "grad_norm": 0.46721500158309937, + "learning_rate": 0.0001, + "loss": 1.535, + "step": 6400 + }, + { + "epoch": 0.7352822928034002, + "grad_norm": 0.4647897779941559, + "learning_rate": 0.0001, + "loss": 1.635, + "step": 6401 + }, + { + "epoch": 0.7353971627132273, + "grad_norm": 0.48368552327156067, + "learning_rate": 0.0001, + "loss": 1.734, + "step": 6402 + }, + { + "epoch": 0.7355120326230544, + "grad_norm": 0.4766364097595215, + "learning_rate": 0.0001, + "loss": 1.6196, + "step": 6403 + }, + { + "epoch": 0.7356269025328815, + "grad_norm": 0.4311745762825012, + "learning_rate": 0.0001, + "loss": 1.5385, + "step": 6404 + }, + { + "epoch": 0.7357417724427087, + "grad_norm": 0.424921452999115, + "learning_rate": 0.0001, + "loss": 1.4973, + "step": 6405 + }, + { + "epoch": 0.7358566423525358, + "grad_norm": 0.44613373279571533, + "learning_rate": 0.0001, + "loss": 1.6768, + "step": 6406 + }, + { + "epoch": 0.7359715122623629, + "grad_norm": 0.52595055103302, + "learning_rate": 0.0001, + "loss": 1.8678, + "step": 6407 + }, + { + "epoch": 0.73608638217219, + "grad_norm": 0.4704437851905823, + "learning_rate": 0.0001, + "loss": 1.7375, + "step": 6408 + }, + { + "epoch": 0.7362012520820171, + "grad_norm": 0.4089406132698059, + "learning_rate": 0.0001, + "loss": 1.5, + "step": 6409 + }, + { + "epoch": 0.7363161219918443, + "grad_norm": 0.4442186951637268, + "learning_rate": 0.0001, + "loss": 1.4664, + "step": 6410 + }, + { + "epoch": 0.7364309919016714, + "grad_norm": 0.4903067350387573, + "learning_rate": 0.0001, + "loss": 1.7815, + "step": 6411 + }, + { + "epoch": 0.7365458618114985, + "grad_norm": 0.47199535369873047, + "learning_rate": 0.0001, + "loss": 1.7579, + "step": 6412 + }, + { + "epoch": 0.7366607317213256, + "grad_norm": 0.4860610365867615, + "learning_rate": 0.0001, + "loss": 1.7036, + "step": 6413 + }, + { + "epoch": 0.7367756016311527, + "grad_norm": 0.44982248544692993, + "learning_rate": 0.0001, + "loss": 1.5699, + "step": 6414 + }, + { + "epoch": 0.7368904715409799, + "grad_norm": 0.41628536581993103, + "learning_rate": 0.0001, + "loss": 1.3993, + "step": 6415 + }, + { + "epoch": 0.737005341450807, + "grad_norm": 0.4167647063732147, + "learning_rate": 0.0001, + "loss": 1.4185, + "step": 6416 + }, + { + "epoch": 0.7371202113606341, + "grad_norm": 0.44325876235961914, + "learning_rate": 0.0001, + "loss": 1.6582, + "step": 6417 + }, + { + "epoch": 0.7372350812704612, + "grad_norm": 0.4645753800868988, + "learning_rate": 0.0001, + "loss": 1.44, + "step": 6418 + }, + { + "epoch": 0.7373499511802883, + "grad_norm": 0.41918548941612244, + "learning_rate": 0.0001, + "loss": 1.584, + "step": 6419 + }, + { + "epoch": 0.7374648210901155, + "grad_norm": 0.4537114202976227, + "learning_rate": 0.0001, + "loss": 1.664, + "step": 6420 + }, + { + "epoch": 0.7375796909999426, + "grad_norm": 0.4690401554107666, + "learning_rate": 0.0001, + "loss": 1.7344, + "step": 6421 + }, + { + "epoch": 0.7376945609097697, + "grad_norm": 0.5302848815917969, + "learning_rate": 0.0001, + "loss": 1.8387, + "step": 6422 + }, + { + "epoch": 0.7378094308195968, + "grad_norm": 0.49464482069015503, + "learning_rate": 0.0001, + "loss": 1.7305, + "step": 6423 + }, + { + "epoch": 0.7379243007294239, + "grad_norm": 0.48099836707115173, + "learning_rate": 0.0001, + "loss": 1.5702, + "step": 6424 + }, + { + "epoch": 0.7380391706392511, + "grad_norm": 0.4626838266849518, + "learning_rate": 0.0001, + "loss": 1.6737, + "step": 6425 + }, + { + "epoch": 0.7381540405490782, + "grad_norm": 0.5171931385993958, + "learning_rate": 0.0001, + "loss": 1.562, + "step": 6426 + }, + { + "epoch": 0.7382689104589053, + "grad_norm": 0.46404412388801575, + "learning_rate": 0.0001, + "loss": 1.767, + "step": 6427 + }, + { + "epoch": 0.7383837803687324, + "grad_norm": 0.44521862268447876, + "learning_rate": 0.0001, + "loss": 1.5083, + "step": 6428 + }, + { + "epoch": 0.7384986502785595, + "grad_norm": 0.4961501359939575, + "learning_rate": 0.0001, + "loss": 1.6542, + "step": 6429 + }, + { + "epoch": 0.7386135201883867, + "grad_norm": 0.43013089895248413, + "learning_rate": 0.0001, + "loss": 1.5063, + "step": 6430 + }, + { + "epoch": 0.7387283900982138, + "grad_norm": 0.43474483489990234, + "learning_rate": 0.0001, + "loss": 1.459, + "step": 6431 + }, + { + "epoch": 0.7388432600080409, + "grad_norm": 0.4235991835594177, + "learning_rate": 0.0001, + "loss": 1.6437, + "step": 6432 + }, + { + "epoch": 0.738958129917868, + "grad_norm": 0.45976969599723816, + "learning_rate": 0.0001, + "loss": 1.6887, + "step": 6433 + }, + { + "epoch": 0.7390729998276951, + "grad_norm": 0.5007357597351074, + "learning_rate": 0.0001, + "loss": 1.5328, + "step": 6434 + }, + { + "epoch": 0.7391878697375223, + "grad_norm": 0.4222777783870697, + "learning_rate": 0.0001, + "loss": 1.4098, + "step": 6435 + }, + { + "epoch": 0.7393027396473494, + "grad_norm": 0.4068715274333954, + "learning_rate": 0.0001, + "loss": 1.2175, + "step": 6436 + }, + { + "epoch": 0.7394176095571765, + "grad_norm": 0.4571983516216278, + "learning_rate": 0.0001, + "loss": 1.5671, + "step": 6437 + }, + { + "epoch": 0.7395324794670036, + "grad_norm": 0.4495130181312561, + "learning_rate": 0.0001, + "loss": 1.5278, + "step": 6438 + }, + { + "epoch": 0.7396473493768307, + "grad_norm": 0.48614737391471863, + "learning_rate": 0.0001, + "loss": 1.6044, + "step": 6439 + }, + { + "epoch": 0.7397622192866579, + "grad_norm": 0.48480525612831116, + "learning_rate": 0.0001, + "loss": 1.6996, + "step": 6440 + }, + { + "epoch": 0.739877089196485, + "grad_norm": 0.46960780024528503, + "learning_rate": 0.0001, + "loss": 1.3502, + "step": 6441 + }, + { + "epoch": 0.7399919591063121, + "grad_norm": 0.45793652534484863, + "learning_rate": 0.0001, + "loss": 1.6815, + "step": 6442 + }, + { + "epoch": 0.7401068290161392, + "grad_norm": 0.46224090456962585, + "learning_rate": 0.0001, + "loss": 1.6644, + "step": 6443 + }, + { + "epoch": 0.7402216989259663, + "grad_norm": 0.41686657071113586, + "learning_rate": 0.0001, + "loss": 1.5682, + "step": 6444 + }, + { + "epoch": 0.7403365688357935, + "grad_norm": 0.42895323038101196, + "learning_rate": 0.0001, + "loss": 1.5301, + "step": 6445 + }, + { + "epoch": 0.7404514387456206, + "grad_norm": 0.42914414405822754, + "learning_rate": 0.0001, + "loss": 1.6159, + "step": 6446 + }, + { + "epoch": 0.7405663086554477, + "grad_norm": 0.4458409249782562, + "learning_rate": 0.0001, + "loss": 1.5835, + "step": 6447 + }, + { + "epoch": 0.7406811785652748, + "grad_norm": 0.4255877435207367, + "learning_rate": 0.0001, + "loss": 1.4395, + "step": 6448 + }, + { + "epoch": 0.7407960484751019, + "grad_norm": 0.4231339991092682, + "learning_rate": 0.0001, + "loss": 1.495, + "step": 6449 + }, + { + "epoch": 0.7409109183849291, + "grad_norm": 0.4323672652244568, + "learning_rate": 0.0001, + "loss": 1.4857, + "step": 6450 + }, + { + "epoch": 0.7410257882947562, + "grad_norm": 0.4640507996082306, + "learning_rate": 0.0001, + "loss": 1.4276, + "step": 6451 + }, + { + "epoch": 0.7411406582045833, + "grad_norm": 0.442341685295105, + "learning_rate": 0.0001, + "loss": 1.6777, + "step": 6452 + }, + { + "epoch": 0.7412555281144104, + "grad_norm": 0.44191497564315796, + "learning_rate": 0.0001, + "loss": 1.561, + "step": 6453 + }, + { + "epoch": 0.7413703980242375, + "grad_norm": 0.4533472955226898, + "learning_rate": 0.0001, + "loss": 1.5872, + "step": 6454 + }, + { + "epoch": 0.7414852679340647, + "grad_norm": 0.45057639479637146, + "learning_rate": 0.0001, + "loss": 1.7246, + "step": 6455 + }, + { + "epoch": 0.7416001378438918, + "grad_norm": 0.5106820464134216, + "learning_rate": 0.0001, + "loss": 1.7923, + "step": 6456 + }, + { + "epoch": 0.7417150077537189, + "grad_norm": 0.4512089788913727, + "learning_rate": 0.0001, + "loss": 1.6305, + "step": 6457 + }, + { + "epoch": 0.741829877663546, + "grad_norm": 0.44131678342819214, + "learning_rate": 0.0001, + "loss": 1.547, + "step": 6458 + }, + { + "epoch": 0.7419447475733731, + "grad_norm": 0.4136804938316345, + "learning_rate": 0.0001, + "loss": 1.5348, + "step": 6459 + }, + { + "epoch": 0.7420596174832003, + "grad_norm": 0.48369288444519043, + "learning_rate": 0.0001, + "loss": 1.7312, + "step": 6460 + }, + { + "epoch": 0.7421744873930274, + "grad_norm": 0.4654393792152405, + "learning_rate": 0.0001, + "loss": 1.6704, + "step": 6461 + }, + { + "epoch": 0.7422893573028545, + "grad_norm": 0.457107275724411, + "learning_rate": 0.0001, + "loss": 1.6405, + "step": 6462 + }, + { + "epoch": 0.7424042272126816, + "grad_norm": 0.46595972776412964, + "learning_rate": 0.0001, + "loss": 1.6433, + "step": 6463 + }, + { + "epoch": 0.7425190971225087, + "grad_norm": 0.43557512760162354, + "learning_rate": 0.0001, + "loss": 1.3404, + "step": 6464 + }, + { + "epoch": 0.7426339670323359, + "grad_norm": 0.4468023180961609, + "learning_rate": 0.0001, + "loss": 1.531, + "step": 6465 + }, + { + "epoch": 0.742748836942163, + "grad_norm": 0.4448022246360779, + "learning_rate": 0.0001, + "loss": 1.6072, + "step": 6466 + }, + { + "epoch": 0.7428637068519901, + "grad_norm": 0.461955726146698, + "learning_rate": 0.0001, + "loss": 1.6381, + "step": 6467 + }, + { + "epoch": 0.7429785767618172, + "grad_norm": 0.47350478172302246, + "learning_rate": 0.0001, + "loss": 1.5773, + "step": 6468 + }, + { + "epoch": 0.7430934466716443, + "grad_norm": 0.506061851978302, + "learning_rate": 0.0001, + "loss": 1.9245, + "step": 6469 + }, + { + "epoch": 0.7432083165814715, + "grad_norm": 0.48322594165802, + "learning_rate": 0.0001, + "loss": 1.8228, + "step": 6470 + }, + { + "epoch": 0.7433231864912986, + "grad_norm": 0.4514816701412201, + "learning_rate": 0.0001, + "loss": 1.7976, + "step": 6471 + }, + { + "epoch": 0.7434380564011257, + "grad_norm": 0.45972806215286255, + "learning_rate": 0.0001, + "loss": 1.676, + "step": 6472 + }, + { + "epoch": 0.7435529263109528, + "grad_norm": 0.4653480350971222, + "learning_rate": 0.0001, + "loss": 1.7181, + "step": 6473 + }, + { + "epoch": 0.7436677962207799, + "grad_norm": 0.4564400315284729, + "learning_rate": 0.0001, + "loss": 1.4732, + "step": 6474 + }, + { + "epoch": 0.7437826661306071, + "grad_norm": 0.44110190868377686, + "learning_rate": 0.0001, + "loss": 1.4867, + "step": 6475 + }, + { + "epoch": 0.7438975360404342, + "grad_norm": 0.4426039159297943, + "learning_rate": 0.0001, + "loss": 1.5752, + "step": 6476 + }, + { + "epoch": 0.7440124059502613, + "grad_norm": 0.4515429735183716, + "learning_rate": 0.0001, + "loss": 1.6102, + "step": 6477 + }, + { + "epoch": 0.7441272758600884, + "grad_norm": 0.4343465268611908, + "learning_rate": 0.0001, + "loss": 1.685, + "step": 6478 + }, + { + "epoch": 0.7442421457699155, + "grad_norm": 0.448831170797348, + "learning_rate": 0.0001, + "loss": 1.6047, + "step": 6479 + }, + { + "epoch": 0.7443570156797427, + "grad_norm": 0.45811012387275696, + "learning_rate": 0.0001, + "loss": 1.6288, + "step": 6480 + }, + { + "epoch": 0.7444718855895698, + "grad_norm": 0.48360949754714966, + "learning_rate": 0.0001, + "loss": 1.6076, + "step": 6481 + }, + { + "epoch": 0.7445867554993969, + "grad_norm": 0.4996730089187622, + "learning_rate": 0.0001, + "loss": 1.8236, + "step": 6482 + }, + { + "epoch": 0.744701625409224, + "grad_norm": 0.4542221426963806, + "learning_rate": 0.0001, + "loss": 1.6124, + "step": 6483 + }, + { + "epoch": 0.7448164953190511, + "grad_norm": 0.4285677671432495, + "learning_rate": 0.0001, + "loss": 1.374, + "step": 6484 + }, + { + "epoch": 0.7449313652288783, + "grad_norm": 0.48036250472068787, + "learning_rate": 0.0001, + "loss": 1.6905, + "step": 6485 + }, + { + "epoch": 0.7450462351387054, + "grad_norm": 0.4445507824420929, + "learning_rate": 0.0001, + "loss": 1.5425, + "step": 6486 + }, + { + "epoch": 0.7451611050485325, + "grad_norm": 0.4675546884536743, + "learning_rate": 0.0001, + "loss": 1.7367, + "step": 6487 + }, + { + "epoch": 0.7452759749583596, + "grad_norm": 0.4238932132720947, + "learning_rate": 0.0001, + "loss": 1.6086, + "step": 6488 + }, + { + "epoch": 0.7453908448681867, + "grad_norm": 0.4289308786392212, + "learning_rate": 0.0001, + "loss": 1.4649, + "step": 6489 + }, + { + "epoch": 0.7455057147780139, + "grad_norm": 0.449756383895874, + "learning_rate": 0.0001, + "loss": 1.6866, + "step": 6490 + }, + { + "epoch": 0.745620584687841, + "grad_norm": 0.4513997435569763, + "learning_rate": 0.0001, + "loss": 1.6018, + "step": 6491 + }, + { + "epoch": 0.7457354545976681, + "grad_norm": 0.49391379952430725, + "learning_rate": 0.0001, + "loss": 1.7308, + "step": 6492 + }, + { + "epoch": 0.7458503245074952, + "grad_norm": 0.45236408710479736, + "learning_rate": 0.0001, + "loss": 1.6059, + "step": 6493 + }, + { + "epoch": 0.7459651944173223, + "grad_norm": 0.466229110956192, + "learning_rate": 0.0001, + "loss": 1.7665, + "step": 6494 + }, + { + "epoch": 0.7460800643271495, + "grad_norm": 0.44140058755874634, + "learning_rate": 0.0001, + "loss": 1.572, + "step": 6495 + }, + { + "epoch": 0.7461949342369766, + "grad_norm": 0.4668871760368347, + "learning_rate": 0.0001, + "loss": 1.7177, + "step": 6496 + }, + { + "epoch": 0.7463098041468037, + "grad_norm": 0.43868231773376465, + "learning_rate": 0.0001, + "loss": 1.5234, + "step": 6497 + }, + { + "epoch": 0.7464246740566308, + "grad_norm": 0.517144501209259, + "learning_rate": 0.0001, + "loss": 1.7245, + "step": 6498 + }, + { + "epoch": 0.7465395439664579, + "grad_norm": 0.4234006404876709, + "learning_rate": 0.0001, + "loss": 1.4977, + "step": 6499 + }, + { + "epoch": 0.7466544138762851, + "grad_norm": 0.44216203689575195, + "learning_rate": 0.0001, + "loss": 1.549, + "step": 6500 + }, + { + "epoch": 0.7467692837861122, + "grad_norm": 0.43986350297927856, + "learning_rate": 0.0001, + "loss": 1.6273, + "step": 6501 + }, + { + "epoch": 0.7468841536959393, + "grad_norm": 0.4326637387275696, + "learning_rate": 0.0001, + "loss": 1.4611, + "step": 6502 + }, + { + "epoch": 0.7469990236057664, + "grad_norm": 0.43953758478164673, + "learning_rate": 0.0001, + "loss": 1.6316, + "step": 6503 + }, + { + "epoch": 0.7471138935155935, + "grad_norm": 0.44682297110557556, + "learning_rate": 0.0001, + "loss": 1.5408, + "step": 6504 + }, + { + "epoch": 0.7472287634254207, + "grad_norm": 0.4634048342704773, + "learning_rate": 0.0001, + "loss": 1.68, + "step": 6505 + }, + { + "epoch": 0.7473436333352478, + "grad_norm": 0.43683484196662903, + "learning_rate": 0.0001, + "loss": 1.6397, + "step": 6506 + }, + { + "epoch": 0.7474585032450749, + "grad_norm": 0.42723357677459717, + "learning_rate": 0.0001, + "loss": 1.4604, + "step": 6507 + }, + { + "epoch": 0.747573373154902, + "grad_norm": 0.49617254734039307, + "learning_rate": 0.0001, + "loss": 1.8581, + "step": 6508 + }, + { + "epoch": 0.7476882430647291, + "grad_norm": 0.46171995997428894, + "learning_rate": 0.0001, + "loss": 1.4629, + "step": 6509 + }, + { + "epoch": 0.7478031129745563, + "grad_norm": 0.4606563150882721, + "learning_rate": 0.0001, + "loss": 1.4719, + "step": 6510 + }, + { + "epoch": 0.7479179828843834, + "grad_norm": 0.47616055607795715, + "learning_rate": 0.0001, + "loss": 1.7907, + "step": 6511 + }, + { + "epoch": 0.7480328527942106, + "grad_norm": 0.42046403884887695, + "learning_rate": 0.0001, + "loss": 1.5671, + "step": 6512 + }, + { + "epoch": 0.7481477227040377, + "grad_norm": 0.41847604513168335, + "learning_rate": 0.0001, + "loss": 1.6201, + "step": 6513 + }, + { + "epoch": 0.7482625926138649, + "grad_norm": 0.45703554153442383, + "learning_rate": 0.0001, + "loss": 1.7209, + "step": 6514 + }, + { + "epoch": 0.748377462523692, + "grad_norm": 0.40654999017715454, + "learning_rate": 0.0001, + "loss": 1.2961, + "step": 6515 + }, + { + "epoch": 0.7484923324335191, + "grad_norm": 0.4782845377922058, + "learning_rate": 0.0001, + "loss": 1.6385, + "step": 6516 + }, + { + "epoch": 0.7486072023433462, + "grad_norm": 0.4496891498565674, + "learning_rate": 0.0001, + "loss": 1.4721, + "step": 6517 + }, + { + "epoch": 0.7487220722531733, + "grad_norm": 0.4883591830730438, + "learning_rate": 0.0001, + "loss": 1.6914, + "step": 6518 + }, + { + "epoch": 0.7488369421630005, + "grad_norm": 0.5101348161697388, + "learning_rate": 0.0001, + "loss": 1.8276, + "step": 6519 + }, + { + "epoch": 0.7489518120728276, + "grad_norm": 0.47420623898506165, + "learning_rate": 0.0001, + "loss": 1.7383, + "step": 6520 + }, + { + "epoch": 0.7490666819826547, + "grad_norm": 0.544798731803894, + "learning_rate": 0.0001, + "loss": 1.2877, + "step": 6521 + }, + { + "epoch": 0.7491815518924818, + "grad_norm": 0.496698796749115, + "learning_rate": 0.0001, + "loss": 1.7508, + "step": 6522 + }, + { + "epoch": 0.7492964218023089, + "grad_norm": 0.42780181765556335, + "learning_rate": 0.0001, + "loss": 1.448, + "step": 6523 + }, + { + "epoch": 0.749411291712136, + "grad_norm": 0.4542038142681122, + "learning_rate": 0.0001, + "loss": 1.6221, + "step": 6524 + }, + { + "epoch": 0.7495261616219632, + "grad_norm": 0.4443798363208771, + "learning_rate": 0.0001, + "loss": 1.5453, + "step": 6525 + }, + { + "epoch": 0.7496410315317903, + "grad_norm": 0.4632874131202698, + "learning_rate": 0.0001, + "loss": 1.6545, + "step": 6526 + }, + { + "epoch": 0.7497559014416174, + "grad_norm": 0.4470142424106598, + "learning_rate": 0.0001, + "loss": 1.6075, + "step": 6527 + }, + { + "epoch": 0.7498707713514445, + "grad_norm": 0.4853494465351105, + "learning_rate": 0.0001, + "loss": 1.6171, + "step": 6528 + }, + { + "epoch": 0.7499856412612717, + "grad_norm": 0.43715977668762207, + "learning_rate": 0.0001, + "loss": 1.3894, + "step": 6529 + }, + { + "epoch": 0.7501005111710988, + "grad_norm": 0.43983104825019836, + "learning_rate": 0.0001, + "loss": 1.6225, + "step": 6530 + }, + { + "epoch": 0.7502153810809259, + "grad_norm": 0.4784538745880127, + "learning_rate": 0.0001, + "loss": 1.7794, + "step": 6531 + }, + { + "epoch": 0.750330250990753, + "grad_norm": 0.47763192653656006, + "learning_rate": 0.0001, + "loss": 1.85, + "step": 6532 + }, + { + "epoch": 0.7504451209005801, + "grad_norm": 0.4692215025424957, + "learning_rate": 0.0001, + "loss": 1.7338, + "step": 6533 + }, + { + "epoch": 0.7505599908104073, + "grad_norm": 0.4766184091567993, + "learning_rate": 0.0001, + "loss": 1.8508, + "step": 6534 + }, + { + "epoch": 0.7506748607202344, + "grad_norm": 0.44639328122138977, + "learning_rate": 0.0001, + "loss": 1.7465, + "step": 6535 + }, + { + "epoch": 0.7507897306300615, + "grad_norm": 0.4912661910057068, + "learning_rate": 0.0001, + "loss": 1.7199, + "step": 6536 + }, + { + "epoch": 0.7509046005398886, + "grad_norm": 0.4561523199081421, + "learning_rate": 0.0001, + "loss": 1.4812, + "step": 6537 + }, + { + "epoch": 0.7510194704497157, + "grad_norm": 0.44610196352005005, + "learning_rate": 0.0001, + "loss": 1.5681, + "step": 6538 + }, + { + "epoch": 0.7511343403595429, + "grad_norm": 0.46128877997398376, + "learning_rate": 0.0001, + "loss": 1.6083, + "step": 6539 + }, + { + "epoch": 0.75124921026937, + "grad_norm": 0.4640040397644043, + "learning_rate": 0.0001, + "loss": 1.8346, + "step": 6540 + }, + { + "epoch": 0.7513640801791971, + "grad_norm": 0.4831351339817047, + "learning_rate": 0.0001, + "loss": 1.5296, + "step": 6541 + }, + { + "epoch": 0.7514789500890242, + "grad_norm": 0.4846135377883911, + "learning_rate": 0.0001, + "loss": 1.727, + "step": 6542 + }, + { + "epoch": 0.7515938199988513, + "grad_norm": 0.40949901938438416, + "learning_rate": 0.0001, + "loss": 1.5282, + "step": 6543 + }, + { + "epoch": 0.7517086899086785, + "grad_norm": 0.4737204909324646, + "learning_rate": 0.0001, + "loss": 1.766, + "step": 6544 + }, + { + "epoch": 0.7518235598185056, + "grad_norm": 0.4291594922542572, + "learning_rate": 0.0001, + "loss": 1.6355, + "step": 6545 + }, + { + "epoch": 0.7519384297283327, + "grad_norm": 0.4513239562511444, + "learning_rate": 0.0001, + "loss": 1.5654, + "step": 6546 + }, + { + "epoch": 0.7520532996381598, + "grad_norm": 0.42415329813957214, + "learning_rate": 0.0001, + "loss": 1.5541, + "step": 6547 + }, + { + "epoch": 0.7521681695479869, + "grad_norm": 0.4564039707183838, + "learning_rate": 0.0001, + "loss": 1.2685, + "step": 6548 + }, + { + "epoch": 0.752283039457814, + "grad_norm": 0.4450725018978119, + "learning_rate": 0.0001, + "loss": 1.5905, + "step": 6549 + }, + { + "epoch": 0.7523979093676412, + "grad_norm": 0.4227917492389679, + "learning_rate": 0.0001, + "loss": 1.448, + "step": 6550 + }, + { + "epoch": 0.7525127792774683, + "grad_norm": 0.4478033483028412, + "learning_rate": 0.0001, + "loss": 1.6344, + "step": 6551 + }, + { + "epoch": 0.7526276491872954, + "grad_norm": 0.45714229345321655, + "learning_rate": 0.0001, + "loss": 1.7176, + "step": 6552 + }, + { + "epoch": 0.7527425190971225, + "grad_norm": 0.4998722970485687, + "learning_rate": 0.0001, + "loss": 1.4318, + "step": 6553 + }, + { + "epoch": 0.7528573890069497, + "grad_norm": 0.49186933040618896, + "learning_rate": 0.0001, + "loss": 1.6897, + "step": 6554 + }, + { + "epoch": 0.7529722589167768, + "grad_norm": 0.5109425187110901, + "learning_rate": 0.0001, + "loss": 1.7841, + "step": 6555 + }, + { + "epoch": 0.7530871288266039, + "grad_norm": 0.436103492975235, + "learning_rate": 0.0001, + "loss": 1.5522, + "step": 6556 + }, + { + "epoch": 0.753201998736431, + "grad_norm": 0.46560850739479065, + "learning_rate": 0.0001, + "loss": 1.4743, + "step": 6557 + }, + { + "epoch": 0.7533168686462581, + "grad_norm": 0.47097107768058777, + "learning_rate": 0.0001, + "loss": 1.5025, + "step": 6558 + }, + { + "epoch": 0.7534317385560853, + "grad_norm": 0.47928592562675476, + "learning_rate": 0.0001, + "loss": 1.6335, + "step": 6559 + }, + { + "epoch": 0.7535466084659124, + "grad_norm": 0.52452552318573, + "learning_rate": 0.0001, + "loss": 1.6683, + "step": 6560 + }, + { + "epoch": 0.7536614783757395, + "grad_norm": 0.47707682847976685, + "learning_rate": 0.0001, + "loss": 1.7117, + "step": 6561 + }, + { + "epoch": 0.7537763482855666, + "grad_norm": 0.4742587208747864, + "learning_rate": 0.0001, + "loss": 1.6456, + "step": 6562 + }, + { + "epoch": 0.7538912181953937, + "grad_norm": 0.4345073103904724, + "learning_rate": 0.0001, + "loss": 1.4673, + "step": 6563 + }, + { + "epoch": 0.7540060881052209, + "grad_norm": 0.4827091693878174, + "learning_rate": 0.0001, + "loss": 1.731, + "step": 6564 + }, + { + "epoch": 0.754120958015048, + "grad_norm": 0.4398217797279358, + "learning_rate": 0.0001, + "loss": 1.5551, + "step": 6565 + }, + { + "epoch": 0.7542358279248751, + "grad_norm": 0.4701884090900421, + "learning_rate": 0.0001, + "loss": 1.811, + "step": 6566 + }, + { + "epoch": 0.7543506978347022, + "grad_norm": 0.41458040475845337, + "learning_rate": 0.0001, + "loss": 1.1869, + "step": 6567 + }, + { + "epoch": 0.7544655677445293, + "grad_norm": 0.4266148507595062, + "learning_rate": 0.0001, + "loss": 1.5984, + "step": 6568 + }, + { + "epoch": 0.7545804376543565, + "grad_norm": 0.441137433052063, + "learning_rate": 0.0001, + "loss": 1.6422, + "step": 6569 + }, + { + "epoch": 0.7546953075641836, + "grad_norm": 0.511440098285675, + "learning_rate": 0.0001, + "loss": 1.6331, + "step": 6570 + }, + { + "epoch": 0.7548101774740107, + "grad_norm": 0.4327561855316162, + "learning_rate": 0.0001, + "loss": 1.3331, + "step": 6571 + }, + { + "epoch": 0.7549250473838378, + "grad_norm": 0.4539497494697571, + "learning_rate": 0.0001, + "loss": 1.3839, + "step": 6572 + }, + { + "epoch": 0.7550399172936649, + "grad_norm": 0.47245991230010986, + "learning_rate": 0.0001, + "loss": 1.5165, + "step": 6573 + }, + { + "epoch": 0.755154787203492, + "grad_norm": 0.46419230103492737, + "learning_rate": 0.0001, + "loss": 1.6633, + "step": 6574 + }, + { + "epoch": 0.7552696571133192, + "grad_norm": 0.47439756989479065, + "learning_rate": 0.0001, + "loss": 1.5447, + "step": 6575 + }, + { + "epoch": 0.7553845270231463, + "grad_norm": 0.4236104488372803, + "learning_rate": 0.0001, + "loss": 1.3438, + "step": 6576 + }, + { + "epoch": 0.7554993969329734, + "grad_norm": 0.4503575563430786, + "learning_rate": 0.0001, + "loss": 1.6844, + "step": 6577 + }, + { + "epoch": 0.7556142668428005, + "grad_norm": 0.4399665594100952, + "learning_rate": 0.0001, + "loss": 1.5789, + "step": 6578 + }, + { + "epoch": 0.7557291367526277, + "grad_norm": 0.45208460092544556, + "learning_rate": 0.0001, + "loss": 1.6652, + "step": 6579 + }, + { + "epoch": 0.7558440066624548, + "grad_norm": 0.48191729187965393, + "learning_rate": 0.0001, + "loss": 1.7206, + "step": 6580 + }, + { + "epoch": 0.7559588765722819, + "grad_norm": 0.496855765581131, + "learning_rate": 0.0001, + "loss": 1.5174, + "step": 6581 + }, + { + "epoch": 0.756073746482109, + "grad_norm": 0.4180924594402313, + "learning_rate": 0.0001, + "loss": 1.4212, + "step": 6582 + }, + { + "epoch": 0.7561886163919361, + "grad_norm": 0.4331256151199341, + "learning_rate": 0.0001, + "loss": 1.3518, + "step": 6583 + }, + { + "epoch": 0.7563034863017633, + "grad_norm": 0.47106215357780457, + "learning_rate": 0.0001, + "loss": 1.6164, + "step": 6584 + }, + { + "epoch": 0.7564183562115904, + "grad_norm": 0.4699764549732208, + "learning_rate": 0.0001, + "loss": 1.3998, + "step": 6585 + }, + { + "epoch": 0.7565332261214175, + "grad_norm": 0.44458329677581787, + "learning_rate": 0.0001, + "loss": 1.5519, + "step": 6586 + }, + { + "epoch": 0.7566480960312446, + "grad_norm": 0.46415066719055176, + "learning_rate": 0.0001, + "loss": 1.6553, + "step": 6587 + }, + { + "epoch": 0.7567629659410717, + "grad_norm": 0.44048815965652466, + "learning_rate": 0.0001, + "loss": 1.4584, + "step": 6588 + }, + { + "epoch": 0.7568778358508989, + "grad_norm": 0.43758389353752136, + "learning_rate": 0.0001, + "loss": 1.5962, + "step": 6589 + }, + { + "epoch": 0.756992705760726, + "grad_norm": 0.44562289118766785, + "learning_rate": 0.0001, + "loss": 1.5985, + "step": 6590 + }, + { + "epoch": 0.7571075756705531, + "grad_norm": 0.4680253267288208, + "learning_rate": 0.0001, + "loss": 1.6759, + "step": 6591 + }, + { + "epoch": 0.7572224455803802, + "grad_norm": 0.5020065307617188, + "learning_rate": 0.0001, + "loss": 1.7335, + "step": 6592 + }, + { + "epoch": 0.7573373154902073, + "grad_norm": 0.42748236656188965, + "learning_rate": 0.0001, + "loss": 1.4555, + "step": 6593 + }, + { + "epoch": 0.7574521854000345, + "grad_norm": 0.4277678430080414, + "learning_rate": 0.0001, + "loss": 1.5494, + "step": 6594 + }, + { + "epoch": 0.7575670553098616, + "grad_norm": 0.4632684886455536, + "learning_rate": 0.0001, + "loss": 1.5616, + "step": 6595 + }, + { + "epoch": 0.7576819252196887, + "grad_norm": 0.5025344491004944, + "learning_rate": 0.0001, + "loss": 1.7728, + "step": 6596 + }, + { + "epoch": 0.7577967951295158, + "grad_norm": 0.45161494612693787, + "learning_rate": 0.0001, + "loss": 1.6224, + "step": 6597 + }, + { + "epoch": 0.7579116650393429, + "grad_norm": 0.45215389132499695, + "learning_rate": 0.0001, + "loss": 1.659, + "step": 6598 + }, + { + "epoch": 0.75802653494917, + "grad_norm": 0.4214898347854614, + "learning_rate": 0.0001, + "loss": 1.4783, + "step": 6599 + }, + { + "epoch": 0.7581414048589972, + "grad_norm": 0.5195730924606323, + "learning_rate": 0.0001, + "loss": 1.8058, + "step": 6600 + }, + { + "epoch": 0.7582562747688243, + "grad_norm": 0.48730143904685974, + "learning_rate": 0.0001, + "loss": 1.8334, + "step": 6601 + }, + { + "epoch": 0.7583711446786514, + "grad_norm": 0.5312708020210266, + "learning_rate": 0.0001, + "loss": 1.8008, + "step": 6602 + }, + { + "epoch": 0.7584860145884785, + "grad_norm": 0.44341936707496643, + "learning_rate": 0.0001, + "loss": 1.6025, + "step": 6603 + }, + { + "epoch": 0.7586008844983056, + "grad_norm": 0.4990461468696594, + "learning_rate": 0.0001, + "loss": 1.7039, + "step": 6604 + }, + { + "epoch": 0.7587157544081328, + "grad_norm": 0.43165311217308044, + "learning_rate": 0.0001, + "loss": 1.6646, + "step": 6605 + }, + { + "epoch": 0.7588306243179599, + "grad_norm": 0.4335373640060425, + "learning_rate": 0.0001, + "loss": 1.4298, + "step": 6606 + }, + { + "epoch": 0.758945494227787, + "grad_norm": 0.4470045566558838, + "learning_rate": 0.0001, + "loss": 1.3655, + "step": 6607 + }, + { + "epoch": 0.7590603641376141, + "grad_norm": 0.46403536200523376, + "learning_rate": 0.0001, + "loss": 1.4718, + "step": 6608 + }, + { + "epoch": 0.7591752340474412, + "grad_norm": 0.49644750356674194, + "learning_rate": 0.0001, + "loss": 1.6212, + "step": 6609 + }, + { + "epoch": 0.7592901039572684, + "grad_norm": 0.5031653046607971, + "learning_rate": 0.0001, + "loss": 1.5862, + "step": 6610 + }, + { + "epoch": 0.7594049738670955, + "grad_norm": 0.46444204449653625, + "learning_rate": 0.0001, + "loss": 1.5091, + "step": 6611 + }, + { + "epoch": 0.7595198437769226, + "grad_norm": 0.4190670847892761, + "learning_rate": 0.0001, + "loss": 1.3953, + "step": 6612 + }, + { + "epoch": 0.7596347136867497, + "grad_norm": 0.4325488209724426, + "learning_rate": 0.0001, + "loss": 1.5455, + "step": 6613 + }, + { + "epoch": 0.7597495835965768, + "grad_norm": 0.4146260917186737, + "learning_rate": 0.0001, + "loss": 1.4447, + "step": 6614 + }, + { + "epoch": 0.759864453506404, + "grad_norm": 0.4315129816532135, + "learning_rate": 0.0001, + "loss": 1.5806, + "step": 6615 + }, + { + "epoch": 0.7599793234162311, + "grad_norm": 0.4837382733821869, + "learning_rate": 0.0001, + "loss": 1.5836, + "step": 6616 + }, + { + "epoch": 0.7600941933260582, + "grad_norm": 0.5264213681221008, + "learning_rate": 0.0001, + "loss": 1.7784, + "step": 6617 + }, + { + "epoch": 0.7602090632358853, + "grad_norm": 0.4192154109477997, + "learning_rate": 0.0001, + "loss": 1.49, + "step": 6618 + }, + { + "epoch": 0.7603239331457124, + "grad_norm": 0.4764638841152191, + "learning_rate": 0.0001, + "loss": 1.4579, + "step": 6619 + }, + { + "epoch": 0.7604388030555396, + "grad_norm": 0.4535200893878937, + "learning_rate": 0.0001, + "loss": 1.4815, + "step": 6620 + }, + { + "epoch": 0.7605536729653667, + "grad_norm": 0.40182626247406006, + "learning_rate": 0.0001, + "loss": 1.2703, + "step": 6621 + }, + { + "epoch": 0.7606685428751938, + "grad_norm": 0.4054408669471741, + "learning_rate": 0.0001, + "loss": 1.4678, + "step": 6622 + }, + { + "epoch": 0.7607834127850209, + "grad_norm": 0.4554063081741333, + "learning_rate": 0.0001, + "loss": 1.7263, + "step": 6623 + }, + { + "epoch": 0.760898282694848, + "grad_norm": 0.447591632604599, + "learning_rate": 0.0001, + "loss": 1.75, + "step": 6624 + }, + { + "epoch": 0.7610131526046752, + "grad_norm": 0.4537143409252167, + "learning_rate": 0.0001, + "loss": 1.7036, + "step": 6625 + }, + { + "epoch": 0.7611280225145023, + "grad_norm": 0.47372350096702576, + "learning_rate": 0.0001, + "loss": 1.5821, + "step": 6626 + }, + { + "epoch": 0.7612428924243294, + "grad_norm": 0.47313424944877625, + "learning_rate": 0.0001, + "loss": 1.7578, + "step": 6627 + }, + { + "epoch": 0.7613577623341565, + "grad_norm": 0.4805237054824829, + "learning_rate": 0.0001, + "loss": 1.6613, + "step": 6628 + }, + { + "epoch": 0.7614726322439836, + "grad_norm": 0.4734886884689331, + "learning_rate": 0.0001, + "loss": 1.7557, + "step": 6629 + }, + { + "epoch": 0.7615875021538108, + "grad_norm": 0.46872133016586304, + "learning_rate": 0.0001, + "loss": 1.7054, + "step": 6630 + }, + { + "epoch": 0.7617023720636379, + "grad_norm": 0.44141799211502075, + "learning_rate": 0.0001, + "loss": 1.5425, + "step": 6631 + }, + { + "epoch": 0.761817241973465, + "grad_norm": 0.4713442623615265, + "learning_rate": 0.0001, + "loss": 1.7963, + "step": 6632 + }, + { + "epoch": 0.7619321118832921, + "grad_norm": 0.46353405714035034, + "learning_rate": 0.0001, + "loss": 1.6291, + "step": 6633 + }, + { + "epoch": 0.7620469817931192, + "grad_norm": 0.9744009971618652, + "learning_rate": 0.0001, + "loss": 1.63, + "step": 6634 + }, + { + "epoch": 0.7621618517029464, + "grad_norm": 0.41723188757896423, + "learning_rate": 0.0001, + "loss": 1.5305, + "step": 6635 + }, + { + "epoch": 0.7622767216127735, + "grad_norm": 0.47938770055770874, + "learning_rate": 0.0001, + "loss": 1.7722, + "step": 6636 + }, + { + "epoch": 0.7623915915226006, + "grad_norm": 0.46202394366264343, + "learning_rate": 0.0001, + "loss": 1.6189, + "step": 6637 + }, + { + "epoch": 0.7625064614324277, + "grad_norm": 0.4409821033477783, + "learning_rate": 0.0001, + "loss": 1.3921, + "step": 6638 + }, + { + "epoch": 0.7626213313422548, + "grad_norm": 0.4694823920726776, + "learning_rate": 0.0001, + "loss": 1.6386, + "step": 6639 + }, + { + "epoch": 0.762736201252082, + "grad_norm": 0.45864009857177734, + "learning_rate": 0.0001, + "loss": 1.6221, + "step": 6640 + }, + { + "epoch": 0.7628510711619091, + "grad_norm": 0.46855515241622925, + "learning_rate": 0.0001, + "loss": 1.4683, + "step": 6641 + }, + { + "epoch": 0.7629659410717362, + "grad_norm": 0.498154878616333, + "learning_rate": 0.0001, + "loss": 1.5162, + "step": 6642 + }, + { + "epoch": 0.7630808109815633, + "grad_norm": 0.4623854160308838, + "learning_rate": 0.0001, + "loss": 1.5181, + "step": 6643 + }, + { + "epoch": 0.7631956808913904, + "grad_norm": 0.42440950870513916, + "learning_rate": 0.0001, + "loss": 1.6539, + "step": 6644 + }, + { + "epoch": 0.7633105508012176, + "grad_norm": 0.4813312888145447, + "learning_rate": 0.0001, + "loss": 1.5837, + "step": 6645 + }, + { + "epoch": 0.7634254207110447, + "grad_norm": 0.430618554353714, + "learning_rate": 0.0001, + "loss": 1.5017, + "step": 6646 + }, + { + "epoch": 0.7635402906208718, + "grad_norm": 0.4468959867954254, + "learning_rate": 0.0001, + "loss": 1.5506, + "step": 6647 + }, + { + "epoch": 0.7636551605306989, + "grad_norm": 0.45413529872894287, + "learning_rate": 0.0001, + "loss": 1.5858, + "step": 6648 + }, + { + "epoch": 0.763770030440526, + "grad_norm": 0.4446607530117035, + "learning_rate": 0.0001, + "loss": 1.5391, + "step": 6649 + }, + { + "epoch": 0.7638849003503533, + "grad_norm": 0.4520895779132843, + "learning_rate": 0.0001, + "loss": 1.5946, + "step": 6650 + }, + { + "epoch": 0.7639997702601804, + "grad_norm": 0.46558019518852234, + "learning_rate": 0.0001, + "loss": 1.6506, + "step": 6651 + }, + { + "epoch": 0.7641146401700075, + "grad_norm": 0.501394510269165, + "learning_rate": 0.0001, + "loss": 1.8724, + "step": 6652 + }, + { + "epoch": 0.7642295100798346, + "grad_norm": 0.47188153862953186, + "learning_rate": 0.0001, + "loss": 1.7557, + "step": 6653 + }, + { + "epoch": 0.7643443799896618, + "grad_norm": 0.42527419328689575, + "learning_rate": 0.0001, + "loss": 1.4982, + "step": 6654 + }, + { + "epoch": 0.7644592498994889, + "grad_norm": 0.4962664246559143, + "learning_rate": 0.0001, + "loss": 1.7203, + "step": 6655 + }, + { + "epoch": 0.764574119809316, + "grad_norm": 0.4442030191421509, + "learning_rate": 0.0001, + "loss": 1.5981, + "step": 6656 + }, + { + "epoch": 0.7646889897191431, + "grad_norm": 0.47021228075027466, + "learning_rate": 0.0001, + "loss": 1.7917, + "step": 6657 + }, + { + "epoch": 0.7648038596289702, + "grad_norm": 0.4421970546245575, + "learning_rate": 0.0001, + "loss": 1.5717, + "step": 6658 + }, + { + "epoch": 0.7649187295387974, + "grad_norm": 0.45684003829956055, + "learning_rate": 0.0001, + "loss": 1.6662, + "step": 6659 + }, + { + "epoch": 0.7650335994486245, + "grad_norm": 0.47243228554725647, + "learning_rate": 0.0001, + "loss": 1.6116, + "step": 6660 + }, + { + "epoch": 0.7651484693584516, + "grad_norm": 0.44212606549263, + "learning_rate": 0.0001, + "loss": 1.5302, + "step": 6661 + }, + { + "epoch": 0.7652633392682787, + "grad_norm": 0.48998787999153137, + "learning_rate": 0.0001, + "loss": 1.7869, + "step": 6662 + }, + { + "epoch": 0.7653782091781058, + "grad_norm": 0.4589942693710327, + "learning_rate": 0.0001, + "loss": 1.5954, + "step": 6663 + }, + { + "epoch": 0.765493079087933, + "grad_norm": 0.45436352491378784, + "learning_rate": 0.0001, + "loss": 1.7321, + "step": 6664 + }, + { + "epoch": 0.7656079489977601, + "grad_norm": 0.45719271898269653, + "learning_rate": 0.0001, + "loss": 1.3269, + "step": 6665 + }, + { + "epoch": 0.7657228189075872, + "grad_norm": 0.4515266716480255, + "learning_rate": 0.0001, + "loss": 1.5696, + "step": 6666 + }, + { + "epoch": 0.7658376888174143, + "grad_norm": 0.4383704662322998, + "learning_rate": 0.0001, + "loss": 1.6029, + "step": 6667 + }, + { + "epoch": 0.7659525587272414, + "grad_norm": 0.5026112198829651, + "learning_rate": 0.0001, + "loss": 1.6132, + "step": 6668 + }, + { + "epoch": 0.7660674286370686, + "grad_norm": 0.4402121305465698, + "learning_rate": 0.0001, + "loss": 1.4645, + "step": 6669 + }, + { + "epoch": 0.7661822985468957, + "grad_norm": 0.4881230294704437, + "learning_rate": 0.0001, + "loss": 1.7567, + "step": 6670 + }, + { + "epoch": 0.7662971684567228, + "grad_norm": 0.454129159450531, + "learning_rate": 0.0001, + "loss": 1.5974, + "step": 6671 + }, + { + "epoch": 0.7664120383665499, + "grad_norm": 0.45725277066230774, + "learning_rate": 0.0001, + "loss": 1.3238, + "step": 6672 + }, + { + "epoch": 0.766526908276377, + "grad_norm": 0.4708629250526428, + "learning_rate": 0.0001, + "loss": 1.5594, + "step": 6673 + }, + { + "epoch": 0.7666417781862042, + "grad_norm": 0.4380055367946625, + "learning_rate": 0.0001, + "loss": 1.6742, + "step": 6674 + }, + { + "epoch": 0.7667566480960313, + "grad_norm": 0.45258665084838867, + "learning_rate": 0.0001, + "loss": 1.4825, + "step": 6675 + }, + { + "epoch": 0.7668715180058584, + "grad_norm": 0.4417783319950104, + "learning_rate": 0.0001, + "loss": 1.469, + "step": 6676 + }, + { + "epoch": 0.7669863879156855, + "grad_norm": 0.5079810619354248, + "learning_rate": 0.0001, + "loss": 1.6424, + "step": 6677 + }, + { + "epoch": 0.7671012578255126, + "grad_norm": 0.4944427013397217, + "learning_rate": 0.0001, + "loss": 1.5304, + "step": 6678 + }, + { + "epoch": 0.7672161277353398, + "grad_norm": 0.4718817472457886, + "learning_rate": 0.0001, + "loss": 1.6147, + "step": 6679 + }, + { + "epoch": 0.7673309976451669, + "grad_norm": 0.4739345610141754, + "learning_rate": 0.0001, + "loss": 1.765, + "step": 6680 + }, + { + "epoch": 0.767445867554994, + "grad_norm": 0.46249300241470337, + "learning_rate": 0.0001, + "loss": 1.6155, + "step": 6681 + }, + { + "epoch": 0.7675607374648211, + "grad_norm": 0.5330554246902466, + "learning_rate": 0.0001, + "loss": 1.6253, + "step": 6682 + }, + { + "epoch": 0.7676756073746482, + "grad_norm": 0.4298166334629059, + "learning_rate": 0.0001, + "loss": 1.4984, + "step": 6683 + }, + { + "epoch": 0.7677904772844754, + "grad_norm": 0.47553664445877075, + "learning_rate": 0.0001, + "loss": 1.6892, + "step": 6684 + }, + { + "epoch": 0.7679053471943025, + "grad_norm": 0.4601770341396332, + "learning_rate": 0.0001, + "loss": 1.7258, + "step": 6685 + }, + { + "epoch": 0.7680202171041296, + "grad_norm": 0.4717545509338379, + "learning_rate": 0.0001, + "loss": 1.7111, + "step": 6686 + }, + { + "epoch": 0.7681350870139567, + "grad_norm": 0.47477278113365173, + "learning_rate": 0.0001, + "loss": 1.5278, + "step": 6687 + }, + { + "epoch": 0.7682499569237838, + "grad_norm": 0.45599865913391113, + "learning_rate": 0.0001, + "loss": 1.6069, + "step": 6688 + }, + { + "epoch": 0.768364826833611, + "grad_norm": 0.43916571140289307, + "learning_rate": 0.0001, + "loss": 1.5651, + "step": 6689 + }, + { + "epoch": 0.7684796967434381, + "grad_norm": 0.4506683945655823, + "learning_rate": 0.0001, + "loss": 1.6638, + "step": 6690 + }, + { + "epoch": 0.7685945666532652, + "grad_norm": 0.42485401034355164, + "learning_rate": 0.0001, + "loss": 1.4915, + "step": 6691 + }, + { + "epoch": 0.7687094365630923, + "grad_norm": 0.47712936997413635, + "learning_rate": 0.0001, + "loss": 1.6449, + "step": 6692 + }, + { + "epoch": 0.7688243064729194, + "grad_norm": 0.4386729896068573, + "learning_rate": 0.0001, + "loss": 1.5198, + "step": 6693 + }, + { + "epoch": 0.7689391763827466, + "grad_norm": 0.5089852213859558, + "learning_rate": 0.0001, + "loss": 1.6115, + "step": 6694 + }, + { + "epoch": 0.7690540462925737, + "grad_norm": 0.4199332594871521, + "learning_rate": 0.0001, + "loss": 1.4757, + "step": 6695 + }, + { + "epoch": 0.7691689162024008, + "grad_norm": 0.5055813789367676, + "learning_rate": 0.0001, + "loss": 1.7663, + "step": 6696 + }, + { + "epoch": 0.7692837861122279, + "grad_norm": 0.4748409390449524, + "learning_rate": 0.0001, + "loss": 1.5431, + "step": 6697 + }, + { + "epoch": 0.769398656022055, + "grad_norm": 0.452891081571579, + "learning_rate": 0.0001, + "loss": 1.4932, + "step": 6698 + }, + { + "epoch": 0.7695135259318822, + "grad_norm": 0.4543799161911011, + "learning_rate": 0.0001, + "loss": 1.5107, + "step": 6699 + }, + { + "epoch": 0.7696283958417093, + "grad_norm": 0.47071510553359985, + "learning_rate": 0.0001, + "loss": 1.7285, + "step": 6700 + }, + { + "epoch": 0.7697432657515364, + "grad_norm": 0.46207287907600403, + "learning_rate": 0.0001, + "loss": 1.5552, + "step": 6701 + }, + { + "epoch": 0.7698581356613635, + "grad_norm": 0.4610616862773895, + "learning_rate": 0.0001, + "loss": 1.4635, + "step": 6702 + }, + { + "epoch": 0.7699730055711906, + "grad_norm": 0.46330535411834717, + "learning_rate": 0.0001, + "loss": 1.5052, + "step": 6703 + }, + { + "epoch": 0.7700878754810178, + "grad_norm": 0.4601458013057709, + "learning_rate": 0.0001, + "loss": 1.4925, + "step": 6704 + }, + { + "epoch": 0.7702027453908449, + "grad_norm": 0.4191734790802002, + "learning_rate": 0.0001, + "loss": 1.3989, + "step": 6705 + }, + { + "epoch": 0.770317615300672, + "grad_norm": 0.4650634229183197, + "learning_rate": 0.0001, + "loss": 1.6411, + "step": 6706 + }, + { + "epoch": 0.7704324852104991, + "grad_norm": 0.4624280035495758, + "learning_rate": 0.0001, + "loss": 1.6782, + "step": 6707 + }, + { + "epoch": 0.7705473551203262, + "grad_norm": 0.45528537034988403, + "learning_rate": 0.0001, + "loss": 1.7724, + "step": 6708 + }, + { + "epoch": 0.7706622250301534, + "grad_norm": 0.49074849486351013, + "learning_rate": 0.0001, + "loss": 1.5958, + "step": 6709 + }, + { + "epoch": 0.7707770949399805, + "grad_norm": 0.4072690010070801, + "learning_rate": 0.0001, + "loss": 1.5553, + "step": 6710 + }, + { + "epoch": 0.7708919648498076, + "grad_norm": 0.5691601634025574, + "learning_rate": 0.0001, + "loss": 1.4397, + "step": 6711 + }, + { + "epoch": 0.7710068347596347, + "grad_norm": 0.4546988308429718, + "learning_rate": 0.0001, + "loss": 1.5887, + "step": 6712 + }, + { + "epoch": 0.7711217046694618, + "grad_norm": 0.49612048268318176, + "learning_rate": 0.0001, + "loss": 1.6534, + "step": 6713 + }, + { + "epoch": 0.771236574579289, + "grad_norm": 0.5100909471511841, + "learning_rate": 0.0001, + "loss": 1.6387, + "step": 6714 + }, + { + "epoch": 0.7713514444891161, + "grad_norm": 0.4859296679496765, + "learning_rate": 0.0001, + "loss": 1.6302, + "step": 6715 + }, + { + "epoch": 0.7714663143989432, + "grad_norm": 0.45790478587150574, + "learning_rate": 0.0001, + "loss": 1.5652, + "step": 6716 + }, + { + "epoch": 0.7715811843087703, + "grad_norm": 0.4487425982952118, + "learning_rate": 0.0001, + "loss": 1.6406, + "step": 6717 + }, + { + "epoch": 0.7716960542185974, + "grad_norm": 0.49326303601264954, + "learning_rate": 0.0001, + "loss": 1.7071, + "step": 6718 + }, + { + "epoch": 0.7718109241284246, + "grad_norm": 0.43098387122154236, + "learning_rate": 0.0001, + "loss": 1.6099, + "step": 6719 + }, + { + "epoch": 0.7719257940382517, + "grad_norm": 0.4224907159805298, + "learning_rate": 0.0001, + "loss": 1.4381, + "step": 6720 + }, + { + "epoch": 0.7720406639480788, + "grad_norm": 0.4463758170604706, + "learning_rate": 0.0001, + "loss": 1.6655, + "step": 6721 + }, + { + "epoch": 0.7721555338579059, + "grad_norm": 0.45395219326019287, + "learning_rate": 0.0001, + "loss": 1.7321, + "step": 6722 + }, + { + "epoch": 0.772270403767733, + "grad_norm": 0.443327933549881, + "learning_rate": 0.0001, + "loss": 1.6938, + "step": 6723 + }, + { + "epoch": 0.7723852736775602, + "grad_norm": 0.44689181447029114, + "learning_rate": 0.0001, + "loss": 1.6242, + "step": 6724 + }, + { + "epoch": 0.7725001435873873, + "grad_norm": 0.4609023928642273, + "learning_rate": 0.0001, + "loss": 1.7214, + "step": 6725 + }, + { + "epoch": 0.7726150134972144, + "grad_norm": 0.42757725715637207, + "learning_rate": 0.0001, + "loss": 1.298, + "step": 6726 + }, + { + "epoch": 0.7727298834070415, + "grad_norm": 0.43794259428977966, + "learning_rate": 0.0001, + "loss": 1.6358, + "step": 6727 + }, + { + "epoch": 0.7728447533168686, + "grad_norm": 0.4543568193912506, + "learning_rate": 0.0001, + "loss": 1.5736, + "step": 6728 + }, + { + "epoch": 0.7729596232266958, + "grad_norm": 0.4287487864494324, + "learning_rate": 0.0001, + "loss": 1.4149, + "step": 6729 + }, + { + "epoch": 0.7730744931365229, + "grad_norm": 0.44154518842697144, + "learning_rate": 0.0001, + "loss": 1.4262, + "step": 6730 + }, + { + "epoch": 0.77318936304635, + "grad_norm": 0.44655200839042664, + "learning_rate": 0.0001, + "loss": 1.5583, + "step": 6731 + }, + { + "epoch": 0.7733042329561771, + "grad_norm": 0.5005887150764465, + "learning_rate": 0.0001, + "loss": 1.4248, + "step": 6732 + }, + { + "epoch": 0.7734191028660042, + "grad_norm": 0.5025423169136047, + "learning_rate": 0.0001, + "loss": 1.7576, + "step": 6733 + }, + { + "epoch": 0.7735339727758314, + "grad_norm": 0.5430256128311157, + "learning_rate": 0.0001, + "loss": 1.7347, + "step": 6734 + }, + { + "epoch": 0.7736488426856585, + "grad_norm": 0.4566929340362549, + "learning_rate": 0.0001, + "loss": 1.4515, + "step": 6735 + }, + { + "epoch": 0.7737637125954856, + "grad_norm": 0.45113489031791687, + "learning_rate": 0.0001, + "loss": 1.6234, + "step": 6736 + }, + { + "epoch": 0.7738785825053127, + "grad_norm": 0.44897618889808655, + "learning_rate": 0.0001, + "loss": 1.6176, + "step": 6737 + }, + { + "epoch": 0.7739934524151398, + "grad_norm": 0.4536812901496887, + "learning_rate": 0.0001, + "loss": 1.6545, + "step": 6738 + }, + { + "epoch": 0.774108322324967, + "grad_norm": 0.4436149299144745, + "learning_rate": 0.0001, + "loss": 1.6586, + "step": 6739 + }, + { + "epoch": 0.7742231922347941, + "grad_norm": 0.4435299336910248, + "learning_rate": 0.0001, + "loss": 1.6059, + "step": 6740 + }, + { + "epoch": 0.7743380621446212, + "grad_norm": 0.44554266333580017, + "learning_rate": 0.0001, + "loss": 1.6621, + "step": 6741 + }, + { + "epoch": 0.7744529320544483, + "grad_norm": 0.45656824111938477, + "learning_rate": 0.0001, + "loss": 1.539, + "step": 6742 + }, + { + "epoch": 0.7745678019642754, + "grad_norm": 0.4641306698322296, + "learning_rate": 0.0001, + "loss": 1.6619, + "step": 6743 + }, + { + "epoch": 0.7746826718741026, + "grad_norm": 0.48745623230934143, + "learning_rate": 0.0001, + "loss": 1.603, + "step": 6744 + }, + { + "epoch": 0.7747975417839297, + "grad_norm": 0.4215722978115082, + "learning_rate": 0.0001, + "loss": 1.5519, + "step": 6745 + }, + { + "epoch": 0.7749124116937568, + "grad_norm": 0.49982690811157227, + "learning_rate": 0.0001, + "loss": 1.6318, + "step": 6746 + }, + { + "epoch": 0.7750272816035839, + "grad_norm": 0.5177478194236755, + "learning_rate": 0.0001, + "loss": 1.6534, + "step": 6747 + }, + { + "epoch": 0.775142151513411, + "grad_norm": 0.4642798900604248, + "learning_rate": 0.0001, + "loss": 1.7246, + "step": 6748 + }, + { + "epoch": 0.7752570214232382, + "grad_norm": 0.5008178949356079, + "learning_rate": 0.0001, + "loss": 1.5999, + "step": 6749 + }, + { + "epoch": 0.7753718913330653, + "grad_norm": 0.4520767033100128, + "learning_rate": 0.0001, + "loss": 1.4705, + "step": 6750 + }, + { + "epoch": 0.7754867612428924, + "grad_norm": 0.4959637224674225, + "learning_rate": 0.0001, + "loss": 1.6124, + "step": 6751 + }, + { + "epoch": 0.7756016311527195, + "grad_norm": 0.4342813491821289, + "learning_rate": 0.0001, + "loss": 1.6029, + "step": 6752 + }, + { + "epoch": 0.7757165010625466, + "grad_norm": 0.45691919326782227, + "learning_rate": 0.0001, + "loss": 1.7955, + "step": 6753 + }, + { + "epoch": 0.7758313709723738, + "grad_norm": 0.44725337624549866, + "learning_rate": 0.0001, + "loss": 1.4242, + "step": 6754 + }, + { + "epoch": 0.7759462408822009, + "grad_norm": 0.4812532365322113, + "learning_rate": 0.0001, + "loss": 1.6946, + "step": 6755 + }, + { + "epoch": 0.776061110792028, + "grad_norm": 0.40750646591186523, + "learning_rate": 0.0001, + "loss": 1.5516, + "step": 6756 + }, + { + "epoch": 0.7761759807018551, + "grad_norm": 0.482294499874115, + "learning_rate": 0.0001, + "loss": 1.6497, + "step": 6757 + }, + { + "epoch": 0.7762908506116822, + "grad_norm": 0.4344806969165802, + "learning_rate": 0.0001, + "loss": 1.4736, + "step": 6758 + }, + { + "epoch": 0.7764057205215094, + "grad_norm": 0.46780574321746826, + "learning_rate": 0.0001, + "loss": 1.5484, + "step": 6759 + }, + { + "epoch": 0.7765205904313365, + "grad_norm": 0.4722866714000702, + "learning_rate": 0.0001, + "loss": 1.8075, + "step": 6760 + }, + { + "epoch": 0.7766354603411636, + "grad_norm": 0.45282799005508423, + "learning_rate": 0.0001, + "loss": 1.5938, + "step": 6761 + }, + { + "epoch": 0.7767503302509907, + "grad_norm": 0.4654501974582672, + "learning_rate": 0.0001, + "loss": 1.7371, + "step": 6762 + }, + { + "epoch": 0.7768652001608178, + "grad_norm": 0.46721896529197693, + "learning_rate": 0.0001, + "loss": 1.6322, + "step": 6763 + }, + { + "epoch": 0.776980070070645, + "grad_norm": 0.47334960103034973, + "learning_rate": 0.0001, + "loss": 1.6024, + "step": 6764 + }, + { + "epoch": 0.7770949399804721, + "grad_norm": 0.46868133544921875, + "learning_rate": 0.0001, + "loss": 1.5711, + "step": 6765 + }, + { + "epoch": 0.7772098098902992, + "grad_norm": 0.45955634117126465, + "learning_rate": 0.0001, + "loss": 1.633, + "step": 6766 + }, + { + "epoch": 0.7773246798001263, + "grad_norm": 0.43872690200805664, + "learning_rate": 0.0001, + "loss": 1.3424, + "step": 6767 + }, + { + "epoch": 0.7774395497099534, + "grad_norm": 0.5430575609207153, + "learning_rate": 0.0001, + "loss": 1.8176, + "step": 6768 + }, + { + "epoch": 0.7775544196197806, + "grad_norm": 0.4751816689968109, + "learning_rate": 0.0001, + "loss": 1.6546, + "step": 6769 + }, + { + "epoch": 0.7776692895296077, + "grad_norm": 0.42308947443962097, + "learning_rate": 0.0001, + "loss": 1.4672, + "step": 6770 + }, + { + "epoch": 0.7777841594394348, + "grad_norm": 0.4343721866607666, + "learning_rate": 0.0001, + "loss": 1.5106, + "step": 6771 + }, + { + "epoch": 0.7778990293492619, + "grad_norm": 0.4403552711009979, + "learning_rate": 0.0001, + "loss": 1.5326, + "step": 6772 + }, + { + "epoch": 0.778013899259089, + "grad_norm": 0.46488016843795776, + "learning_rate": 0.0001, + "loss": 1.6177, + "step": 6773 + }, + { + "epoch": 0.7781287691689162, + "grad_norm": 0.43037131428718567, + "learning_rate": 0.0001, + "loss": 1.5796, + "step": 6774 + }, + { + "epoch": 0.7782436390787433, + "grad_norm": 0.42041462659835815, + "learning_rate": 0.0001, + "loss": 1.4634, + "step": 6775 + }, + { + "epoch": 0.7783585089885704, + "grad_norm": 0.46670305728912354, + "learning_rate": 0.0001, + "loss": 1.7338, + "step": 6776 + }, + { + "epoch": 0.7784733788983975, + "grad_norm": 0.44566023349761963, + "learning_rate": 0.0001, + "loss": 1.4879, + "step": 6777 + }, + { + "epoch": 0.7785882488082246, + "grad_norm": 0.46468743681907654, + "learning_rate": 0.0001, + "loss": 1.4851, + "step": 6778 + }, + { + "epoch": 0.7787031187180518, + "grad_norm": 0.4546216130256653, + "learning_rate": 0.0001, + "loss": 1.4767, + "step": 6779 + }, + { + "epoch": 0.7788179886278789, + "grad_norm": 0.41460397839546204, + "learning_rate": 0.0001, + "loss": 1.5568, + "step": 6780 + }, + { + "epoch": 0.778932858537706, + "grad_norm": 0.43776392936706543, + "learning_rate": 0.0001, + "loss": 1.5138, + "step": 6781 + }, + { + "epoch": 0.7790477284475331, + "grad_norm": 0.42901432514190674, + "learning_rate": 0.0001, + "loss": 1.4523, + "step": 6782 + }, + { + "epoch": 0.7791625983573602, + "grad_norm": 0.4392737150192261, + "learning_rate": 0.0001, + "loss": 1.5261, + "step": 6783 + }, + { + "epoch": 0.7792774682671874, + "grad_norm": 0.5367296934127808, + "learning_rate": 0.0001, + "loss": 1.7535, + "step": 6784 + }, + { + "epoch": 0.7793923381770145, + "grad_norm": 0.48000702261924744, + "learning_rate": 0.0001, + "loss": 1.6611, + "step": 6785 + }, + { + "epoch": 0.7795072080868416, + "grad_norm": 0.42784345149993896, + "learning_rate": 0.0001, + "loss": 1.5054, + "step": 6786 + }, + { + "epoch": 0.7796220779966688, + "grad_norm": 0.4323700964450836, + "learning_rate": 0.0001, + "loss": 1.5182, + "step": 6787 + }, + { + "epoch": 0.779736947906496, + "grad_norm": 0.46200308203697205, + "learning_rate": 0.0001, + "loss": 1.5219, + "step": 6788 + }, + { + "epoch": 0.7798518178163231, + "grad_norm": 0.4896979033946991, + "learning_rate": 0.0001, + "loss": 1.698, + "step": 6789 + }, + { + "epoch": 0.7799666877261502, + "grad_norm": 0.4582618474960327, + "learning_rate": 0.0001, + "loss": 1.4937, + "step": 6790 + }, + { + "epoch": 0.7800815576359773, + "grad_norm": 0.45183730125427246, + "learning_rate": 0.0001, + "loss": 1.6155, + "step": 6791 + }, + { + "epoch": 0.7801964275458044, + "grad_norm": 0.45246532559394836, + "learning_rate": 0.0001, + "loss": 1.6663, + "step": 6792 + }, + { + "epoch": 0.7803112974556315, + "grad_norm": 0.453540176153183, + "learning_rate": 0.0001, + "loss": 1.4815, + "step": 6793 + }, + { + "epoch": 0.7804261673654587, + "grad_norm": 0.460021436214447, + "learning_rate": 0.0001, + "loss": 1.5578, + "step": 6794 + }, + { + "epoch": 0.7805410372752858, + "grad_norm": 0.5325375199317932, + "learning_rate": 0.0001, + "loss": 1.7254, + "step": 6795 + }, + { + "epoch": 0.7806559071851129, + "grad_norm": 0.41828301548957825, + "learning_rate": 0.0001, + "loss": 1.5596, + "step": 6796 + }, + { + "epoch": 0.78077077709494, + "grad_norm": 0.4506258964538574, + "learning_rate": 0.0001, + "loss": 1.587, + "step": 6797 + }, + { + "epoch": 0.7808856470047671, + "grad_norm": 0.4368896484375, + "learning_rate": 0.0001, + "loss": 1.5261, + "step": 6798 + }, + { + "epoch": 0.7810005169145943, + "grad_norm": 0.4336557388305664, + "learning_rate": 0.0001, + "loss": 1.4341, + "step": 6799 + }, + { + "epoch": 0.7811153868244214, + "grad_norm": 0.47496965527534485, + "learning_rate": 0.0001, + "loss": 1.7202, + "step": 6800 + }, + { + "epoch": 0.7812302567342485, + "grad_norm": 0.44038236141204834, + "learning_rate": 0.0001, + "loss": 1.7002, + "step": 6801 + }, + { + "epoch": 0.7813451266440756, + "grad_norm": 0.44530773162841797, + "learning_rate": 0.0001, + "loss": 1.668, + "step": 6802 + }, + { + "epoch": 0.7814599965539027, + "grad_norm": 0.4356937110424042, + "learning_rate": 0.0001, + "loss": 1.6553, + "step": 6803 + }, + { + "epoch": 0.7815748664637299, + "grad_norm": 0.4739866256713867, + "learning_rate": 0.0001, + "loss": 1.6232, + "step": 6804 + }, + { + "epoch": 0.781689736373557, + "grad_norm": 0.5034109950065613, + "learning_rate": 0.0001, + "loss": 1.7754, + "step": 6805 + }, + { + "epoch": 0.7818046062833841, + "grad_norm": 0.46401265263557434, + "learning_rate": 0.0001, + "loss": 1.6829, + "step": 6806 + }, + { + "epoch": 0.7819194761932112, + "grad_norm": 0.44969433546066284, + "learning_rate": 0.0001, + "loss": 1.6893, + "step": 6807 + }, + { + "epoch": 0.7820343461030383, + "grad_norm": 0.48960667848587036, + "learning_rate": 0.0001, + "loss": 1.8962, + "step": 6808 + }, + { + "epoch": 0.7821492160128655, + "grad_norm": 0.41496121883392334, + "learning_rate": 0.0001, + "loss": 1.3764, + "step": 6809 + }, + { + "epoch": 0.7822640859226926, + "grad_norm": 0.4606899917125702, + "learning_rate": 0.0001, + "loss": 1.6955, + "step": 6810 + }, + { + "epoch": 0.7823789558325197, + "grad_norm": 0.5153933763504028, + "learning_rate": 0.0001, + "loss": 1.8612, + "step": 6811 + }, + { + "epoch": 0.7824938257423468, + "grad_norm": 0.4672335684299469, + "learning_rate": 0.0001, + "loss": 1.6347, + "step": 6812 + }, + { + "epoch": 0.782608695652174, + "grad_norm": 0.4316304326057434, + "learning_rate": 0.0001, + "loss": 1.5107, + "step": 6813 + }, + { + "epoch": 0.7827235655620011, + "grad_norm": 0.4348050355911255, + "learning_rate": 0.0001, + "loss": 1.5962, + "step": 6814 + }, + { + "epoch": 0.7828384354718282, + "grad_norm": 0.44443196058273315, + "learning_rate": 0.0001, + "loss": 1.6767, + "step": 6815 + }, + { + "epoch": 0.7829533053816553, + "grad_norm": 0.45539575815200806, + "learning_rate": 0.0001, + "loss": 1.6111, + "step": 6816 + }, + { + "epoch": 0.7830681752914824, + "grad_norm": 0.42358124256134033, + "learning_rate": 0.0001, + "loss": 1.5121, + "step": 6817 + }, + { + "epoch": 0.7831830452013095, + "grad_norm": 0.444414347410202, + "learning_rate": 0.0001, + "loss": 1.6731, + "step": 6818 + }, + { + "epoch": 0.7832979151111367, + "grad_norm": 0.45530837774276733, + "learning_rate": 0.0001, + "loss": 1.6444, + "step": 6819 + }, + { + "epoch": 0.7834127850209638, + "grad_norm": 0.4563850462436676, + "learning_rate": 0.0001, + "loss": 1.6079, + "step": 6820 + }, + { + "epoch": 0.7835276549307909, + "grad_norm": 0.43416640162467957, + "learning_rate": 0.0001, + "loss": 1.3742, + "step": 6821 + }, + { + "epoch": 0.783642524840618, + "grad_norm": 0.44997450709342957, + "learning_rate": 0.0001, + "loss": 1.6559, + "step": 6822 + }, + { + "epoch": 0.7837573947504451, + "grad_norm": 0.4280833303928375, + "learning_rate": 0.0001, + "loss": 1.4376, + "step": 6823 + }, + { + "epoch": 0.7838722646602723, + "grad_norm": 0.4517759680747986, + "learning_rate": 0.0001, + "loss": 1.6059, + "step": 6824 + }, + { + "epoch": 0.7839871345700994, + "grad_norm": 0.45241299271583557, + "learning_rate": 0.0001, + "loss": 1.3445, + "step": 6825 + }, + { + "epoch": 0.7841020044799265, + "grad_norm": 0.4801664352416992, + "learning_rate": 0.0001, + "loss": 1.7256, + "step": 6826 + }, + { + "epoch": 0.7842168743897536, + "grad_norm": 0.44268131256103516, + "learning_rate": 0.0001, + "loss": 1.7158, + "step": 6827 + }, + { + "epoch": 0.7843317442995807, + "grad_norm": 0.46175646781921387, + "learning_rate": 0.0001, + "loss": 1.7005, + "step": 6828 + }, + { + "epoch": 0.7844466142094079, + "grad_norm": 0.4507410228252411, + "learning_rate": 0.0001, + "loss": 1.415, + "step": 6829 + }, + { + "epoch": 0.784561484119235, + "grad_norm": 0.42673787474632263, + "learning_rate": 0.0001, + "loss": 1.4747, + "step": 6830 + }, + { + "epoch": 0.7846763540290621, + "grad_norm": 0.42640364170074463, + "learning_rate": 0.0001, + "loss": 1.3069, + "step": 6831 + }, + { + "epoch": 0.7847912239388892, + "grad_norm": 0.46200641989707947, + "learning_rate": 0.0001, + "loss": 1.633, + "step": 6832 + }, + { + "epoch": 0.7849060938487163, + "grad_norm": 0.43748313188552856, + "learning_rate": 0.0001, + "loss": 1.5343, + "step": 6833 + }, + { + "epoch": 0.7850209637585435, + "grad_norm": 0.49261534214019775, + "learning_rate": 0.0001, + "loss": 1.9485, + "step": 6834 + }, + { + "epoch": 0.7851358336683706, + "grad_norm": 0.4890846908092499, + "learning_rate": 0.0001, + "loss": 1.6535, + "step": 6835 + }, + { + "epoch": 0.7852507035781977, + "grad_norm": 0.4594542384147644, + "learning_rate": 0.0001, + "loss": 1.5565, + "step": 6836 + }, + { + "epoch": 0.7853655734880248, + "grad_norm": 0.434871107339859, + "learning_rate": 0.0001, + "loss": 1.5575, + "step": 6837 + }, + { + "epoch": 0.785480443397852, + "grad_norm": 0.45254603028297424, + "learning_rate": 0.0001, + "loss": 1.6604, + "step": 6838 + }, + { + "epoch": 0.7855953133076791, + "grad_norm": 0.46842509508132935, + "learning_rate": 0.0001, + "loss": 1.5599, + "step": 6839 + }, + { + "epoch": 0.7857101832175062, + "grad_norm": 0.45516836643218994, + "learning_rate": 0.0001, + "loss": 1.533, + "step": 6840 + }, + { + "epoch": 0.7858250531273333, + "grad_norm": 0.4427262246608734, + "learning_rate": 0.0001, + "loss": 1.671, + "step": 6841 + }, + { + "epoch": 0.7859399230371604, + "grad_norm": 0.4520769417285919, + "learning_rate": 0.0001, + "loss": 1.6729, + "step": 6842 + }, + { + "epoch": 0.7860547929469875, + "grad_norm": 0.5243545770645142, + "learning_rate": 0.0001, + "loss": 1.6431, + "step": 6843 + }, + { + "epoch": 0.7861696628568147, + "grad_norm": 0.4461134374141693, + "learning_rate": 0.0001, + "loss": 1.4884, + "step": 6844 + }, + { + "epoch": 0.7862845327666418, + "grad_norm": 0.48180684447288513, + "learning_rate": 0.0001, + "loss": 1.6279, + "step": 6845 + }, + { + "epoch": 0.7863994026764689, + "grad_norm": 0.4266572594642639, + "learning_rate": 0.0001, + "loss": 1.4144, + "step": 6846 + }, + { + "epoch": 0.786514272586296, + "grad_norm": 0.43107786774635315, + "learning_rate": 0.0001, + "loss": 1.5917, + "step": 6847 + }, + { + "epoch": 0.7866291424961231, + "grad_norm": 0.49343347549438477, + "learning_rate": 0.0001, + "loss": 1.6641, + "step": 6848 + }, + { + "epoch": 0.7867440124059503, + "grad_norm": 0.4486045837402344, + "learning_rate": 0.0001, + "loss": 1.2793, + "step": 6849 + }, + { + "epoch": 0.7868588823157774, + "grad_norm": 0.4293416440486908, + "learning_rate": 0.0001, + "loss": 1.5555, + "step": 6850 + }, + { + "epoch": 0.7869737522256045, + "grad_norm": 0.48061510920524597, + "learning_rate": 0.0001, + "loss": 1.6201, + "step": 6851 + }, + { + "epoch": 0.7870886221354316, + "grad_norm": 0.4739622175693512, + "learning_rate": 0.0001, + "loss": 1.7332, + "step": 6852 + }, + { + "epoch": 0.7872034920452587, + "grad_norm": 0.4450550675392151, + "learning_rate": 0.0001, + "loss": 1.5765, + "step": 6853 + }, + { + "epoch": 0.7873183619550859, + "grad_norm": 0.4692426025867462, + "learning_rate": 0.0001, + "loss": 1.5579, + "step": 6854 + }, + { + "epoch": 0.787433231864913, + "grad_norm": 0.4459279775619507, + "learning_rate": 0.0001, + "loss": 1.4608, + "step": 6855 + }, + { + "epoch": 0.7875481017747401, + "grad_norm": 0.44502994418144226, + "learning_rate": 0.0001, + "loss": 1.5768, + "step": 6856 + }, + { + "epoch": 0.7876629716845672, + "grad_norm": 0.4621850550174713, + "learning_rate": 0.0001, + "loss": 1.5366, + "step": 6857 + }, + { + "epoch": 0.7877778415943943, + "grad_norm": 0.40475985407829285, + "learning_rate": 0.0001, + "loss": 1.3404, + "step": 6858 + }, + { + "epoch": 0.7878927115042215, + "grad_norm": 0.47660937905311584, + "learning_rate": 0.0001, + "loss": 1.6131, + "step": 6859 + }, + { + "epoch": 0.7880075814140486, + "grad_norm": 0.44972842931747437, + "learning_rate": 0.0001, + "loss": 1.6736, + "step": 6860 + }, + { + "epoch": 0.7881224513238757, + "grad_norm": 0.454998642206192, + "learning_rate": 0.0001, + "loss": 1.5207, + "step": 6861 + }, + { + "epoch": 0.7882373212337028, + "grad_norm": 0.4434056580066681, + "learning_rate": 0.0001, + "loss": 1.5593, + "step": 6862 + }, + { + "epoch": 0.78835219114353, + "grad_norm": 0.4378054141998291, + "learning_rate": 0.0001, + "loss": 1.6163, + "step": 6863 + }, + { + "epoch": 0.7884670610533571, + "grad_norm": 0.43698400259017944, + "learning_rate": 0.0001, + "loss": 1.7174, + "step": 6864 + }, + { + "epoch": 0.7885819309631842, + "grad_norm": 0.4555806815624237, + "learning_rate": 0.0001, + "loss": 1.606, + "step": 6865 + }, + { + "epoch": 0.7886968008730113, + "grad_norm": 0.44483011960983276, + "learning_rate": 0.0001, + "loss": 1.6285, + "step": 6866 + }, + { + "epoch": 0.7888116707828384, + "grad_norm": 0.42852985858917236, + "learning_rate": 0.0001, + "loss": 1.5767, + "step": 6867 + }, + { + "epoch": 0.7889265406926655, + "grad_norm": 0.4609838128089905, + "learning_rate": 0.0001, + "loss": 1.6478, + "step": 6868 + }, + { + "epoch": 0.7890414106024927, + "grad_norm": 0.45633402466773987, + "learning_rate": 0.0001, + "loss": 1.6004, + "step": 6869 + }, + { + "epoch": 0.7891562805123198, + "grad_norm": 0.44641777873039246, + "learning_rate": 0.0001, + "loss": 1.4307, + "step": 6870 + }, + { + "epoch": 0.7892711504221469, + "grad_norm": 0.45838481187820435, + "learning_rate": 0.0001, + "loss": 1.7874, + "step": 6871 + }, + { + "epoch": 0.789386020331974, + "grad_norm": 0.44452112913131714, + "learning_rate": 0.0001, + "loss": 1.5838, + "step": 6872 + }, + { + "epoch": 0.7895008902418011, + "grad_norm": 0.44366639852523804, + "learning_rate": 0.0001, + "loss": 1.6561, + "step": 6873 + }, + { + "epoch": 0.7896157601516283, + "grad_norm": 0.44662636518478394, + "learning_rate": 0.0001, + "loss": 1.5177, + "step": 6874 + }, + { + "epoch": 0.7897306300614554, + "grad_norm": 0.46313855051994324, + "learning_rate": 0.0001, + "loss": 1.6924, + "step": 6875 + }, + { + "epoch": 0.7898454999712825, + "grad_norm": 0.4705411195755005, + "learning_rate": 0.0001, + "loss": 1.6291, + "step": 6876 + }, + { + "epoch": 0.7899603698811096, + "grad_norm": 0.4569123387336731, + "learning_rate": 0.0001, + "loss": 1.617, + "step": 6877 + }, + { + "epoch": 0.7900752397909367, + "grad_norm": 0.5129233002662659, + "learning_rate": 0.0001, + "loss": 1.9261, + "step": 6878 + }, + { + "epoch": 0.7901901097007639, + "grad_norm": 0.4295032322406769, + "learning_rate": 0.0001, + "loss": 1.4843, + "step": 6879 + }, + { + "epoch": 0.790304979610591, + "grad_norm": 0.44028350710868835, + "learning_rate": 0.0001, + "loss": 1.7061, + "step": 6880 + }, + { + "epoch": 0.7904198495204181, + "grad_norm": 0.48637375235557556, + "learning_rate": 0.0001, + "loss": 1.6503, + "step": 6881 + }, + { + "epoch": 0.7905347194302452, + "grad_norm": 0.4660567045211792, + "learning_rate": 0.0001, + "loss": 1.6903, + "step": 6882 + }, + { + "epoch": 0.7906495893400723, + "grad_norm": 0.4239864945411682, + "learning_rate": 0.0001, + "loss": 1.5828, + "step": 6883 + }, + { + "epoch": 0.7907644592498995, + "grad_norm": 0.5081626772880554, + "learning_rate": 0.0001, + "loss": 1.8917, + "step": 6884 + }, + { + "epoch": 0.7908793291597266, + "grad_norm": 0.47426649928092957, + "learning_rate": 0.0001, + "loss": 1.7486, + "step": 6885 + }, + { + "epoch": 0.7909941990695537, + "grad_norm": 0.47770363092422485, + "learning_rate": 0.0001, + "loss": 1.5376, + "step": 6886 + }, + { + "epoch": 0.7911090689793808, + "grad_norm": 0.44490906596183777, + "learning_rate": 0.0001, + "loss": 1.6119, + "step": 6887 + }, + { + "epoch": 0.791223938889208, + "grad_norm": 0.4341287612915039, + "learning_rate": 0.0001, + "loss": 1.6521, + "step": 6888 + }, + { + "epoch": 0.7913388087990351, + "grad_norm": 0.46785807609558105, + "learning_rate": 0.0001, + "loss": 1.6912, + "step": 6889 + }, + { + "epoch": 0.7914536787088622, + "grad_norm": 0.4538176655769348, + "learning_rate": 0.0001, + "loss": 1.6581, + "step": 6890 + }, + { + "epoch": 0.7915685486186893, + "grad_norm": 0.4823910593986511, + "learning_rate": 0.0001, + "loss": 1.6738, + "step": 6891 + }, + { + "epoch": 0.7916834185285164, + "grad_norm": 0.49208977818489075, + "learning_rate": 0.0001, + "loss": 1.7127, + "step": 6892 + }, + { + "epoch": 0.7917982884383435, + "grad_norm": 0.6321043968200684, + "learning_rate": 0.0001, + "loss": 1.5294, + "step": 6893 + }, + { + "epoch": 0.7919131583481707, + "grad_norm": 0.43280404806137085, + "learning_rate": 0.0001, + "loss": 1.5839, + "step": 6894 + }, + { + "epoch": 0.7920280282579978, + "grad_norm": 0.5009095072746277, + "learning_rate": 0.0001, + "loss": 1.6516, + "step": 6895 + }, + { + "epoch": 0.7921428981678249, + "grad_norm": 0.4481852650642395, + "learning_rate": 0.0001, + "loss": 1.6166, + "step": 6896 + }, + { + "epoch": 0.792257768077652, + "grad_norm": 0.4593367278575897, + "learning_rate": 0.0001, + "loss": 1.652, + "step": 6897 + }, + { + "epoch": 0.7923726379874791, + "grad_norm": 0.5545166730880737, + "learning_rate": 0.0001, + "loss": 1.3038, + "step": 6898 + }, + { + "epoch": 0.7924875078973063, + "grad_norm": 0.4565151631832123, + "learning_rate": 0.0001, + "loss": 1.5986, + "step": 6899 + }, + { + "epoch": 0.7926023778071334, + "grad_norm": 0.45123347640037537, + "learning_rate": 0.0001, + "loss": 1.7472, + "step": 6900 + }, + { + "epoch": 0.7927172477169605, + "grad_norm": 0.46333909034729004, + "learning_rate": 0.0001, + "loss": 1.5799, + "step": 6901 + }, + { + "epoch": 0.7928321176267876, + "grad_norm": 0.4783911108970642, + "learning_rate": 0.0001, + "loss": 1.5615, + "step": 6902 + }, + { + "epoch": 0.7929469875366147, + "grad_norm": 0.48153024911880493, + "learning_rate": 0.0001, + "loss": 1.6846, + "step": 6903 + }, + { + "epoch": 0.7930618574464419, + "grad_norm": 0.45357227325439453, + "learning_rate": 0.0001, + "loss": 1.6658, + "step": 6904 + }, + { + "epoch": 0.793176727356269, + "grad_norm": 0.44956380128860474, + "learning_rate": 0.0001, + "loss": 1.6623, + "step": 6905 + }, + { + "epoch": 0.7932915972660961, + "grad_norm": 0.4557205140590668, + "learning_rate": 0.0001, + "loss": 1.4921, + "step": 6906 + }, + { + "epoch": 0.7934064671759232, + "grad_norm": 0.45987358689308167, + "learning_rate": 0.0001, + "loss": 1.6181, + "step": 6907 + }, + { + "epoch": 0.7935213370857503, + "grad_norm": 0.4671670198440552, + "learning_rate": 0.0001, + "loss": 1.559, + "step": 6908 + }, + { + "epoch": 0.7936362069955775, + "grad_norm": 0.4789045453071594, + "learning_rate": 0.0001, + "loss": 1.545, + "step": 6909 + }, + { + "epoch": 0.7937510769054046, + "grad_norm": 0.4139240086078644, + "learning_rate": 0.0001, + "loss": 1.4065, + "step": 6910 + }, + { + "epoch": 0.7938659468152317, + "grad_norm": 0.43465399742126465, + "learning_rate": 0.0001, + "loss": 1.4924, + "step": 6911 + }, + { + "epoch": 0.7939808167250588, + "grad_norm": 0.45922383666038513, + "learning_rate": 0.0001, + "loss": 1.5714, + "step": 6912 + }, + { + "epoch": 0.7940956866348859, + "grad_norm": 0.5036603808403015, + "learning_rate": 0.0001, + "loss": 1.7516, + "step": 6913 + }, + { + "epoch": 0.7942105565447131, + "grad_norm": 0.4896206855773926, + "learning_rate": 0.0001, + "loss": 1.6954, + "step": 6914 + }, + { + "epoch": 0.7943254264545402, + "grad_norm": 0.4837943911552429, + "learning_rate": 0.0001, + "loss": 1.7452, + "step": 6915 + }, + { + "epoch": 0.7944402963643673, + "grad_norm": 0.4519433081150055, + "learning_rate": 0.0001, + "loss": 1.5967, + "step": 6916 + }, + { + "epoch": 0.7945551662741944, + "grad_norm": 0.45447391271591187, + "learning_rate": 0.0001, + "loss": 1.5263, + "step": 6917 + }, + { + "epoch": 0.7946700361840215, + "grad_norm": 0.4731905460357666, + "learning_rate": 0.0001, + "loss": 1.8278, + "step": 6918 + }, + { + "epoch": 0.7947849060938487, + "grad_norm": 0.44193151593208313, + "learning_rate": 0.0001, + "loss": 1.4828, + "step": 6919 + }, + { + "epoch": 0.7948997760036758, + "grad_norm": 0.472493439912796, + "learning_rate": 0.0001, + "loss": 1.6412, + "step": 6920 + }, + { + "epoch": 0.7950146459135029, + "grad_norm": 0.46209755539894104, + "learning_rate": 0.0001, + "loss": 1.5916, + "step": 6921 + }, + { + "epoch": 0.79512951582333, + "grad_norm": 0.41559138894081116, + "learning_rate": 0.0001, + "loss": 1.6852, + "step": 6922 + }, + { + "epoch": 0.7952443857331571, + "grad_norm": 0.44188040494918823, + "learning_rate": 0.0001, + "loss": 1.5008, + "step": 6923 + }, + { + "epoch": 0.7953592556429844, + "grad_norm": 0.4449104070663452, + "learning_rate": 0.0001, + "loss": 1.5437, + "step": 6924 + }, + { + "epoch": 0.7954741255528115, + "grad_norm": 0.45927226543426514, + "learning_rate": 0.0001, + "loss": 1.6136, + "step": 6925 + }, + { + "epoch": 0.7955889954626386, + "grad_norm": 0.532455563545227, + "learning_rate": 0.0001, + "loss": 1.7761, + "step": 6926 + }, + { + "epoch": 0.7957038653724657, + "grad_norm": 0.5386255979537964, + "learning_rate": 0.0001, + "loss": 1.7247, + "step": 6927 + }, + { + "epoch": 0.7958187352822929, + "grad_norm": 0.449303537607193, + "learning_rate": 0.0001, + "loss": 1.4939, + "step": 6928 + }, + { + "epoch": 0.79593360519212, + "grad_norm": 0.43787631392478943, + "learning_rate": 0.0001, + "loss": 1.5544, + "step": 6929 + }, + { + "epoch": 0.7960484751019471, + "grad_norm": 0.4425432085990906, + "learning_rate": 0.0001, + "loss": 1.5128, + "step": 6930 + }, + { + "epoch": 0.7961633450117742, + "grad_norm": 0.47082197666168213, + "learning_rate": 0.0001, + "loss": 1.6663, + "step": 6931 + }, + { + "epoch": 0.7962782149216013, + "grad_norm": 0.42969945073127747, + "learning_rate": 0.0001, + "loss": 1.5908, + "step": 6932 + }, + { + "epoch": 0.7963930848314285, + "grad_norm": 0.4878128468990326, + "learning_rate": 0.0001, + "loss": 1.4099, + "step": 6933 + }, + { + "epoch": 0.7965079547412556, + "grad_norm": 0.4496504068374634, + "learning_rate": 0.0001, + "loss": 1.4885, + "step": 6934 + }, + { + "epoch": 0.7966228246510827, + "grad_norm": 0.48936036229133606, + "learning_rate": 0.0001, + "loss": 1.6446, + "step": 6935 + }, + { + "epoch": 0.7967376945609098, + "grad_norm": 0.45953816175460815, + "learning_rate": 0.0001, + "loss": 1.6319, + "step": 6936 + }, + { + "epoch": 0.7968525644707369, + "grad_norm": 0.4707304537296295, + "learning_rate": 0.0001, + "loss": 1.5218, + "step": 6937 + }, + { + "epoch": 0.796967434380564, + "grad_norm": 0.4913303256034851, + "learning_rate": 0.0001, + "loss": 1.6678, + "step": 6938 + }, + { + "epoch": 0.7970823042903912, + "grad_norm": 0.4540485739707947, + "learning_rate": 0.0001, + "loss": 1.5723, + "step": 6939 + }, + { + "epoch": 0.7971971742002183, + "grad_norm": 0.4464291036128998, + "learning_rate": 0.0001, + "loss": 1.5943, + "step": 6940 + }, + { + "epoch": 0.7973120441100454, + "grad_norm": 0.4318457245826721, + "learning_rate": 0.0001, + "loss": 1.5276, + "step": 6941 + }, + { + "epoch": 0.7974269140198725, + "grad_norm": 0.47109347581863403, + "learning_rate": 0.0001, + "loss": 1.5672, + "step": 6942 + }, + { + "epoch": 0.7975417839296997, + "grad_norm": 0.47364145517349243, + "learning_rate": 0.0001, + "loss": 1.7189, + "step": 6943 + }, + { + "epoch": 0.7976566538395268, + "grad_norm": 0.5263649821281433, + "learning_rate": 0.0001, + "loss": 1.3896, + "step": 6944 + }, + { + "epoch": 0.7977715237493539, + "grad_norm": 0.45865774154663086, + "learning_rate": 0.0001, + "loss": 1.4919, + "step": 6945 + }, + { + "epoch": 0.797886393659181, + "grad_norm": 0.4584158658981323, + "learning_rate": 0.0001, + "loss": 1.7354, + "step": 6946 + }, + { + "epoch": 0.7980012635690081, + "grad_norm": 0.4830566346645355, + "learning_rate": 0.0001, + "loss": 1.525, + "step": 6947 + }, + { + "epoch": 0.7981161334788353, + "grad_norm": 0.4785566031932831, + "learning_rate": 0.0001, + "loss": 1.6353, + "step": 6948 + }, + { + "epoch": 0.7982310033886624, + "grad_norm": 0.4663696587085724, + "learning_rate": 0.0001, + "loss": 1.636, + "step": 6949 + }, + { + "epoch": 0.7983458732984895, + "grad_norm": 0.48032259941101074, + "learning_rate": 0.0001, + "loss": 1.3853, + "step": 6950 + }, + { + "epoch": 0.7984607432083166, + "grad_norm": 0.4568888545036316, + "learning_rate": 0.0001, + "loss": 1.6627, + "step": 6951 + }, + { + "epoch": 0.7985756131181437, + "grad_norm": 0.458404541015625, + "learning_rate": 0.0001, + "loss": 1.5422, + "step": 6952 + }, + { + "epoch": 0.7986904830279709, + "grad_norm": 0.45047512650489807, + "learning_rate": 0.0001, + "loss": 1.5767, + "step": 6953 + }, + { + "epoch": 0.798805352937798, + "grad_norm": 0.4487355053424835, + "learning_rate": 0.0001, + "loss": 1.5318, + "step": 6954 + }, + { + "epoch": 0.7989202228476251, + "grad_norm": 0.46647393703460693, + "learning_rate": 0.0001, + "loss": 1.5918, + "step": 6955 + }, + { + "epoch": 0.7990350927574522, + "grad_norm": 0.4552063047885895, + "learning_rate": 0.0001, + "loss": 1.6044, + "step": 6956 + }, + { + "epoch": 0.7991499626672793, + "grad_norm": 0.5072106719017029, + "learning_rate": 0.0001, + "loss": 1.8092, + "step": 6957 + }, + { + "epoch": 0.7992648325771065, + "grad_norm": 0.4884167015552521, + "learning_rate": 0.0001, + "loss": 1.6688, + "step": 6958 + }, + { + "epoch": 0.7993797024869336, + "grad_norm": 0.4894075095653534, + "learning_rate": 0.0001, + "loss": 1.665, + "step": 6959 + }, + { + "epoch": 0.7994945723967607, + "grad_norm": 0.5430724024772644, + "learning_rate": 0.0001, + "loss": 1.4763, + "step": 6960 + }, + { + "epoch": 0.7996094423065878, + "grad_norm": 0.49636295437812805, + "learning_rate": 0.0001, + "loss": 1.7136, + "step": 6961 + }, + { + "epoch": 0.7997243122164149, + "grad_norm": 0.46621257066726685, + "learning_rate": 0.0001, + "loss": 1.7219, + "step": 6962 + }, + { + "epoch": 0.799839182126242, + "grad_norm": 0.4584997296333313, + "learning_rate": 0.0001, + "loss": 1.6187, + "step": 6963 + }, + { + "epoch": 0.7999540520360692, + "grad_norm": 0.4953577518463135, + "learning_rate": 0.0001, + "loss": 1.744, + "step": 6964 + }, + { + "epoch": 0.8000689219458963, + "grad_norm": 0.5007856488227844, + "learning_rate": 0.0001, + "loss": 1.6076, + "step": 6965 + }, + { + "epoch": 0.8001837918557234, + "grad_norm": 0.46211037039756775, + "learning_rate": 0.0001, + "loss": 1.6078, + "step": 6966 + }, + { + "epoch": 0.8002986617655505, + "grad_norm": 0.47474467754364014, + "learning_rate": 0.0001, + "loss": 1.7794, + "step": 6967 + }, + { + "epoch": 0.8004135316753777, + "grad_norm": 0.4621082544326782, + "learning_rate": 0.0001, + "loss": 1.6307, + "step": 6968 + }, + { + "epoch": 0.8005284015852048, + "grad_norm": 0.4746955335140228, + "learning_rate": 0.0001, + "loss": 1.6833, + "step": 6969 + }, + { + "epoch": 0.8006432714950319, + "grad_norm": 0.44209524989128113, + "learning_rate": 0.0001, + "loss": 1.7345, + "step": 6970 + }, + { + "epoch": 0.800758141404859, + "grad_norm": 0.48901766538619995, + "learning_rate": 0.0001, + "loss": 1.672, + "step": 6971 + }, + { + "epoch": 0.8008730113146861, + "grad_norm": 0.5113202333450317, + "learning_rate": 0.0001, + "loss": 1.5684, + "step": 6972 + }, + { + "epoch": 0.8009878812245133, + "grad_norm": 0.4625761806964874, + "learning_rate": 0.0001, + "loss": 1.6895, + "step": 6973 + }, + { + "epoch": 0.8011027511343404, + "grad_norm": 0.4590431749820709, + "learning_rate": 0.0001, + "loss": 1.6322, + "step": 6974 + }, + { + "epoch": 0.8012176210441675, + "grad_norm": 0.4370403289794922, + "learning_rate": 0.0001, + "loss": 1.5681, + "step": 6975 + }, + { + "epoch": 0.8013324909539946, + "grad_norm": 0.4700826406478882, + "learning_rate": 0.0001, + "loss": 1.6091, + "step": 6976 + }, + { + "epoch": 0.8014473608638217, + "grad_norm": 0.5367459058761597, + "learning_rate": 0.0001, + "loss": 1.7318, + "step": 6977 + }, + { + "epoch": 0.8015622307736489, + "grad_norm": 0.4377821087837219, + "learning_rate": 0.0001, + "loss": 1.5507, + "step": 6978 + }, + { + "epoch": 0.801677100683476, + "grad_norm": 0.45518580079078674, + "learning_rate": 0.0001, + "loss": 1.5111, + "step": 6979 + }, + { + "epoch": 0.8017919705933031, + "grad_norm": 0.4376772344112396, + "learning_rate": 0.0001, + "loss": 1.5323, + "step": 6980 + }, + { + "epoch": 0.8019068405031302, + "grad_norm": 0.45350295305252075, + "learning_rate": 0.0001, + "loss": 1.6036, + "step": 6981 + }, + { + "epoch": 0.8020217104129573, + "grad_norm": 0.4758756458759308, + "learning_rate": 0.0001, + "loss": 1.7442, + "step": 6982 + }, + { + "epoch": 0.8021365803227845, + "grad_norm": 0.43972933292388916, + "learning_rate": 0.0001, + "loss": 1.5419, + "step": 6983 + }, + { + "epoch": 0.8022514502326116, + "grad_norm": 0.45926135778427124, + "learning_rate": 0.0001, + "loss": 1.7564, + "step": 6984 + }, + { + "epoch": 0.8023663201424387, + "grad_norm": 0.46066513657569885, + "learning_rate": 0.0001, + "loss": 1.5823, + "step": 6985 + }, + { + "epoch": 0.8024811900522658, + "grad_norm": 0.4452419579029083, + "learning_rate": 0.0001, + "loss": 1.5413, + "step": 6986 + }, + { + "epoch": 0.8025960599620929, + "grad_norm": 0.44738146662712097, + "learning_rate": 0.0001, + "loss": 1.5042, + "step": 6987 + }, + { + "epoch": 0.80271092987192, + "grad_norm": 0.4505269229412079, + "learning_rate": 0.0001, + "loss": 1.5, + "step": 6988 + }, + { + "epoch": 0.8028257997817472, + "grad_norm": 0.4531947672367096, + "learning_rate": 0.0001, + "loss": 1.6508, + "step": 6989 + }, + { + "epoch": 0.8029406696915743, + "grad_norm": 0.5062926411628723, + "learning_rate": 0.0001, + "loss": 1.8187, + "step": 6990 + }, + { + "epoch": 0.8030555396014014, + "grad_norm": 0.472827285528183, + "learning_rate": 0.0001, + "loss": 1.6253, + "step": 6991 + }, + { + "epoch": 0.8031704095112285, + "grad_norm": 0.5587745308876038, + "learning_rate": 0.0001, + "loss": 1.7311, + "step": 6992 + }, + { + "epoch": 0.8032852794210557, + "grad_norm": 0.43462514877319336, + "learning_rate": 0.0001, + "loss": 1.3869, + "step": 6993 + }, + { + "epoch": 0.8034001493308828, + "grad_norm": 0.45932313799858093, + "learning_rate": 0.0001, + "loss": 1.7735, + "step": 6994 + }, + { + "epoch": 0.8035150192407099, + "grad_norm": 0.47527042031288147, + "learning_rate": 0.0001, + "loss": 1.7342, + "step": 6995 + }, + { + "epoch": 0.803629889150537, + "grad_norm": 0.4415908455848694, + "learning_rate": 0.0001, + "loss": 1.4238, + "step": 6996 + }, + { + "epoch": 0.8037447590603641, + "grad_norm": 0.4424525499343872, + "learning_rate": 0.0001, + "loss": 1.8068, + "step": 6997 + }, + { + "epoch": 0.8038596289701913, + "grad_norm": 0.45409083366394043, + "learning_rate": 0.0001, + "loss": 1.4999, + "step": 6998 + }, + { + "epoch": 0.8039744988800184, + "grad_norm": 0.5033005475997925, + "learning_rate": 0.0001, + "loss": 1.6588, + "step": 6999 + }, + { + "epoch": 0.8040893687898455, + "grad_norm": 0.4428901970386505, + "learning_rate": 0.0001, + "loss": 1.5321, + "step": 7000 + }, + { + "epoch": 0.8042042386996726, + "grad_norm": 0.4608984589576721, + "learning_rate": 0.0001, + "loss": 1.5062, + "step": 7001 + }, + { + "epoch": 0.8043191086094997, + "grad_norm": 0.47772228717803955, + "learning_rate": 0.0001, + "loss": 1.602, + "step": 7002 + }, + { + "epoch": 0.8044339785193269, + "grad_norm": 0.4590427577495575, + "learning_rate": 0.0001, + "loss": 1.7222, + "step": 7003 + }, + { + "epoch": 0.804548848429154, + "grad_norm": 0.5008092522621155, + "learning_rate": 0.0001, + "loss": 1.8124, + "step": 7004 + }, + { + "epoch": 0.8046637183389811, + "grad_norm": 0.428852915763855, + "learning_rate": 0.0001, + "loss": 1.6382, + "step": 7005 + }, + { + "epoch": 0.8047785882488082, + "grad_norm": 0.46282047033309937, + "learning_rate": 0.0001, + "loss": 1.6544, + "step": 7006 + }, + { + "epoch": 0.8048934581586353, + "grad_norm": 0.4332880675792694, + "learning_rate": 0.0001, + "loss": 1.796, + "step": 7007 + }, + { + "epoch": 0.8050083280684625, + "grad_norm": 0.4656613767147064, + "learning_rate": 0.0001, + "loss": 1.723, + "step": 7008 + }, + { + "epoch": 0.8051231979782896, + "grad_norm": 0.4826306104660034, + "learning_rate": 0.0001, + "loss": 1.8894, + "step": 7009 + }, + { + "epoch": 0.8052380678881167, + "grad_norm": 0.4590745270252228, + "learning_rate": 0.0001, + "loss": 1.7591, + "step": 7010 + }, + { + "epoch": 0.8053529377979438, + "grad_norm": 0.48546114563941956, + "learning_rate": 0.0001, + "loss": 1.7585, + "step": 7011 + }, + { + "epoch": 0.8054678077077709, + "grad_norm": 0.4384916424751282, + "learning_rate": 0.0001, + "loss": 1.6979, + "step": 7012 + }, + { + "epoch": 0.805582677617598, + "grad_norm": 0.47794288396835327, + "learning_rate": 0.0001, + "loss": 1.7491, + "step": 7013 + }, + { + "epoch": 0.8056975475274252, + "grad_norm": 0.4551166892051697, + "learning_rate": 0.0001, + "loss": 1.6755, + "step": 7014 + }, + { + "epoch": 0.8058124174372523, + "grad_norm": 0.4861622452735901, + "learning_rate": 0.0001, + "loss": 1.5717, + "step": 7015 + }, + { + "epoch": 0.8059272873470794, + "grad_norm": 0.4588170051574707, + "learning_rate": 0.0001, + "loss": 1.4308, + "step": 7016 + }, + { + "epoch": 0.8060421572569065, + "grad_norm": 0.46102410554885864, + "learning_rate": 0.0001, + "loss": 1.5345, + "step": 7017 + }, + { + "epoch": 0.8061570271667337, + "grad_norm": 0.44939014315605164, + "learning_rate": 0.0001, + "loss": 1.6842, + "step": 7018 + }, + { + "epoch": 0.8062718970765608, + "grad_norm": 0.4488585293292999, + "learning_rate": 0.0001, + "loss": 1.485, + "step": 7019 + }, + { + "epoch": 0.8063867669863879, + "grad_norm": 0.4329974055290222, + "learning_rate": 0.0001, + "loss": 1.5916, + "step": 7020 + }, + { + "epoch": 0.806501636896215, + "grad_norm": 0.4898880124092102, + "learning_rate": 0.0001, + "loss": 1.7087, + "step": 7021 + }, + { + "epoch": 0.8066165068060421, + "grad_norm": 0.4246252477169037, + "learning_rate": 0.0001, + "loss": 1.4405, + "step": 7022 + }, + { + "epoch": 0.8067313767158693, + "grad_norm": 0.49754396080970764, + "learning_rate": 0.0001, + "loss": 1.4626, + "step": 7023 + }, + { + "epoch": 0.8068462466256964, + "grad_norm": 0.4949999749660492, + "learning_rate": 0.0001, + "loss": 1.6155, + "step": 7024 + }, + { + "epoch": 0.8069611165355235, + "grad_norm": 0.4603281617164612, + "learning_rate": 0.0001, + "loss": 1.5919, + "step": 7025 + }, + { + "epoch": 0.8070759864453506, + "grad_norm": 0.4442635476589203, + "learning_rate": 0.0001, + "loss": 1.6615, + "step": 7026 + }, + { + "epoch": 0.8071908563551777, + "grad_norm": 0.46228450536727905, + "learning_rate": 0.0001, + "loss": 1.5407, + "step": 7027 + }, + { + "epoch": 0.8073057262650049, + "grad_norm": 0.46300187706947327, + "learning_rate": 0.0001, + "loss": 1.5931, + "step": 7028 + }, + { + "epoch": 0.807420596174832, + "grad_norm": 0.4652291536331177, + "learning_rate": 0.0001, + "loss": 1.7684, + "step": 7029 + }, + { + "epoch": 0.8075354660846591, + "grad_norm": 0.4570204019546509, + "learning_rate": 0.0001, + "loss": 1.4686, + "step": 7030 + }, + { + "epoch": 0.8076503359944862, + "grad_norm": 0.453274667263031, + "learning_rate": 0.0001, + "loss": 1.5297, + "step": 7031 + }, + { + "epoch": 0.8077652059043133, + "grad_norm": 0.4465121030807495, + "learning_rate": 0.0001, + "loss": 1.4167, + "step": 7032 + }, + { + "epoch": 0.8078800758141405, + "grad_norm": 0.4628574550151825, + "learning_rate": 0.0001, + "loss": 1.5498, + "step": 7033 + }, + { + "epoch": 0.8079949457239676, + "grad_norm": 0.4569374918937683, + "learning_rate": 0.0001, + "loss": 1.4265, + "step": 7034 + }, + { + "epoch": 0.8081098156337947, + "grad_norm": 0.43578875064849854, + "learning_rate": 0.0001, + "loss": 1.4849, + "step": 7035 + }, + { + "epoch": 0.8082246855436218, + "grad_norm": 0.45690855383872986, + "learning_rate": 0.0001, + "loss": 1.6328, + "step": 7036 + }, + { + "epoch": 0.8083395554534489, + "grad_norm": 0.4643135666847229, + "learning_rate": 0.0001, + "loss": 1.6545, + "step": 7037 + }, + { + "epoch": 0.808454425363276, + "grad_norm": 0.4422778785228729, + "learning_rate": 0.0001, + "loss": 1.4945, + "step": 7038 + }, + { + "epoch": 0.8085692952731032, + "grad_norm": 0.5053475499153137, + "learning_rate": 0.0001, + "loss": 1.5699, + "step": 7039 + }, + { + "epoch": 0.8086841651829303, + "grad_norm": 0.4230154752731323, + "learning_rate": 0.0001, + "loss": 1.4409, + "step": 7040 + }, + { + "epoch": 0.8087990350927574, + "grad_norm": 0.45723220705986023, + "learning_rate": 0.0001, + "loss": 1.6677, + "step": 7041 + }, + { + "epoch": 0.8089139050025845, + "grad_norm": 0.43414244055747986, + "learning_rate": 0.0001, + "loss": 1.4774, + "step": 7042 + }, + { + "epoch": 0.8090287749124117, + "grad_norm": 0.45168161392211914, + "learning_rate": 0.0001, + "loss": 1.468, + "step": 7043 + }, + { + "epoch": 0.8091436448222388, + "grad_norm": 0.4413895308971405, + "learning_rate": 0.0001, + "loss": 1.5116, + "step": 7044 + }, + { + "epoch": 0.8092585147320659, + "grad_norm": 0.45228877663612366, + "learning_rate": 0.0001, + "loss": 1.627, + "step": 7045 + }, + { + "epoch": 0.809373384641893, + "grad_norm": 0.4586316645145416, + "learning_rate": 0.0001, + "loss": 1.6476, + "step": 7046 + }, + { + "epoch": 0.8094882545517201, + "grad_norm": 0.4477645456790924, + "learning_rate": 0.0001, + "loss": 1.6319, + "step": 7047 + }, + { + "epoch": 0.8096031244615473, + "grad_norm": 0.5138921737670898, + "learning_rate": 0.0001, + "loss": 1.6401, + "step": 7048 + }, + { + "epoch": 0.8097179943713744, + "grad_norm": 0.4743337333202362, + "learning_rate": 0.0001, + "loss": 1.8095, + "step": 7049 + }, + { + "epoch": 0.8098328642812015, + "grad_norm": 0.47952887415885925, + "learning_rate": 0.0001, + "loss": 1.8992, + "step": 7050 + }, + { + "epoch": 0.8099477341910286, + "grad_norm": 0.4406343102455139, + "learning_rate": 0.0001, + "loss": 1.5797, + "step": 7051 + }, + { + "epoch": 0.8100626041008557, + "grad_norm": 0.5044456720352173, + "learning_rate": 0.0001, + "loss": 1.6264, + "step": 7052 + }, + { + "epoch": 0.8101774740106829, + "grad_norm": 0.45316067337989807, + "learning_rate": 0.0001, + "loss": 1.6792, + "step": 7053 + }, + { + "epoch": 0.81029234392051, + "grad_norm": 0.4341808259487152, + "learning_rate": 0.0001, + "loss": 1.3805, + "step": 7054 + }, + { + "epoch": 0.8104072138303371, + "grad_norm": 0.44615742564201355, + "learning_rate": 0.0001, + "loss": 1.577, + "step": 7055 + }, + { + "epoch": 0.8105220837401642, + "grad_norm": 0.5084645748138428, + "learning_rate": 0.0001, + "loss": 1.4708, + "step": 7056 + }, + { + "epoch": 0.8106369536499913, + "grad_norm": 0.4395124912261963, + "learning_rate": 0.0001, + "loss": 1.4493, + "step": 7057 + }, + { + "epoch": 0.8107518235598185, + "grad_norm": 0.4524141550064087, + "learning_rate": 0.0001, + "loss": 1.5445, + "step": 7058 + }, + { + "epoch": 0.8108666934696456, + "grad_norm": 0.4532511532306671, + "learning_rate": 0.0001, + "loss": 1.6722, + "step": 7059 + }, + { + "epoch": 0.8109815633794727, + "grad_norm": 0.45477429032325745, + "learning_rate": 0.0001, + "loss": 1.5978, + "step": 7060 + }, + { + "epoch": 0.8110964332892999, + "grad_norm": 0.4670192003250122, + "learning_rate": 0.0001, + "loss": 1.7155, + "step": 7061 + }, + { + "epoch": 0.811211303199127, + "grad_norm": 0.47830730676651, + "learning_rate": 0.0001, + "loss": 1.6898, + "step": 7062 + }, + { + "epoch": 0.8113261731089542, + "grad_norm": 0.4505847692489624, + "learning_rate": 0.0001, + "loss": 1.5469, + "step": 7063 + }, + { + "epoch": 0.8114410430187813, + "grad_norm": 0.44358721375465393, + "learning_rate": 0.0001, + "loss": 1.6125, + "step": 7064 + }, + { + "epoch": 0.8115559129286084, + "grad_norm": 0.45430463552474976, + "learning_rate": 0.0001, + "loss": 1.6516, + "step": 7065 + }, + { + "epoch": 0.8116707828384355, + "grad_norm": 0.40954452753067017, + "learning_rate": 0.0001, + "loss": 1.4936, + "step": 7066 + }, + { + "epoch": 0.8117856527482626, + "grad_norm": 0.4465819001197815, + "learning_rate": 0.0001, + "loss": 1.641, + "step": 7067 + }, + { + "epoch": 0.8119005226580898, + "grad_norm": 0.45103660225868225, + "learning_rate": 0.0001, + "loss": 1.5577, + "step": 7068 + }, + { + "epoch": 0.8120153925679169, + "grad_norm": 0.44100210070610046, + "learning_rate": 0.0001, + "loss": 1.5393, + "step": 7069 + }, + { + "epoch": 0.812130262477744, + "grad_norm": 0.45234718918800354, + "learning_rate": 0.0001, + "loss": 1.5397, + "step": 7070 + }, + { + "epoch": 0.8122451323875711, + "grad_norm": 0.4711310565471649, + "learning_rate": 0.0001, + "loss": 1.6381, + "step": 7071 + }, + { + "epoch": 0.8123600022973982, + "grad_norm": 0.4617573618888855, + "learning_rate": 0.0001, + "loss": 1.6191, + "step": 7072 + }, + { + "epoch": 0.8124748722072254, + "grad_norm": 0.4681065082550049, + "learning_rate": 0.0001, + "loss": 1.3769, + "step": 7073 + }, + { + "epoch": 0.8125897421170525, + "grad_norm": 0.500937819480896, + "learning_rate": 0.0001, + "loss": 1.8512, + "step": 7074 + }, + { + "epoch": 0.8127046120268796, + "grad_norm": 0.5059450268745422, + "learning_rate": 0.0001, + "loss": 1.6661, + "step": 7075 + }, + { + "epoch": 0.8128194819367067, + "grad_norm": 0.49339231848716736, + "learning_rate": 0.0001, + "loss": 1.619, + "step": 7076 + }, + { + "epoch": 0.8129343518465338, + "grad_norm": 0.4886321723461151, + "learning_rate": 0.0001, + "loss": 1.6388, + "step": 7077 + }, + { + "epoch": 0.813049221756361, + "grad_norm": 0.4459485113620758, + "learning_rate": 0.0001, + "loss": 1.5529, + "step": 7078 + }, + { + "epoch": 0.8131640916661881, + "grad_norm": 0.45102250576019287, + "learning_rate": 0.0001, + "loss": 1.6559, + "step": 7079 + }, + { + "epoch": 0.8132789615760152, + "grad_norm": 0.46558091044425964, + "learning_rate": 0.0001, + "loss": 1.4895, + "step": 7080 + }, + { + "epoch": 0.8133938314858423, + "grad_norm": 0.45526906847953796, + "learning_rate": 0.0001, + "loss": 1.6212, + "step": 7081 + }, + { + "epoch": 0.8135087013956694, + "grad_norm": 0.44742417335510254, + "learning_rate": 0.0001, + "loss": 1.6513, + "step": 7082 + }, + { + "epoch": 0.8136235713054966, + "grad_norm": 0.44720160961151123, + "learning_rate": 0.0001, + "loss": 1.5228, + "step": 7083 + }, + { + "epoch": 0.8137384412153237, + "grad_norm": 0.4373728632926941, + "learning_rate": 0.0001, + "loss": 1.5815, + "step": 7084 + }, + { + "epoch": 0.8138533111251508, + "grad_norm": 0.4450926184654236, + "learning_rate": 0.0001, + "loss": 1.4912, + "step": 7085 + }, + { + "epoch": 0.8139681810349779, + "grad_norm": 0.44571229815483093, + "learning_rate": 0.0001, + "loss": 1.5636, + "step": 7086 + }, + { + "epoch": 0.814083050944805, + "grad_norm": 0.49900081753730774, + "learning_rate": 0.0001, + "loss": 1.5591, + "step": 7087 + }, + { + "epoch": 0.8141979208546322, + "grad_norm": 0.440686970949173, + "learning_rate": 0.0001, + "loss": 1.6891, + "step": 7088 + }, + { + "epoch": 0.8143127907644593, + "grad_norm": 0.5091794729232788, + "learning_rate": 0.0001, + "loss": 1.7744, + "step": 7089 + }, + { + "epoch": 0.8144276606742864, + "grad_norm": 0.4285080134868622, + "learning_rate": 0.0001, + "loss": 1.5412, + "step": 7090 + }, + { + "epoch": 0.8145425305841135, + "grad_norm": 0.43077725172042847, + "learning_rate": 0.0001, + "loss": 1.4802, + "step": 7091 + }, + { + "epoch": 0.8146574004939406, + "grad_norm": 0.46352505683898926, + "learning_rate": 0.0001, + "loss": 1.6418, + "step": 7092 + }, + { + "epoch": 0.8147722704037678, + "grad_norm": 0.44118359684944153, + "learning_rate": 0.0001, + "loss": 1.6337, + "step": 7093 + }, + { + "epoch": 0.8148871403135949, + "grad_norm": 0.45531347393989563, + "learning_rate": 0.0001, + "loss": 1.6527, + "step": 7094 + }, + { + "epoch": 0.815002010223422, + "grad_norm": 0.5839067697525024, + "learning_rate": 0.0001, + "loss": 1.6389, + "step": 7095 + }, + { + "epoch": 0.8151168801332491, + "grad_norm": 0.4553830623626709, + "learning_rate": 0.0001, + "loss": 1.6279, + "step": 7096 + }, + { + "epoch": 0.8152317500430762, + "grad_norm": 0.4333423972129822, + "learning_rate": 0.0001, + "loss": 1.5816, + "step": 7097 + }, + { + "epoch": 0.8153466199529034, + "grad_norm": 0.4156707525253296, + "learning_rate": 0.0001, + "loss": 1.2689, + "step": 7098 + }, + { + "epoch": 0.8154614898627305, + "grad_norm": 0.5249226093292236, + "learning_rate": 0.0001, + "loss": 1.8285, + "step": 7099 + }, + { + "epoch": 0.8155763597725576, + "grad_norm": 0.45831164717674255, + "learning_rate": 0.0001, + "loss": 1.5238, + "step": 7100 + }, + { + "epoch": 0.8156912296823847, + "grad_norm": 0.5419580340385437, + "learning_rate": 0.0001, + "loss": 1.9262, + "step": 7101 + }, + { + "epoch": 0.8158060995922118, + "grad_norm": 0.46825796365737915, + "learning_rate": 0.0001, + "loss": 1.5954, + "step": 7102 + }, + { + "epoch": 0.815920969502039, + "grad_norm": 0.47406765818595886, + "learning_rate": 0.0001, + "loss": 1.5404, + "step": 7103 + }, + { + "epoch": 0.8160358394118661, + "grad_norm": 0.49613794684410095, + "learning_rate": 0.0001, + "loss": 1.7463, + "step": 7104 + }, + { + "epoch": 0.8161507093216932, + "grad_norm": 0.4632255733013153, + "learning_rate": 0.0001, + "loss": 1.4529, + "step": 7105 + }, + { + "epoch": 0.8162655792315203, + "grad_norm": 0.45750927925109863, + "learning_rate": 0.0001, + "loss": 1.6081, + "step": 7106 + }, + { + "epoch": 0.8163804491413474, + "grad_norm": 0.46657127141952515, + "learning_rate": 0.0001, + "loss": 1.7844, + "step": 7107 + }, + { + "epoch": 0.8164953190511746, + "grad_norm": 0.47650468349456787, + "learning_rate": 0.0001, + "loss": 1.5957, + "step": 7108 + }, + { + "epoch": 0.8166101889610017, + "grad_norm": 0.4252217411994934, + "learning_rate": 0.0001, + "loss": 1.5233, + "step": 7109 + }, + { + "epoch": 0.8167250588708288, + "grad_norm": 0.45260146260261536, + "learning_rate": 0.0001, + "loss": 1.6294, + "step": 7110 + }, + { + "epoch": 0.8168399287806559, + "grad_norm": 0.43198126554489136, + "learning_rate": 0.0001, + "loss": 1.5063, + "step": 7111 + }, + { + "epoch": 0.816954798690483, + "grad_norm": 0.48374143242836, + "learning_rate": 0.0001, + "loss": 1.72, + "step": 7112 + }, + { + "epoch": 0.8170696686003102, + "grad_norm": 0.5113547444343567, + "learning_rate": 0.0001, + "loss": 1.7583, + "step": 7113 + }, + { + "epoch": 0.8171845385101373, + "grad_norm": 0.46327292919158936, + "learning_rate": 0.0001, + "loss": 1.6309, + "step": 7114 + }, + { + "epoch": 0.8172994084199644, + "grad_norm": 0.4762055575847626, + "learning_rate": 0.0001, + "loss": 1.6213, + "step": 7115 + }, + { + "epoch": 0.8174142783297915, + "grad_norm": 0.5080848932266235, + "learning_rate": 0.0001, + "loss": 1.7955, + "step": 7116 + }, + { + "epoch": 0.8175291482396186, + "grad_norm": 0.47820234298706055, + "learning_rate": 0.0001, + "loss": 1.736, + "step": 7117 + }, + { + "epoch": 0.8176440181494458, + "grad_norm": 0.4423477351665497, + "learning_rate": 0.0001, + "loss": 1.5467, + "step": 7118 + }, + { + "epoch": 0.8177588880592729, + "grad_norm": 0.4471362829208374, + "learning_rate": 0.0001, + "loss": 1.5868, + "step": 7119 + }, + { + "epoch": 0.8178737579691, + "grad_norm": 0.4536069631576538, + "learning_rate": 0.0001, + "loss": 1.6112, + "step": 7120 + }, + { + "epoch": 0.8179886278789271, + "grad_norm": 0.4437789022922516, + "learning_rate": 0.0001, + "loss": 1.4918, + "step": 7121 + }, + { + "epoch": 0.8181034977887542, + "grad_norm": 0.4352627992630005, + "learning_rate": 0.0001, + "loss": 1.5588, + "step": 7122 + }, + { + "epoch": 0.8182183676985814, + "grad_norm": 0.4951383173465729, + "learning_rate": 0.0001, + "loss": 1.5797, + "step": 7123 + }, + { + "epoch": 0.8183332376084085, + "grad_norm": 0.47660359740257263, + "learning_rate": 0.0001, + "loss": 1.649, + "step": 7124 + }, + { + "epoch": 0.8184481075182356, + "grad_norm": 0.4454366862773895, + "learning_rate": 0.0001, + "loss": 1.647, + "step": 7125 + }, + { + "epoch": 0.8185629774280627, + "grad_norm": 0.47320306301116943, + "learning_rate": 0.0001, + "loss": 1.8041, + "step": 7126 + }, + { + "epoch": 0.8186778473378898, + "grad_norm": 0.4817422330379486, + "learning_rate": 0.0001, + "loss": 1.6996, + "step": 7127 + }, + { + "epoch": 0.818792717247717, + "grad_norm": 0.4612285792827606, + "learning_rate": 0.0001, + "loss": 1.6331, + "step": 7128 + }, + { + "epoch": 0.8189075871575441, + "grad_norm": 0.4710875153541565, + "learning_rate": 0.0001, + "loss": 1.6399, + "step": 7129 + }, + { + "epoch": 0.8190224570673712, + "grad_norm": 0.4853847324848175, + "learning_rate": 0.0001, + "loss": 1.7546, + "step": 7130 + }, + { + "epoch": 0.8191373269771983, + "grad_norm": 0.46727731823921204, + "learning_rate": 0.0001, + "loss": 1.7467, + "step": 7131 + }, + { + "epoch": 0.8192521968870254, + "grad_norm": 0.44645458459854126, + "learning_rate": 0.0001, + "loss": 1.5363, + "step": 7132 + }, + { + "epoch": 0.8193670667968526, + "grad_norm": 0.4743711054325104, + "learning_rate": 0.0001, + "loss": 1.7297, + "step": 7133 + }, + { + "epoch": 0.8194819367066797, + "grad_norm": 0.45293739438056946, + "learning_rate": 0.0001, + "loss": 1.6004, + "step": 7134 + }, + { + "epoch": 0.8195968066165068, + "grad_norm": 0.46416231989860535, + "learning_rate": 0.0001, + "loss": 1.6761, + "step": 7135 + }, + { + "epoch": 0.8197116765263339, + "grad_norm": 0.45859238505363464, + "learning_rate": 0.0001, + "loss": 1.6363, + "step": 7136 + }, + { + "epoch": 0.819826546436161, + "grad_norm": 0.45095375180244446, + "learning_rate": 0.0001, + "loss": 1.5282, + "step": 7137 + }, + { + "epoch": 0.8199414163459882, + "grad_norm": 0.5300037860870361, + "learning_rate": 0.0001, + "loss": 1.827, + "step": 7138 + }, + { + "epoch": 0.8200562862558153, + "grad_norm": 0.4848960340023041, + "learning_rate": 0.0001, + "loss": 1.4755, + "step": 7139 + }, + { + "epoch": 0.8201711561656424, + "grad_norm": 0.5703202486038208, + "learning_rate": 0.0001, + "loss": 1.2951, + "step": 7140 + }, + { + "epoch": 0.8202860260754695, + "grad_norm": 0.44500285387039185, + "learning_rate": 0.0001, + "loss": 1.5112, + "step": 7141 + }, + { + "epoch": 0.8204008959852966, + "grad_norm": 0.4689328968524933, + "learning_rate": 0.0001, + "loss": 1.4309, + "step": 7142 + }, + { + "epoch": 0.8205157658951238, + "grad_norm": 0.486074298620224, + "learning_rate": 0.0001, + "loss": 1.5857, + "step": 7143 + }, + { + "epoch": 0.8206306358049509, + "grad_norm": 0.4206257462501526, + "learning_rate": 0.0001, + "loss": 1.334, + "step": 7144 + }, + { + "epoch": 0.820745505714778, + "grad_norm": 0.4606837332248688, + "learning_rate": 0.0001, + "loss": 1.5509, + "step": 7145 + }, + { + "epoch": 0.8208603756246051, + "grad_norm": 0.5066717267036438, + "learning_rate": 0.0001, + "loss": 1.8185, + "step": 7146 + }, + { + "epoch": 0.8209752455344322, + "grad_norm": 0.46547627449035645, + "learning_rate": 0.0001, + "loss": 1.5673, + "step": 7147 + }, + { + "epoch": 0.8210901154442594, + "grad_norm": 0.4761962294578552, + "learning_rate": 0.0001, + "loss": 1.6391, + "step": 7148 + }, + { + "epoch": 0.8212049853540865, + "grad_norm": 0.459431916475296, + "learning_rate": 0.0001, + "loss": 1.66, + "step": 7149 + }, + { + "epoch": 0.8213198552639136, + "grad_norm": 0.4470955729484558, + "learning_rate": 0.0001, + "loss": 1.6065, + "step": 7150 + }, + { + "epoch": 0.8214347251737407, + "grad_norm": 0.48236003518104553, + "learning_rate": 0.0001, + "loss": 1.8159, + "step": 7151 + }, + { + "epoch": 0.8215495950835678, + "grad_norm": 0.46525686979293823, + "learning_rate": 0.0001, + "loss": 1.6244, + "step": 7152 + }, + { + "epoch": 0.821664464993395, + "grad_norm": 0.41665059328079224, + "learning_rate": 0.0001, + "loss": 1.4355, + "step": 7153 + }, + { + "epoch": 0.8217793349032221, + "grad_norm": 0.518457293510437, + "learning_rate": 0.0001, + "loss": 1.83, + "step": 7154 + }, + { + "epoch": 0.8218942048130492, + "grad_norm": 0.4431942105293274, + "learning_rate": 0.0001, + "loss": 1.5345, + "step": 7155 + }, + { + "epoch": 0.8220090747228763, + "grad_norm": 0.5172891616821289, + "learning_rate": 0.0001, + "loss": 1.6801, + "step": 7156 + }, + { + "epoch": 0.8221239446327034, + "grad_norm": 0.4258945882320404, + "learning_rate": 0.0001, + "loss": 1.6212, + "step": 7157 + }, + { + "epoch": 0.8222388145425306, + "grad_norm": 0.4604533016681671, + "learning_rate": 0.0001, + "loss": 1.5509, + "step": 7158 + }, + { + "epoch": 0.8223536844523577, + "grad_norm": 0.4496094584465027, + "learning_rate": 0.0001, + "loss": 1.4679, + "step": 7159 + }, + { + "epoch": 0.8224685543621848, + "grad_norm": 0.4688582420349121, + "learning_rate": 0.0001, + "loss": 1.668, + "step": 7160 + }, + { + "epoch": 0.8225834242720119, + "grad_norm": 0.47711849212646484, + "learning_rate": 0.0001, + "loss": 1.574, + "step": 7161 + }, + { + "epoch": 0.822698294181839, + "grad_norm": 0.45683956146240234, + "learning_rate": 0.0001, + "loss": 1.5046, + "step": 7162 + }, + { + "epoch": 0.8228131640916662, + "grad_norm": 0.437517374753952, + "learning_rate": 0.0001, + "loss": 1.4831, + "step": 7163 + }, + { + "epoch": 0.8229280340014933, + "grad_norm": 0.4502145051956177, + "learning_rate": 0.0001, + "loss": 1.5689, + "step": 7164 + }, + { + "epoch": 0.8230429039113204, + "grad_norm": 0.4960477650165558, + "learning_rate": 0.0001, + "loss": 1.6825, + "step": 7165 + }, + { + "epoch": 0.8231577738211475, + "grad_norm": 0.4727974534034729, + "learning_rate": 0.0001, + "loss": 1.5416, + "step": 7166 + }, + { + "epoch": 0.8232726437309746, + "grad_norm": 0.4566989541053772, + "learning_rate": 0.0001, + "loss": 1.5257, + "step": 7167 + }, + { + "epoch": 0.8233875136408018, + "grad_norm": 0.49063435196876526, + "learning_rate": 0.0001, + "loss": 1.576, + "step": 7168 + }, + { + "epoch": 0.8235023835506289, + "grad_norm": 0.4589118957519531, + "learning_rate": 0.0001, + "loss": 1.6443, + "step": 7169 + }, + { + "epoch": 0.823617253460456, + "grad_norm": 0.46506285667419434, + "learning_rate": 0.0001, + "loss": 1.5499, + "step": 7170 + }, + { + "epoch": 0.8237321233702831, + "grad_norm": 0.4581180810928345, + "learning_rate": 0.0001, + "loss": 1.6783, + "step": 7171 + }, + { + "epoch": 0.8238469932801102, + "grad_norm": 0.47631680965423584, + "learning_rate": 0.0001, + "loss": 1.6033, + "step": 7172 + }, + { + "epoch": 0.8239618631899374, + "grad_norm": 0.4644998013973236, + "learning_rate": 0.0001, + "loss": 1.6591, + "step": 7173 + }, + { + "epoch": 0.8240767330997645, + "grad_norm": 0.43831029534339905, + "learning_rate": 0.0001, + "loss": 1.4786, + "step": 7174 + }, + { + "epoch": 0.8241916030095916, + "grad_norm": 0.45947766304016113, + "learning_rate": 0.0001, + "loss": 1.6604, + "step": 7175 + }, + { + "epoch": 0.8243064729194187, + "grad_norm": 0.44559556245803833, + "learning_rate": 0.0001, + "loss": 1.5418, + "step": 7176 + }, + { + "epoch": 0.8244213428292458, + "grad_norm": 0.43358561396598816, + "learning_rate": 0.0001, + "loss": 1.5983, + "step": 7177 + }, + { + "epoch": 0.824536212739073, + "grad_norm": 0.4433882534503937, + "learning_rate": 0.0001, + "loss": 1.5811, + "step": 7178 + }, + { + "epoch": 0.8246510826489001, + "grad_norm": 0.48557913303375244, + "learning_rate": 0.0001, + "loss": 1.6611, + "step": 7179 + }, + { + "epoch": 0.8247659525587272, + "grad_norm": 0.4322436451911926, + "learning_rate": 0.0001, + "loss": 1.5646, + "step": 7180 + }, + { + "epoch": 0.8248808224685543, + "grad_norm": 0.4723256230354309, + "learning_rate": 0.0001, + "loss": 1.6417, + "step": 7181 + }, + { + "epoch": 0.8249956923783814, + "grad_norm": 0.4498218894004822, + "learning_rate": 0.0001, + "loss": 1.69, + "step": 7182 + }, + { + "epoch": 0.8251105622882086, + "grad_norm": 0.44787004590034485, + "learning_rate": 0.0001, + "loss": 1.8015, + "step": 7183 + }, + { + "epoch": 0.8252254321980357, + "grad_norm": 0.4494984745979309, + "learning_rate": 0.0001, + "loss": 1.4992, + "step": 7184 + }, + { + "epoch": 0.8253403021078628, + "grad_norm": 0.4542940557003021, + "learning_rate": 0.0001, + "loss": 1.5179, + "step": 7185 + }, + { + "epoch": 0.8254551720176899, + "grad_norm": 0.4573206901550293, + "learning_rate": 0.0001, + "loss": 1.731, + "step": 7186 + }, + { + "epoch": 0.825570041927517, + "grad_norm": 0.455084890127182, + "learning_rate": 0.0001, + "loss": 1.7468, + "step": 7187 + }, + { + "epoch": 0.8256849118373442, + "grad_norm": 0.4983527958393097, + "learning_rate": 0.0001, + "loss": 1.5253, + "step": 7188 + }, + { + "epoch": 0.8257997817471713, + "grad_norm": 0.457497239112854, + "learning_rate": 0.0001, + "loss": 1.6493, + "step": 7189 + }, + { + "epoch": 0.8259146516569984, + "grad_norm": 0.485844224691391, + "learning_rate": 0.0001, + "loss": 1.7502, + "step": 7190 + }, + { + "epoch": 0.8260295215668255, + "grad_norm": 0.47317448258399963, + "learning_rate": 0.0001, + "loss": 1.5996, + "step": 7191 + }, + { + "epoch": 0.8261443914766526, + "grad_norm": 0.4314495921134949, + "learning_rate": 0.0001, + "loss": 1.5514, + "step": 7192 + }, + { + "epoch": 0.8262592613864798, + "grad_norm": 0.45822349190711975, + "learning_rate": 0.0001, + "loss": 1.667, + "step": 7193 + }, + { + "epoch": 0.8263741312963069, + "grad_norm": 0.4706357419490814, + "learning_rate": 0.0001, + "loss": 1.6343, + "step": 7194 + }, + { + "epoch": 0.826489001206134, + "grad_norm": 0.45781227946281433, + "learning_rate": 0.0001, + "loss": 1.4011, + "step": 7195 + }, + { + "epoch": 0.8266038711159611, + "grad_norm": 0.43155431747436523, + "learning_rate": 0.0001, + "loss": 1.5375, + "step": 7196 + }, + { + "epoch": 0.8267187410257882, + "grad_norm": 0.4829701781272888, + "learning_rate": 0.0001, + "loss": 1.6697, + "step": 7197 + }, + { + "epoch": 0.8268336109356155, + "grad_norm": 0.4185773432254791, + "learning_rate": 0.0001, + "loss": 1.4531, + "step": 7198 + }, + { + "epoch": 0.8269484808454426, + "grad_norm": 0.4704040288925171, + "learning_rate": 0.0001, + "loss": 1.6184, + "step": 7199 + }, + { + "epoch": 0.8270633507552697, + "grad_norm": 0.41344597935676575, + "learning_rate": 0.0001, + "loss": 1.4317, + "step": 7200 + }, + { + "epoch": 0.8271782206650968, + "grad_norm": 0.4467063546180725, + "learning_rate": 0.0001, + "loss": 1.3732, + "step": 7201 + }, + { + "epoch": 0.827293090574924, + "grad_norm": 0.4785305857658386, + "learning_rate": 0.0001, + "loss": 1.5443, + "step": 7202 + }, + { + "epoch": 0.8274079604847511, + "grad_norm": 0.4496002793312073, + "learning_rate": 0.0001, + "loss": 1.5494, + "step": 7203 + }, + { + "epoch": 0.8275228303945782, + "grad_norm": 0.45911693572998047, + "learning_rate": 0.0001, + "loss": 1.3529, + "step": 7204 + }, + { + "epoch": 0.8276377003044053, + "grad_norm": 0.4438045918941498, + "learning_rate": 0.0001, + "loss": 1.2912, + "step": 7205 + }, + { + "epoch": 0.8277525702142324, + "grad_norm": 0.4709499776363373, + "learning_rate": 0.0001, + "loss": 1.5936, + "step": 7206 + }, + { + "epoch": 0.8278674401240596, + "grad_norm": 0.4676428437232971, + "learning_rate": 0.0001, + "loss": 1.5819, + "step": 7207 + }, + { + "epoch": 0.8279823100338867, + "grad_norm": 0.5031962394714355, + "learning_rate": 0.0001, + "loss": 1.5998, + "step": 7208 + }, + { + "epoch": 0.8280971799437138, + "grad_norm": 0.49766260385513306, + "learning_rate": 0.0001, + "loss": 1.4382, + "step": 7209 + }, + { + "epoch": 0.8282120498535409, + "grad_norm": 0.5159705877304077, + "learning_rate": 0.0001, + "loss": 1.8591, + "step": 7210 + }, + { + "epoch": 0.828326919763368, + "grad_norm": 0.47950395941734314, + "learning_rate": 0.0001, + "loss": 1.568, + "step": 7211 + }, + { + "epoch": 0.8284417896731952, + "grad_norm": 0.4581819772720337, + "learning_rate": 0.0001, + "loss": 1.6142, + "step": 7212 + }, + { + "epoch": 0.8285566595830223, + "grad_norm": 0.4366266429424286, + "learning_rate": 0.0001, + "loss": 1.6161, + "step": 7213 + }, + { + "epoch": 0.8286715294928494, + "grad_norm": 0.48760971426963806, + "learning_rate": 0.0001, + "loss": 1.5188, + "step": 7214 + }, + { + "epoch": 0.8287863994026765, + "grad_norm": 0.4329768419265747, + "learning_rate": 0.0001, + "loss": 1.5507, + "step": 7215 + }, + { + "epoch": 0.8289012693125036, + "grad_norm": 0.44413167238235474, + "learning_rate": 0.0001, + "loss": 1.5943, + "step": 7216 + }, + { + "epoch": 0.8290161392223308, + "grad_norm": 0.4911179840564728, + "learning_rate": 0.0001, + "loss": 1.4973, + "step": 7217 + }, + { + "epoch": 0.8291310091321579, + "grad_norm": 0.4415479302406311, + "learning_rate": 0.0001, + "loss": 1.5761, + "step": 7218 + }, + { + "epoch": 0.829245879041985, + "grad_norm": 0.5084326863288879, + "learning_rate": 0.0001, + "loss": 1.7074, + "step": 7219 + }, + { + "epoch": 0.8293607489518121, + "grad_norm": 0.48137006163597107, + "learning_rate": 0.0001, + "loss": 1.7523, + "step": 7220 + }, + { + "epoch": 0.8294756188616392, + "grad_norm": 0.4507937431335449, + "learning_rate": 0.0001, + "loss": 1.6697, + "step": 7221 + }, + { + "epoch": 0.8295904887714664, + "grad_norm": 0.4717406630516052, + "learning_rate": 0.0001, + "loss": 1.754, + "step": 7222 + }, + { + "epoch": 0.8297053586812935, + "grad_norm": 0.4703536927700043, + "learning_rate": 0.0001, + "loss": 1.6438, + "step": 7223 + }, + { + "epoch": 0.8298202285911206, + "grad_norm": 0.41335755586624146, + "learning_rate": 0.0001, + "loss": 1.4745, + "step": 7224 + }, + { + "epoch": 0.8299350985009477, + "grad_norm": 0.47170180082321167, + "learning_rate": 0.0001, + "loss": 1.5615, + "step": 7225 + }, + { + "epoch": 0.8300499684107748, + "grad_norm": 0.44753673672676086, + "learning_rate": 0.0001, + "loss": 1.3859, + "step": 7226 + }, + { + "epoch": 0.830164838320602, + "grad_norm": 0.44213739037513733, + "learning_rate": 0.0001, + "loss": 1.3913, + "step": 7227 + }, + { + "epoch": 0.8302797082304291, + "grad_norm": 0.46415868401527405, + "learning_rate": 0.0001, + "loss": 1.7794, + "step": 7228 + }, + { + "epoch": 0.8303945781402562, + "grad_norm": 0.47127893567085266, + "learning_rate": 0.0001, + "loss": 1.6631, + "step": 7229 + }, + { + "epoch": 0.8305094480500833, + "grad_norm": 0.48462972044944763, + "learning_rate": 0.0001, + "loss": 1.643, + "step": 7230 + }, + { + "epoch": 0.8306243179599104, + "grad_norm": 0.43438246846199036, + "learning_rate": 0.0001, + "loss": 1.5174, + "step": 7231 + }, + { + "epoch": 0.8307391878697375, + "grad_norm": 0.44734856486320496, + "learning_rate": 0.0001, + "loss": 1.5771, + "step": 7232 + }, + { + "epoch": 0.8308540577795647, + "grad_norm": 0.4758615493774414, + "learning_rate": 0.0001, + "loss": 1.5675, + "step": 7233 + }, + { + "epoch": 0.8309689276893918, + "grad_norm": 0.5029293298721313, + "learning_rate": 0.0001, + "loss": 1.7695, + "step": 7234 + }, + { + "epoch": 0.8310837975992189, + "grad_norm": 0.4502490162849426, + "learning_rate": 0.0001, + "loss": 1.4654, + "step": 7235 + }, + { + "epoch": 0.831198667509046, + "grad_norm": 0.4694804251194, + "learning_rate": 0.0001, + "loss": 1.5624, + "step": 7236 + }, + { + "epoch": 0.8313135374188731, + "grad_norm": 0.4628579616546631, + "learning_rate": 0.0001, + "loss": 1.4772, + "step": 7237 + }, + { + "epoch": 0.8314284073287003, + "grad_norm": 0.40973448753356934, + "learning_rate": 0.0001, + "loss": 1.3017, + "step": 7238 + }, + { + "epoch": 0.8315432772385274, + "grad_norm": 0.48314985632896423, + "learning_rate": 0.0001, + "loss": 1.6824, + "step": 7239 + }, + { + "epoch": 0.8316581471483545, + "grad_norm": 0.43235471844673157, + "learning_rate": 0.0001, + "loss": 1.5396, + "step": 7240 + }, + { + "epoch": 0.8317730170581816, + "grad_norm": 0.45430463552474976, + "learning_rate": 0.0001, + "loss": 1.5663, + "step": 7241 + }, + { + "epoch": 0.8318878869680087, + "grad_norm": 0.4984131455421448, + "learning_rate": 0.0001, + "loss": 1.5334, + "step": 7242 + }, + { + "epoch": 0.8320027568778359, + "grad_norm": 0.46940264105796814, + "learning_rate": 0.0001, + "loss": 1.6658, + "step": 7243 + }, + { + "epoch": 0.832117626787663, + "grad_norm": 0.47424784302711487, + "learning_rate": 0.0001, + "loss": 1.4645, + "step": 7244 + }, + { + "epoch": 0.8322324966974901, + "grad_norm": 0.5137829780578613, + "learning_rate": 0.0001, + "loss": 1.6511, + "step": 7245 + }, + { + "epoch": 0.8323473666073172, + "grad_norm": 0.5260666012763977, + "learning_rate": 0.0001, + "loss": 1.7015, + "step": 7246 + }, + { + "epoch": 0.8324622365171443, + "grad_norm": 0.47583386301994324, + "learning_rate": 0.0001, + "loss": 1.504, + "step": 7247 + }, + { + "epoch": 0.8325771064269715, + "grad_norm": 0.4737547039985657, + "learning_rate": 0.0001, + "loss": 1.7114, + "step": 7248 + }, + { + "epoch": 0.8326919763367986, + "grad_norm": 0.49073606729507446, + "learning_rate": 0.0001, + "loss": 1.6678, + "step": 7249 + }, + { + "epoch": 0.8328068462466257, + "grad_norm": 0.4550478756427765, + "learning_rate": 0.0001, + "loss": 1.366, + "step": 7250 + }, + { + "epoch": 0.8329217161564528, + "grad_norm": 0.45904481410980225, + "learning_rate": 0.0001, + "loss": 1.4628, + "step": 7251 + }, + { + "epoch": 0.83303658606628, + "grad_norm": 0.4898592233657837, + "learning_rate": 0.0001, + "loss": 1.7453, + "step": 7252 + }, + { + "epoch": 0.8331514559761071, + "grad_norm": 0.46044033765792847, + "learning_rate": 0.0001, + "loss": 1.5762, + "step": 7253 + }, + { + "epoch": 0.8332663258859342, + "grad_norm": 0.4777674376964569, + "learning_rate": 0.0001, + "loss": 1.5323, + "step": 7254 + }, + { + "epoch": 0.8333811957957613, + "grad_norm": 0.44753503799438477, + "learning_rate": 0.0001, + "loss": 1.729, + "step": 7255 + }, + { + "epoch": 0.8334960657055884, + "grad_norm": 0.45856595039367676, + "learning_rate": 0.0001, + "loss": 1.5738, + "step": 7256 + }, + { + "epoch": 0.8336109356154155, + "grad_norm": 0.4056832790374756, + "learning_rate": 0.0001, + "loss": 1.2812, + "step": 7257 + }, + { + "epoch": 0.8337258055252427, + "grad_norm": 0.48094066977500916, + "learning_rate": 0.0001, + "loss": 1.5232, + "step": 7258 + }, + { + "epoch": 0.8338406754350698, + "grad_norm": 0.4848513603210449, + "learning_rate": 0.0001, + "loss": 1.6777, + "step": 7259 + }, + { + "epoch": 0.8339555453448969, + "grad_norm": 0.4800076484680176, + "learning_rate": 0.0001, + "loss": 1.4396, + "step": 7260 + }, + { + "epoch": 0.834070415254724, + "grad_norm": 0.47761738300323486, + "learning_rate": 0.0001, + "loss": 1.5938, + "step": 7261 + }, + { + "epoch": 0.8341852851645511, + "grad_norm": 0.4880167245864868, + "learning_rate": 0.0001, + "loss": 1.5884, + "step": 7262 + }, + { + "epoch": 0.8343001550743783, + "grad_norm": 0.464358925819397, + "learning_rate": 0.0001, + "loss": 1.7559, + "step": 7263 + }, + { + "epoch": 0.8344150249842054, + "grad_norm": 0.4656590521335602, + "learning_rate": 0.0001, + "loss": 1.576, + "step": 7264 + }, + { + "epoch": 0.8345298948940325, + "grad_norm": 0.4818393290042877, + "learning_rate": 0.0001, + "loss": 1.6201, + "step": 7265 + }, + { + "epoch": 0.8346447648038596, + "grad_norm": 0.4994739294052124, + "learning_rate": 0.0001, + "loss": 1.4481, + "step": 7266 + }, + { + "epoch": 0.8347596347136867, + "grad_norm": 0.49908795952796936, + "learning_rate": 0.0001, + "loss": 1.6915, + "step": 7267 + }, + { + "epoch": 0.8348745046235139, + "grad_norm": 0.5030030608177185, + "learning_rate": 0.0001, + "loss": 1.7213, + "step": 7268 + }, + { + "epoch": 0.834989374533341, + "grad_norm": 0.4775954484939575, + "learning_rate": 0.0001, + "loss": 1.7861, + "step": 7269 + }, + { + "epoch": 0.8351042444431681, + "grad_norm": 0.41004353761672974, + "learning_rate": 0.0001, + "loss": 1.4602, + "step": 7270 + }, + { + "epoch": 0.8352191143529952, + "grad_norm": 0.49906715750694275, + "learning_rate": 0.0001, + "loss": 1.5855, + "step": 7271 + }, + { + "epoch": 0.8353339842628223, + "grad_norm": 0.4458923637866974, + "learning_rate": 0.0001, + "loss": 1.7002, + "step": 7272 + }, + { + "epoch": 0.8354488541726495, + "grad_norm": 0.4532411992549896, + "learning_rate": 0.0001, + "loss": 1.6273, + "step": 7273 + }, + { + "epoch": 0.8355637240824766, + "grad_norm": 0.4611133933067322, + "learning_rate": 0.0001, + "loss": 1.5768, + "step": 7274 + }, + { + "epoch": 0.8356785939923037, + "grad_norm": 0.4394255578517914, + "learning_rate": 0.0001, + "loss": 1.4816, + "step": 7275 + }, + { + "epoch": 0.8357934639021308, + "grad_norm": 0.4390329420566559, + "learning_rate": 0.0001, + "loss": 1.6247, + "step": 7276 + }, + { + "epoch": 0.835908333811958, + "grad_norm": 0.4387739300727844, + "learning_rate": 0.0001, + "loss": 1.5974, + "step": 7277 + }, + { + "epoch": 0.8360232037217851, + "grad_norm": 0.44605275988578796, + "learning_rate": 0.0001, + "loss": 1.6068, + "step": 7278 + }, + { + "epoch": 0.8361380736316122, + "grad_norm": 0.4986502528190613, + "learning_rate": 0.0001, + "loss": 1.4895, + "step": 7279 + }, + { + "epoch": 0.8362529435414393, + "grad_norm": 0.4799439013004303, + "learning_rate": 0.0001, + "loss": 1.7177, + "step": 7280 + }, + { + "epoch": 0.8363678134512664, + "grad_norm": 0.4725569784641266, + "learning_rate": 0.0001, + "loss": 1.6727, + "step": 7281 + }, + { + "epoch": 0.8364826833610935, + "grad_norm": 0.46974942088127136, + "learning_rate": 0.0001, + "loss": 1.5882, + "step": 7282 + }, + { + "epoch": 0.8365975532709207, + "grad_norm": 0.4600738286972046, + "learning_rate": 0.0001, + "loss": 1.662, + "step": 7283 + }, + { + "epoch": 0.8367124231807478, + "grad_norm": 0.516070544719696, + "learning_rate": 0.0001, + "loss": 1.5734, + "step": 7284 + }, + { + "epoch": 0.8368272930905749, + "grad_norm": 0.44913583993911743, + "learning_rate": 0.0001, + "loss": 1.4985, + "step": 7285 + }, + { + "epoch": 0.836942163000402, + "grad_norm": 0.4450875222682953, + "learning_rate": 0.0001, + "loss": 1.4021, + "step": 7286 + }, + { + "epoch": 0.8370570329102291, + "grad_norm": 0.45671719312667847, + "learning_rate": 0.0001, + "loss": 1.4193, + "step": 7287 + }, + { + "epoch": 0.8371719028200563, + "grad_norm": 0.4811131954193115, + "learning_rate": 0.0001, + "loss": 1.588, + "step": 7288 + }, + { + "epoch": 0.8372867727298834, + "grad_norm": 0.473019540309906, + "learning_rate": 0.0001, + "loss": 1.6143, + "step": 7289 + }, + { + "epoch": 0.8374016426397105, + "grad_norm": 0.4964619576931, + "learning_rate": 0.0001, + "loss": 1.4574, + "step": 7290 + }, + { + "epoch": 0.8375165125495376, + "grad_norm": 0.43981707096099854, + "learning_rate": 0.0001, + "loss": 1.5759, + "step": 7291 + }, + { + "epoch": 0.8376313824593647, + "grad_norm": 0.47483888268470764, + "learning_rate": 0.0001, + "loss": 1.7885, + "step": 7292 + }, + { + "epoch": 0.8377462523691919, + "grad_norm": 0.46889838576316833, + "learning_rate": 0.0001, + "loss": 1.4536, + "step": 7293 + }, + { + "epoch": 0.837861122279019, + "grad_norm": 0.46966785192489624, + "learning_rate": 0.0001, + "loss": 1.4182, + "step": 7294 + }, + { + "epoch": 0.8379759921888461, + "grad_norm": 0.4754125475883484, + "learning_rate": 0.0001, + "loss": 1.8329, + "step": 7295 + }, + { + "epoch": 0.8380908620986732, + "grad_norm": 0.44175228476524353, + "learning_rate": 0.0001, + "loss": 1.5962, + "step": 7296 + }, + { + "epoch": 0.8382057320085003, + "grad_norm": 0.45148342847824097, + "learning_rate": 0.0001, + "loss": 1.514, + "step": 7297 + }, + { + "epoch": 0.8383206019183275, + "grad_norm": 0.46137553453445435, + "learning_rate": 0.0001, + "loss": 1.531, + "step": 7298 + }, + { + "epoch": 0.8384354718281546, + "grad_norm": 0.49679750204086304, + "learning_rate": 0.0001, + "loss": 1.646, + "step": 7299 + }, + { + "epoch": 0.8385503417379817, + "grad_norm": 0.44846269488334656, + "learning_rate": 0.0001, + "loss": 1.5111, + "step": 7300 + }, + { + "epoch": 0.8386652116478088, + "grad_norm": 0.43875476717948914, + "learning_rate": 0.0001, + "loss": 1.5015, + "step": 7301 + }, + { + "epoch": 0.838780081557636, + "grad_norm": 0.44996675848960876, + "learning_rate": 0.0001, + "loss": 1.5054, + "step": 7302 + }, + { + "epoch": 0.8388949514674631, + "grad_norm": 0.47677183151245117, + "learning_rate": 0.0001, + "loss": 1.7262, + "step": 7303 + }, + { + "epoch": 0.8390098213772902, + "grad_norm": 0.4702812433242798, + "learning_rate": 0.0001, + "loss": 1.6581, + "step": 7304 + }, + { + "epoch": 0.8391246912871173, + "grad_norm": 0.4662913978099823, + "learning_rate": 0.0001, + "loss": 1.541, + "step": 7305 + }, + { + "epoch": 0.8392395611969444, + "grad_norm": 0.5055838227272034, + "learning_rate": 0.0001, + "loss": 1.747, + "step": 7306 + }, + { + "epoch": 0.8393544311067715, + "grad_norm": 0.47500503063201904, + "learning_rate": 0.0001, + "loss": 1.6744, + "step": 7307 + }, + { + "epoch": 0.8394693010165987, + "grad_norm": 0.48270219564437866, + "learning_rate": 0.0001, + "loss": 1.7157, + "step": 7308 + }, + { + "epoch": 0.8395841709264258, + "grad_norm": 0.48767292499542236, + "learning_rate": 0.0001, + "loss": 1.5027, + "step": 7309 + }, + { + "epoch": 0.8396990408362529, + "grad_norm": 0.46618691086769104, + "learning_rate": 0.0001, + "loss": 1.4696, + "step": 7310 + }, + { + "epoch": 0.83981391074608, + "grad_norm": 0.4929366111755371, + "learning_rate": 0.0001, + "loss": 1.683, + "step": 7311 + }, + { + "epoch": 0.8399287806559071, + "grad_norm": 0.4555191397666931, + "learning_rate": 0.0001, + "loss": 1.549, + "step": 7312 + }, + { + "epoch": 0.8400436505657343, + "grad_norm": 0.4752098321914673, + "learning_rate": 0.0001, + "loss": 1.8156, + "step": 7313 + }, + { + "epoch": 0.8401585204755614, + "grad_norm": 0.4447777271270752, + "learning_rate": 0.0001, + "loss": 1.5551, + "step": 7314 + }, + { + "epoch": 0.8402733903853885, + "grad_norm": 0.46375584602355957, + "learning_rate": 0.0001, + "loss": 1.5083, + "step": 7315 + }, + { + "epoch": 0.8403882602952156, + "grad_norm": 0.5160993933677673, + "learning_rate": 0.0001, + "loss": 1.7724, + "step": 7316 + }, + { + "epoch": 0.8405031302050427, + "grad_norm": 0.49875614047050476, + "learning_rate": 0.0001, + "loss": 1.7911, + "step": 7317 + }, + { + "epoch": 0.8406180001148699, + "grad_norm": 0.47104793787002563, + "learning_rate": 0.0001, + "loss": 1.5512, + "step": 7318 + }, + { + "epoch": 0.840732870024697, + "grad_norm": 0.47585365176200867, + "learning_rate": 0.0001, + "loss": 1.4807, + "step": 7319 + }, + { + "epoch": 0.8408477399345241, + "grad_norm": 0.48287901282310486, + "learning_rate": 0.0001, + "loss": 1.4909, + "step": 7320 + }, + { + "epoch": 0.8409626098443512, + "grad_norm": 0.4639113247394562, + "learning_rate": 0.0001, + "loss": 1.6005, + "step": 7321 + }, + { + "epoch": 0.8410774797541783, + "grad_norm": 0.4889727830886841, + "learning_rate": 0.0001, + "loss": 1.8168, + "step": 7322 + }, + { + "epoch": 0.8411923496640055, + "grad_norm": 0.44564002752304077, + "learning_rate": 0.0001, + "loss": 1.4868, + "step": 7323 + }, + { + "epoch": 0.8413072195738326, + "grad_norm": 0.4678811728954315, + "learning_rate": 0.0001, + "loss": 1.5586, + "step": 7324 + }, + { + "epoch": 0.8414220894836597, + "grad_norm": 0.483021080493927, + "learning_rate": 0.0001, + "loss": 1.7088, + "step": 7325 + }, + { + "epoch": 0.8415369593934868, + "grad_norm": 0.47490793466567993, + "learning_rate": 0.0001, + "loss": 1.6228, + "step": 7326 + }, + { + "epoch": 0.841651829303314, + "grad_norm": 0.4489779770374298, + "learning_rate": 0.0001, + "loss": 1.5528, + "step": 7327 + }, + { + "epoch": 0.8417666992131411, + "grad_norm": 0.4593144953250885, + "learning_rate": 0.0001, + "loss": 1.6267, + "step": 7328 + }, + { + "epoch": 0.8418815691229682, + "grad_norm": 0.4318476617336273, + "learning_rate": 0.0001, + "loss": 1.458, + "step": 7329 + }, + { + "epoch": 0.8419964390327953, + "grad_norm": 0.44436419010162354, + "learning_rate": 0.0001, + "loss": 1.4058, + "step": 7330 + }, + { + "epoch": 0.8421113089426224, + "grad_norm": 0.4526481032371521, + "learning_rate": 0.0001, + "loss": 1.5017, + "step": 7331 + }, + { + "epoch": 0.8422261788524495, + "grad_norm": 0.4639897644519806, + "learning_rate": 0.0001, + "loss": 1.6044, + "step": 7332 + }, + { + "epoch": 0.8423410487622767, + "grad_norm": 0.4710429012775421, + "learning_rate": 0.0001, + "loss": 1.7472, + "step": 7333 + }, + { + "epoch": 0.8424559186721038, + "grad_norm": 0.4736747443675995, + "learning_rate": 0.0001, + "loss": 1.5991, + "step": 7334 + }, + { + "epoch": 0.842570788581931, + "grad_norm": 0.46228286623954773, + "learning_rate": 0.0001, + "loss": 1.4623, + "step": 7335 + }, + { + "epoch": 0.8426856584917581, + "grad_norm": 0.5162138938903809, + "learning_rate": 0.0001, + "loss": 1.8545, + "step": 7336 + }, + { + "epoch": 0.8428005284015853, + "grad_norm": 0.4684411585330963, + "learning_rate": 0.0001, + "loss": 1.6929, + "step": 7337 + }, + { + "epoch": 0.8429153983114124, + "grad_norm": 0.45652058720588684, + "learning_rate": 0.0001, + "loss": 1.4637, + "step": 7338 + }, + { + "epoch": 0.8430302682212395, + "grad_norm": 0.49784135818481445, + "learning_rate": 0.0001, + "loss": 1.491, + "step": 7339 + }, + { + "epoch": 0.8431451381310666, + "grad_norm": 0.5035806894302368, + "learning_rate": 0.0001, + "loss": 1.6469, + "step": 7340 + }, + { + "epoch": 0.8432600080408937, + "grad_norm": 0.4898698925971985, + "learning_rate": 0.0001, + "loss": 1.7416, + "step": 7341 + }, + { + "epoch": 0.8433748779507209, + "grad_norm": 0.4696720242500305, + "learning_rate": 0.0001, + "loss": 1.5175, + "step": 7342 + }, + { + "epoch": 0.843489747860548, + "grad_norm": 0.4923301339149475, + "learning_rate": 0.0001, + "loss": 1.7249, + "step": 7343 + }, + { + "epoch": 0.8436046177703751, + "grad_norm": 0.49450981616973877, + "learning_rate": 0.0001, + "loss": 1.7339, + "step": 7344 + }, + { + "epoch": 0.8437194876802022, + "grad_norm": 0.43973812460899353, + "learning_rate": 0.0001, + "loss": 1.4611, + "step": 7345 + }, + { + "epoch": 0.8438343575900293, + "grad_norm": 0.48189517855644226, + "learning_rate": 0.0001, + "loss": 1.6952, + "step": 7346 + }, + { + "epoch": 0.8439492274998565, + "grad_norm": 0.4575080871582031, + "learning_rate": 0.0001, + "loss": 1.5302, + "step": 7347 + }, + { + "epoch": 0.8440640974096836, + "grad_norm": 0.4627358019351959, + "learning_rate": 0.0001, + "loss": 1.4716, + "step": 7348 + }, + { + "epoch": 0.8441789673195107, + "grad_norm": 0.4325619041919708, + "learning_rate": 0.0001, + "loss": 1.5157, + "step": 7349 + }, + { + "epoch": 0.8442938372293378, + "grad_norm": 0.4869388937950134, + "learning_rate": 0.0001, + "loss": 1.7152, + "step": 7350 + }, + { + "epoch": 0.8444087071391649, + "grad_norm": 0.4528372585773468, + "learning_rate": 0.0001, + "loss": 1.6697, + "step": 7351 + }, + { + "epoch": 0.8445235770489921, + "grad_norm": 0.45322051644325256, + "learning_rate": 0.0001, + "loss": 1.7075, + "step": 7352 + }, + { + "epoch": 0.8446384469588192, + "grad_norm": 0.43234097957611084, + "learning_rate": 0.0001, + "loss": 1.4102, + "step": 7353 + }, + { + "epoch": 0.8447533168686463, + "grad_norm": 0.4526819586753845, + "learning_rate": 0.0001, + "loss": 1.5617, + "step": 7354 + }, + { + "epoch": 0.8448681867784734, + "grad_norm": 0.46750757098197937, + "learning_rate": 0.0001, + "loss": 1.8057, + "step": 7355 + }, + { + "epoch": 0.8449830566883005, + "grad_norm": 0.47405919432640076, + "learning_rate": 0.0001, + "loss": 1.6563, + "step": 7356 + }, + { + "epoch": 0.8450979265981277, + "grad_norm": 0.4665440320968628, + "learning_rate": 0.0001, + "loss": 1.6313, + "step": 7357 + }, + { + "epoch": 0.8452127965079548, + "grad_norm": 0.4736506938934326, + "learning_rate": 0.0001, + "loss": 1.6407, + "step": 7358 + }, + { + "epoch": 0.8453276664177819, + "grad_norm": 0.4538653492927551, + "learning_rate": 0.0001, + "loss": 1.7792, + "step": 7359 + }, + { + "epoch": 0.845442536327609, + "grad_norm": 0.4535897672176361, + "learning_rate": 0.0001, + "loss": 1.5042, + "step": 7360 + }, + { + "epoch": 0.8455574062374361, + "grad_norm": 0.5242683291435242, + "learning_rate": 0.0001, + "loss": 1.6937, + "step": 7361 + }, + { + "epoch": 0.8456722761472633, + "grad_norm": 0.4639322757720947, + "learning_rate": 0.0001, + "loss": 1.6695, + "step": 7362 + }, + { + "epoch": 0.8457871460570904, + "grad_norm": 0.44305261969566345, + "learning_rate": 0.0001, + "loss": 1.3519, + "step": 7363 + }, + { + "epoch": 0.8459020159669175, + "grad_norm": 0.4527410864830017, + "learning_rate": 0.0001, + "loss": 1.584, + "step": 7364 + }, + { + "epoch": 0.8460168858767446, + "grad_norm": 0.48342829942703247, + "learning_rate": 0.0001, + "loss": 1.5043, + "step": 7365 + }, + { + "epoch": 0.8461317557865717, + "grad_norm": 0.46347376704216003, + "learning_rate": 0.0001, + "loss": 1.6209, + "step": 7366 + }, + { + "epoch": 0.8462466256963989, + "grad_norm": 0.45847606658935547, + "learning_rate": 0.0001, + "loss": 1.5473, + "step": 7367 + }, + { + "epoch": 0.846361495606226, + "grad_norm": 0.4818578362464905, + "learning_rate": 0.0001, + "loss": 1.5684, + "step": 7368 + }, + { + "epoch": 0.8464763655160531, + "grad_norm": 0.48799648880958557, + "learning_rate": 0.0001, + "loss": 1.6581, + "step": 7369 + }, + { + "epoch": 0.8465912354258802, + "grad_norm": 0.4803750813007355, + "learning_rate": 0.0001, + "loss": 1.4806, + "step": 7370 + }, + { + "epoch": 0.8467061053357073, + "grad_norm": 0.48348790407180786, + "learning_rate": 0.0001, + "loss": 1.6172, + "step": 7371 + }, + { + "epoch": 0.8468209752455345, + "grad_norm": 0.4604859948158264, + "learning_rate": 0.0001, + "loss": 1.6259, + "step": 7372 + }, + { + "epoch": 0.8469358451553616, + "grad_norm": 0.459789901971817, + "learning_rate": 0.0001, + "loss": 1.4266, + "step": 7373 + }, + { + "epoch": 0.8470507150651887, + "grad_norm": 0.5160319805145264, + "learning_rate": 0.0001, + "loss": 1.7858, + "step": 7374 + }, + { + "epoch": 0.8471655849750158, + "grad_norm": 0.4354211688041687, + "learning_rate": 0.0001, + "loss": 1.6375, + "step": 7375 + }, + { + "epoch": 0.8472804548848429, + "grad_norm": 0.4287918508052826, + "learning_rate": 0.0001, + "loss": 1.5123, + "step": 7376 + }, + { + "epoch": 0.8473953247946701, + "grad_norm": 0.4390510320663452, + "learning_rate": 0.0001, + "loss": 1.5634, + "step": 7377 + }, + { + "epoch": 0.8475101947044972, + "grad_norm": 0.4523385465145111, + "learning_rate": 0.0001, + "loss": 1.631, + "step": 7378 + }, + { + "epoch": 0.8476250646143243, + "grad_norm": 0.4572320878505707, + "learning_rate": 0.0001, + "loss": 1.1648, + "step": 7379 + }, + { + "epoch": 0.8477399345241514, + "grad_norm": 0.5559184551239014, + "learning_rate": 0.0001, + "loss": 1.3973, + "step": 7380 + }, + { + "epoch": 0.8478548044339785, + "grad_norm": 0.49197232723236084, + "learning_rate": 0.0001, + "loss": 1.7199, + "step": 7381 + }, + { + "epoch": 0.8479696743438057, + "grad_norm": 0.49532270431518555, + "learning_rate": 0.0001, + "loss": 1.6443, + "step": 7382 + }, + { + "epoch": 0.8480845442536328, + "grad_norm": 0.49762824177742004, + "learning_rate": 0.0001, + "loss": 1.6974, + "step": 7383 + }, + { + "epoch": 0.8481994141634599, + "grad_norm": 0.473664790391922, + "learning_rate": 0.0001, + "loss": 1.6332, + "step": 7384 + }, + { + "epoch": 0.848314284073287, + "grad_norm": 0.48429468274116516, + "learning_rate": 0.0001, + "loss": 1.4807, + "step": 7385 + }, + { + "epoch": 0.8484291539831141, + "grad_norm": 0.4701521396636963, + "learning_rate": 0.0001, + "loss": 1.6465, + "step": 7386 + }, + { + "epoch": 0.8485440238929413, + "grad_norm": 0.43333899974823, + "learning_rate": 0.0001, + "loss": 1.5189, + "step": 7387 + }, + { + "epoch": 0.8486588938027684, + "grad_norm": 0.47169727087020874, + "learning_rate": 0.0001, + "loss": 1.4835, + "step": 7388 + }, + { + "epoch": 0.8487737637125955, + "grad_norm": 0.4842875003814697, + "learning_rate": 0.0001, + "loss": 1.7324, + "step": 7389 + }, + { + "epoch": 0.8488886336224226, + "grad_norm": 0.49503180384635925, + "learning_rate": 0.0001, + "loss": 1.707, + "step": 7390 + }, + { + "epoch": 0.8490035035322497, + "grad_norm": 0.44051480293273926, + "learning_rate": 0.0001, + "loss": 1.5385, + "step": 7391 + }, + { + "epoch": 0.8491183734420769, + "grad_norm": 0.4651011824607849, + "learning_rate": 0.0001, + "loss": 1.4514, + "step": 7392 + }, + { + "epoch": 0.849233243351904, + "grad_norm": 0.4789915978908539, + "learning_rate": 0.0001, + "loss": 1.6536, + "step": 7393 + }, + { + "epoch": 0.8493481132617311, + "grad_norm": 0.44147029519081116, + "learning_rate": 0.0001, + "loss": 1.6246, + "step": 7394 + }, + { + "epoch": 0.8494629831715582, + "grad_norm": 0.42696404457092285, + "learning_rate": 0.0001, + "loss": 1.3676, + "step": 7395 + }, + { + "epoch": 0.8495778530813853, + "grad_norm": 0.4485602080821991, + "learning_rate": 0.0001, + "loss": 1.602, + "step": 7396 + }, + { + "epoch": 0.8496927229912125, + "grad_norm": 0.46463295817375183, + "learning_rate": 0.0001, + "loss": 1.6137, + "step": 7397 + }, + { + "epoch": 0.8498075929010396, + "grad_norm": 0.43345364928245544, + "learning_rate": 0.0001, + "loss": 1.5218, + "step": 7398 + }, + { + "epoch": 0.8499224628108667, + "grad_norm": 0.4817748963832855, + "learning_rate": 0.0001, + "loss": 1.6097, + "step": 7399 + }, + { + "epoch": 0.8500373327206938, + "grad_norm": 0.48344627022743225, + "learning_rate": 0.0001, + "loss": 1.434, + "step": 7400 + }, + { + "epoch": 0.8501522026305209, + "grad_norm": 0.4697633683681488, + "learning_rate": 0.0001, + "loss": 1.5238, + "step": 7401 + }, + { + "epoch": 0.8502670725403481, + "grad_norm": 0.4468742609024048, + "learning_rate": 0.0001, + "loss": 1.4694, + "step": 7402 + }, + { + "epoch": 0.8503819424501752, + "grad_norm": 0.4675341546535492, + "learning_rate": 0.0001, + "loss": 1.5938, + "step": 7403 + }, + { + "epoch": 0.8504968123600023, + "grad_norm": 0.43257632851600647, + "learning_rate": 0.0001, + "loss": 1.3002, + "step": 7404 + }, + { + "epoch": 0.8506116822698294, + "grad_norm": 0.4668475389480591, + "learning_rate": 0.0001, + "loss": 1.6644, + "step": 7405 + }, + { + "epoch": 0.8507265521796565, + "grad_norm": 0.4785885512828827, + "learning_rate": 0.0001, + "loss": 1.5659, + "step": 7406 + }, + { + "epoch": 0.8508414220894837, + "grad_norm": 0.43829622864723206, + "learning_rate": 0.0001, + "loss": 1.5819, + "step": 7407 + }, + { + "epoch": 0.8509562919993108, + "grad_norm": 0.5236424207687378, + "learning_rate": 0.0001, + "loss": 1.5819, + "step": 7408 + }, + { + "epoch": 0.8510711619091379, + "grad_norm": 0.45531997084617615, + "learning_rate": 0.0001, + "loss": 1.6721, + "step": 7409 + }, + { + "epoch": 0.851186031818965, + "grad_norm": 0.5072128176689148, + "learning_rate": 0.0001, + "loss": 1.5845, + "step": 7410 + }, + { + "epoch": 0.8513009017287921, + "grad_norm": 0.4630776643753052, + "learning_rate": 0.0001, + "loss": 1.6296, + "step": 7411 + }, + { + "epoch": 0.8514157716386193, + "grad_norm": 0.4900180697441101, + "learning_rate": 0.0001, + "loss": 1.7094, + "step": 7412 + }, + { + "epoch": 0.8515306415484464, + "grad_norm": 0.46314537525177, + "learning_rate": 0.0001, + "loss": 1.4107, + "step": 7413 + }, + { + "epoch": 0.8516455114582735, + "grad_norm": 0.5156530141830444, + "learning_rate": 0.0001, + "loss": 1.5988, + "step": 7414 + }, + { + "epoch": 0.8517603813681006, + "grad_norm": 0.5086135268211365, + "learning_rate": 0.0001, + "loss": 1.5731, + "step": 7415 + }, + { + "epoch": 0.8518752512779277, + "grad_norm": 0.48410969972610474, + "learning_rate": 0.0001, + "loss": 1.7056, + "step": 7416 + }, + { + "epoch": 0.8519901211877549, + "grad_norm": 0.5622872114181519, + "learning_rate": 0.0001, + "loss": 1.5636, + "step": 7417 + }, + { + "epoch": 0.852104991097582, + "grad_norm": 0.4555276036262512, + "learning_rate": 0.0001, + "loss": 1.5397, + "step": 7418 + }, + { + "epoch": 0.8522198610074091, + "grad_norm": 0.46399348974227905, + "learning_rate": 0.0001, + "loss": 1.6051, + "step": 7419 + }, + { + "epoch": 0.8523347309172362, + "grad_norm": 0.43579867482185364, + "learning_rate": 0.0001, + "loss": 1.3899, + "step": 7420 + }, + { + "epoch": 0.8524496008270633, + "grad_norm": 0.47155770659446716, + "learning_rate": 0.0001, + "loss": 1.6079, + "step": 7421 + }, + { + "epoch": 0.8525644707368905, + "grad_norm": 0.46017301082611084, + "learning_rate": 0.0001, + "loss": 1.623, + "step": 7422 + }, + { + "epoch": 0.8526793406467176, + "grad_norm": 0.4408453404903412, + "learning_rate": 0.0001, + "loss": 1.5108, + "step": 7423 + }, + { + "epoch": 0.8527942105565447, + "grad_norm": 0.475716769695282, + "learning_rate": 0.0001, + "loss": 1.6382, + "step": 7424 + }, + { + "epoch": 0.8529090804663718, + "grad_norm": 0.4536953270435333, + "learning_rate": 0.0001, + "loss": 1.5114, + "step": 7425 + }, + { + "epoch": 0.8530239503761989, + "grad_norm": 0.476300448179245, + "learning_rate": 0.0001, + "loss": 1.6174, + "step": 7426 + }, + { + "epoch": 0.8531388202860261, + "grad_norm": 0.42572006583213806, + "learning_rate": 0.0001, + "loss": 1.5994, + "step": 7427 + }, + { + "epoch": 0.8532536901958532, + "grad_norm": 0.5320248007774353, + "learning_rate": 0.0001, + "loss": 1.6911, + "step": 7428 + }, + { + "epoch": 0.8533685601056803, + "grad_norm": 0.46820348501205444, + "learning_rate": 0.0001, + "loss": 1.5416, + "step": 7429 + }, + { + "epoch": 0.8534834300155074, + "grad_norm": 0.4351903796195984, + "learning_rate": 0.0001, + "loss": 1.5144, + "step": 7430 + }, + { + "epoch": 0.8535982999253345, + "grad_norm": 0.47440531849861145, + "learning_rate": 0.0001, + "loss": 1.4344, + "step": 7431 + }, + { + "epoch": 0.8537131698351617, + "grad_norm": 0.43947550654411316, + "learning_rate": 0.0001, + "loss": 1.5196, + "step": 7432 + }, + { + "epoch": 0.8538280397449888, + "grad_norm": 0.4965689182281494, + "learning_rate": 0.0001, + "loss": 1.6887, + "step": 7433 + }, + { + "epoch": 0.8539429096548159, + "grad_norm": 0.4858873188495636, + "learning_rate": 0.0001, + "loss": 1.8044, + "step": 7434 + }, + { + "epoch": 0.854057779564643, + "grad_norm": 0.4513735771179199, + "learning_rate": 0.0001, + "loss": 1.4199, + "step": 7435 + }, + { + "epoch": 0.8541726494744701, + "grad_norm": 0.4994846284389496, + "learning_rate": 0.0001, + "loss": 1.943, + "step": 7436 + }, + { + "epoch": 0.8542875193842973, + "grad_norm": 0.45225298404693604, + "learning_rate": 0.0001, + "loss": 1.6063, + "step": 7437 + }, + { + "epoch": 0.8544023892941244, + "grad_norm": 0.445266991853714, + "learning_rate": 0.0001, + "loss": 1.4866, + "step": 7438 + }, + { + "epoch": 0.8545172592039515, + "grad_norm": 0.4278365969657898, + "learning_rate": 0.0001, + "loss": 1.617, + "step": 7439 + }, + { + "epoch": 0.8546321291137786, + "grad_norm": 0.4820537865161896, + "learning_rate": 0.0001, + "loss": 1.7703, + "step": 7440 + }, + { + "epoch": 0.8547469990236057, + "grad_norm": 0.4873339533805847, + "learning_rate": 0.0001, + "loss": 1.7428, + "step": 7441 + }, + { + "epoch": 0.8548618689334329, + "grad_norm": 0.44816693663597107, + "learning_rate": 0.0001, + "loss": 1.6484, + "step": 7442 + }, + { + "epoch": 0.85497673884326, + "grad_norm": 0.47395050525665283, + "learning_rate": 0.0001, + "loss": 1.5569, + "step": 7443 + }, + { + "epoch": 0.8550916087530871, + "grad_norm": 0.42696264386177063, + "learning_rate": 0.0001, + "loss": 1.5543, + "step": 7444 + }, + { + "epoch": 0.8552064786629142, + "grad_norm": 0.5623756647109985, + "learning_rate": 0.0001, + "loss": 2.051, + "step": 7445 + }, + { + "epoch": 0.8553213485727413, + "grad_norm": 0.5125777721405029, + "learning_rate": 0.0001, + "loss": 1.7589, + "step": 7446 + }, + { + "epoch": 0.8554362184825685, + "grad_norm": 0.4917384684085846, + "learning_rate": 0.0001, + "loss": 1.8042, + "step": 7447 + }, + { + "epoch": 0.8555510883923956, + "grad_norm": 0.4971913695335388, + "learning_rate": 0.0001, + "loss": 1.6491, + "step": 7448 + }, + { + "epoch": 0.8556659583022227, + "grad_norm": 0.4413685202598572, + "learning_rate": 0.0001, + "loss": 1.4241, + "step": 7449 + }, + { + "epoch": 0.8557808282120498, + "grad_norm": 0.5352091789245605, + "learning_rate": 0.0001, + "loss": 1.9268, + "step": 7450 + }, + { + "epoch": 0.8558956981218769, + "grad_norm": 0.5074852705001831, + "learning_rate": 0.0001, + "loss": 1.7119, + "step": 7451 + }, + { + "epoch": 0.8560105680317041, + "grad_norm": 0.45686012506484985, + "learning_rate": 0.0001, + "loss": 1.5553, + "step": 7452 + }, + { + "epoch": 0.8561254379415312, + "grad_norm": 0.48405084013938904, + "learning_rate": 0.0001, + "loss": 1.792, + "step": 7453 + }, + { + "epoch": 0.8562403078513583, + "grad_norm": 0.5095300674438477, + "learning_rate": 0.0001, + "loss": 1.7128, + "step": 7454 + }, + { + "epoch": 0.8563551777611854, + "grad_norm": 0.49315640330314636, + "learning_rate": 0.0001, + "loss": 1.4333, + "step": 7455 + }, + { + "epoch": 0.8564700476710125, + "grad_norm": 0.44458267092704773, + "learning_rate": 0.0001, + "loss": 1.606, + "step": 7456 + }, + { + "epoch": 0.8565849175808397, + "grad_norm": 0.47778499126434326, + "learning_rate": 0.0001, + "loss": 1.7333, + "step": 7457 + }, + { + "epoch": 0.8566997874906668, + "grad_norm": 0.47323766350746155, + "learning_rate": 0.0001, + "loss": 1.4791, + "step": 7458 + }, + { + "epoch": 0.8568146574004939, + "grad_norm": 0.4731180965900421, + "learning_rate": 0.0001, + "loss": 1.6042, + "step": 7459 + }, + { + "epoch": 0.856929527310321, + "grad_norm": 0.4616163671016693, + "learning_rate": 0.0001, + "loss": 1.7287, + "step": 7460 + }, + { + "epoch": 0.8570443972201481, + "grad_norm": 0.4829119145870209, + "learning_rate": 0.0001, + "loss": 1.6345, + "step": 7461 + }, + { + "epoch": 0.8571592671299753, + "grad_norm": 0.5113245844841003, + "learning_rate": 0.0001, + "loss": 1.5072, + "step": 7462 + }, + { + "epoch": 0.8572741370398024, + "grad_norm": 0.45658257603645325, + "learning_rate": 0.0001, + "loss": 1.5559, + "step": 7463 + }, + { + "epoch": 0.8573890069496295, + "grad_norm": 0.44155845046043396, + "learning_rate": 0.0001, + "loss": 1.5001, + "step": 7464 + }, + { + "epoch": 0.8575038768594566, + "grad_norm": 0.46995145082473755, + "learning_rate": 0.0001, + "loss": 1.6653, + "step": 7465 + }, + { + "epoch": 0.8576187467692837, + "grad_norm": 0.43981465697288513, + "learning_rate": 0.0001, + "loss": 1.5494, + "step": 7466 + }, + { + "epoch": 0.8577336166791109, + "grad_norm": 0.4353700280189514, + "learning_rate": 0.0001, + "loss": 1.5173, + "step": 7467 + }, + { + "epoch": 0.857848486588938, + "grad_norm": 0.4613707959651947, + "learning_rate": 0.0001, + "loss": 1.6324, + "step": 7468 + }, + { + "epoch": 0.8579633564987651, + "grad_norm": 0.43150594830513, + "learning_rate": 0.0001, + "loss": 1.3483, + "step": 7469 + }, + { + "epoch": 0.8580782264085922, + "grad_norm": 0.49536994099617004, + "learning_rate": 0.0001, + "loss": 1.6696, + "step": 7470 + }, + { + "epoch": 0.8581930963184193, + "grad_norm": 0.4285619258880615, + "learning_rate": 0.0001, + "loss": 1.6479, + "step": 7471 + }, + { + "epoch": 0.8583079662282466, + "grad_norm": 0.5003694891929626, + "learning_rate": 0.0001, + "loss": 1.5725, + "step": 7472 + }, + { + "epoch": 0.8584228361380737, + "grad_norm": 0.4515581429004669, + "learning_rate": 0.0001, + "loss": 1.5494, + "step": 7473 + }, + { + "epoch": 0.8585377060479008, + "grad_norm": 0.4796769320964813, + "learning_rate": 0.0001, + "loss": 1.4985, + "step": 7474 + }, + { + "epoch": 0.8586525759577279, + "grad_norm": 0.5017609596252441, + "learning_rate": 0.0001, + "loss": 1.7879, + "step": 7475 + }, + { + "epoch": 0.858767445867555, + "grad_norm": 0.4531811475753784, + "learning_rate": 0.0001, + "loss": 1.5423, + "step": 7476 + }, + { + "epoch": 0.8588823157773822, + "grad_norm": 0.4460836946964264, + "learning_rate": 0.0001, + "loss": 1.456, + "step": 7477 + }, + { + "epoch": 0.8589971856872093, + "grad_norm": 0.4407133162021637, + "learning_rate": 0.0001, + "loss": 1.5907, + "step": 7478 + }, + { + "epoch": 0.8591120555970364, + "grad_norm": 0.4690662622451782, + "learning_rate": 0.0001, + "loss": 1.6899, + "step": 7479 + }, + { + "epoch": 0.8592269255068635, + "grad_norm": 0.43073776364326477, + "learning_rate": 0.0001, + "loss": 1.3295, + "step": 7480 + }, + { + "epoch": 0.8593417954166906, + "grad_norm": 0.45200130343437195, + "learning_rate": 0.0001, + "loss": 1.435, + "step": 7481 + }, + { + "epoch": 0.8594566653265178, + "grad_norm": 0.45768219232559204, + "learning_rate": 0.0001, + "loss": 1.5326, + "step": 7482 + }, + { + "epoch": 0.8595715352363449, + "grad_norm": 0.4337681829929352, + "learning_rate": 0.0001, + "loss": 1.5591, + "step": 7483 + }, + { + "epoch": 0.859686405146172, + "grad_norm": 0.44868743419647217, + "learning_rate": 0.0001, + "loss": 1.5495, + "step": 7484 + }, + { + "epoch": 0.8598012750559991, + "grad_norm": 0.4528716206550598, + "learning_rate": 0.0001, + "loss": 1.5656, + "step": 7485 + }, + { + "epoch": 0.8599161449658262, + "grad_norm": 0.4922735095024109, + "learning_rate": 0.0001, + "loss": 1.6894, + "step": 7486 + }, + { + "epoch": 0.8600310148756534, + "grad_norm": 0.46525344252586365, + "learning_rate": 0.0001, + "loss": 1.5649, + "step": 7487 + }, + { + "epoch": 0.8601458847854805, + "grad_norm": 0.46007898449897766, + "learning_rate": 0.0001, + "loss": 1.6705, + "step": 7488 + }, + { + "epoch": 0.8602607546953076, + "grad_norm": 0.4645099341869354, + "learning_rate": 0.0001, + "loss": 1.5782, + "step": 7489 + }, + { + "epoch": 0.8603756246051347, + "grad_norm": 0.46048229932785034, + "learning_rate": 0.0001, + "loss": 1.5643, + "step": 7490 + }, + { + "epoch": 0.8604904945149618, + "grad_norm": 0.4628632366657257, + "learning_rate": 0.0001, + "loss": 1.6688, + "step": 7491 + }, + { + "epoch": 0.860605364424789, + "grad_norm": 0.5050228238105774, + "learning_rate": 0.0001, + "loss": 1.712, + "step": 7492 + }, + { + "epoch": 0.8607202343346161, + "grad_norm": 0.4507612884044647, + "learning_rate": 0.0001, + "loss": 1.5151, + "step": 7493 + }, + { + "epoch": 0.8608351042444432, + "grad_norm": 0.4406948983669281, + "learning_rate": 0.0001, + "loss": 1.5542, + "step": 7494 + }, + { + "epoch": 0.8609499741542703, + "grad_norm": 0.4973854124546051, + "learning_rate": 0.0001, + "loss": 1.5352, + "step": 7495 + }, + { + "epoch": 0.8610648440640974, + "grad_norm": 0.4488707482814789, + "learning_rate": 0.0001, + "loss": 1.4153, + "step": 7496 + }, + { + "epoch": 0.8611797139739246, + "grad_norm": 0.4815550744533539, + "learning_rate": 0.0001, + "loss": 1.6676, + "step": 7497 + }, + { + "epoch": 0.8612945838837517, + "grad_norm": 0.4687705338001251, + "learning_rate": 0.0001, + "loss": 1.3591, + "step": 7498 + }, + { + "epoch": 0.8614094537935788, + "grad_norm": 0.46637317538261414, + "learning_rate": 0.0001, + "loss": 1.6578, + "step": 7499 + }, + { + "epoch": 0.8615243237034059, + "grad_norm": 0.4513280987739563, + "learning_rate": 0.0001, + "loss": 1.5599, + "step": 7500 + }, + { + "epoch": 0.861639193613233, + "grad_norm": 0.4474887549877167, + "learning_rate": 0.0001, + "loss": 1.3628, + "step": 7501 + }, + { + "epoch": 0.8617540635230602, + "grad_norm": 0.4975503087043762, + "learning_rate": 0.0001, + "loss": 1.5146, + "step": 7502 + }, + { + "epoch": 0.8618689334328873, + "grad_norm": 0.5043979287147522, + "learning_rate": 0.0001, + "loss": 1.5227, + "step": 7503 + }, + { + "epoch": 0.8619838033427144, + "grad_norm": 0.4594765305519104, + "learning_rate": 0.0001, + "loss": 1.702, + "step": 7504 + }, + { + "epoch": 0.8620986732525415, + "grad_norm": 0.46996229887008667, + "learning_rate": 0.0001, + "loss": 1.714, + "step": 7505 + }, + { + "epoch": 0.8622135431623686, + "grad_norm": 0.45425885915756226, + "learning_rate": 0.0001, + "loss": 1.6545, + "step": 7506 + }, + { + "epoch": 0.8623284130721958, + "grad_norm": 0.48939937353134155, + "learning_rate": 0.0001, + "loss": 1.611, + "step": 7507 + }, + { + "epoch": 0.8624432829820229, + "grad_norm": 0.44552651047706604, + "learning_rate": 0.0001, + "loss": 1.5571, + "step": 7508 + }, + { + "epoch": 0.86255815289185, + "grad_norm": 0.4959718883037567, + "learning_rate": 0.0001, + "loss": 1.6482, + "step": 7509 + }, + { + "epoch": 0.8626730228016771, + "grad_norm": 0.4864218533039093, + "learning_rate": 0.0001, + "loss": 1.7558, + "step": 7510 + }, + { + "epoch": 0.8627878927115042, + "grad_norm": 0.4416675865650177, + "learning_rate": 0.0001, + "loss": 1.686, + "step": 7511 + }, + { + "epoch": 0.8629027626213314, + "grad_norm": 0.46253499388694763, + "learning_rate": 0.0001, + "loss": 1.5942, + "step": 7512 + }, + { + "epoch": 0.8630176325311585, + "grad_norm": 0.47551655769348145, + "learning_rate": 0.0001, + "loss": 1.7636, + "step": 7513 + }, + { + "epoch": 0.8631325024409856, + "grad_norm": 0.48232796788215637, + "learning_rate": 0.0001, + "loss": 1.6967, + "step": 7514 + }, + { + "epoch": 0.8632473723508127, + "grad_norm": 0.4513819217681885, + "learning_rate": 0.0001, + "loss": 1.5229, + "step": 7515 + }, + { + "epoch": 0.8633622422606398, + "grad_norm": 0.4784489572048187, + "learning_rate": 0.0001, + "loss": 1.7298, + "step": 7516 + }, + { + "epoch": 0.863477112170467, + "grad_norm": 0.4839509427547455, + "learning_rate": 0.0001, + "loss": 1.8396, + "step": 7517 + }, + { + "epoch": 0.8635919820802941, + "grad_norm": 0.5013167858123779, + "learning_rate": 0.0001, + "loss": 1.5833, + "step": 7518 + }, + { + "epoch": 0.8637068519901212, + "grad_norm": 0.43714308738708496, + "learning_rate": 0.0001, + "loss": 1.4933, + "step": 7519 + }, + { + "epoch": 0.8638217218999483, + "grad_norm": 0.45084357261657715, + "learning_rate": 0.0001, + "loss": 1.6041, + "step": 7520 + }, + { + "epoch": 0.8639365918097754, + "grad_norm": 0.4939959943294525, + "learning_rate": 0.0001, + "loss": 1.769, + "step": 7521 + }, + { + "epoch": 0.8640514617196026, + "grad_norm": 0.46642765402793884, + "learning_rate": 0.0001, + "loss": 1.6056, + "step": 7522 + }, + { + "epoch": 0.8641663316294297, + "grad_norm": 0.4534401595592499, + "learning_rate": 0.0001, + "loss": 1.582, + "step": 7523 + }, + { + "epoch": 0.8642812015392568, + "grad_norm": 0.43686237931251526, + "learning_rate": 0.0001, + "loss": 1.5726, + "step": 7524 + }, + { + "epoch": 0.8643960714490839, + "grad_norm": 0.4657284915447235, + "learning_rate": 0.0001, + "loss": 1.6247, + "step": 7525 + }, + { + "epoch": 0.864510941358911, + "grad_norm": 0.5079863667488098, + "learning_rate": 0.0001, + "loss": 1.551, + "step": 7526 + }, + { + "epoch": 0.8646258112687382, + "grad_norm": 0.4670884311199188, + "learning_rate": 0.0001, + "loss": 1.6194, + "step": 7527 + }, + { + "epoch": 0.8647406811785653, + "grad_norm": 0.4581589102745056, + "learning_rate": 0.0001, + "loss": 1.3903, + "step": 7528 + }, + { + "epoch": 0.8648555510883924, + "grad_norm": 0.4376249313354492, + "learning_rate": 0.0001, + "loss": 1.5475, + "step": 7529 + }, + { + "epoch": 0.8649704209982195, + "grad_norm": 0.46135491132736206, + "learning_rate": 0.0001, + "loss": 1.5266, + "step": 7530 + }, + { + "epoch": 0.8650852909080466, + "grad_norm": 0.492291659116745, + "learning_rate": 0.0001, + "loss": 1.713, + "step": 7531 + }, + { + "epoch": 0.8652001608178738, + "grad_norm": 0.4598917067050934, + "learning_rate": 0.0001, + "loss": 1.3503, + "step": 7532 + }, + { + "epoch": 0.8653150307277009, + "grad_norm": 0.4626048505306244, + "learning_rate": 0.0001, + "loss": 1.4571, + "step": 7533 + }, + { + "epoch": 0.865429900637528, + "grad_norm": 0.49662452936172485, + "learning_rate": 0.0001, + "loss": 1.5838, + "step": 7534 + }, + { + "epoch": 0.8655447705473551, + "grad_norm": 0.4794904291629791, + "learning_rate": 0.0001, + "loss": 1.5125, + "step": 7535 + }, + { + "epoch": 0.8656596404571822, + "grad_norm": 0.4706816077232361, + "learning_rate": 0.0001, + "loss": 1.4572, + "step": 7536 + }, + { + "epoch": 0.8657745103670094, + "grad_norm": 0.4907233417034149, + "learning_rate": 0.0001, + "loss": 1.5851, + "step": 7537 + }, + { + "epoch": 0.8658893802768365, + "grad_norm": 0.46914729475975037, + "learning_rate": 0.0001, + "loss": 1.5293, + "step": 7538 + }, + { + "epoch": 0.8660042501866636, + "grad_norm": 0.47738802433013916, + "learning_rate": 0.0001, + "loss": 1.6708, + "step": 7539 + }, + { + "epoch": 0.8661191200964907, + "grad_norm": 0.47731783986091614, + "learning_rate": 0.0001, + "loss": 1.5712, + "step": 7540 + }, + { + "epoch": 0.8662339900063178, + "grad_norm": 0.4860245883464813, + "learning_rate": 0.0001, + "loss": 1.4923, + "step": 7541 + }, + { + "epoch": 0.866348859916145, + "grad_norm": 0.4608958065509796, + "learning_rate": 0.0001, + "loss": 1.4598, + "step": 7542 + }, + { + "epoch": 0.8664637298259721, + "grad_norm": 0.4907558262348175, + "learning_rate": 0.0001, + "loss": 1.6327, + "step": 7543 + }, + { + "epoch": 0.8665785997357992, + "grad_norm": 0.460859477519989, + "learning_rate": 0.0001, + "loss": 1.7288, + "step": 7544 + }, + { + "epoch": 0.8666934696456263, + "grad_norm": 0.5079466104507446, + "learning_rate": 0.0001, + "loss": 1.7155, + "step": 7545 + }, + { + "epoch": 0.8668083395554534, + "grad_norm": 0.4723812937736511, + "learning_rate": 0.0001, + "loss": 1.7198, + "step": 7546 + }, + { + "epoch": 0.8669232094652806, + "grad_norm": 0.457302063703537, + "learning_rate": 0.0001, + "loss": 1.5175, + "step": 7547 + }, + { + "epoch": 0.8670380793751077, + "grad_norm": 0.44479089975357056, + "learning_rate": 0.0001, + "loss": 1.5455, + "step": 7548 + }, + { + "epoch": 0.8671529492849348, + "grad_norm": 0.4218999147415161, + "learning_rate": 0.0001, + "loss": 1.5437, + "step": 7549 + }, + { + "epoch": 0.8672678191947619, + "grad_norm": 0.44676443934440613, + "learning_rate": 0.0001, + "loss": 1.6445, + "step": 7550 + }, + { + "epoch": 0.867382689104589, + "grad_norm": 0.4901319444179535, + "learning_rate": 0.0001, + "loss": 1.7671, + "step": 7551 + }, + { + "epoch": 0.8674975590144162, + "grad_norm": 0.4453258216381073, + "learning_rate": 0.0001, + "loss": 1.4686, + "step": 7552 + }, + { + "epoch": 0.8676124289242433, + "grad_norm": 0.4534637928009033, + "learning_rate": 0.0001, + "loss": 1.5514, + "step": 7553 + }, + { + "epoch": 0.8677272988340704, + "grad_norm": 0.46283942461013794, + "learning_rate": 0.0001, + "loss": 1.6294, + "step": 7554 + }, + { + "epoch": 0.8678421687438975, + "grad_norm": 0.5271000266075134, + "learning_rate": 0.0001, + "loss": 1.6204, + "step": 7555 + }, + { + "epoch": 0.8679570386537246, + "grad_norm": 0.4478399157524109, + "learning_rate": 0.0001, + "loss": 1.5407, + "step": 7556 + }, + { + "epoch": 0.8680719085635518, + "grad_norm": 0.4519982933998108, + "learning_rate": 0.0001, + "loss": 1.5047, + "step": 7557 + }, + { + "epoch": 0.8681867784733789, + "grad_norm": 0.49382635951042175, + "learning_rate": 0.0001, + "loss": 1.662, + "step": 7558 + }, + { + "epoch": 0.868301648383206, + "grad_norm": 0.4385705888271332, + "learning_rate": 0.0001, + "loss": 1.4204, + "step": 7559 + }, + { + "epoch": 0.8684165182930331, + "grad_norm": 0.42191818356513977, + "learning_rate": 0.0001, + "loss": 1.3775, + "step": 7560 + }, + { + "epoch": 0.8685313882028602, + "grad_norm": 0.48729297518730164, + "learning_rate": 0.0001, + "loss": 1.5923, + "step": 7561 + }, + { + "epoch": 0.8686462581126874, + "grad_norm": 0.45936107635498047, + "learning_rate": 0.0001, + "loss": 1.5908, + "step": 7562 + }, + { + "epoch": 0.8687611280225145, + "grad_norm": 0.4694010317325592, + "learning_rate": 0.0001, + "loss": 1.6691, + "step": 7563 + }, + { + "epoch": 0.8688759979323416, + "grad_norm": 0.4324677288532257, + "learning_rate": 0.0001, + "loss": 1.5969, + "step": 7564 + }, + { + "epoch": 0.8689908678421687, + "grad_norm": 0.47414863109588623, + "learning_rate": 0.0001, + "loss": 1.6054, + "step": 7565 + }, + { + "epoch": 0.8691057377519958, + "grad_norm": 0.46548882126808167, + "learning_rate": 0.0001, + "loss": 1.3367, + "step": 7566 + }, + { + "epoch": 0.869220607661823, + "grad_norm": 0.44881516695022583, + "learning_rate": 0.0001, + "loss": 1.4306, + "step": 7567 + }, + { + "epoch": 0.8693354775716501, + "grad_norm": 0.475082129240036, + "learning_rate": 0.0001, + "loss": 1.5533, + "step": 7568 + }, + { + "epoch": 0.8694503474814772, + "grad_norm": 0.4312398135662079, + "learning_rate": 0.0001, + "loss": 1.6606, + "step": 7569 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 0.4381447434425354, + "learning_rate": 0.0001, + "loss": 1.4735, + "step": 7570 + }, + { + "epoch": 0.8696800873011314, + "grad_norm": 0.42871126532554626, + "learning_rate": 0.0001, + "loss": 1.4752, + "step": 7571 + }, + { + "epoch": 0.8697949572109586, + "grad_norm": 0.44468954205513, + "learning_rate": 0.0001, + "loss": 1.5352, + "step": 7572 + }, + { + "epoch": 0.8699098271207857, + "grad_norm": 0.49432846903800964, + "learning_rate": 0.0001, + "loss": 1.7286, + "step": 7573 + }, + { + "epoch": 0.8700246970306128, + "grad_norm": 0.4997519552707672, + "learning_rate": 0.0001, + "loss": 1.4884, + "step": 7574 + }, + { + "epoch": 0.8701395669404399, + "grad_norm": 0.47391921281814575, + "learning_rate": 0.0001, + "loss": 1.7209, + "step": 7575 + }, + { + "epoch": 0.870254436850267, + "grad_norm": 0.49615180492401123, + "learning_rate": 0.0001, + "loss": 1.5852, + "step": 7576 + }, + { + "epoch": 0.8703693067600942, + "grad_norm": 0.5302203297615051, + "learning_rate": 0.0001, + "loss": 1.6529, + "step": 7577 + }, + { + "epoch": 0.8704841766699213, + "grad_norm": 0.512974739074707, + "learning_rate": 0.0001, + "loss": 1.8474, + "step": 7578 + }, + { + "epoch": 0.8705990465797484, + "grad_norm": 0.5273780226707458, + "learning_rate": 0.0001, + "loss": 1.7045, + "step": 7579 + }, + { + "epoch": 0.8707139164895755, + "grad_norm": 0.45488354563713074, + "learning_rate": 0.0001, + "loss": 1.7054, + "step": 7580 + }, + { + "epoch": 0.8708287863994026, + "grad_norm": 0.4844851791858673, + "learning_rate": 0.0001, + "loss": 1.7121, + "step": 7581 + }, + { + "epoch": 0.8709436563092298, + "grad_norm": 0.47248736023902893, + "learning_rate": 0.0001, + "loss": 1.7882, + "step": 7582 + }, + { + "epoch": 0.8710585262190569, + "grad_norm": 0.4401882588863373, + "learning_rate": 0.0001, + "loss": 1.5392, + "step": 7583 + }, + { + "epoch": 0.871173396128884, + "grad_norm": 0.46107155084609985, + "learning_rate": 0.0001, + "loss": 1.5845, + "step": 7584 + }, + { + "epoch": 0.8712882660387111, + "grad_norm": 0.44512903690338135, + "learning_rate": 0.0001, + "loss": 1.4246, + "step": 7585 + }, + { + "epoch": 0.8714031359485382, + "grad_norm": 0.5119422078132629, + "learning_rate": 0.0001, + "loss": 1.6568, + "step": 7586 + }, + { + "epoch": 0.8715180058583654, + "grad_norm": 0.5448932647705078, + "learning_rate": 0.0001, + "loss": 1.6187, + "step": 7587 + }, + { + "epoch": 0.8716328757681925, + "grad_norm": 0.4820195138454437, + "learning_rate": 0.0001, + "loss": 1.5244, + "step": 7588 + }, + { + "epoch": 0.8717477456780196, + "grad_norm": 0.46764081716537476, + "learning_rate": 0.0001, + "loss": 1.6459, + "step": 7589 + }, + { + "epoch": 0.8718626155878467, + "grad_norm": 0.5043097734451294, + "learning_rate": 0.0001, + "loss": 1.7252, + "step": 7590 + }, + { + "epoch": 0.8719774854976738, + "grad_norm": 0.48596030473709106, + "learning_rate": 0.0001, + "loss": 1.6176, + "step": 7591 + }, + { + "epoch": 0.872092355407501, + "grad_norm": 0.4883681535720825, + "learning_rate": 0.0001, + "loss": 1.7058, + "step": 7592 + }, + { + "epoch": 0.8722072253173281, + "grad_norm": 0.4625040888786316, + "learning_rate": 0.0001, + "loss": 1.633, + "step": 7593 + }, + { + "epoch": 0.8723220952271552, + "grad_norm": 0.4826609194278717, + "learning_rate": 0.0001, + "loss": 1.4658, + "step": 7594 + }, + { + "epoch": 0.8724369651369823, + "grad_norm": 0.4496923089027405, + "learning_rate": 0.0001, + "loss": 1.6275, + "step": 7595 + }, + { + "epoch": 0.8725518350468094, + "grad_norm": 0.5118072032928467, + "learning_rate": 0.0001, + "loss": 1.8564, + "step": 7596 + }, + { + "epoch": 0.8726667049566366, + "grad_norm": 0.4654580056667328, + "learning_rate": 0.0001, + "loss": 1.4611, + "step": 7597 + }, + { + "epoch": 0.8727815748664637, + "grad_norm": 0.42999371886253357, + "learning_rate": 0.0001, + "loss": 1.4828, + "step": 7598 + }, + { + "epoch": 0.8728964447762908, + "grad_norm": 0.5076425671577454, + "learning_rate": 0.0001, + "loss": 1.7994, + "step": 7599 + }, + { + "epoch": 0.8730113146861179, + "grad_norm": 0.4634290933609009, + "learning_rate": 0.0001, + "loss": 1.7437, + "step": 7600 + }, + { + "epoch": 0.873126184595945, + "grad_norm": 0.455967515707016, + "learning_rate": 0.0001, + "loss": 1.6326, + "step": 7601 + }, + { + "epoch": 0.8732410545057722, + "grad_norm": 0.5265469551086426, + "learning_rate": 0.0001, + "loss": 1.828, + "step": 7602 + }, + { + "epoch": 0.8733559244155993, + "grad_norm": 0.44739046692848206, + "learning_rate": 0.0001, + "loss": 1.5161, + "step": 7603 + }, + { + "epoch": 0.8734707943254264, + "grad_norm": 0.47437113523483276, + "learning_rate": 0.0001, + "loss": 1.7686, + "step": 7604 + }, + { + "epoch": 0.8735856642352535, + "grad_norm": 0.47178056836128235, + "learning_rate": 0.0001, + "loss": 1.5443, + "step": 7605 + }, + { + "epoch": 0.8737005341450806, + "grad_norm": 0.47476136684417725, + "learning_rate": 0.0001, + "loss": 1.6839, + "step": 7606 + }, + { + "epoch": 0.8738154040549078, + "grad_norm": 0.47085344791412354, + "learning_rate": 0.0001, + "loss": 1.6793, + "step": 7607 + }, + { + "epoch": 0.8739302739647349, + "grad_norm": 0.48475953936576843, + "learning_rate": 0.0001, + "loss": 1.7614, + "step": 7608 + }, + { + "epoch": 0.8740451438745621, + "grad_norm": 0.46257898211479187, + "learning_rate": 0.0001, + "loss": 1.5595, + "step": 7609 + }, + { + "epoch": 0.8741600137843892, + "grad_norm": 0.4603593051433563, + "learning_rate": 0.0001, + "loss": 1.6879, + "step": 7610 + }, + { + "epoch": 0.8742748836942164, + "grad_norm": 0.46927449107170105, + "learning_rate": 0.0001, + "loss": 1.7635, + "step": 7611 + }, + { + "epoch": 0.8743897536040435, + "grad_norm": 0.48620933294296265, + "learning_rate": 0.0001, + "loss": 1.6829, + "step": 7612 + }, + { + "epoch": 0.8745046235138706, + "grad_norm": 0.4888293445110321, + "learning_rate": 0.0001, + "loss": 1.6731, + "step": 7613 + }, + { + "epoch": 0.8746194934236977, + "grad_norm": 0.45221349596977234, + "learning_rate": 0.0001, + "loss": 1.4829, + "step": 7614 + }, + { + "epoch": 0.8747343633335248, + "grad_norm": 0.4384780824184418, + "learning_rate": 0.0001, + "loss": 1.5118, + "step": 7615 + }, + { + "epoch": 0.874849233243352, + "grad_norm": 0.463351309299469, + "learning_rate": 0.0001, + "loss": 1.2614, + "step": 7616 + }, + { + "epoch": 0.8749641031531791, + "grad_norm": 0.5050379037857056, + "learning_rate": 0.0001, + "loss": 1.7645, + "step": 7617 + }, + { + "epoch": 0.8750789730630062, + "grad_norm": 0.4900285005569458, + "learning_rate": 0.0001, + "loss": 1.668, + "step": 7618 + }, + { + "epoch": 0.8751938429728333, + "grad_norm": 0.4964447319507599, + "learning_rate": 0.0001, + "loss": 1.7461, + "step": 7619 + }, + { + "epoch": 0.8753087128826604, + "grad_norm": 0.4623139798641205, + "learning_rate": 0.0001, + "loss": 1.5311, + "step": 7620 + }, + { + "epoch": 0.8754235827924876, + "grad_norm": 0.47225624322891235, + "learning_rate": 0.0001, + "loss": 1.6197, + "step": 7621 + }, + { + "epoch": 0.8755384527023147, + "grad_norm": 0.49223390221595764, + "learning_rate": 0.0001, + "loss": 1.6516, + "step": 7622 + }, + { + "epoch": 0.8756533226121418, + "grad_norm": 0.5095483064651489, + "learning_rate": 0.0001, + "loss": 1.5173, + "step": 7623 + }, + { + "epoch": 0.8757681925219689, + "grad_norm": 0.4903635084629059, + "learning_rate": 0.0001, + "loss": 1.7643, + "step": 7624 + }, + { + "epoch": 0.875883062431796, + "grad_norm": 0.4946781396865845, + "learning_rate": 0.0001, + "loss": 1.5768, + "step": 7625 + }, + { + "epoch": 0.8759979323416232, + "grad_norm": 0.4905765652656555, + "learning_rate": 0.0001, + "loss": 1.5622, + "step": 7626 + }, + { + "epoch": 0.8761128022514503, + "grad_norm": 0.505204975605011, + "learning_rate": 0.0001, + "loss": 1.7406, + "step": 7627 + }, + { + "epoch": 0.8762276721612774, + "grad_norm": 0.533305823802948, + "learning_rate": 0.0001, + "loss": 1.4833, + "step": 7628 + }, + { + "epoch": 0.8763425420711045, + "grad_norm": 0.47964945435523987, + "learning_rate": 0.0001, + "loss": 1.6496, + "step": 7629 + }, + { + "epoch": 0.8764574119809316, + "grad_norm": 0.4678000211715698, + "learning_rate": 0.0001, + "loss": 1.6501, + "step": 7630 + }, + { + "epoch": 0.8765722818907588, + "grad_norm": 0.47018638253211975, + "learning_rate": 0.0001, + "loss": 1.489, + "step": 7631 + }, + { + "epoch": 0.8766871518005859, + "grad_norm": 0.4635443091392517, + "learning_rate": 0.0001, + "loss": 1.4924, + "step": 7632 + }, + { + "epoch": 0.876802021710413, + "grad_norm": 0.46582552790641785, + "learning_rate": 0.0001, + "loss": 1.6194, + "step": 7633 + }, + { + "epoch": 0.8769168916202401, + "grad_norm": 0.47066211700439453, + "learning_rate": 0.0001, + "loss": 1.6757, + "step": 7634 + }, + { + "epoch": 0.8770317615300672, + "grad_norm": 0.569146990776062, + "learning_rate": 0.0001, + "loss": 1.7014, + "step": 7635 + }, + { + "epoch": 0.8771466314398944, + "grad_norm": 0.48655346035957336, + "learning_rate": 0.0001, + "loss": 1.5346, + "step": 7636 + }, + { + "epoch": 0.8772615013497215, + "grad_norm": 0.4453171491622925, + "learning_rate": 0.0001, + "loss": 1.6097, + "step": 7637 + }, + { + "epoch": 0.8773763712595486, + "grad_norm": 0.4952543079853058, + "learning_rate": 0.0001, + "loss": 1.6746, + "step": 7638 + }, + { + "epoch": 0.8774912411693757, + "grad_norm": 0.45405057072639465, + "learning_rate": 0.0001, + "loss": 1.5223, + "step": 7639 + }, + { + "epoch": 0.8776061110792028, + "grad_norm": 0.46041983366012573, + "learning_rate": 0.0001, + "loss": 1.5733, + "step": 7640 + }, + { + "epoch": 0.87772098098903, + "grad_norm": 0.4734715223312378, + "learning_rate": 0.0001, + "loss": 1.6602, + "step": 7641 + }, + { + "epoch": 0.8778358508988571, + "grad_norm": 0.4629979431629181, + "learning_rate": 0.0001, + "loss": 1.6299, + "step": 7642 + }, + { + "epoch": 0.8779507208086842, + "grad_norm": 0.47290873527526855, + "learning_rate": 0.0001, + "loss": 1.524, + "step": 7643 + }, + { + "epoch": 0.8780655907185113, + "grad_norm": 0.4647415578365326, + "learning_rate": 0.0001, + "loss": 1.5595, + "step": 7644 + }, + { + "epoch": 0.8781804606283384, + "grad_norm": 0.5042547583580017, + "learning_rate": 0.0001, + "loss": 1.7437, + "step": 7645 + }, + { + "epoch": 0.8782953305381656, + "grad_norm": 0.4914029836654663, + "learning_rate": 0.0001, + "loss": 1.8578, + "step": 7646 + }, + { + "epoch": 0.8784102004479927, + "grad_norm": 0.5158650875091553, + "learning_rate": 0.0001, + "loss": 1.6133, + "step": 7647 + }, + { + "epoch": 0.8785250703578198, + "grad_norm": 0.4820328950881958, + "learning_rate": 0.0001, + "loss": 1.5127, + "step": 7648 + }, + { + "epoch": 0.8786399402676469, + "grad_norm": 0.4517586827278137, + "learning_rate": 0.0001, + "loss": 1.4968, + "step": 7649 + }, + { + "epoch": 0.878754810177474, + "grad_norm": 0.45039236545562744, + "learning_rate": 0.0001, + "loss": 1.539, + "step": 7650 + }, + { + "epoch": 0.8788696800873012, + "grad_norm": 0.4908886253833771, + "learning_rate": 0.0001, + "loss": 1.5477, + "step": 7651 + }, + { + "epoch": 0.8789845499971283, + "grad_norm": 0.4798978865146637, + "learning_rate": 0.0001, + "loss": 1.5524, + "step": 7652 + }, + { + "epoch": 0.8790994199069554, + "grad_norm": 0.4604727327823639, + "learning_rate": 0.0001, + "loss": 1.6227, + "step": 7653 + }, + { + "epoch": 0.8792142898167825, + "grad_norm": 0.5215774774551392, + "learning_rate": 0.0001, + "loss": 1.6924, + "step": 7654 + }, + { + "epoch": 0.8793291597266096, + "grad_norm": 0.5219174027442932, + "learning_rate": 0.0001, + "loss": 1.8248, + "step": 7655 + }, + { + "epoch": 0.8794440296364368, + "grad_norm": 0.46888914704322815, + "learning_rate": 0.0001, + "loss": 1.6328, + "step": 7656 + }, + { + "epoch": 0.8795588995462639, + "grad_norm": 0.49896708130836487, + "learning_rate": 0.0001, + "loss": 1.679, + "step": 7657 + }, + { + "epoch": 0.879673769456091, + "grad_norm": 0.4882703423500061, + "learning_rate": 0.0001, + "loss": 1.7798, + "step": 7658 + }, + { + "epoch": 0.8797886393659181, + "grad_norm": 0.4401630461215973, + "learning_rate": 0.0001, + "loss": 1.5311, + "step": 7659 + }, + { + "epoch": 0.8799035092757452, + "grad_norm": 0.46130862832069397, + "learning_rate": 0.0001, + "loss": 1.5991, + "step": 7660 + }, + { + "epoch": 0.8800183791855724, + "grad_norm": 0.4488151967525482, + "learning_rate": 0.0001, + "loss": 1.5732, + "step": 7661 + }, + { + "epoch": 0.8801332490953995, + "grad_norm": 0.47853875160217285, + "learning_rate": 0.0001, + "loss": 1.5023, + "step": 7662 + }, + { + "epoch": 0.8802481190052266, + "grad_norm": 0.4806554615497589, + "learning_rate": 0.0001, + "loss": 1.5578, + "step": 7663 + }, + { + "epoch": 0.8803629889150537, + "grad_norm": 0.44891056418418884, + "learning_rate": 0.0001, + "loss": 1.4271, + "step": 7664 + }, + { + "epoch": 0.8804778588248808, + "grad_norm": 0.4926340579986572, + "learning_rate": 0.0001, + "loss": 1.6632, + "step": 7665 + }, + { + "epoch": 0.880592728734708, + "grad_norm": 0.47855570912361145, + "learning_rate": 0.0001, + "loss": 1.6053, + "step": 7666 + }, + { + "epoch": 0.8807075986445351, + "grad_norm": 0.47924453020095825, + "learning_rate": 0.0001, + "loss": 1.6421, + "step": 7667 + }, + { + "epoch": 0.8808224685543622, + "grad_norm": 0.45178931951522827, + "learning_rate": 0.0001, + "loss": 1.3517, + "step": 7668 + }, + { + "epoch": 0.8809373384641893, + "grad_norm": 0.4857328534126282, + "learning_rate": 0.0001, + "loss": 1.5011, + "step": 7669 + }, + { + "epoch": 0.8810522083740164, + "grad_norm": 0.5388020277023315, + "learning_rate": 0.0001, + "loss": 1.7677, + "step": 7670 + }, + { + "epoch": 0.8811670782838436, + "grad_norm": 0.4225468039512634, + "learning_rate": 0.0001, + "loss": 1.3599, + "step": 7671 + }, + { + "epoch": 0.8812819481936707, + "grad_norm": 0.4660169780254364, + "learning_rate": 0.0001, + "loss": 1.641, + "step": 7672 + }, + { + "epoch": 0.8813968181034978, + "grad_norm": 0.47213736176490784, + "learning_rate": 0.0001, + "loss": 1.5267, + "step": 7673 + }, + { + "epoch": 0.8815116880133249, + "grad_norm": 0.4642762541770935, + "learning_rate": 0.0001, + "loss": 1.3589, + "step": 7674 + }, + { + "epoch": 0.881626557923152, + "grad_norm": 0.47581663727760315, + "learning_rate": 0.0001, + "loss": 1.475, + "step": 7675 + }, + { + "epoch": 0.8817414278329792, + "grad_norm": 0.4813084900379181, + "learning_rate": 0.0001, + "loss": 1.426, + "step": 7676 + }, + { + "epoch": 0.8818562977428063, + "grad_norm": 0.46025902032852173, + "learning_rate": 0.0001, + "loss": 1.5892, + "step": 7677 + }, + { + "epoch": 0.8819711676526334, + "grad_norm": 0.5673764944076538, + "learning_rate": 0.0001, + "loss": 1.6262, + "step": 7678 + }, + { + "epoch": 0.8820860375624605, + "grad_norm": 0.4987919330596924, + "learning_rate": 0.0001, + "loss": 1.6401, + "step": 7679 + }, + { + "epoch": 0.8822009074722876, + "grad_norm": 0.4518562853336334, + "learning_rate": 0.0001, + "loss": 1.3551, + "step": 7680 + }, + { + "epoch": 0.8823157773821148, + "grad_norm": 0.486644446849823, + "learning_rate": 0.0001, + "loss": 1.5322, + "step": 7681 + }, + { + "epoch": 0.8824306472919419, + "grad_norm": 0.5359245538711548, + "learning_rate": 0.0001, + "loss": 1.8432, + "step": 7682 + }, + { + "epoch": 0.882545517201769, + "grad_norm": 0.4623430371284485, + "learning_rate": 0.0001, + "loss": 1.5393, + "step": 7683 + }, + { + "epoch": 0.8826603871115961, + "grad_norm": 0.47618022561073303, + "learning_rate": 0.0001, + "loss": 1.5502, + "step": 7684 + }, + { + "epoch": 0.8827752570214232, + "grad_norm": 0.5313566327095032, + "learning_rate": 0.0001, + "loss": 1.8855, + "step": 7685 + }, + { + "epoch": 0.8828901269312504, + "grad_norm": 0.4783889055252075, + "learning_rate": 0.0001, + "loss": 1.6492, + "step": 7686 + }, + { + "epoch": 0.8830049968410775, + "grad_norm": 0.45966142416000366, + "learning_rate": 0.0001, + "loss": 1.5857, + "step": 7687 + }, + { + "epoch": 0.8831198667509046, + "grad_norm": 0.47145649790763855, + "learning_rate": 0.0001, + "loss": 1.7645, + "step": 7688 + }, + { + "epoch": 0.8832347366607317, + "grad_norm": 0.4668165445327759, + "learning_rate": 0.0001, + "loss": 1.6133, + "step": 7689 + }, + { + "epoch": 0.8833496065705588, + "grad_norm": 0.44585132598876953, + "learning_rate": 0.0001, + "loss": 1.3982, + "step": 7690 + }, + { + "epoch": 0.883464476480386, + "grad_norm": 0.49249133467674255, + "learning_rate": 0.0001, + "loss": 1.6614, + "step": 7691 + }, + { + "epoch": 0.8835793463902131, + "grad_norm": 0.4700791835784912, + "learning_rate": 0.0001, + "loss": 1.3881, + "step": 7692 + }, + { + "epoch": 0.8836942163000402, + "grad_norm": 0.44664838910102844, + "learning_rate": 0.0001, + "loss": 1.5379, + "step": 7693 + }, + { + "epoch": 0.8838090862098673, + "grad_norm": 0.4859354496002197, + "learning_rate": 0.0001, + "loss": 1.6673, + "step": 7694 + }, + { + "epoch": 0.8839239561196944, + "grad_norm": 0.4571913480758667, + "learning_rate": 0.0001, + "loss": 1.5796, + "step": 7695 + }, + { + "epoch": 0.8840388260295216, + "grad_norm": 0.4279733896255493, + "learning_rate": 0.0001, + "loss": 1.5309, + "step": 7696 + }, + { + "epoch": 0.8841536959393487, + "grad_norm": 0.4860191345214844, + "learning_rate": 0.0001, + "loss": 1.827, + "step": 7697 + }, + { + "epoch": 0.8842685658491758, + "grad_norm": 0.4540635049343109, + "learning_rate": 0.0001, + "loss": 1.5865, + "step": 7698 + }, + { + "epoch": 0.8843834357590029, + "grad_norm": 0.5169913172721863, + "learning_rate": 0.0001, + "loss": 1.6006, + "step": 7699 + }, + { + "epoch": 0.88449830566883, + "grad_norm": 0.4786296784877777, + "learning_rate": 0.0001, + "loss": 1.5426, + "step": 7700 + }, + { + "epoch": 0.8846131755786572, + "grad_norm": 0.4788622558116913, + "learning_rate": 0.0001, + "loss": 1.4655, + "step": 7701 + }, + { + "epoch": 0.8847280454884843, + "grad_norm": 0.4708092212677002, + "learning_rate": 0.0001, + "loss": 1.6054, + "step": 7702 + }, + { + "epoch": 0.8848429153983114, + "grad_norm": 0.49218055605888367, + "learning_rate": 0.0001, + "loss": 1.5345, + "step": 7703 + }, + { + "epoch": 0.8849577853081385, + "grad_norm": 0.4458230137825012, + "learning_rate": 0.0001, + "loss": 1.5672, + "step": 7704 + }, + { + "epoch": 0.8850726552179656, + "grad_norm": 0.46966198086738586, + "learning_rate": 0.0001, + "loss": 1.6702, + "step": 7705 + }, + { + "epoch": 0.8851875251277928, + "grad_norm": 0.5042508244514465, + "learning_rate": 0.0001, + "loss": 1.6373, + "step": 7706 + }, + { + "epoch": 0.8853023950376199, + "grad_norm": 0.5178636312484741, + "learning_rate": 0.0001, + "loss": 1.8095, + "step": 7707 + }, + { + "epoch": 0.885417264947447, + "grad_norm": 0.5226283073425293, + "learning_rate": 0.0001, + "loss": 1.8296, + "step": 7708 + }, + { + "epoch": 0.8855321348572741, + "grad_norm": 0.4413732886314392, + "learning_rate": 0.0001, + "loss": 1.4009, + "step": 7709 + }, + { + "epoch": 0.8856470047671012, + "grad_norm": 0.44534966349601746, + "learning_rate": 0.0001, + "loss": 1.5705, + "step": 7710 + }, + { + "epoch": 0.8857618746769284, + "grad_norm": 0.49753180146217346, + "learning_rate": 0.0001, + "loss": 1.4834, + "step": 7711 + }, + { + "epoch": 0.8858767445867555, + "grad_norm": 0.4720037281513214, + "learning_rate": 0.0001, + "loss": 1.5619, + "step": 7712 + }, + { + "epoch": 0.8859916144965826, + "grad_norm": 0.5182157754898071, + "learning_rate": 0.0001, + "loss": 1.6232, + "step": 7713 + }, + { + "epoch": 0.8861064844064097, + "grad_norm": 0.47871580719947815, + "learning_rate": 0.0001, + "loss": 1.5819, + "step": 7714 + }, + { + "epoch": 0.8862213543162368, + "grad_norm": 0.508307158946991, + "learning_rate": 0.0001, + "loss": 1.6747, + "step": 7715 + }, + { + "epoch": 0.886336224226064, + "grad_norm": 0.47485652565956116, + "learning_rate": 0.0001, + "loss": 1.5988, + "step": 7716 + }, + { + "epoch": 0.8864510941358911, + "grad_norm": 0.5017088651657104, + "learning_rate": 0.0001, + "loss": 1.6983, + "step": 7717 + }, + { + "epoch": 0.8865659640457182, + "grad_norm": 0.46508511900901794, + "learning_rate": 0.0001, + "loss": 1.5499, + "step": 7718 + }, + { + "epoch": 0.8866808339555453, + "grad_norm": 0.4940198063850403, + "learning_rate": 0.0001, + "loss": 1.6522, + "step": 7719 + }, + { + "epoch": 0.8867957038653724, + "grad_norm": 0.508063554763794, + "learning_rate": 0.0001, + "loss": 1.7217, + "step": 7720 + }, + { + "epoch": 0.8869105737751996, + "grad_norm": 0.5674510598182678, + "learning_rate": 0.0001, + "loss": 1.7209, + "step": 7721 + }, + { + "epoch": 0.8870254436850267, + "grad_norm": 0.47527050971984863, + "learning_rate": 0.0001, + "loss": 1.4227, + "step": 7722 + }, + { + "epoch": 0.8871403135948538, + "grad_norm": 0.45663002133369446, + "learning_rate": 0.0001, + "loss": 1.4982, + "step": 7723 + }, + { + "epoch": 0.8872551835046809, + "grad_norm": 0.4941721558570862, + "learning_rate": 0.0001, + "loss": 1.7747, + "step": 7724 + }, + { + "epoch": 0.887370053414508, + "grad_norm": 0.43632128834724426, + "learning_rate": 0.0001, + "loss": 1.5104, + "step": 7725 + }, + { + "epoch": 0.8874849233243352, + "grad_norm": 0.4669651389122009, + "learning_rate": 0.0001, + "loss": 1.5774, + "step": 7726 + }, + { + "epoch": 0.8875997932341623, + "grad_norm": 0.5105260014533997, + "learning_rate": 0.0001, + "loss": 1.7566, + "step": 7727 + }, + { + "epoch": 0.8877146631439894, + "grad_norm": 0.44809389114379883, + "learning_rate": 0.0001, + "loss": 1.4762, + "step": 7728 + }, + { + "epoch": 0.8878295330538165, + "grad_norm": 0.4762137234210968, + "learning_rate": 0.0001, + "loss": 1.7521, + "step": 7729 + }, + { + "epoch": 0.8879444029636436, + "grad_norm": 0.45027822256088257, + "learning_rate": 0.0001, + "loss": 1.4717, + "step": 7730 + }, + { + "epoch": 0.8880592728734708, + "grad_norm": 0.4824502170085907, + "learning_rate": 0.0001, + "loss": 1.5083, + "step": 7731 + }, + { + "epoch": 0.8881741427832979, + "grad_norm": 0.4795003831386566, + "learning_rate": 0.0001, + "loss": 1.5373, + "step": 7732 + }, + { + "epoch": 0.888289012693125, + "grad_norm": 0.5017417073249817, + "learning_rate": 0.0001, + "loss": 1.6815, + "step": 7733 + }, + { + "epoch": 0.8884038826029521, + "grad_norm": 0.44462695717811584, + "learning_rate": 0.0001, + "loss": 1.4145, + "step": 7734 + }, + { + "epoch": 0.8885187525127792, + "grad_norm": 0.4432191550731659, + "learning_rate": 0.0001, + "loss": 1.4622, + "step": 7735 + }, + { + "epoch": 0.8886336224226064, + "grad_norm": 0.4664211571216583, + "learning_rate": 0.0001, + "loss": 1.6906, + "step": 7736 + }, + { + "epoch": 0.8887484923324335, + "grad_norm": 0.4574565589427948, + "learning_rate": 0.0001, + "loss": 1.515, + "step": 7737 + }, + { + "epoch": 0.8888633622422606, + "grad_norm": 0.44210490584373474, + "learning_rate": 0.0001, + "loss": 1.4454, + "step": 7738 + }, + { + "epoch": 0.8889782321520877, + "grad_norm": 0.4957959055900574, + "learning_rate": 0.0001, + "loss": 1.7905, + "step": 7739 + }, + { + "epoch": 0.8890931020619148, + "grad_norm": 0.43715575337409973, + "learning_rate": 0.0001, + "loss": 1.4055, + "step": 7740 + }, + { + "epoch": 0.889207971971742, + "grad_norm": 0.4835394024848938, + "learning_rate": 0.0001, + "loss": 1.7379, + "step": 7741 + }, + { + "epoch": 0.8893228418815691, + "grad_norm": 0.4787488281726837, + "learning_rate": 0.0001, + "loss": 1.6368, + "step": 7742 + }, + { + "epoch": 0.8894377117913962, + "grad_norm": 0.4879782199859619, + "learning_rate": 0.0001, + "loss": 1.5455, + "step": 7743 + }, + { + "epoch": 0.8895525817012233, + "grad_norm": 0.4805709719657898, + "learning_rate": 0.0001, + "loss": 1.5724, + "step": 7744 + }, + { + "epoch": 0.8896674516110504, + "grad_norm": 0.4606061577796936, + "learning_rate": 0.0001, + "loss": 1.6125, + "step": 7745 + }, + { + "epoch": 0.8897823215208777, + "grad_norm": 0.5064094662666321, + "learning_rate": 0.0001, + "loss": 1.322, + "step": 7746 + }, + { + "epoch": 0.8898971914307048, + "grad_norm": 0.45881563425064087, + "learning_rate": 0.0001, + "loss": 1.5164, + "step": 7747 + }, + { + "epoch": 0.8900120613405319, + "grad_norm": 0.48323944211006165, + "learning_rate": 0.0001, + "loss": 1.5188, + "step": 7748 + }, + { + "epoch": 0.890126931250359, + "grad_norm": 0.49248838424682617, + "learning_rate": 0.0001, + "loss": 1.5483, + "step": 7749 + }, + { + "epoch": 0.8902418011601861, + "grad_norm": 0.46828705072402954, + "learning_rate": 0.0001, + "loss": 1.422, + "step": 7750 + }, + { + "epoch": 0.8903566710700133, + "grad_norm": 0.45508071780204773, + "learning_rate": 0.0001, + "loss": 1.5604, + "step": 7751 + }, + { + "epoch": 0.8904715409798404, + "grad_norm": 0.4920658469200134, + "learning_rate": 0.0001, + "loss": 1.4792, + "step": 7752 + }, + { + "epoch": 0.8905864108896675, + "grad_norm": 0.46648716926574707, + "learning_rate": 0.0001, + "loss": 1.674, + "step": 7753 + }, + { + "epoch": 0.8907012807994946, + "grad_norm": 0.4534609019756317, + "learning_rate": 0.0001, + "loss": 1.3492, + "step": 7754 + }, + { + "epoch": 0.8908161507093217, + "grad_norm": 0.5462976098060608, + "learning_rate": 0.0001, + "loss": 1.2098, + "step": 7755 + }, + { + "epoch": 0.8909310206191489, + "grad_norm": 0.4742743670940399, + "learning_rate": 0.0001, + "loss": 1.6341, + "step": 7756 + }, + { + "epoch": 0.891045890528976, + "grad_norm": 0.46202293038368225, + "learning_rate": 0.0001, + "loss": 1.5176, + "step": 7757 + }, + { + "epoch": 0.8911607604388031, + "grad_norm": 0.45886507630348206, + "learning_rate": 0.0001, + "loss": 1.4584, + "step": 7758 + }, + { + "epoch": 0.8912756303486302, + "grad_norm": 0.4991988241672516, + "learning_rate": 0.0001, + "loss": 1.6096, + "step": 7759 + }, + { + "epoch": 0.8913905002584573, + "grad_norm": 0.47904571890830994, + "learning_rate": 0.0001, + "loss": 1.7234, + "step": 7760 + }, + { + "epoch": 0.8915053701682845, + "grad_norm": 0.48273780941963196, + "learning_rate": 0.0001, + "loss": 1.6437, + "step": 7761 + }, + { + "epoch": 0.8916202400781116, + "grad_norm": 0.45888492465019226, + "learning_rate": 0.0001, + "loss": 1.5468, + "step": 7762 + }, + { + "epoch": 0.8917351099879387, + "grad_norm": 0.485304057598114, + "learning_rate": 0.0001, + "loss": 1.5815, + "step": 7763 + }, + { + "epoch": 0.8918499798977658, + "grad_norm": 0.49057838320732117, + "learning_rate": 0.0001, + "loss": 1.62, + "step": 7764 + }, + { + "epoch": 0.8919648498075929, + "grad_norm": 0.503322958946228, + "learning_rate": 0.0001, + "loss": 1.4175, + "step": 7765 + }, + { + "epoch": 0.8920797197174201, + "grad_norm": 0.4755953550338745, + "learning_rate": 0.0001, + "loss": 1.61, + "step": 7766 + }, + { + "epoch": 0.8921945896272472, + "grad_norm": 0.5407209396362305, + "learning_rate": 0.0001, + "loss": 1.8089, + "step": 7767 + }, + { + "epoch": 0.8923094595370743, + "grad_norm": 0.48187482357025146, + "learning_rate": 0.0001, + "loss": 1.5752, + "step": 7768 + }, + { + "epoch": 0.8924243294469014, + "grad_norm": 0.4826925992965698, + "learning_rate": 0.0001, + "loss": 1.6726, + "step": 7769 + }, + { + "epoch": 0.8925391993567285, + "grad_norm": 0.4857804775238037, + "learning_rate": 0.0001, + "loss": 1.5634, + "step": 7770 + }, + { + "epoch": 0.8926540692665557, + "grad_norm": 0.4642079472541809, + "learning_rate": 0.0001, + "loss": 1.4843, + "step": 7771 + }, + { + "epoch": 0.8927689391763828, + "grad_norm": 0.44618815183639526, + "learning_rate": 0.0001, + "loss": 1.3971, + "step": 7772 + }, + { + "epoch": 0.8928838090862099, + "grad_norm": 0.44455963373184204, + "learning_rate": 0.0001, + "loss": 1.3867, + "step": 7773 + }, + { + "epoch": 0.892998678996037, + "grad_norm": 0.46926748752593994, + "learning_rate": 0.0001, + "loss": 1.5882, + "step": 7774 + }, + { + "epoch": 0.8931135489058641, + "grad_norm": 0.5021077394485474, + "learning_rate": 0.0001, + "loss": 1.7976, + "step": 7775 + }, + { + "epoch": 0.8932284188156913, + "grad_norm": 0.4602797031402588, + "learning_rate": 0.0001, + "loss": 1.599, + "step": 7776 + }, + { + "epoch": 0.8933432887255184, + "grad_norm": 0.4697769582271576, + "learning_rate": 0.0001, + "loss": 1.4241, + "step": 7777 + }, + { + "epoch": 0.8934581586353455, + "grad_norm": 0.48710909485816956, + "learning_rate": 0.0001, + "loss": 1.4455, + "step": 7778 + }, + { + "epoch": 0.8935730285451726, + "grad_norm": 0.47241660952568054, + "learning_rate": 0.0001, + "loss": 1.6332, + "step": 7779 + }, + { + "epoch": 0.8936878984549997, + "grad_norm": 0.488765686750412, + "learning_rate": 0.0001, + "loss": 1.5666, + "step": 7780 + }, + { + "epoch": 0.8938027683648269, + "grad_norm": 0.45212939381599426, + "learning_rate": 0.0001, + "loss": 1.5377, + "step": 7781 + }, + { + "epoch": 0.893917638274654, + "grad_norm": 0.49566277861595154, + "learning_rate": 0.0001, + "loss": 1.6206, + "step": 7782 + }, + { + "epoch": 0.8940325081844811, + "grad_norm": 0.4758758544921875, + "learning_rate": 0.0001, + "loss": 1.5356, + "step": 7783 + }, + { + "epoch": 0.8941473780943082, + "grad_norm": 0.47846412658691406, + "learning_rate": 0.0001, + "loss": 1.409, + "step": 7784 + }, + { + "epoch": 0.8942622480041353, + "grad_norm": 0.465116947889328, + "learning_rate": 0.0001, + "loss": 1.6158, + "step": 7785 + }, + { + "epoch": 0.8943771179139625, + "grad_norm": 0.4536508321762085, + "learning_rate": 0.0001, + "loss": 1.7166, + "step": 7786 + }, + { + "epoch": 0.8944919878237896, + "grad_norm": 0.44384217262268066, + "learning_rate": 0.0001, + "loss": 1.5588, + "step": 7787 + }, + { + "epoch": 0.8946068577336167, + "grad_norm": 0.46297183632850647, + "learning_rate": 0.0001, + "loss": 1.4644, + "step": 7788 + }, + { + "epoch": 0.8947217276434438, + "grad_norm": 0.5043572783470154, + "learning_rate": 0.0001, + "loss": 1.6958, + "step": 7789 + }, + { + "epoch": 0.8948365975532709, + "grad_norm": 0.49215108156204224, + "learning_rate": 0.0001, + "loss": 1.7364, + "step": 7790 + }, + { + "epoch": 0.8949514674630981, + "grad_norm": 0.45499980449676514, + "learning_rate": 0.0001, + "loss": 1.5584, + "step": 7791 + }, + { + "epoch": 0.8950663373729252, + "grad_norm": 0.46621590852737427, + "learning_rate": 0.0001, + "loss": 1.6982, + "step": 7792 + }, + { + "epoch": 0.8951812072827523, + "grad_norm": 0.4747898578643799, + "learning_rate": 0.0001, + "loss": 1.5868, + "step": 7793 + }, + { + "epoch": 0.8952960771925794, + "grad_norm": 0.4650057256221771, + "learning_rate": 0.0001, + "loss": 1.4734, + "step": 7794 + }, + { + "epoch": 0.8954109471024065, + "grad_norm": 0.45124003291130066, + "learning_rate": 0.0001, + "loss": 1.6151, + "step": 7795 + }, + { + "epoch": 0.8955258170122337, + "grad_norm": 0.469590425491333, + "learning_rate": 0.0001, + "loss": 1.6978, + "step": 7796 + }, + { + "epoch": 0.8956406869220608, + "grad_norm": 0.4690435826778412, + "learning_rate": 0.0001, + "loss": 1.5853, + "step": 7797 + }, + { + "epoch": 0.8957555568318879, + "grad_norm": 0.48957929015159607, + "learning_rate": 0.0001, + "loss": 1.767, + "step": 7798 + }, + { + "epoch": 0.895870426741715, + "grad_norm": 0.459736168384552, + "learning_rate": 0.0001, + "loss": 1.593, + "step": 7799 + }, + { + "epoch": 0.8959852966515421, + "grad_norm": 0.4722045063972473, + "learning_rate": 0.0001, + "loss": 1.8041, + "step": 7800 + }, + { + "epoch": 0.8961001665613693, + "grad_norm": 0.5619189143180847, + "learning_rate": 0.0001, + "loss": 1.6918, + "step": 7801 + }, + { + "epoch": 0.8962150364711964, + "grad_norm": 0.4744798243045807, + "learning_rate": 0.0001, + "loss": 1.7405, + "step": 7802 + }, + { + "epoch": 0.8963299063810235, + "grad_norm": 0.4761870801448822, + "learning_rate": 0.0001, + "loss": 1.5544, + "step": 7803 + }, + { + "epoch": 0.8964447762908506, + "grad_norm": 0.45697519183158875, + "learning_rate": 0.0001, + "loss": 1.5555, + "step": 7804 + }, + { + "epoch": 0.8965596462006777, + "grad_norm": 0.5395013689994812, + "learning_rate": 0.0001, + "loss": 1.8088, + "step": 7805 + }, + { + "epoch": 0.8966745161105049, + "grad_norm": 0.4655267000198364, + "learning_rate": 0.0001, + "loss": 1.5784, + "step": 7806 + }, + { + "epoch": 0.896789386020332, + "grad_norm": 0.4401511549949646, + "learning_rate": 0.0001, + "loss": 1.4877, + "step": 7807 + }, + { + "epoch": 0.8969042559301591, + "grad_norm": 0.5011132955551147, + "learning_rate": 0.0001, + "loss": 1.6072, + "step": 7808 + }, + { + "epoch": 0.8970191258399862, + "grad_norm": 0.4679315984249115, + "learning_rate": 0.0001, + "loss": 1.3788, + "step": 7809 + }, + { + "epoch": 0.8971339957498133, + "grad_norm": 0.487979918718338, + "learning_rate": 0.0001, + "loss": 1.5793, + "step": 7810 + }, + { + "epoch": 0.8972488656596405, + "grad_norm": 0.4689542353153229, + "learning_rate": 0.0001, + "loss": 1.5487, + "step": 7811 + }, + { + "epoch": 0.8973637355694676, + "grad_norm": 0.49013927578926086, + "learning_rate": 0.0001, + "loss": 1.8521, + "step": 7812 + }, + { + "epoch": 0.8974786054792947, + "grad_norm": 0.4543326497077942, + "learning_rate": 0.0001, + "loss": 1.5612, + "step": 7813 + }, + { + "epoch": 0.8975934753891218, + "grad_norm": 0.4831598103046417, + "learning_rate": 0.0001, + "loss": 1.619, + "step": 7814 + }, + { + "epoch": 0.8977083452989489, + "grad_norm": 0.49325430393218994, + "learning_rate": 0.0001, + "loss": 1.6324, + "step": 7815 + }, + { + "epoch": 0.8978232152087761, + "grad_norm": 0.5205132365226746, + "learning_rate": 0.0001, + "loss": 1.5647, + "step": 7816 + }, + { + "epoch": 0.8979380851186032, + "grad_norm": 0.485248327255249, + "learning_rate": 0.0001, + "loss": 1.5955, + "step": 7817 + }, + { + "epoch": 0.8980529550284303, + "grad_norm": 0.467001736164093, + "learning_rate": 0.0001, + "loss": 1.5915, + "step": 7818 + }, + { + "epoch": 0.8981678249382574, + "grad_norm": 0.519828200340271, + "learning_rate": 0.0001, + "loss": 1.6739, + "step": 7819 + }, + { + "epoch": 0.8982826948480845, + "grad_norm": 0.4926901161670685, + "learning_rate": 0.0001, + "loss": 1.7966, + "step": 7820 + }, + { + "epoch": 0.8983975647579117, + "grad_norm": 0.4958095848560333, + "learning_rate": 0.0001, + "loss": 1.6828, + "step": 7821 + }, + { + "epoch": 0.8985124346677388, + "grad_norm": 0.4742107093334198, + "learning_rate": 0.0001, + "loss": 1.6398, + "step": 7822 + }, + { + "epoch": 0.8986273045775659, + "grad_norm": 0.48758435249328613, + "learning_rate": 0.0001, + "loss": 1.7207, + "step": 7823 + }, + { + "epoch": 0.898742174487393, + "grad_norm": 0.48550042510032654, + "learning_rate": 0.0001, + "loss": 1.6212, + "step": 7824 + }, + { + "epoch": 0.8988570443972201, + "grad_norm": 0.5684822201728821, + "learning_rate": 0.0001, + "loss": 2.0053, + "step": 7825 + }, + { + "epoch": 0.8989719143070473, + "grad_norm": 0.4416176378726959, + "learning_rate": 0.0001, + "loss": 1.4682, + "step": 7826 + }, + { + "epoch": 0.8990867842168744, + "grad_norm": 0.44723203778266907, + "learning_rate": 0.0001, + "loss": 1.493, + "step": 7827 + }, + { + "epoch": 0.8992016541267015, + "grad_norm": 0.4821613132953644, + "learning_rate": 0.0001, + "loss": 1.6573, + "step": 7828 + }, + { + "epoch": 0.8993165240365286, + "grad_norm": 0.4443539083003998, + "learning_rate": 0.0001, + "loss": 1.4682, + "step": 7829 + }, + { + "epoch": 0.8994313939463557, + "grad_norm": 0.5018233060836792, + "learning_rate": 0.0001, + "loss": 1.7302, + "step": 7830 + }, + { + "epoch": 0.8995462638561829, + "grad_norm": 0.4475822448730469, + "learning_rate": 0.0001, + "loss": 1.5244, + "step": 7831 + }, + { + "epoch": 0.89966113376601, + "grad_norm": 0.4100770652294159, + "learning_rate": 0.0001, + "loss": 1.3041, + "step": 7832 + }, + { + "epoch": 0.8997760036758371, + "grad_norm": 0.5303254723548889, + "learning_rate": 0.0001, + "loss": 1.9058, + "step": 7833 + }, + { + "epoch": 0.8998908735856642, + "grad_norm": 0.4729917645454407, + "learning_rate": 0.0001, + "loss": 1.4515, + "step": 7834 + }, + { + "epoch": 0.9000057434954913, + "grad_norm": 0.45676717162132263, + "learning_rate": 0.0001, + "loss": 1.4337, + "step": 7835 + }, + { + "epoch": 0.9001206134053185, + "grad_norm": 0.4548957943916321, + "learning_rate": 0.0001, + "loss": 1.5653, + "step": 7836 + }, + { + "epoch": 0.9002354833151456, + "grad_norm": 0.46776890754699707, + "learning_rate": 0.0001, + "loss": 1.5995, + "step": 7837 + }, + { + "epoch": 0.9003503532249727, + "grad_norm": 0.8313576579093933, + "learning_rate": 0.0001, + "loss": 1.6672, + "step": 7838 + }, + { + "epoch": 0.9004652231347998, + "grad_norm": 0.45772501826286316, + "learning_rate": 0.0001, + "loss": 1.5468, + "step": 7839 + }, + { + "epoch": 0.9005800930446269, + "grad_norm": 0.4540563225746155, + "learning_rate": 0.0001, + "loss": 1.6791, + "step": 7840 + }, + { + "epoch": 0.9006949629544541, + "grad_norm": 0.43928417563438416, + "learning_rate": 0.0001, + "loss": 1.4581, + "step": 7841 + }, + { + "epoch": 0.9008098328642812, + "grad_norm": 0.46283161640167236, + "learning_rate": 0.0001, + "loss": 1.4749, + "step": 7842 + }, + { + "epoch": 0.9009247027741083, + "grad_norm": 0.49827006459236145, + "learning_rate": 0.0001, + "loss": 1.6633, + "step": 7843 + }, + { + "epoch": 0.9010395726839354, + "grad_norm": 0.48653677105903625, + "learning_rate": 0.0001, + "loss": 1.5447, + "step": 7844 + }, + { + "epoch": 0.9011544425937625, + "grad_norm": 0.5136744976043701, + "learning_rate": 0.0001, + "loss": 1.5334, + "step": 7845 + }, + { + "epoch": 0.9012693125035897, + "grad_norm": 0.4694797694683075, + "learning_rate": 0.0001, + "loss": 1.5367, + "step": 7846 + }, + { + "epoch": 0.9013841824134168, + "grad_norm": 0.47298017144203186, + "learning_rate": 0.0001, + "loss": 1.4799, + "step": 7847 + }, + { + "epoch": 0.9014990523232439, + "grad_norm": 0.46906399726867676, + "learning_rate": 0.0001, + "loss": 1.6518, + "step": 7848 + }, + { + "epoch": 0.901613922233071, + "grad_norm": 0.473000168800354, + "learning_rate": 0.0001, + "loss": 1.6285, + "step": 7849 + }, + { + "epoch": 0.9017287921428981, + "grad_norm": 0.462663471698761, + "learning_rate": 0.0001, + "loss": 1.7218, + "step": 7850 + }, + { + "epoch": 0.9018436620527253, + "grad_norm": 0.48268601298332214, + "learning_rate": 0.0001, + "loss": 1.4941, + "step": 7851 + }, + { + "epoch": 0.9019585319625524, + "grad_norm": 0.4828319251537323, + "learning_rate": 0.0001, + "loss": 1.6787, + "step": 7852 + }, + { + "epoch": 0.9020734018723795, + "grad_norm": 0.5251330733299255, + "learning_rate": 0.0001, + "loss": 1.8425, + "step": 7853 + }, + { + "epoch": 0.9021882717822066, + "grad_norm": 0.478097140789032, + "learning_rate": 0.0001, + "loss": 1.4669, + "step": 7854 + }, + { + "epoch": 0.9023031416920337, + "grad_norm": 0.4791843891143799, + "learning_rate": 0.0001, + "loss": 1.4991, + "step": 7855 + }, + { + "epoch": 0.9024180116018609, + "grad_norm": 0.4416709244251251, + "learning_rate": 0.0001, + "loss": 1.2615, + "step": 7856 + }, + { + "epoch": 0.902532881511688, + "grad_norm": 0.4547010362148285, + "learning_rate": 0.0001, + "loss": 1.6578, + "step": 7857 + }, + { + "epoch": 0.9026477514215151, + "grad_norm": 0.47191575169563293, + "learning_rate": 0.0001, + "loss": 1.6771, + "step": 7858 + }, + { + "epoch": 0.9027626213313422, + "grad_norm": 0.4907677173614502, + "learning_rate": 0.0001, + "loss": 1.6336, + "step": 7859 + }, + { + "epoch": 0.9028774912411693, + "grad_norm": 0.4836212396621704, + "learning_rate": 0.0001, + "loss": 1.5876, + "step": 7860 + }, + { + "epoch": 0.9029923611509965, + "grad_norm": 0.46854549646377563, + "learning_rate": 0.0001, + "loss": 1.7531, + "step": 7861 + }, + { + "epoch": 0.9031072310608236, + "grad_norm": 0.45791977643966675, + "learning_rate": 0.0001, + "loss": 1.6377, + "step": 7862 + }, + { + "epoch": 0.9032221009706507, + "grad_norm": 0.4522063732147217, + "learning_rate": 0.0001, + "loss": 1.6546, + "step": 7863 + }, + { + "epoch": 0.9033369708804778, + "grad_norm": 0.4975827932357788, + "learning_rate": 0.0001, + "loss": 1.464, + "step": 7864 + }, + { + "epoch": 0.9034518407903049, + "grad_norm": 0.450082927942276, + "learning_rate": 0.0001, + "loss": 1.69, + "step": 7865 + }, + { + "epoch": 0.9035667107001321, + "grad_norm": 0.4922288656234741, + "learning_rate": 0.0001, + "loss": 1.6362, + "step": 7866 + }, + { + "epoch": 0.9036815806099592, + "grad_norm": 0.4563447833061218, + "learning_rate": 0.0001, + "loss": 1.6167, + "step": 7867 + }, + { + "epoch": 0.9037964505197863, + "grad_norm": 0.466033399105072, + "learning_rate": 0.0001, + "loss": 1.5822, + "step": 7868 + }, + { + "epoch": 0.9039113204296134, + "grad_norm": 0.48967087268829346, + "learning_rate": 0.0001, + "loss": 1.6251, + "step": 7869 + }, + { + "epoch": 0.9040261903394405, + "grad_norm": 0.4684128165245056, + "learning_rate": 0.0001, + "loss": 1.61, + "step": 7870 + }, + { + "epoch": 0.9041410602492677, + "grad_norm": 0.5109823942184448, + "learning_rate": 0.0001, + "loss": 1.6527, + "step": 7871 + }, + { + "epoch": 0.9042559301590948, + "grad_norm": 0.46000662446022034, + "learning_rate": 0.0001, + "loss": 1.6549, + "step": 7872 + }, + { + "epoch": 0.9043708000689219, + "grad_norm": 0.5103839039802551, + "learning_rate": 0.0001, + "loss": 1.5844, + "step": 7873 + }, + { + "epoch": 0.904485669978749, + "grad_norm": 0.47217777371406555, + "learning_rate": 0.0001, + "loss": 1.6014, + "step": 7874 + }, + { + "epoch": 0.9046005398885761, + "grad_norm": 0.5040363073348999, + "learning_rate": 0.0001, + "loss": 1.6295, + "step": 7875 + }, + { + "epoch": 0.9047154097984033, + "grad_norm": 0.4671007990837097, + "learning_rate": 0.0001, + "loss": 1.4647, + "step": 7876 + }, + { + "epoch": 0.9048302797082304, + "grad_norm": 0.467616468667984, + "learning_rate": 0.0001, + "loss": 1.5478, + "step": 7877 + }, + { + "epoch": 0.9049451496180575, + "grad_norm": 0.4528937339782715, + "learning_rate": 0.0001, + "loss": 1.4716, + "step": 7878 + }, + { + "epoch": 0.9050600195278846, + "grad_norm": 0.4835106432437897, + "learning_rate": 0.0001, + "loss": 1.6353, + "step": 7879 + }, + { + "epoch": 0.9051748894377117, + "grad_norm": 0.4774007201194763, + "learning_rate": 0.0001, + "loss": 1.619, + "step": 7880 + }, + { + "epoch": 0.9052897593475389, + "grad_norm": 0.4768272340297699, + "learning_rate": 0.0001, + "loss": 1.4522, + "step": 7881 + }, + { + "epoch": 0.905404629257366, + "grad_norm": 0.4725984036922455, + "learning_rate": 0.0001, + "loss": 1.5499, + "step": 7882 + }, + { + "epoch": 0.9055194991671932, + "grad_norm": 0.44116008281707764, + "learning_rate": 0.0001, + "loss": 1.3919, + "step": 7883 + }, + { + "epoch": 0.9056343690770203, + "grad_norm": 0.4748861789703369, + "learning_rate": 0.0001, + "loss": 1.6583, + "step": 7884 + }, + { + "epoch": 0.9057492389868474, + "grad_norm": 0.4855034053325653, + "learning_rate": 0.0001, + "loss": 1.6794, + "step": 7885 + }, + { + "epoch": 0.9058641088966746, + "grad_norm": 0.4543818533420563, + "learning_rate": 0.0001, + "loss": 1.5825, + "step": 7886 + }, + { + "epoch": 0.9059789788065017, + "grad_norm": 0.529973566532135, + "learning_rate": 0.0001, + "loss": 1.9031, + "step": 7887 + }, + { + "epoch": 0.9060938487163288, + "grad_norm": 0.445414662361145, + "learning_rate": 0.0001, + "loss": 1.4344, + "step": 7888 + }, + { + "epoch": 0.9062087186261559, + "grad_norm": 0.5017836689949036, + "learning_rate": 0.0001, + "loss": 1.4926, + "step": 7889 + }, + { + "epoch": 0.906323588535983, + "grad_norm": 0.4460631012916565, + "learning_rate": 0.0001, + "loss": 1.4937, + "step": 7890 + }, + { + "epoch": 0.9064384584458102, + "grad_norm": 0.4223814904689789, + "learning_rate": 0.0001, + "loss": 1.2901, + "step": 7891 + }, + { + "epoch": 0.9065533283556373, + "grad_norm": 0.4735008478164673, + "learning_rate": 0.0001, + "loss": 1.6393, + "step": 7892 + }, + { + "epoch": 0.9066681982654644, + "grad_norm": 0.4891226589679718, + "learning_rate": 0.0001, + "loss": 1.6429, + "step": 7893 + }, + { + "epoch": 0.9067830681752915, + "grad_norm": 0.49529507756233215, + "learning_rate": 0.0001, + "loss": 1.4392, + "step": 7894 + }, + { + "epoch": 0.9068979380851186, + "grad_norm": 0.4905736446380615, + "learning_rate": 0.0001, + "loss": 1.5082, + "step": 7895 + }, + { + "epoch": 0.9070128079949458, + "grad_norm": 0.53058922290802, + "learning_rate": 0.0001, + "loss": 1.4193, + "step": 7896 + }, + { + "epoch": 0.9071276779047729, + "grad_norm": 0.42824816703796387, + "learning_rate": 0.0001, + "loss": 1.4727, + "step": 7897 + }, + { + "epoch": 0.9072425478146, + "grad_norm": 0.4656597673892975, + "learning_rate": 0.0001, + "loss": 1.5861, + "step": 7898 + }, + { + "epoch": 0.9073574177244271, + "grad_norm": 0.5175133347511292, + "learning_rate": 0.0001, + "loss": 1.7236, + "step": 7899 + }, + { + "epoch": 0.9074722876342542, + "grad_norm": 0.4708557426929474, + "learning_rate": 0.0001, + "loss": 1.6186, + "step": 7900 + }, + { + "epoch": 0.9075871575440814, + "grad_norm": 0.49528563022613525, + "learning_rate": 0.0001, + "loss": 1.5486, + "step": 7901 + }, + { + "epoch": 0.9077020274539085, + "grad_norm": 0.45954427123069763, + "learning_rate": 0.0001, + "loss": 1.4233, + "step": 7902 + }, + { + "epoch": 0.9078168973637356, + "grad_norm": 0.48827245831489563, + "learning_rate": 0.0001, + "loss": 1.5515, + "step": 7903 + }, + { + "epoch": 0.9079317672735627, + "grad_norm": 0.4947773218154907, + "learning_rate": 0.0001, + "loss": 1.6213, + "step": 7904 + }, + { + "epoch": 0.9080466371833898, + "grad_norm": 0.4761560261249542, + "learning_rate": 0.0001, + "loss": 1.5405, + "step": 7905 + }, + { + "epoch": 0.908161507093217, + "grad_norm": 0.48969805240631104, + "learning_rate": 0.0001, + "loss": 1.5753, + "step": 7906 + }, + { + "epoch": 0.9082763770030441, + "grad_norm": 0.49315011501312256, + "learning_rate": 0.0001, + "loss": 1.6573, + "step": 7907 + }, + { + "epoch": 0.9083912469128712, + "grad_norm": 0.4530597925186157, + "learning_rate": 0.0001, + "loss": 1.5737, + "step": 7908 + }, + { + "epoch": 0.9085061168226983, + "grad_norm": 0.4942728877067566, + "learning_rate": 0.0001, + "loss": 1.7064, + "step": 7909 + }, + { + "epoch": 0.9086209867325254, + "grad_norm": 0.44348907470703125, + "learning_rate": 0.0001, + "loss": 1.506, + "step": 7910 + }, + { + "epoch": 0.9087358566423526, + "grad_norm": 0.4613859951496124, + "learning_rate": 0.0001, + "loss": 1.5753, + "step": 7911 + }, + { + "epoch": 0.9088507265521797, + "grad_norm": 0.46956050395965576, + "learning_rate": 0.0001, + "loss": 1.2878, + "step": 7912 + }, + { + "epoch": 0.9089655964620068, + "grad_norm": 0.5553674697875977, + "learning_rate": 0.0001, + "loss": 1.8513, + "step": 7913 + }, + { + "epoch": 0.9090804663718339, + "grad_norm": 0.48194214701652527, + "learning_rate": 0.0001, + "loss": 1.3442, + "step": 7914 + }, + { + "epoch": 0.909195336281661, + "grad_norm": 0.4378836750984192, + "learning_rate": 0.0001, + "loss": 1.4797, + "step": 7915 + }, + { + "epoch": 0.9093102061914882, + "grad_norm": 0.4518606662750244, + "learning_rate": 0.0001, + "loss": 1.5698, + "step": 7916 + }, + { + "epoch": 0.9094250761013153, + "grad_norm": 0.46091845631599426, + "learning_rate": 0.0001, + "loss": 1.4956, + "step": 7917 + }, + { + "epoch": 0.9095399460111424, + "grad_norm": 0.447496235370636, + "learning_rate": 0.0001, + "loss": 1.5605, + "step": 7918 + }, + { + "epoch": 0.9096548159209695, + "grad_norm": 0.5359869599342346, + "learning_rate": 0.0001, + "loss": 1.6687, + "step": 7919 + }, + { + "epoch": 0.9097696858307966, + "grad_norm": 0.5245752930641174, + "learning_rate": 0.0001, + "loss": 1.5827, + "step": 7920 + }, + { + "epoch": 0.9098845557406238, + "grad_norm": 0.5289234519004822, + "learning_rate": 0.0001, + "loss": 1.6148, + "step": 7921 + }, + { + "epoch": 0.9099994256504509, + "grad_norm": 0.49240848422050476, + "learning_rate": 0.0001, + "loss": 1.6205, + "step": 7922 + }, + { + "epoch": 0.910114295560278, + "grad_norm": 0.46257832646369934, + "learning_rate": 0.0001, + "loss": 1.623, + "step": 7923 + }, + { + "epoch": 0.9102291654701051, + "grad_norm": 0.5155847072601318, + "learning_rate": 0.0001, + "loss": 1.6578, + "step": 7924 + }, + { + "epoch": 0.9103440353799322, + "grad_norm": 0.5221872925758362, + "learning_rate": 0.0001, + "loss": 1.65, + "step": 7925 + }, + { + "epoch": 0.9104589052897594, + "grad_norm": 0.4366224408149719, + "learning_rate": 0.0001, + "loss": 1.5788, + "step": 7926 + }, + { + "epoch": 0.9105737751995865, + "grad_norm": 0.48819804191589355, + "learning_rate": 0.0001, + "loss": 1.6978, + "step": 7927 + }, + { + "epoch": 0.9106886451094136, + "grad_norm": 0.4560621678829193, + "learning_rate": 0.0001, + "loss": 1.4342, + "step": 7928 + }, + { + "epoch": 0.9108035150192407, + "grad_norm": 0.48248445987701416, + "learning_rate": 0.0001, + "loss": 1.7123, + "step": 7929 + }, + { + "epoch": 0.9109183849290678, + "grad_norm": 0.4500480890274048, + "learning_rate": 0.0001, + "loss": 1.5309, + "step": 7930 + }, + { + "epoch": 0.911033254838895, + "grad_norm": 0.52225661277771, + "learning_rate": 0.0001, + "loss": 1.6317, + "step": 7931 + }, + { + "epoch": 0.9111481247487221, + "grad_norm": 0.5065485239028931, + "learning_rate": 0.0001, + "loss": 1.3653, + "step": 7932 + }, + { + "epoch": 0.9112629946585492, + "grad_norm": 0.5204269886016846, + "learning_rate": 0.0001, + "loss": 1.7194, + "step": 7933 + }, + { + "epoch": 0.9113778645683763, + "grad_norm": 0.5047922134399414, + "learning_rate": 0.0001, + "loss": 1.6934, + "step": 7934 + }, + { + "epoch": 0.9114927344782034, + "grad_norm": 0.4803232252597809, + "learning_rate": 0.0001, + "loss": 1.7358, + "step": 7935 + }, + { + "epoch": 0.9116076043880306, + "grad_norm": 0.5010977983474731, + "learning_rate": 0.0001, + "loss": 1.6251, + "step": 7936 + }, + { + "epoch": 0.9117224742978577, + "grad_norm": 0.4615200459957123, + "learning_rate": 0.0001, + "loss": 1.5255, + "step": 7937 + }, + { + "epoch": 0.9118373442076848, + "grad_norm": 0.4917146861553192, + "learning_rate": 0.0001, + "loss": 1.5949, + "step": 7938 + }, + { + "epoch": 0.9119522141175119, + "grad_norm": 0.45587408542633057, + "learning_rate": 0.0001, + "loss": 1.5541, + "step": 7939 + }, + { + "epoch": 0.912067084027339, + "grad_norm": 0.4592360556125641, + "learning_rate": 0.0001, + "loss": 1.5118, + "step": 7940 + }, + { + "epoch": 0.9121819539371662, + "grad_norm": 0.49223262071609497, + "learning_rate": 0.0001, + "loss": 1.6489, + "step": 7941 + }, + { + "epoch": 0.9122968238469933, + "grad_norm": 0.46011883020401, + "learning_rate": 0.0001, + "loss": 1.3733, + "step": 7942 + }, + { + "epoch": 0.9124116937568204, + "grad_norm": 0.49059078097343445, + "learning_rate": 0.0001, + "loss": 1.7346, + "step": 7943 + }, + { + "epoch": 0.9125265636666475, + "grad_norm": 0.472307026386261, + "learning_rate": 0.0001, + "loss": 1.5103, + "step": 7944 + }, + { + "epoch": 0.9126414335764746, + "grad_norm": 0.48213303089141846, + "learning_rate": 0.0001, + "loss": 1.5634, + "step": 7945 + }, + { + "epoch": 0.9127563034863018, + "grad_norm": 0.5104955434799194, + "learning_rate": 0.0001, + "loss": 1.8161, + "step": 7946 + }, + { + "epoch": 0.9128711733961289, + "grad_norm": 0.4811389446258545, + "learning_rate": 0.0001, + "loss": 1.5841, + "step": 7947 + }, + { + "epoch": 0.912986043305956, + "grad_norm": 0.4906313419342041, + "learning_rate": 0.0001, + "loss": 1.7777, + "step": 7948 + }, + { + "epoch": 0.9131009132157831, + "grad_norm": 0.45326316356658936, + "learning_rate": 0.0001, + "loss": 1.6841, + "step": 7949 + }, + { + "epoch": 0.9132157831256102, + "grad_norm": 0.41644978523254395, + "learning_rate": 0.0001, + "loss": 1.306, + "step": 7950 + }, + { + "epoch": 0.9133306530354374, + "grad_norm": 0.432559996843338, + "learning_rate": 0.0001, + "loss": 1.3399, + "step": 7951 + }, + { + "epoch": 0.9134455229452645, + "grad_norm": 0.4607470631599426, + "learning_rate": 0.0001, + "loss": 1.6326, + "step": 7952 + }, + { + "epoch": 0.9135603928550916, + "grad_norm": 0.5131176710128784, + "learning_rate": 0.0001, + "loss": 1.7356, + "step": 7953 + }, + { + "epoch": 0.9136752627649187, + "grad_norm": 0.4790054261684418, + "learning_rate": 0.0001, + "loss": 1.6153, + "step": 7954 + }, + { + "epoch": 0.9137901326747458, + "grad_norm": 0.4506865441799164, + "learning_rate": 0.0001, + "loss": 1.5772, + "step": 7955 + }, + { + "epoch": 0.913905002584573, + "grad_norm": 0.44508710503578186, + "learning_rate": 0.0001, + "loss": 1.5731, + "step": 7956 + }, + { + "epoch": 0.9140198724944001, + "grad_norm": 0.43661433458328247, + "learning_rate": 0.0001, + "loss": 1.531, + "step": 7957 + }, + { + "epoch": 0.9141347424042272, + "grad_norm": 0.456387460231781, + "learning_rate": 0.0001, + "loss": 1.6312, + "step": 7958 + }, + { + "epoch": 0.9142496123140543, + "grad_norm": 0.5062097311019897, + "learning_rate": 0.0001, + "loss": 1.7662, + "step": 7959 + }, + { + "epoch": 0.9143644822238814, + "grad_norm": 0.4519118070602417, + "learning_rate": 0.0001, + "loss": 1.5057, + "step": 7960 + }, + { + "epoch": 0.9144793521337086, + "grad_norm": 0.4753338098526001, + "learning_rate": 0.0001, + "loss": 1.5305, + "step": 7961 + }, + { + "epoch": 0.9145942220435357, + "grad_norm": 0.44463813304901123, + "learning_rate": 0.0001, + "loss": 1.4873, + "step": 7962 + }, + { + "epoch": 0.9147090919533628, + "grad_norm": 0.4657713770866394, + "learning_rate": 0.0001, + "loss": 1.5608, + "step": 7963 + }, + { + "epoch": 0.9148239618631899, + "grad_norm": 0.47514888644218445, + "learning_rate": 0.0001, + "loss": 1.4934, + "step": 7964 + }, + { + "epoch": 0.914938831773017, + "grad_norm": 0.5030425786972046, + "learning_rate": 0.0001, + "loss": 1.6392, + "step": 7965 + }, + { + "epoch": 0.9150537016828442, + "grad_norm": 0.49183422327041626, + "learning_rate": 0.0001, + "loss": 1.6318, + "step": 7966 + }, + { + "epoch": 0.9151685715926713, + "grad_norm": 0.4909827709197998, + "learning_rate": 0.0001, + "loss": 1.607, + "step": 7967 + }, + { + "epoch": 0.9152834415024984, + "grad_norm": 0.5114017724990845, + "learning_rate": 0.0001, + "loss": 1.7051, + "step": 7968 + }, + { + "epoch": 0.9153983114123255, + "grad_norm": 0.5097264647483826, + "learning_rate": 0.0001, + "loss": 1.7322, + "step": 7969 + }, + { + "epoch": 0.9155131813221526, + "grad_norm": 0.4767652750015259, + "learning_rate": 0.0001, + "loss": 1.4866, + "step": 7970 + }, + { + "epoch": 0.9156280512319798, + "grad_norm": 0.4586257338523865, + "learning_rate": 0.0001, + "loss": 1.5762, + "step": 7971 + }, + { + "epoch": 0.9157429211418069, + "grad_norm": 0.4332074224948883, + "learning_rate": 0.0001, + "loss": 1.3948, + "step": 7972 + }, + { + "epoch": 0.915857791051634, + "grad_norm": 0.48241952061653137, + "learning_rate": 0.0001, + "loss": 1.5413, + "step": 7973 + }, + { + "epoch": 0.9159726609614611, + "grad_norm": 0.5874638557434082, + "learning_rate": 0.0001, + "loss": 1.895, + "step": 7974 + }, + { + "epoch": 0.9160875308712882, + "grad_norm": 0.4532858431339264, + "learning_rate": 0.0001, + "loss": 1.6298, + "step": 7975 + }, + { + "epoch": 0.9162024007811154, + "grad_norm": 0.47365015745162964, + "learning_rate": 0.0001, + "loss": 1.6582, + "step": 7976 + }, + { + "epoch": 0.9163172706909425, + "grad_norm": 0.5175490975379944, + "learning_rate": 0.0001, + "loss": 1.5293, + "step": 7977 + }, + { + "epoch": 0.9164321406007696, + "grad_norm": 0.47895553708076477, + "learning_rate": 0.0001, + "loss": 1.608, + "step": 7978 + }, + { + "epoch": 0.9165470105105967, + "grad_norm": 0.4826950132846832, + "learning_rate": 0.0001, + "loss": 1.5567, + "step": 7979 + }, + { + "epoch": 0.9166618804204238, + "grad_norm": 0.4933220148086548, + "learning_rate": 0.0001, + "loss": 1.5636, + "step": 7980 + }, + { + "epoch": 0.916776750330251, + "grad_norm": 0.47146233916282654, + "learning_rate": 0.0001, + "loss": 1.7157, + "step": 7981 + }, + { + "epoch": 0.9168916202400781, + "grad_norm": 0.4537650942802429, + "learning_rate": 0.0001, + "loss": 1.5842, + "step": 7982 + }, + { + "epoch": 0.9170064901499052, + "grad_norm": 0.5133910179138184, + "learning_rate": 0.0001, + "loss": 1.75, + "step": 7983 + }, + { + "epoch": 0.9171213600597323, + "grad_norm": 0.4584057927131653, + "learning_rate": 0.0001, + "loss": 1.5925, + "step": 7984 + }, + { + "epoch": 0.9172362299695594, + "grad_norm": 0.4383944571018219, + "learning_rate": 0.0001, + "loss": 1.4504, + "step": 7985 + }, + { + "epoch": 0.9173510998793866, + "grad_norm": 0.482051819562912, + "learning_rate": 0.0001, + "loss": 1.4079, + "step": 7986 + }, + { + "epoch": 0.9174659697892137, + "grad_norm": 0.4862101078033447, + "learning_rate": 0.0001, + "loss": 1.734, + "step": 7987 + }, + { + "epoch": 0.9175808396990408, + "grad_norm": 0.46888962388038635, + "learning_rate": 0.0001, + "loss": 1.6339, + "step": 7988 + }, + { + "epoch": 0.9176957096088679, + "grad_norm": 0.4773329198360443, + "learning_rate": 0.0001, + "loss": 1.5293, + "step": 7989 + }, + { + "epoch": 0.917810579518695, + "grad_norm": 0.5123506784439087, + "learning_rate": 0.0001, + "loss": 1.4992, + "step": 7990 + }, + { + "epoch": 0.9179254494285222, + "grad_norm": 0.4806014895439148, + "learning_rate": 0.0001, + "loss": 1.5878, + "step": 7991 + }, + { + "epoch": 0.9180403193383493, + "grad_norm": 0.45858410000801086, + "learning_rate": 0.0001, + "loss": 1.5983, + "step": 7992 + }, + { + "epoch": 0.9181551892481764, + "grad_norm": 0.5545932650566101, + "learning_rate": 0.0001, + "loss": 1.6907, + "step": 7993 + }, + { + "epoch": 0.9182700591580035, + "grad_norm": 0.4767701327800751, + "learning_rate": 0.0001, + "loss": 1.6551, + "step": 7994 + }, + { + "epoch": 0.9183849290678306, + "grad_norm": 0.5300646424293518, + "learning_rate": 0.0001, + "loss": 1.4472, + "step": 7995 + }, + { + "epoch": 0.9184997989776578, + "grad_norm": 0.4707062244415283, + "learning_rate": 0.0001, + "loss": 1.6684, + "step": 7996 + }, + { + "epoch": 0.9186146688874849, + "grad_norm": 0.47407713532447815, + "learning_rate": 0.0001, + "loss": 1.5561, + "step": 7997 + }, + { + "epoch": 0.918729538797312, + "grad_norm": 0.4721267521381378, + "learning_rate": 0.0001, + "loss": 1.6156, + "step": 7998 + }, + { + "epoch": 0.9188444087071391, + "grad_norm": 0.48417842388153076, + "learning_rate": 0.0001, + "loss": 1.5445, + "step": 7999 + }, + { + "epoch": 0.9189592786169662, + "grad_norm": 0.4994370639324188, + "learning_rate": 0.0001, + "loss": 1.786, + "step": 8000 + }, + { + "epoch": 0.9190741485267934, + "grad_norm": 0.5283456444740295, + "learning_rate": 0.0001, + "loss": 1.8096, + "step": 8001 + }, + { + "epoch": 0.9191890184366205, + "grad_norm": 0.4809178411960602, + "learning_rate": 0.0001, + "loss": 1.5805, + "step": 8002 + }, + { + "epoch": 0.9193038883464476, + "grad_norm": 0.5179066061973572, + "learning_rate": 0.0001, + "loss": 1.6071, + "step": 8003 + }, + { + "epoch": 0.9194187582562747, + "grad_norm": 0.5334687829017639, + "learning_rate": 0.0001, + "loss": 1.759, + "step": 8004 + }, + { + "epoch": 0.9195336281661018, + "grad_norm": 0.4858127236366272, + "learning_rate": 0.0001, + "loss": 1.6873, + "step": 8005 + }, + { + "epoch": 0.919648498075929, + "grad_norm": 0.47769853472709656, + "learning_rate": 0.0001, + "loss": 1.4748, + "step": 8006 + }, + { + "epoch": 0.9197633679857561, + "grad_norm": 0.5166919827461243, + "learning_rate": 0.0001, + "loss": 1.7652, + "step": 8007 + }, + { + "epoch": 0.9198782378955832, + "grad_norm": 0.45674726366996765, + "learning_rate": 0.0001, + "loss": 1.6726, + "step": 8008 + }, + { + "epoch": 0.9199931078054103, + "grad_norm": 0.45661020278930664, + "learning_rate": 0.0001, + "loss": 1.55, + "step": 8009 + }, + { + "epoch": 0.9201079777152374, + "grad_norm": 0.5638238787651062, + "learning_rate": 0.0001, + "loss": 1.9338, + "step": 8010 + }, + { + "epoch": 0.9202228476250646, + "grad_norm": 0.4804632067680359, + "learning_rate": 0.0001, + "loss": 1.695, + "step": 8011 + }, + { + "epoch": 0.9203377175348917, + "grad_norm": 0.45596975088119507, + "learning_rate": 0.0001, + "loss": 1.4805, + "step": 8012 + }, + { + "epoch": 0.9204525874447188, + "grad_norm": 0.47516369819641113, + "learning_rate": 0.0001, + "loss": 1.6081, + "step": 8013 + }, + { + "epoch": 0.9205674573545459, + "grad_norm": 0.47999808192253113, + "learning_rate": 0.0001, + "loss": 1.6079, + "step": 8014 + }, + { + "epoch": 0.920682327264373, + "grad_norm": 0.4690406918525696, + "learning_rate": 0.0001, + "loss": 1.6159, + "step": 8015 + }, + { + "epoch": 0.9207971971742002, + "grad_norm": 0.4539242088794708, + "learning_rate": 0.0001, + "loss": 1.598, + "step": 8016 + }, + { + "epoch": 0.9209120670840273, + "grad_norm": 0.4828774333000183, + "learning_rate": 0.0001, + "loss": 1.5996, + "step": 8017 + }, + { + "epoch": 0.9210269369938544, + "grad_norm": 0.46138039231300354, + "learning_rate": 0.0001, + "loss": 1.6745, + "step": 8018 + }, + { + "epoch": 0.9211418069036815, + "grad_norm": 0.5137259364128113, + "learning_rate": 0.0001, + "loss": 1.7913, + "step": 8019 + }, + { + "epoch": 0.9212566768135086, + "grad_norm": 0.4621741771697998, + "learning_rate": 0.0001, + "loss": 1.5345, + "step": 8020 + }, + { + "epoch": 0.9213715467233359, + "grad_norm": 0.4695769250392914, + "learning_rate": 0.0001, + "loss": 1.6416, + "step": 8021 + }, + { + "epoch": 0.921486416633163, + "grad_norm": 0.4522899389266968, + "learning_rate": 0.0001, + "loss": 1.6889, + "step": 8022 + }, + { + "epoch": 0.9216012865429901, + "grad_norm": 0.4740762710571289, + "learning_rate": 0.0001, + "loss": 1.5799, + "step": 8023 + }, + { + "epoch": 0.9217161564528172, + "grad_norm": 0.4992024302482605, + "learning_rate": 0.0001, + "loss": 1.5142, + "step": 8024 + }, + { + "epoch": 0.9218310263626444, + "grad_norm": 0.4674678444862366, + "learning_rate": 0.0001, + "loss": 1.6359, + "step": 8025 + }, + { + "epoch": 0.9219458962724715, + "grad_norm": 0.5612513422966003, + "learning_rate": 0.0001, + "loss": 1.715, + "step": 8026 + }, + { + "epoch": 0.9220607661822986, + "grad_norm": 0.4955098330974579, + "learning_rate": 0.0001, + "loss": 1.6685, + "step": 8027 + }, + { + "epoch": 0.9221756360921257, + "grad_norm": 0.47574183344841003, + "learning_rate": 0.0001, + "loss": 1.5452, + "step": 8028 + }, + { + "epoch": 0.9222905060019528, + "grad_norm": 0.4642239212989807, + "learning_rate": 0.0001, + "loss": 1.6651, + "step": 8029 + }, + { + "epoch": 0.92240537591178, + "grad_norm": 0.473645955324173, + "learning_rate": 0.0001, + "loss": 1.6189, + "step": 8030 + }, + { + "epoch": 0.9225202458216071, + "grad_norm": 0.4713420569896698, + "learning_rate": 0.0001, + "loss": 1.6377, + "step": 8031 + }, + { + "epoch": 0.9226351157314342, + "grad_norm": 0.44111305475234985, + "learning_rate": 0.0001, + "loss": 1.4193, + "step": 8032 + }, + { + "epoch": 0.9227499856412613, + "grad_norm": 0.4585154354572296, + "learning_rate": 0.0001, + "loss": 1.5419, + "step": 8033 + }, + { + "epoch": 0.9228648555510884, + "grad_norm": 0.5225629210472107, + "learning_rate": 0.0001, + "loss": 1.8615, + "step": 8034 + }, + { + "epoch": 0.9229797254609156, + "grad_norm": 0.43736404180526733, + "learning_rate": 0.0001, + "loss": 1.6029, + "step": 8035 + }, + { + "epoch": 0.9230945953707427, + "grad_norm": 0.4762333333492279, + "learning_rate": 0.0001, + "loss": 1.6124, + "step": 8036 + }, + { + "epoch": 0.9232094652805698, + "grad_norm": 0.47877663373947144, + "learning_rate": 0.0001, + "loss": 1.7119, + "step": 8037 + }, + { + "epoch": 0.9233243351903969, + "grad_norm": 0.47204989194869995, + "learning_rate": 0.0001, + "loss": 1.5251, + "step": 8038 + }, + { + "epoch": 0.923439205100224, + "grad_norm": 0.4415188133716583, + "learning_rate": 0.0001, + "loss": 1.6022, + "step": 8039 + }, + { + "epoch": 0.9235540750100512, + "grad_norm": 0.4474656879901886, + "learning_rate": 0.0001, + "loss": 1.4882, + "step": 8040 + }, + { + "epoch": 0.9236689449198783, + "grad_norm": 0.4732295274734497, + "learning_rate": 0.0001, + "loss": 1.3283, + "step": 8041 + }, + { + "epoch": 0.9237838148297054, + "grad_norm": 0.5105409026145935, + "learning_rate": 0.0001, + "loss": 1.7868, + "step": 8042 + }, + { + "epoch": 0.9238986847395325, + "grad_norm": 0.48805803060531616, + "learning_rate": 0.0001, + "loss": 1.3698, + "step": 8043 + }, + { + "epoch": 0.9240135546493596, + "grad_norm": 0.5286892652511597, + "learning_rate": 0.0001, + "loss": 1.5557, + "step": 8044 + }, + { + "epoch": 0.9241284245591868, + "grad_norm": 0.5003154873847961, + "learning_rate": 0.0001, + "loss": 1.5898, + "step": 8045 + }, + { + "epoch": 0.9242432944690139, + "grad_norm": 0.503452718257904, + "learning_rate": 0.0001, + "loss": 1.2122, + "step": 8046 + }, + { + "epoch": 0.924358164378841, + "grad_norm": 0.46282196044921875, + "learning_rate": 0.0001, + "loss": 1.4744, + "step": 8047 + }, + { + "epoch": 0.9244730342886681, + "grad_norm": 0.48527947068214417, + "learning_rate": 0.0001, + "loss": 1.7213, + "step": 8048 + }, + { + "epoch": 0.9245879041984952, + "grad_norm": 0.5114087462425232, + "learning_rate": 0.0001, + "loss": 1.8104, + "step": 8049 + }, + { + "epoch": 0.9247027741083224, + "grad_norm": 0.5129145979881287, + "learning_rate": 0.0001, + "loss": 1.7043, + "step": 8050 + }, + { + "epoch": 0.9248176440181495, + "grad_norm": 0.4763910174369812, + "learning_rate": 0.0001, + "loss": 1.7121, + "step": 8051 + }, + { + "epoch": 0.9249325139279766, + "grad_norm": 0.47218838334083557, + "learning_rate": 0.0001, + "loss": 1.5667, + "step": 8052 + }, + { + "epoch": 0.9250473838378037, + "grad_norm": 0.48110297322273254, + "learning_rate": 0.0001, + "loss": 1.4001, + "step": 8053 + }, + { + "epoch": 0.9251622537476308, + "grad_norm": 0.4787732660770416, + "learning_rate": 0.0001, + "loss": 1.6857, + "step": 8054 + }, + { + "epoch": 0.925277123657458, + "grad_norm": 0.48069849610328674, + "learning_rate": 0.0001, + "loss": 1.7946, + "step": 8055 + }, + { + "epoch": 0.9253919935672851, + "grad_norm": 0.47537073493003845, + "learning_rate": 0.0001, + "loss": 1.6928, + "step": 8056 + }, + { + "epoch": 0.9255068634771122, + "grad_norm": 0.5298829078674316, + "learning_rate": 0.0001, + "loss": 1.76, + "step": 8057 + }, + { + "epoch": 0.9256217333869393, + "grad_norm": 0.5032019019126892, + "learning_rate": 0.0001, + "loss": 1.8279, + "step": 8058 + }, + { + "epoch": 0.9257366032967664, + "grad_norm": 0.45885175466537476, + "learning_rate": 0.0001, + "loss": 1.5709, + "step": 8059 + }, + { + "epoch": 0.9258514732065936, + "grad_norm": 0.5285278558731079, + "learning_rate": 0.0001, + "loss": 1.8198, + "step": 8060 + }, + { + "epoch": 0.9259663431164207, + "grad_norm": 0.4382495880126953, + "learning_rate": 0.0001, + "loss": 1.3852, + "step": 8061 + }, + { + "epoch": 0.9260812130262478, + "grad_norm": 0.47816163301467896, + "learning_rate": 0.0001, + "loss": 1.7725, + "step": 8062 + }, + { + "epoch": 0.9261960829360749, + "grad_norm": 0.4544413685798645, + "learning_rate": 0.0001, + "loss": 1.4782, + "step": 8063 + }, + { + "epoch": 0.926310952845902, + "grad_norm": 0.46362125873565674, + "learning_rate": 0.0001, + "loss": 1.583, + "step": 8064 + }, + { + "epoch": 0.9264258227557292, + "grad_norm": 0.4957447052001953, + "learning_rate": 0.0001, + "loss": 1.5227, + "step": 8065 + }, + { + "epoch": 0.9265406926655563, + "grad_norm": 0.4854178726673126, + "learning_rate": 0.0001, + "loss": 1.7782, + "step": 8066 + }, + { + "epoch": 0.9266555625753834, + "grad_norm": 0.4702478051185608, + "learning_rate": 0.0001, + "loss": 1.5576, + "step": 8067 + }, + { + "epoch": 0.9267704324852105, + "grad_norm": 0.5195414423942566, + "learning_rate": 0.0001, + "loss": 1.8032, + "step": 8068 + }, + { + "epoch": 0.9268853023950376, + "grad_norm": 0.45996007323265076, + "learning_rate": 0.0001, + "loss": 1.3316, + "step": 8069 + }, + { + "epoch": 0.9270001723048648, + "grad_norm": 0.482707679271698, + "learning_rate": 0.0001, + "loss": 1.7089, + "step": 8070 + }, + { + "epoch": 0.9271150422146919, + "grad_norm": 0.4624692499637604, + "learning_rate": 0.0001, + "loss": 1.6492, + "step": 8071 + }, + { + "epoch": 0.927229912124519, + "grad_norm": 0.46416664123535156, + "learning_rate": 0.0001, + "loss": 1.5141, + "step": 8072 + }, + { + "epoch": 0.9273447820343461, + "grad_norm": 0.4962540566921234, + "learning_rate": 0.0001, + "loss": 1.6515, + "step": 8073 + }, + { + "epoch": 0.9274596519441732, + "grad_norm": 0.4407590627670288, + "learning_rate": 0.0001, + "loss": 1.4663, + "step": 8074 + }, + { + "epoch": 0.9275745218540004, + "grad_norm": 0.4883404076099396, + "learning_rate": 0.0001, + "loss": 1.5755, + "step": 8075 + }, + { + "epoch": 0.9276893917638275, + "grad_norm": 0.46453699469566345, + "learning_rate": 0.0001, + "loss": 1.4894, + "step": 8076 + }, + { + "epoch": 0.9278042616736546, + "grad_norm": 0.45241397619247437, + "learning_rate": 0.0001, + "loss": 1.5494, + "step": 8077 + }, + { + "epoch": 0.9279191315834817, + "grad_norm": 0.48242220282554626, + "learning_rate": 0.0001, + "loss": 1.5236, + "step": 8078 + }, + { + "epoch": 0.9280340014933088, + "grad_norm": 0.546755313873291, + "learning_rate": 0.0001, + "loss": 1.9116, + "step": 8079 + }, + { + "epoch": 0.928148871403136, + "grad_norm": 0.4613960087299347, + "learning_rate": 0.0001, + "loss": 1.5672, + "step": 8080 + }, + { + "epoch": 0.9282637413129631, + "grad_norm": 0.47544580698013306, + "learning_rate": 0.0001, + "loss": 1.5371, + "step": 8081 + }, + { + "epoch": 0.9283786112227902, + "grad_norm": 0.5122746825218201, + "learning_rate": 0.0001, + "loss": 1.686, + "step": 8082 + }, + { + "epoch": 0.9284934811326173, + "grad_norm": 0.47522497177124023, + "learning_rate": 0.0001, + "loss": 1.6247, + "step": 8083 + }, + { + "epoch": 0.9286083510424444, + "grad_norm": 0.46367111802101135, + "learning_rate": 0.0001, + "loss": 1.4283, + "step": 8084 + }, + { + "epoch": 0.9287232209522716, + "grad_norm": 0.5942444205284119, + "learning_rate": 0.0001, + "loss": 1.5347, + "step": 8085 + }, + { + "epoch": 0.9288380908620987, + "grad_norm": 0.46138957142829895, + "learning_rate": 0.0001, + "loss": 1.7417, + "step": 8086 + }, + { + "epoch": 0.9289529607719258, + "grad_norm": 0.5159793496131897, + "learning_rate": 0.0001, + "loss": 1.757, + "step": 8087 + }, + { + "epoch": 0.9290678306817529, + "grad_norm": 0.48674991726875305, + "learning_rate": 0.0001, + "loss": 1.617, + "step": 8088 + }, + { + "epoch": 0.92918270059158, + "grad_norm": 0.5087023973464966, + "learning_rate": 0.0001, + "loss": 1.8192, + "step": 8089 + }, + { + "epoch": 0.9292975705014072, + "grad_norm": 0.5132076740264893, + "learning_rate": 0.0001, + "loss": 1.6679, + "step": 8090 + }, + { + "epoch": 0.9294124404112343, + "grad_norm": 0.4683862030506134, + "learning_rate": 0.0001, + "loss": 1.6059, + "step": 8091 + }, + { + "epoch": 0.9295273103210614, + "grad_norm": 0.5067598819732666, + "learning_rate": 0.0001, + "loss": 1.6037, + "step": 8092 + }, + { + "epoch": 0.9296421802308885, + "grad_norm": 0.4644726514816284, + "learning_rate": 0.0001, + "loss": 1.6217, + "step": 8093 + }, + { + "epoch": 0.9297570501407156, + "grad_norm": 0.44829845428466797, + "learning_rate": 0.0001, + "loss": 1.43, + "step": 8094 + }, + { + "epoch": 0.9298719200505428, + "grad_norm": 0.5093761682510376, + "learning_rate": 0.0001, + "loss": 1.6939, + "step": 8095 + }, + { + "epoch": 0.9299867899603699, + "grad_norm": 0.4932906925678253, + "learning_rate": 0.0001, + "loss": 1.7515, + "step": 8096 + }, + { + "epoch": 0.930101659870197, + "grad_norm": 0.4766685366630554, + "learning_rate": 0.0001, + "loss": 1.6851, + "step": 8097 + }, + { + "epoch": 0.9302165297800241, + "grad_norm": 0.46401628851890564, + "learning_rate": 0.0001, + "loss": 1.5306, + "step": 8098 + }, + { + "epoch": 0.9303313996898512, + "grad_norm": 0.45654991269111633, + "learning_rate": 0.0001, + "loss": 1.4563, + "step": 8099 + }, + { + "epoch": 0.9304462695996784, + "grad_norm": 0.46206626296043396, + "learning_rate": 0.0001, + "loss": 1.3484, + "step": 8100 + }, + { + "epoch": 0.9305611395095055, + "grad_norm": 0.4578942060470581, + "learning_rate": 0.0001, + "loss": 1.3981, + "step": 8101 + }, + { + "epoch": 0.9306760094193326, + "grad_norm": 0.4928850531578064, + "learning_rate": 0.0001, + "loss": 1.5062, + "step": 8102 + }, + { + "epoch": 0.9307908793291597, + "grad_norm": 0.46304240822792053, + "learning_rate": 0.0001, + "loss": 1.5991, + "step": 8103 + }, + { + "epoch": 0.9309057492389868, + "grad_norm": 0.43924397230148315, + "learning_rate": 0.0001, + "loss": 1.5359, + "step": 8104 + }, + { + "epoch": 0.931020619148814, + "grad_norm": 0.5079434514045715, + "learning_rate": 0.0001, + "loss": 1.6138, + "step": 8105 + }, + { + "epoch": 0.9311354890586411, + "grad_norm": 0.4720153510570526, + "learning_rate": 0.0001, + "loss": 1.7547, + "step": 8106 + }, + { + "epoch": 0.9312503589684682, + "grad_norm": 0.4631122648715973, + "learning_rate": 0.0001, + "loss": 1.591, + "step": 8107 + }, + { + "epoch": 0.9313652288782953, + "grad_norm": 0.4567249119281769, + "learning_rate": 0.0001, + "loss": 1.5734, + "step": 8108 + }, + { + "epoch": 0.9314800987881224, + "grad_norm": 0.49746426939964294, + "learning_rate": 0.0001, + "loss": 1.6227, + "step": 8109 + }, + { + "epoch": 0.9315949686979496, + "grad_norm": 0.47081831097602844, + "learning_rate": 0.0001, + "loss": 1.6083, + "step": 8110 + }, + { + "epoch": 0.9317098386077767, + "grad_norm": 0.5293380618095398, + "learning_rate": 0.0001, + "loss": 1.6673, + "step": 8111 + }, + { + "epoch": 0.9318247085176038, + "grad_norm": 0.47562193870544434, + "learning_rate": 0.0001, + "loss": 1.5876, + "step": 8112 + }, + { + "epoch": 0.9319395784274309, + "grad_norm": 0.5127376317977905, + "learning_rate": 0.0001, + "loss": 1.6752, + "step": 8113 + }, + { + "epoch": 0.932054448337258, + "grad_norm": 0.47655048966407776, + "learning_rate": 0.0001, + "loss": 1.5865, + "step": 8114 + }, + { + "epoch": 0.9321693182470852, + "grad_norm": 0.4715765416622162, + "learning_rate": 0.0001, + "loss": 1.6537, + "step": 8115 + }, + { + "epoch": 0.9322841881569123, + "grad_norm": 0.4677276313304901, + "learning_rate": 0.0001, + "loss": 1.5214, + "step": 8116 + }, + { + "epoch": 0.9323990580667394, + "grad_norm": 0.4286912977695465, + "learning_rate": 0.0001, + "loss": 1.4507, + "step": 8117 + }, + { + "epoch": 0.9325139279765665, + "grad_norm": 0.4614866375923157, + "learning_rate": 0.0001, + "loss": 1.6722, + "step": 8118 + }, + { + "epoch": 0.9326287978863936, + "grad_norm": 0.44791609048843384, + "learning_rate": 0.0001, + "loss": 1.5154, + "step": 8119 + }, + { + "epoch": 0.9327436677962208, + "grad_norm": 0.49002981185913086, + "learning_rate": 0.0001, + "loss": 1.7432, + "step": 8120 + }, + { + "epoch": 0.9328585377060479, + "grad_norm": 0.4595596194267273, + "learning_rate": 0.0001, + "loss": 1.6082, + "step": 8121 + }, + { + "epoch": 0.932973407615875, + "grad_norm": 0.47388172149658203, + "learning_rate": 0.0001, + "loss": 1.4072, + "step": 8122 + }, + { + "epoch": 0.9330882775257021, + "grad_norm": 0.5274661183357239, + "learning_rate": 0.0001, + "loss": 1.7443, + "step": 8123 + }, + { + "epoch": 0.9332031474355292, + "grad_norm": 0.5174868702888489, + "learning_rate": 0.0001, + "loss": 1.7705, + "step": 8124 + }, + { + "epoch": 0.9333180173453564, + "grad_norm": 0.5129168629646301, + "learning_rate": 0.0001, + "loss": 1.6475, + "step": 8125 + }, + { + "epoch": 0.9334328872551835, + "grad_norm": 0.4941151440143585, + "learning_rate": 0.0001, + "loss": 1.6202, + "step": 8126 + }, + { + "epoch": 0.9335477571650106, + "grad_norm": 0.4885686933994293, + "learning_rate": 0.0001, + "loss": 1.7374, + "step": 8127 + }, + { + "epoch": 0.9336626270748377, + "grad_norm": 0.47207513451576233, + "learning_rate": 0.0001, + "loss": 1.6066, + "step": 8128 + }, + { + "epoch": 0.9337774969846648, + "grad_norm": 0.5630714893341064, + "learning_rate": 0.0001, + "loss": 2.0647, + "step": 8129 + }, + { + "epoch": 0.933892366894492, + "grad_norm": 0.47757160663604736, + "learning_rate": 0.0001, + "loss": 1.6114, + "step": 8130 + }, + { + "epoch": 0.9340072368043191, + "grad_norm": 0.4331253170967102, + "learning_rate": 0.0001, + "loss": 1.4655, + "step": 8131 + }, + { + "epoch": 0.9341221067141462, + "grad_norm": 0.48484155535697937, + "learning_rate": 0.0001, + "loss": 1.6523, + "step": 8132 + }, + { + "epoch": 0.9342369766239733, + "grad_norm": 0.47229698300361633, + "learning_rate": 0.0001, + "loss": 1.7429, + "step": 8133 + }, + { + "epoch": 0.9343518465338004, + "grad_norm": 0.45240116119384766, + "learning_rate": 0.0001, + "loss": 1.3286, + "step": 8134 + }, + { + "epoch": 0.9344667164436276, + "grad_norm": 0.4636879563331604, + "learning_rate": 0.0001, + "loss": 1.5574, + "step": 8135 + }, + { + "epoch": 0.9345815863534547, + "grad_norm": 0.4439462721347809, + "learning_rate": 0.0001, + "loss": 1.6443, + "step": 8136 + }, + { + "epoch": 0.9346964562632818, + "grad_norm": 0.46923813223838806, + "learning_rate": 0.0001, + "loss": 1.3393, + "step": 8137 + }, + { + "epoch": 0.9348113261731089, + "grad_norm": 0.47300711274147034, + "learning_rate": 0.0001, + "loss": 1.6769, + "step": 8138 + }, + { + "epoch": 0.934926196082936, + "grad_norm": 0.5104739665985107, + "learning_rate": 0.0001, + "loss": 1.5852, + "step": 8139 + }, + { + "epoch": 0.9350410659927632, + "grad_norm": 0.4453759491443634, + "learning_rate": 0.0001, + "loss": 1.5457, + "step": 8140 + }, + { + "epoch": 0.9351559359025903, + "grad_norm": 0.46000081300735474, + "learning_rate": 0.0001, + "loss": 1.568, + "step": 8141 + }, + { + "epoch": 0.9352708058124174, + "grad_norm": 0.4699609577655792, + "learning_rate": 0.0001, + "loss": 1.6492, + "step": 8142 + }, + { + "epoch": 0.9353856757222445, + "grad_norm": 0.4497356414794922, + "learning_rate": 0.0001, + "loss": 1.6447, + "step": 8143 + }, + { + "epoch": 0.9355005456320716, + "grad_norm": 0.4631606638431549, + "learning_rate": 0.0001, + "loss": 1.7243, + "step": 8144 + }, + { + "epoch": 0.9356154155418988, + "grad_norm": 0.4655974805355072, + "learning_rate": 0.0001, + "loss": 1.6267, + "step": 8145 + }, + { + "epoch": 0.9357302854517259, + "grad_norm": 0.464070588350296, + "learning_rate": 0.0001, + "loss": 1.5003, + "step": 8146 + }, + { + "epoch": 0.935845155361553, + "grad_norm": 0.5652233362197876, + "learning_rate": 0.0001, + "loss": 1.5475, + "step": 8147 + }, + { + "epoch": 0.9359600252713801, + "grad_norm": 0.47151249647140503, + "learning_rate": 0.0001, + "loss": 1.5032, + "step": 8148 + }, + { + "epoch": 0.9360748951812072, + "grad_norm": 0.4593266248703003, + "learning_rate": 0.0001, + "loss": 1.5327, + "step": 8149 + }, + { + "epoch": 0.9361897650910344, + "grad_norm": 0.491393119096756, + "learning_rate": 0.0001, + "loss": 1.6612, + "step": 8150 + }, + { + "epoch": 0.9363046350008615, + "grad_norm": 0.5595842003822327, + "learning_rate": 0.0001, + "loss": 1.4467, + "step": 8151 + }, + { + "epoch": 0.9364195049106886, + "grad_norm": 0.46376144886016846, + "learning_rate": 0.0001, + "loss": 1.5592, + "step": 8152 + }, + { + "epoch": 0.9365343748205157, + "grad_norm": 0.4688231348991394, + "learning_rate": 0.0001, + "loss": 1.5557, + "step": 8153 + }, + { + "epoch": 0.9366492447303428, + "grad_norm": 0.45528239011764526, + "learning_rate": 0.0001, + "loss": 1.5417, + "step": 8154 + }, + { + "epoch": 0.93676411464017, + "grad_norm": 0.48385414481163025, + "learning_rate": 0.0001, + "loss": 1.4563, + "step": 8155 + }, + { + "epoch": 0.9368789845499971, + "grad_norm": 0.4678395688533783, + "learning_rate": 0.0001, + "loss": 1.5949, + "step": 8156 + }, + { + "epoch": 0.9369938544598242, + "grad_norm": 0.5076170563697815, + "learning_rate": 0.0001, + "loss": 1.61, + "step": 8157 + }, + { + "epoch": 0.9371087243696514, + "grad_norm": 0.4750152826309204, + "learning_rate": 0.0001, + "loss": 1.4804, + "step": 8158 + }, + { + "epoch": 0.9372235942794785, + "grad_norm": 0.4693584740161896, + "learning_rate": 0.0001, + "loss": 1.5668, + "step": 8159 + }, + { + "epoch": 0.9373384641893057, + "grad_norm": 0.45886656641960144, + "learning_rate": 0.0001, + "loss": 1.7251, + "step": 8160 + }, + { + "epoch": 0.9374533340991328, + "grad_norm": 0.554453432559967, + "learning_rate": 0.0001, + "loss": 1.9527, + "step": 8161 + }, + { + "epoch": 0.9375682040089599, + "grad_norm": 0.46930959820747375, + "learning_rate": 0.0001, + "loss": 1.4046, + "step": 8162 + }, + { + "epoch": 0.937683073918787, + "grad_norm": 0.44557827711105347, + "learning_rate": 0.0001, + "loss": 1.4906, + "step": 8163 + }, + { + "epoch": 0.9377979438286141, + "grad_norm": 0.4786651134490967, + "learning_rate": 0.0001, + "loss": 1.7044, + "step": 8164 + }, + { + "epoch": 0.9379128137384413, + "grad_norm": 0.47346630692481995, + "learning_rate": 0.0001, + "loss": 1.384, + "step": 8165 + }, + { + "epoch": 0.9380276836482684, + "grad_norm": 0.4756239056587219, + "learning_rate": 0.0001, + "loss": 1.6438, + "step": 8166 + }, + { + "epoch": 0.9381425535580955, + "grad_norm": 0.4796939194202423, + "learning_rate": 0.0001, + "loss": 1.6085, + "step": 8167 + }, + { + "epoch": 0.9382574234679226, + "grad_norm": 0.4924411475658417, + "learning_rate": 0.0001, + "loss": 1.5232, + "step": 8168 + }, + { + "epoch": 0.9383722933777497, + "grad_norm": 0.4914194941520691, + "learning_rate": 0.0001, + "loss": 1.6322, + "step": 8169 + }, + { + "epoch": 0.9384871632875769, + "grad_norm": 0.46854540705680847, + "learning_rate": 0.0001, + "loss": 1.4879, + "step": 8170 + }, + { + "epoch": 0.938602033197404, + "grad_norm": 0.46316492557525635, + "learning_rate": 0.0001, + "loss": 1.523, + "step": 8171 + }, + { + "epoch": 0.9387169031072311, + "grad_norm": 0.4873157739639282, + "learning_rate": 0.0001, + "loss": 1.5733, + "step": 8172 + }, + { + "epoch": 0.9388317730170582, + "grad_norm": 0.4700479805469513, + "learning_rate": 0.0001, + "loss": 1.4223, + "step": 8173 + }, + { + "epoch": 0.9389466429268853, + "grad_norm": 0.5385305285453796, + "learning_rate": 0.0001, + "loss": 1.821, + "step": 8174 + }, + { + "epoch": 0.9390615128367125, + "grad_norm": 0.48141562938690186, + "learning_rate": 0.0001, + "loss": 1.4552, + "step": 8175 + }, + { + "epoch": 0.9391763827465396, + "grad_norm": 0.4608069360256195, + "learning_rate": 0.0001, + "loss": 1.521, + "step": 8176 + }, + { + "epoch": 0.9392912526563667, + "grad_norm": 0.4807290732860565, + "learning_rate": 0.0001, + "loss": 1.4567, + "step": 8177 + }, + { + "epoch": 0.9394061225661938, + "grad_norm": 0.5841659307479858, + "learning_rate": 0.0001, + "loss": 1.8741, + "step": 8178 + }, + { + "epoch": 0.9395209924760209, + "grad_norm": 0.4911169111728668, + "learning_rate": 0.0001, + "loss": 1.6685, + "step": 8179 + }, + { + "epoch": 0.9396358623858481, + "grad_norm": 0.4507903754711151, + "learning_rate": 0.0001, + "loss": 1.5333, + "step": 8180 + }, + { + "epoch": 0.9397507322956752, + "grad_norm": 0.4615688621997833, + "learning_rate": 0.0001, + "loss": 1.6094, + "step": 8181 + }, + { + "epoch": 0.9398656022055023, + "grad_norm": 0.5235313773155212, + "learning_rate": 0.0001, + "loss": 1.6907, + "step": 8182 + }, + { + "epoch": 0.9399804721153294, + "grad_norm": 0.4749388098716736, + "learning_rate": 0.0001, + "loss": 1.4103, + "step": 8183 + }, + { + "epoch": 0.9400953420251565, + "grad_norm": 0.49427589774131775, + "learning_rate": 0.0001, + "loss": 1.7691, + "step": 8184 + }, + { + "epoch": 0.9402102119349837, + "grad_norm": 0.4776000678539276, + "learning_rate": 0.0001, + "loss": 1.566, + "step": 8185 + }, + { + "epoch": 0.9403250818448108, + "grad_norm": 0.47408178448677063, + "learning_rate": 0.0001, + "loss": 1.4462, + "step": 8186 + }, + { + "epoch": 0.9404399517546379, + "grad_norm": 0.5133025646209717, + "learning_rate": 0.0001, + "loss": 1.6323, + "step": 8187 + }, + { + "epoch": 0.940554821664465, + "grad_norm": 0.45842912793159485, + "learning_rate": 0.0001, + "loss": 1.6081, + "step": 8188 + }, + { + "epoch": 0.9406696915742921, + "grad_norm": 0.4748247563838959, + "learning_rate": 0.0001, + "loss": 1.4791, + "step": 8189 + }, + { + "epoch": 0.9407845614841193, + "grad_norm": 0.46958670020103455, + "learning_rate": 0.0001, + "loss": 1.597, + "step": 8190 + }, + { + "epoch": 0.9408994313939464, + "grad_norm": 0.48225289583206177, + "learning_rate": 0.0001, + "loss": 1.6671, + "step": 8191 + }, + { + "epoch": 0.9410143013037735, + "grad_norm": 0.5528295636177063, + "learning_rate": 0.0001, + "loss": 1.6373, + "step": 8192 + }, + { + "epoch": 0.9411291712136006, + "grad_norm": 0.5050978064537048, + "learning_rate": 0.0001, + "loss": 1.702, + "step": 8193 + }, + { + "epoch": 0.9412440411234277, + "grad_norm": 0.4624803066253662, + "learning_rate": 0.0001, + "loss": 1.6395, + "step": 8194 + }, + { + "epoch": 0.9413589110332549, + "grad_norm": 0.5099507570266724, + "learning_rate": 0.0001, + "loss": 1.6849, + "step": 8195 + }, + { + "epoch": 0.941473780943082, + "grad_norm": 0.4514469504356384, + "learning_rate": 0.0001, + "loss": 1.5848, + "step": 8196 + }, + { + "epoch": 0.9415886508529091, + "grad_norm": 0.5006936192512512, + "learning_rate": 0.0001, + "loss": 1.5678, + "step": 8197 + }, + { + "epoch": 0.9417035207627362, + "grad_norm": 0.5389548540115356, + "learning_rate": 0.0001, + "loss": 1.6739, + "step": 8198 + }, + { + "epoch": 0.9418183906725633, + "grad_norm": 0.46252140402793884, + "learning_rate": 0.0001, + "loss": 1.5236, + "step": 8199 + }, + { + "epoch": 0.9419332605823905, + "grad_norm": 0.46771207451820374, + "learning_rate": 0.0001, + "loss": 1.3591, + "step": 8200 + }, + { + "epoch": 0.9420481304922176, + "grad_norm": 0.492191880941391, + "learning_rate": 0.0001, + "loss": 1.4178, + "step": 8201 + }, + { + "epoch": 0.9421630004020447, + "grad_norm": 0.4770967960357666, + "learning_rate": 0.0001, + "loss": 1.7547, + "step": 8202 + }, + { + "epoch": 0.9422778703118718, + "grad_norm": 0.5047717094421387, + "learning_rate": 0.0001, + "loss": 1.6103, + "step": 8203 + }, + { + "epoch": 0.9423927402216989, + "grad_norm": 0.47742435336112976, + "learning_rate": 0.0001, + "loss": 1.5417, + "step": 8204 + }, + { + "epoch": 0.9425076101315261, + "grad_norm": 0.5914652943611145, + "learning_rate": 0.0001, + "loss": 1.6294, + "step": 8205 + }, + { + "epoch": 0.9426224800413532, + "grad_norm": 0.4261223077774048, + "learning_rate": 0.0001, + "loss": 1.3505, + "step": 8206 + }, + { + "epoch": 0.9427373499511803, + "grad_norm": 0.47556623816490173, + "learning_rate": 0.0001, + "loss": 1.4745, + "step": 8207 + }, + { + "epoch": 0.9428522198610074, + "grad_norm": 0.4878052771091461, + "learning_rate": 0.0001, + "loss": 1.6136, + "step": 8208 + }, + { + "epoch": 0.9429670897708345, + "grad_norm": 0.44221365451812744, + "learning_rate": 0.0001, + "loss": 1.3707, + "step": 8209 + }, + { + "epoch": 0.9430819596806617, + "grad_norm": 0.47154805064201355, + "learning_rate": 0.0001, + "loss": 1.5895, + "step": 8210 + }, + { + "epoch": 0.9431968295904888, + "grad_norm": 0.5039512515068054, + "learning_rate": 0.0001, + "loss": 1.7088, + "step": 8211 + }, + { + "epoch": 0.9433116995003159, + "grad_norm": 0.45005252957344055, + "learning_rate": 0.0001, + "loss": 1.4901, + "step": 8212 + }, + { + "epoch": 0.943426569410143, + "grad_norm": 0.47104600071907043, + "learning_rate": 0.0001, + "loss": 1.5345, + "step": 8213 + }, + { + "epoch": 0.9435414393199701, + "grad_norm": 0.4666808843612671, + "learning_rate": 0.0001, + "loss": 1.5343, + "step": 8214 + }, + { + "epoch": 0.9436563092297973, + "grad_norm": 0.510503351688385, + "learning_rate": 0.0001, + "loss": 1.6839, + "step": 8215 + }, + { + "epoch": 0.9437711791396244, + "grad_norm": 0.4818345606327057, + "learning_rate": 0.0001, + "loss": 1.4745, + "step": 8216 + }, + { + "epoch": 0.9438860490494515, + "grad_norm": 0.4899197518825531, + "learning_rate": 0.0001, + "loss": 1.5966, + "step": 8217 + }, + { + "epoch": 0.9440009189592786, + "grad_norm": 0.5139290690422058, + "learning_rate": 0.0001, + "loss": 1.406, + "step": 8218 + }, + { + "epoch": 0.9441157888691057, + "grad_norm": 0.4770418107509613, + "learning_rate": 0.0001, + "loss": 1.4606, + "step": 8219 + }, + { + "epoch": 0.9442306587789329, + "grad_norm": 0.617544949054718, + "learning_rate": 0.0001, + "loss": 1.5908, + "step": 8220 + }, + { + "epoch": 0.94434552868876, + "grad_norm": 0.49356672167778015, + "learning_rate": 0.0001, + "loss": 1.7148, + "step": 8221 + }, + { + "epoch": 0.9444603985985871, + "grad_norm": 0.4679422974586487, + "learning_rate": 0.0001, + "loss": 1.5843, + "step": 8222 + }, + { + "epoch": 0.9445752685084142, + "grad_norm": 0.5478677749633789, + "learning_rate": 0.0001, + "loss": 1.8256, + "step": 8223 + }, + { + "epoch": 0.9446901384182413, + "grad_norm": 0.4296529293060303, + "learning_rate": 0.0001, + "loss": 1.416, + "step": 8224 + }, + { + "epoch": 0.9448050083280685, + "grad_norm": 0.4777142405509949, + "learning_rate": 0.0001, + "loss": 1.4079, + "step": 8225 + }, + { + "epoch": 0.9449198782378956, + "grad_norm": 0.5253008604049683, + "learning_rate": 0.0001, + "loss": 1.6611, + "step": 8226 + }, + { + "epoch": 0.9450347481477227, + "grad_norm": 0.4972079396247864, + "learning_rate": 0.0001, + "loss": 1.6474, + "step": 8227 + }, + { + "epoch": 0.9451496180575498, + "grad_norm": 0.4560241997241974, + "learning_rate": 0.0001, + "loss": 1.4919, + "step": 8228 + }, + { + "epoch": 0.9452644879673769, + "grad_norm": 0.4682483673095703, + "learning_rate": 0.0001, + "loss": 1.5651, + "step": 8229 + }, + { + "epoch": 0.9453793578772041, + "grad_norm": 0.4736382067203522, + "learning_rate": 0.0001, + "loss": 1.3865, + "step": 8230 + }, + { + "epoch": 0.9454942277870312, + "grad_norm": 0.4748673737049103, + "learning_rate": 0.0001, + "loss": 1.427, + "step": 8231 + }, + { + "epoch": 0.9456090976968583, + "grad_norm": 0.45922762155532837, + "learning_rate": 0.0001, + "loss": 1.3151, + "step": 8232 + }, + { + "epoch": 0.9457239676066854, + "grad_norm": 0.47256186604499817, + "learning_rate": 0.0001, + "loss": 1.4456, + "step": 8233 + }, + { + "epoch": 0.9458388375165125, + "grad_norm": 0.46213608980178833, + "learning_rate": 0.0001, + "loss": 1.4308, + "step": 8234 + }, + { + "epoch": 0.9459537074263397, + "grad_norm": 0.4915864169597626, + "learning_rate": 0.0001, + "loss": 1.6674, + "step": 8235 + }, + { + "epoch": 0.9460685773361668, + "grad_norm": 0.4443720579147339, + "learning_rate": 0.0001, + "loss": 1.5858, + "step": 8236 + }, + { + "epoch": 0.9461834472459939, + "grad_norm": 0.4873688519001007, + "learning_rate": 0.0001, + "loss": 1.7889, + "step": 8237 + }, + { + "epoch": 0.946298317155821, + "grad_norm": 0.49472931027412415, + "learning_rate": 0.0001, + "loss": 1.6489, + "step": 8238 + }, + { + "epoch": 0.9464131870656481, + "grad_norm": 0.4404892921447754, + "learning_rate": 0.0001, + "loss": 1.3307, + "step": 8239 + }, + { + "epoch": 0.9465280569754753, + "grad_norm": 0.48074471950531006, + "learning_rate": 0.0001, + "loss": 1.6203, + "step": 8240 + }, + { + "epoch": 0.9466429268853024, + "grad_norm": 0.5083596110343933, + "learning_rate": 0.0001, + "loss": 1.8559, + "step": 8241 + }, + { + "epoch": 0.9467577967951295, + "grad_norm": 0.4485703110694885, + "learning_rate": 0.0001, + "loss": 1.5608, + "step": 8242 + }, + { + "epoch": 0.9468726667049566, + "grad_norm": 0.48620954155921936, + "learning_rate": 0.0001, + "loss": 1.5907, + "step": 8243 + }, + { + "epoch": 0.9469875366147837, + "grad_norm": 0.4664001166820526, + "learning_rate": 0.0001, + "loss": 1.5831, + "step": 8244 + }, + { + "epoch": 0.9471024065246109, + "grad_norm": 0.4595847427845001, + "learning_rate": 0.0001, + "loss": 1.4782, + "step": 8245 + }, + { + "epoch": 0.947217276434438, + "grad_norm": 0.5062698721885681, + "learning_rate": 0.0001, + "loss": 1.5421, + "step": 8246 + }, + { + "epoch": 0.9473321463442651, + "grad_norm": 0.44905591011047363, + "learning_rate": 0.0001, + "loss": 1.5338, + "step": 8247 + }, + { + "epoch": 0.9474470162540922, + "grad_norm": 0.47497665882110596, + "learning_rate": 0.0001, + "loss": 1.4712, + "step": 8248 + }, + { + "epoch": 0.9475618861639193, + "grad_norm": 0.48770657181739807, + "learning_rate": 0.0001, + "loss": 1.43, + "step": 8249 + }, + { + "epoch": 0.9476767560737465, + "grad_norm": 0.4931548833847046, + "learning_rate": 0.0001, + "loss": 1.5114, + "step": 8250 + }, + { + "epoch": 0.9477916259835736, + "grad_norm": 0.47381991147994995, + "learning_rate": 0.0001, + "loss": 1.7137, + "step": 8251 + }, + { + "epoch": 0.9479064958934007, + "grad_norm": 0.5158901810646057, + "learning_rate": 0.0001, + "loss": 1.6038, + "step": 8252 + }, + { + "epoch": 0.9480213658032278, + "grad_norm": 0.48369431495666504, + "learning_rate": 0.0001, + "loss": 1.6776, + "step": 8253 + }, + { + "epoch": 0.9481362357130549, + "grad_norm": 0.5041664838790894, + "learning_rate": 0.0001, + "loss": 1.6957, + "step": 8254 + }, + { + "epoch": 0.9482511056228821, + "grad_norm": 0.46258240938186646, + "learning_rate": 0.0001, + "loss": 1.4937, + "step": 8255 + }, + { + "epoch": 0.9483659755327092, + "grad_norm": 0.4723878800868988, + "learning_rate": 0.0001, + "loss": 1.656, + "step": 8256 + }, + { + "epoch": 0.9484808454425363, + "grad_norm": 0.45941516757011414, + "learning_rate": 0.0001, + "loss": 1.6875, + "step": 8257 + }, + { + "epoch": 0.9485957153523634, + "grad_norm": 0.4574301540851593, + "learning_rate": 0.0001, + "loss": 1.5827, + "step": 8258 + }, + { + "epoch": 0.9487105852621905, + "grad_norm": 0.4727140963077545, + "learning_rate": 0.0001, + "loss": 1.6921, + "step": 8259 + }, + { + "epoch": 0.9488254551720177, + "grad_norm": 0.46204841136932373, + "learning_rate": 0.0001, + "loss": 1.4919, + "step": 8260 + }, + { + "epoch": 0.9489403250818448, + "grad_norm": 0.48473307490348816, + "learning_rate": 0.0001, + "loss": 1.3974, + "step": 8261 + }, + { + "epoch": 0.9490551949916719, + "grad_norm": 0.4569971561431885, + "learning_rate": 0.0001, + "loss": 1.7005, + "step": 8262 + }, + { + "epoch": 0.949170064901499, + "grad_norm": 0.4910833239555359, + "learning_rate": 0.0001, + "loss": 1.6477, + "step": 8263 + }, + { + "epoch": 0.9492849348113261, + "grad_norm": 0.469138503074646, + "learning_rate": 0.0001, + "loss": 1.5438, + "step": 8264 + }, + { + "epoch": 0.9493998047211533, + "grad_norm": 0.4757539927959442, + "learning_rate": 0.0001, + "loss": 1.5641, + "step": 8265 + }, + { + "epoch": 0.9495146746309804, + "grad_norm": 0.4612511992454529, + "learning_rate": 0.0001, + "loss": 1.6176, + "step": 8266 + }, + { + "epoch": 0.9496295445408075, + "grad_norm": 0.5166419744491577, + "learning_rate": 0.0001, + "loss": 1.6812, + "step": 8267 + }, + { + "epoch": 0.9497444144506346, + "grad_norm": 0.451444149017334, + "learning_rate": 0.0001, + "loss": 1.5371, + "step": 8268 + }, + { + "epoch": 0.9498592843604617, + "grad_norm": 0.4882412850856781, + "learning_rate": 0.0001, + "loss": 1.5438, + "step": 8269 + }, + { + "epoch": 0.9499741542702889, + "grad_norm": 0.5102916359901428, + "learning_rate": 0.0001, + "loss": 1.6798, + "step": 8270 + }, + { + "epoch": 0.950089024180116, + "grad_norm": 0.46115705370903015, + "learning_rate": 0.0001, + "loss": 1.2963, + "step": 8271 + }, + { + "epoch": 0.9502038940899431, + "grad_norm": 0.5228103399276733, + "learning_rate": 0.0001, + "loss": 1.4118, + "step": 8272 + }, + { + "epoch": 0.9503187639997702, + "grad_norm": 0.5342441201210022, + "learning_rate": 0.0001, + "loss": 1.7615, + "step": 8273 + }, + { + "epoch": 0.9504336339095973, + "grad_norm": 0.4900018274784088, + "learning_rate": 0.0001, + "loss": 1.3293, + "step": 8274 + }, + { + "epoch": 0.9505485038194245, + "grad_norm": 0.5222525000572205, + "learning_rate": 0.0001, + "loss": 1.6752, + "step": 8275 + }, + { + "epoch": 0.9506633737292516, + "grad_norm": 0.5265666842460632, + "learning_rate": 0.0001, + "loss": 1.7776, + "step": 8276 + }, + { + "epoch": 0.9507782436390787, + "grad_norm": 0.4796416759490967, + "learning_rate": 0.0001, + "loss": 1.5964, + "step": 8277 + }, + { + "epoch": 0.9508931135489058, + "grad_norm": 0.4717714488506317, + "learning_rate": 0.0001, + "loss": 1.6328, + "step": 8278 + }, + { + "epoch": 0.9510079834587329, + "grad_norm": 0.5052562952041626, + "learning_rate": 0.0001, + "loss": 1.5438, + "step": 8279 + }, + { + "epoch": 0.9511228533685601, + "grad_norm": 0.45067209005355835, + "learning_rate": 0.0001, + "loss": 1.5466, + "step": 8280 + }, + { + "epoch": 0.9512377232783872, + "grad_norm": 0.527546226978302, + "learning_rate": 0.0001, + "loss": 1.4335, + "step": 8281 + }, + { + "epoch": 0.9513525931882143, + "grad_norm": 0.46170440316200256, + "learning_rate": 0.0001, + "loss": 1.3969, + "step": 8282 + }, + { + "epoch": 0.9514674630980414, + "grad_norm": 0.5153254270553589, + "learning_rate": 0.0001, + "loss": 1.6541, + "step": 8283 + }, + { + "epoch": 0.9515823330078685, + "grad_norm": 0.48110654950141907, + "learning_rate": 0.0001, + "loss": 1.5888, + "step": 8284 + }, + { + "epoch": 0.9516972029176957, + "grad_norm": 0.5225505828857422, + "learning_rate": 0.0001, + "loss": 1.7728, + "step": 8285 + }, + { + "epoch": 0.9518120728275228, + "grad_norm": 0.4549621343612671, + "learning_rate": 0.0001, + "loss": 1.4923, + "step": 8286 + }, + { + "epoch": 0.9519269427373499, + "grad_norm": 0.5324844121932983, + "learning_rate": 0.0001, + "loss": 1.6255, + "step": 8287 + }, + { + "epoch": 0.952041812647177, + "grad_norm": 0.4963549077510834, + "learning_rate": 0.0001, + "loss": 1.5002, + "step": 8288 + }, + { + "epoch": 0.9521566825570041, + "grad_norm": 0.460531085729599, + "learning_rate": 0.0001, + "loss": 1.564, + "step": 8289 + }, + { + "epoch": 0.9522715524668313, + "grad_norm": 0.4704626798629761, + "learning_rate": 0.0001, + "loss": 1.7415, + "step": 8290 + }, + { + "epoch": 0.9523864223766584, + "grad_norm": 0.4891350269317627, + "learning_rate": 0.0001, + "loss": 1.5294, + "step": 8291 + }, + { + "epoch": 0.9525012922864855, + "grad_norm": 0.4812767207622528, + "learning_rate": 0.0001, + "loss": 1.6853, + "step": 8292 + }, + { + "epoch": 0.9526161621963126, + "grad_norm": 0.47794899344444275, + "learning_rate": 0.0001, + "loss": 1.586, + "step": 8293 + }, + { + "epoch": 0.9527310321061397, + "grad_norm": 0.49231424927711487, + "learning_rate": 0.0001, + "loss": 1.4125, + "step": 8294 + }, + { + "epoch": 0.952845902015967, + "grad_norm": 0.47585782408714294, + "learning_rate": 0.0001, + "loss": 1.5914, + "step": 8295 + }, + { + "epoch": 0.9529607719257941, + "grad_norm": 0.5035213828086853, + "learning_rate": 0.0001, + "loss": 1.6766, + "step": 8296 + }, + { + "epoch": 0.9530756418356212, + "grad_norm": 0.5115920305252075, + "learning_rate": 0.0001, + "loss": 1.4955, + "step": 8297 + }, + { + "epoch": 0.9531905117454483, + "grad_norm": 0.5224997401237488, + "learning_rate": 0.0001, + "loss": 1.6931, + "step": 8298 + }, + { + "epoch": 0.9533053816552755, + "grad_norm": 0.5272790789604187, + "learning_rate": 0.0001, + "loss": 1.584, + "step": 8299 + }, + { + "epoch": 0.9534202515651026, + "grad_norm": 0.516810953617096, + "learning_rate": 0.0001, + "loss": 1.7744, + "step": 8300 + }, + { + "epoch": 0.9535351214749297, + "grad_norm": 0.46272823214530945, + "learning_rate": 0.0001, + "loss": 1.3943, + "step": 8301 + }, + { + "epoch": 0.9536499913847568, + "grad_norm": 0.5371290445327759, + "learning_rate": 0.0001, + "loss": 1.5101, + "step": 8302 + }, + { + "epoch": 0.9537648612945839, + "grad_norm": 0.5088568329811096, + "learning_rate": 0.0001, + "loss": 1.7429, + "step": 8303 + }, + { + "epoch": 0.953879731204411, + "grad_norm": 0.4637158215045929, + "learning_rate": 0.0001, + "loss": 1.5429, + "step": 8304 + }, + { + "epoch": 0.9539946011142382, + "grad_norm": 0.44325053691864014, + "learning_rate": 0.0001, + "loss": 1.521, + "step": 8305 + }, + { + "epoch": 0.9541094710240653, + "grad_norm": 0.4635404944419861, + "learning_rate": 0.0001, + "loss": 1.4581, + "step": 8306 + }, + { + "epoch": 0.9542243409338924, + "grad_norm": 0.5078271627426147, + "learning_rate": 0.0001, + "loss": 1.7131, + "step": 8307 + }, + { + "epoch": 0.9543392108437195, + "grad_norm": 0.5453723669052124, + "learning_rate": 0.0001, + "loss": 1.6502, + "step": 8308 + }, + { + "epoch": 0.9544540807535467, + "grad_norm": 0.4599123001098633, + "learning_rate": 0.0001, + "loss": 1.5017, + "step": 8309 + }, + { + "epoch": 0.9545689506633738, + "grad_norm": 0.5290499329566956, + "learning_rate": 0.0001, + "loss": 1.661, + "step": 8310 + }, + { + "epoch": 0.9546838205732009, + "grad_norm": 0.44906917214393616, + "learning_rate": 0.0001, + "loss": 1.5134, + "step": 8311 + }, + { + "epoch": 0.954798690483028, + "grad_norm": 0.44247668981552124, + "learning_rate": 0.0001, + "loss": 1.4854, + "step": 8312 + }, + { + "epoch": 0.9549135603928551, + "grad_norm": 0.49097177386283875, + "learning_rate": 0.0001, + "loss": 1.3212, + "step": 8313 + }, + { + "epoch": 0.9550284303026823, + "grad_norm": 0.47003665566444397, + "learning_rate": 0.0001, + "loss": 1.5748, + "step": 8314 + }, + { + "epoch": 0.9551433002125094, + "grad_norm": 0.4841088056564331, + "learning_rate": 0.0001, + "loss": 1.6514, + "step": 8315 + }, + { + "epoch": 0.9552581701223365, + "grad_norm": 0.47243985533714294, + "learning_rate": 0.0001, + "loss": 1.4881, + "step": 8316 + }, + { + "epoch": 0.9553730400321636, + "grad_norm": 0.47702085971832275, + "learning_rate": 0.0001, + "loss": 1.4967, + "step": 8317 + }, + { + "epoch": 0.9554879099419907, + "grad_norm": 0.45515045523643494, + "learning_rate": 0.0001, + "loss": 1.5439, + "step": 8318 + }, + { + "epoch": 0.9556027798518179, + "grad_norm": 0.5219390392303467, + "learning_rate": 0.0001, + "loss": 1.7652, + "step": 8319 + }, + { + "epoch": 0.955717649761645, + "grad_norm": 0.5401485562324524, + "learning_rate": 0.0001, + "loss": 1.8357, + "step": 8320 + }, + { + "epoch": 0.9558325196714721, + "grad_norm": 0.4558376669883728, + "learning_rate": 0.0001, + "loss": 1.4786, + "step": 8321 + }, + { + "epoch": 0.9559473895812992, + "grad_norm": 0.4861636459827423, + "learning_rate": 0.0001, + "loss": 1.6804, + "step": 8322 + }, + { + "epoch": 0.9560622594911263, + "grad_norm": 0.4567401707172394, + "learning_rate": 0.0001, + "loss": 1.5903, + "step": 8323 + }, + { + "epoch": 0.9561771294009535, + "grad_norm": 0.5179786086082458, + "learning_rate": 0.0001, + "loss": 1.8311, + "step": 8324 + }, + { + "epoch": 0.9562919993107806, + "grad_norm": 0.45889440178871155, + "learning_rate": 0.0001, + "loss": 1.5582, + "step": 8325 + }, + { + "epoch": 0.9564068692206077, + "grad_norm": 0.4716584086418152, + "learning_rate": 0.0001, + "loss": 1.6171, + "step": 8326 + }, + { + "epoch": 0.9565217391304348, + "grad_norm": 0.43330761790275574, + "learning_rate": 0.0001, + "loss": 1.2869, + "step": 8327 + }, + { + "epoch": 0.9566366090402619, + "grad_norm": 0.4397355914115906, + "learning_rate": 0.0001, + "loss": 1.5012, + "step": 8328 + }, + { + "epoch": 0.956751478950089, + "grad_norm": 0.4772343933582306, + "learning_rate": 0.0001, + "loss": 1.3369, + "step": 8329 + }, + { + "epoch": 0.9568663488599162, + "grad_norm": 0.5061171054840088, + "learning_rate": 0.0001, + "loss": 1.7989, + "step": 8330 + }, + { + "epoch": 0.9569812187697433, + "grad_norm": 0.46907833218574524, + "learning_rate": 0.0001, + "loss": 1.5958, + "step": 8331 + }, + { + "epoch": 0.9570960886795704, + "grad_norm": 0.5048026442527771, + "learning_rate": 0.0001, + "loss": 1.5469, + "step": 8332 + }, + { + "epoch": 0.9572109585893975, + "grad_norm": 0.4772169888019562, + "learning_rate": 0.0001, + "loss": 1.4803, + "step": 8333 + }, + { + "epoch": 0.9573258284992247, + "grad_norm": 0.47201335430145264, + "learning_rate": 0.0001, + "loss": 1.6003, + "step": 8334 + }, + { + "epoch": 0.9574406984090518, + "grad_norm": 0.49312329292297363, + "learning_rate": 0.0001, + "loss": 1.5597, + "step": 8335 + }, + { + "epoch": 0.9575555683188789, + "grad_norm": 0.4100249111652374, + "learning_rate": 0.0001, + "loss": 1.3143, + "step": 8336 + }, + { + "epoch": 0.957670438228706, + "grad_norm": 0.5461440086364746, + "learning_rate": 0.0001, + "loss": 1.7893, + "step": 8337 + }, + { + "epoch": 0.9577853081385331, + "grad_norm": 0.47124072909355164, + "learning_rate": 0.0001, + "loss": 1.3658, + "step": 8338 + }, + { + "epoch": 0.9579001780483603, + "grad_norm": 0.4608875811100006, + "learning_rate": 0.0001, + "loss": 1.6032, + "step": 8339 + }, + { + "epoch": 0.9580150479581874, + "grad_norm": 0.49988433718681335, + "learning_rate": 0.0001, + "loss": 1.6149, + "step": 8340 + }, + { + "epoch": 0.9581299178680145, + "grad_norm": 0.47735628485679626, + "learning_rate": 0.0001, + "loss": 1.5887, + "step": 8341 + }, + { + "epoch": 0.9582447877778416, + "grad_norm": 0.5070236921310425, + "learning_rate": 0.0001, + "loss": 1.4978, + "step": 8342 + }, + { + "epoch": 0.9583596576876687, + "grad_norm": 0.5401067137718201, + "learning_rate": 0.0001, + "loss": 1.7773, + "step": 8343 + }, + { + "epoch": 0.9584745275974959, + "grad_norm": 0.45689040422439575, + "learning_rate": 0.0001, + "loss": 1.6352, + "step": 8344 + }, + { + "epoch": 0.958589397507323, + "grad_norm": 0.4751141667366028, + "learning_rate": 0.0001, + "loss": 1.5874, + "step": 8345 + }, + { + "epoch": 0.9587042674171501, + "grad_norm": 0.5322357416152954, + "learning_rate": 0.0001, + "loss": 1.7816, + "step": 8346 + }, + { + "epoch": 0.9588191373269772, + "grad_norm": 0.472920298576355, + "learning_rate": 0.0001, + "loss": 1.4187, + "step": 8347 + }, + { + "epoch": 0.9589340072368043, + "grad_norm": 0.5017462372779846, + "learning_rate": 0.0001, + "loss": 1.7655, + "step": 8348 + }, + { + "epoch": 0.9590488771466315, + "grad_norm": 0.4727727770805359, + "learning_rate": 0.0001, + "loss": 1.5264, + "step": 8349 + }, + { + "epoch": 0.9591637470564586, + "grad_norm": 0.45175325870513916, + "learning_rate": 0.0001, + "loss": 1.5411, + "step": 8350 + }, + { + "epoch": 0.9592786169662857, + "grad_norm": 0.500636100769043, + "learning_rate": 0.0001, + "loss": 1.7486, + "step": 8351 + }, + { + "epoch": 0.9593934868761128, + "grad_norm": 0.44014236330986023, + "learning_rate": 0.0001, + "loss": 1.3911, + "step": 8352 + }, + { + "epoch": 0.9595083567859399, + "grad_norm": 0.4677266478538513, + "learning_rate": 0.0001, + "loss": 1.8267, + "step": 8353 + }, + { + "epoch": 0.959623226695767, + "grad_norm": 0.4543503522872925, + "learning_rate": 0.0001, + "loss": 1.65, + "step": 8354 + }, + { + "epoch": 0.9597380966055942, + "grad_norm": 0.48893970251083374, + "learning_rate": 0.0001, + "loss": 1.6525, + "step": 8355 + }, + { + "epoch": 0.9598529665154213, + "grad_norm": 0.4677874445915222, + "learning_rate": 0.0001, + "loss": 1.6743, + "step": 8356 + }, + { + "epoch": 0.9599678364252484, + "grad_norm": 0.46414315700531006, + "learning_rate": 0.0001, + "loss": 1.6057, + "step": 8357 + }, + { + "epoch": 0.9600827063350755, + "grad_norm": 0.46315401792526245, + "learning_rate": 0.0001, + "loss": 1.5053, + "step": 8358 + }, + { + "epoch": 0.9601975762449027, + "grad_norm": 0.4876864552497864, + "learning_rate": 0.0001, + "loss": 1.5766, + "step": 8359 + }, + { + "epoch": 0.9603124461547298, + "grad_norm": 0.4578522741794586, + "learning_rate": 0.0001, + "loss": 1.5981, + "step": 8360 + }, + { + "epoch": 0.9604273160645569, + "grad_norm": 0.48835068941116333, + "learning_rate": 0.0001, + "loss": 1.7534, + "step": 8361 + }, + { + "epoch": 0.960542185974384, + "grad_norm": 0.4757782518863678, + "learning_rate": 0.0001, + "loss": 1.4454, + "step": 8362 + }, + { + "epoch": 0.9606570558842111, + "grad_norm": 0.45514336228370667, + "learning_rate": 0.0001, + "loss": 1.476, + "step": 8363 + }, + { + "epoch": 0.9607719257940383, + "grad_norm": 0.4917604625225067, + "learning_rate": 0.0001, + "loss": 1.6986, + "step": 8364 + }, + { + "epoch": 0.9608867957038654, + "grad_norm": 0.4693104326725006, + "learning_rate": 0.0001, + "loss": 1.4697, + "step": 8365 + }, + { + "epoch": 0.9610016656136925, + "grad_norm": 0.4452751874923706, + "learning_rate": 0.0001, + "loss": 1.5288, + "step": 8366 + }, + { + "epoch": 0.9611165355235196, + "grad_norm": 0.4996024966239929, + "learning_rate": 0.0001, + "loss": 1.5848, + "step": 8367 + }, + { + "epoch": 0.9612314054333467, + "grad_norm": 0.4728527069091797, + "learning_rate": 0.0001, + "loss": 1.6645, + "step": 8368 + }, + { + "epoch": 0.9613462753431739, + "grad_norm": 0.5003929734230042, + "learning_rate": 0.0001, + "loss": 1.709, + "step": 8369 + }, + { + "epoch": 0.961461145253001, + "grad_norm": 0.49857136607170105, + "learning_rate": 0.0001, + "loss": 1.6209, + "step": 8370 + }, + { + "epoch": 0.9615760151628281, + "grad_norm": 0.49390268325805664, + "learning_rate": 0.0001, + "loss": 1.5569, + "step": 8371 + }, + { + "epoch": 0.9616908850726552, + "grad_norm": 0.5034048557281494, + "learning_rate": 0.0001, + "loss": 1.5221, + "step": 8372 + }, + { + "epoch": 0.9618057549824823, + "grad_norm": 0.45734164118766785, + "learning_rate": 0.0001, + "loss": 1.4694, + "step": 8373 + }, + { + "epoch": 0.9619206248923095, + "grad_norm": 0.47848743200302124, + "learning_rate": 0.0001, + "loss": 1.6597, + "step": 8374 + }, + { + "epoch": 0.9620354948021366, + "grad_norm": 0.4872235357761383, + "learning_rate": 0.0001, + "loss": 1.6839, + "step": 8375 + }, + { + "epoch": 0.9621503647119637, + "grad_norm": 0.491102933883667, + "learning_rate": 0.0001, + "loss": 1.5953, + "step": 8376 + }, + { + "epoch": 0.9622652346217908, + "grad_norm": 0.4527552127838135, + "learning_rate": 0.0001, + "loss": 1.3692, + "step": 8377 + }, + { + "epoch": 0.9623801045316179, + "grad_norm": 0.4430198073387146, + "learning_rate": 0.0001, + "loss": 1.5954, + "step": 8378 + }, + { + "epoch": 0.962494974441445, + "grad_norm": 0.4682437479496002, + "learning_rate": 0.0001, + "loss": 1.5791, + "step": 8379 + }, + { + "epoch": 0.9626098443512722, + "grad_norm": 0.4858921766281128, + "learning_rate": 0.0001, + "loss": 1.4802, + "step": 8380 + }, + { + "epoch": 0.9627247142610993, + "grad_norm": 0.4844062626361847, + "learning_rate": 0.0001, + "loss": 1.6267, + "step": 8381 + }, + { + "epoch": 0.9628395841709264, + "grad_norm": 0.5199929475784302, + "learning_rate": 0.0001, + "loss": 1.7057, + "step": 8382 + }, + { + "epoch": 0.9629544540807535, + "grad_norm": 0.49841341376304626, + "learning_rate": 0.0001, + "loss": 1.5898, + "step": 8383 + }, + { + "epoch": 0.9630693239905807, + "grad_norm": 0.5616093873977661, + "learning_rate": 0.0001, + "loss": 1.9157, + "step": 8384 + }, + { + "epoch": 0.9631841939004078, + "grad_norm": 0.45158112049102783, + "learning_rate": 0.0001, + "loss": 1.5281, + "step": 8385 + }, + { + "epoch": 0.9632990638102349, + "grad_norm": 0.5046762228012085, + "learning_rate": 0.0001, + "loss": 1.6964, + "step": 8386 + }, + { + "epoch": 0.963413933720062, + "grad_norm": 0.5029253959655762, + "learning_rate": 0.0001, + "loss": 1.6383, + "step": 8387 + }, + { + "epoch": 0.9635288036298891, + "grad_norm": 0.48361513018608093, + "learning_rate": 0.0001, + "loss": 1.7588, + "step": 8388 + }, + { + "epoch": 0.9636436735397163, + "grad_norm": 0.4966965317726135, + "learning_rate": 0.0001, + "loss": 1.5243, + "step": 8389 + }, + { + "epoch": 0.9637585434495434, + "grad_norm": 0.4773005545139313, + "learning_rate": 0.0001, + "loss": 1.6763, + "step": 8390 + }, + { + "epoch": 0.9638734133593705, + "grad_norm": 0.4541994035243988, + "learning_rate": 0.0001, + "loss": 1.5686, + "step": 8391 + }, + { + "epoch": 0.9639882832691976, + "grad_norm": 0.43786925077438354, + "learning_rate": 0.0001, + "loss": 1.3373, + "step": 8392 + }, + { + "epoch": 0.9641031531790247, + "grad_norm": 0.5206244587898254, + "learning_rate": 0.0001, + "loss": 1.61, + "step": 8393 + }, + { + "epoch": 0.9642180230888518, + "grad_norm": 0.5168501138687134, + "learning_rate": 0.0001, + "loss": 1.7587, + "step": 8394 + }, + { + "epoch": 0.964332892998679, + "grad_norm": 0.48464956879615784, + "learning_rate": 0.0001, + "loss": 1.7185, + "step": 8395 + }, + { + "epoch": 0.9644477629085061, + "grad_norm": 0.4584466814994812, + "learning_rate": 0.0001, + "loss": 1.5487, + "step": 8396 + }, + { + "epoch": 0.9645626328183332, + "grad_norm": 0.49749574065208435, + "learning_rate": 0.0001, + "loss": 1.785, + "step": 8397 + }, + { + "epoch": 0.9646775027281603, + "grad_norm": 0.4844261407852173, + "learning_rate": 0.0001, + "loss": 1.4182, + "step": 8398 + }, + { + "epoch": 0.9647923726379874, + "grad_norm": 0.4621402323246002, + "learning_rate": 0.0001, + "loss": 1.5379, + "step": 8399 + }, + { + "epoch": 0.9649072425478146, + "grad_norm": 0.4868880808353424, + "learning_rate": 0.0001, + "loss": 1.6465, + "step": 8400 + }, + { + "epoch": 0.9650221124576417, + "grad_norm": 0.4497327506542206, + "learning_rate": 0.0001, + "loss": 1.5056, + "step": 8401 + }, + { + "epoch": 0.9651369823674688, + "grad_norm": 0.5474899411201477, + "learning_rate": 0.0001, + "loss": 1.8234, + "step": 8402 + }, + { + "epoch": 0.9652518522772959, + "grad_norm": 0.46715471148490906, + "learning_rate": 0.0001, + "loss": 1.3831, + "step": 8403 + }, + { + "epoch": 0.965366722187123, + "grad_norm": 0.5017038583755493, + "learning_rate": 0.0001, + "loss": 1.5066, + "step": 8404 + }, + { + "epoch": 0.9654815920969502, + "grad_norm": 0.5196341872215271, + "learning_rate": 0.0001, + "loss": 1.8523, + "step": 8405 + }, + { + "epoch": 0.9655964620067773, + "grad_norm": 0.4290624260902405, + "learning_rate": 0.0001, + "loss": 1.3917, + "step": 8406 + }, + { + "epoch": 0.9657113319166044, + "grad_norm": 0.5225144028663635, + "learning_rate": 0.0001, + "loss": 1.7594, + "step": 8407 + }, + { + "epoch": 0.9658262018264315, + "grad_norm": 0.44787734746932983, + "learning_rate": 0.0001, + "loss": 1.5222, + "step": 8408 + }, + { + "epoch": 0.9659410717362586, + "grad_norm": 0.4654848575592041, + "learning_rate": 0.0001, + "loss": 1.5487, + "step": 8409 + }, + { + "epoch": 0.9660559416460858, + "grad_norm": 0.49902090430259705, + "learning_rate": 0.0001, + "loss": 1.7194, + "step": 8410 + }, + { + "epoch": 0.9661708115559129, + "grad_norm": 0.5180195569992065, + "learning_rate": 0.0001, + "loss": 1.5937, + "step": 8411 + }, + { + "epoch": 0.96628568146574, + "grad_norm": 0.4926517903804779, + "learning_rate": 0.0001, + "loss": 1.6377, + "step": 8412 + }, + { + "epoch": 0.9664005513755671, + "grad_norm": 0.4600144028663635, + "learning_rate": 0.0001, + "loss": 1.5446, + "step": 8413 + }, + { + "epoch": 0.9665154212853942, + "grad_norm": 0.4665214717388153, + "learning_rate": 0.0001, + "loss": 1.6127, + "step": 8414 + }, + { + "epoch": 0.9666302911952214, + "grad_norm": 0.47527435421943665, + "learning_rate": 0.0001, + "loss": 1.5603, + "step": 8415 + }, + { + "epoch": 0.9667451611050485, + "grad_norm": 0.4811752140522003, + "learning_rate": 0.0001, + "loss": 1.4612, + "step": 8416 + }, + { + "epoch": 0.9668600310148756, + "grad_norm": 0.4516500234603882, + "learning_rate": 0.0001, + "loss": 1.4649, + "step": 8417 + }, + { + "epoch": 0.9669749009247027, + "grad_norm": 0.5186557769775391, + "learning_rate": 0.0001, + "loss": 1.7303, + "step": 8418 + }, + { + "epoch": 0.9670897708345298, + "grad_norm": 0.4529307782649994, + "learning_rate": 0.0001, + "loss": 1.4181, + "step": 8419 + }, + { + "epoch": 0.967204640744357, + "grad_norm": 0.4979994297027588, + "learning_rate": 0.0001, + "loss": 1.6673, + "step": 8420 + }, + { + "epoch": 0.9673195106541841, + "grad_norm": 0.4783253073692322, + "learning_rate": 0.0001, + "loss": 1.5552, + "step": 8421 + }, + { + "epoch": 0.9674343805640112, + "grad_norm": 0.4694981276988983, + "learning_rate": 0.0001, + "loss": 1.6136, + "step": 8422 + }, + { + "epoch": 0.9675492504738383, + "grad_norm": 0.4916779398918152, + "learning_rate": 0.0001, + "loss": 1.7472, + "step": 8423 + }, + { + "epoch": 0.9676641203836654, + "grad_norm": 0.42479321360588074, + "learning_rate": 0.0001, + "loss": 1.3967, + "step": 8424 + }, + { + "epoch": 0.9677789902934926, + "grad_norm": 0.5579563975334167, + "learning_rate": 0.0001, + "loss": 1.8852, + "step": 8425 + }, + { + "epoch": 0.9678938602033197, + "grad_norm": 0.5487393736839294, + "learning_rate": 0.0001, + "loss": 1.7019, + "step": 8426 + }, + { + "epoch": 0.9680087301131468, + "grad_norm": 0.46243569254875183, + "learning_rate": 0.0001, + "loss": 1.5185, + "step": 8427 + }, + { + "epoch": 0.9681236000229739, + "grad_norm": 0.468492716550827, + "learning_rate": 0.0001, + "loss": 1.5824, + "step": 8428 + }, + { + "epoch": 0.968238469932801, + "grad_norm": 0.4968058466911316, + "learning_rate": 0.0001, + "loss": 1.586, + "step": 8429 + }, + { + "epoch": 0.9683533398426282, + "grad_norm": 0.44949015974998474, + "learning_rate": 0.0001, + "loss": 1.5882, + "step": 8430 + }, + { + "epoch": 0.9684682097524553, + "grad_norm": 0.46042609214782715, + "learning_rate": 0.0001, + "loss": 1.6254, + "step": 8431 + }, + { + "epoch": 0.9685830796622825, + "grad_norm": 0.48226192593574524, + "learning_rate": 0.0001, + "loss": 1.6218, + "step": 8432 + }, + { + "epoch": 0.9686979495721096, + "grad_norm": 0.4611875116825104, + "learning_rate": 0.0001, + "loss": 1.4546, + "step": 8433 + }, + { + "epoch": 0.9688128194819368, + "grad_norm": 0.49678662419319153, + "learning_rate": 0.0001, + "loss": 1.6045, + "step": 8434 + }, + { + "epoch": 0.9689276893917639, + "grad_norm": 0.4989873170852661, + "learning_rate": 0.0001, + "loss": 1.6201, + "step": 8435 + }, + { + "epoch": 0.969042559301591, + "grad_norm": 0.49776941537857056, + "learning_rate": 0.0001, + "loss": 1.6435, + "step": 8436 + }, + { + "epoch": 0.9691574292114181, + "grad_norm": 0.4603618085384369, + "learning_rate": 0.0001, + "loss": 1.6716, + "step": 8437 + }, + { + "epoch": 0.9692722991212452, + "grad_norm": 0.503463625907898, + "learning_rate": 0.0001, + "loss": 1.6739, + "step": 8438 + }, + { + "epoch": 0.9693871690310724, + "grad_norm": 0.520193874835968, + "learning_rate": 0.0001, + "loss": 1.7394, + "step": 8439 + }, + { + "epoch": 0.9695020389408995, + "grad_norm": 0.4809243679046631, + "learning_rate": 0.0001, + "loss": 1.5859, + "step": 8440 + }, + { + "epoch": 0.9696169088507266, + "grad_norm": 0.49442121386528015, + "learning_rate": 0.0001, + "loss": 1.6572, + "step": 8441 + }, + { + "epoch": 0.9697317787605537, + "grad_norm": 0.43582677841186523, + "learning_rate": 0.0001, + "loss": 1.5189, + "step": 8442 + }, + { + "epoch": 0.9698466486703808, + "grad_norm": 0.4740058481693268, + "learning_rate": 0.0001, + "loss": 1.6358, + "step": 8443 + }, + { + "epoch": 0.969961518580208, + "grad_norm": 0.4624651372432709, + "learning_rate": 0.0001, + "loss": 1.6075, + "step": 8444 + }, + { + "epoch": 0.9700763884900351, + "grad_norm": 0.5144423842430115, + "learning_rate": 0.0001, + "loss": 1.7943, + "step": 8445 + }, + { + "epoch": 0.9701912583998622, + "grad_norm": 0.4762701392173767, + "learning_rate": 0.0001, + "loss": 1.4843, + "step": 8446 + }, + { + "epoch": 0.9703061283096893, + "grad_norm": 0.46587637066841125, + "learning_rate": 0.0001, + "loss": 1.6171, + "step": 8447 + }, + { + "epoch": 0.9704209982195164, + "grad_norm": 0.516172468662262, + "learning_rate": 0.0001, + "loss": 1.4831, + "step": 8448 + }, + { + "epoch": 0.9705358681293436, + "grad_norm": 0.4806175231933594, + "learning_rate": 0.0001, + "loss": 1.5625, + "step": 8449 + }, + { + "epoch": 0.9706507380391707, + "grad_norm": 0.4877374768257141, + "learning_rate": 0.0001, + "loss": 1.6528, + "step": 8450 + }, + { + "epoch": 0.9707656079489978, + "grad_norm": 0.5047240257263184, + "learning_rate": 0.0001, + "loss": 1.7531, + "step": 8451 + }, + { + "epoch": 0.9708804778588249, + "grad_norm": 0.4985336363315582, + "learning_rate": 0.0001, + "loss": 1.8455, + "step": 8452 + }, + { + "epoch": 0.970995347768652, + "grad_norm": 0.5148325562477112, + "learning_rate": 0.0001, + "loss": 1.6977, + "step": 8453 + }, + { + "epoch": 0.9711102176784792, + "grad_norm": 0.4783836901187897, + "learning_rate": 0.0001, + "loss": 1.7402, + "step": 8454 + }, + { + "epoch": 0.9712250875883063, + "grad_norm": 0.5134006142616272, + "learning_rate": 0.0001, + "loss": 1.5913, + "step": 8455 + }, + { + "epoch": 0.9713399574981334, + "grad_norm": 0.4870409071445465, + "learning_rate": 0.0001, + "loss": 1.5859, + "step": 8456 + }, + { + "epoch": 0.9714548274079605, + "grad_norm": 0.47401508688926697, + "learning_rate": 0.0001, + "loss": 1.7521, + "step": 8457 + }, + { + "epoch": 0.9715696973177876, + "grad_norm": 0.46967458724975586, + "learning_rate": 0.0001, + "loss": 1.5803, + "step": 8458 + }, + { + "epoch": 0.9716845672276148, + "grad_norm": 0.5170222520828247, + "learning_rate": 0.0001, + "loss": 1.662, + "step": 8459 + }, + { + "epoch": 0.9717994371374419, + "grad_norm": 0.503593921661377, + "learning_rate": 0.0001, + "loss": 1.5592, + "step": 8460 + }, + { + "epoch": 0.971914307047269, + "grad_norm": 0.47926294803619385, + "learning_rate": 0.0001, + "loss": 1.5149, + "step": 8461 + }, + { + "epoch": 0.9720291769570961, + "grad_norm": 0.48252803087234497, + "learning_rate": 0.0001, + "loss": 1.7432, + "step": 8462 + }, + { + "epoch": 0.9721440468669232, + "grad_norm": 0.5037051439285278, + "learning_rate": 0.0001, + "loss": 1.7139, + "step": 8463 + }, + { + "epoch": 0.9722589167767504, + "grad_norm": 0.46094417572021484, + "learning_rate": 0.0001, + "loss": 1.406, + "step": 8464 + }, + { + "epoch": 0.9723737866865775, + "grad_norm": 0.5147666335105896, + "learning_rate": 0.0001, + "loss": 1.6715, + "step": 8465 + }, + { + "epoch": 0.9724886565964046, + "grad_norm": 0.4747392237186432, + "learning_rate": 0.0001, + "loss": 1.6376, + "step": 8466 + }, + { + "epoch": 0.9726035265062317, + "grad_norm": 0.5164050459861755, + "learning_rate": 0.0001, + "loss": 1.5872, + "step": 8467 + }, + { + "epoch": 0.9727183964160588, + "grad_norm": 0.46309229731559753, + "learning_rate": 0.0001, + "loss": 1.5583, + "step": 8468 + }, + { + "epoch": 0.972833266325886, + "grad_norm": 0.4927702248096466, + "learning_rate": 0.0001, + "loss": 1.6477, + "step": 8469 + }, + { + "epoch": 0.9729481362357131, + "grad_norm": 0.5026853084564209, + "learning_rate": 0.0001, + "loss": 1.5314, + "step": 8470 + }, + { + "epoch": 0.9730630061455402, + "grad_norm": 0.5030810832977295, + "learning_rate": 0.0001, + "loss": 1.7262, + "step": 8471 + }, + { + "epoch": 0.9731778760553673, + "grad_norm": 0.4909353256225586, + "learning_rate": 0.0001, + "loss": 1.6468, + "step": 8472 + }, + { + "epoch": 0.9732927459651944, + "grad_norm": 0.4842395484447479, + "learning_rate": 0.0001, + "loss": 1.6103, + "step": 8473 + }, + { + "epoch": 0.9734076158750216, + "grad_norm": 0.44747745990753174, + "learning_rate": 0.0001, + "loss": 1.4004, + "step": 8474 + }, + { + "epoch": 0.9735224857848487, + "grad_norm": 0.4720572233200073, + "learning_rate": 0.0001, + "loss": 1.4371, + "step": 8475 + }, + { + "epoch": 0.9736373556946758, + "grad_norm": 0.5061703324317932, + "learning_rate": 0.0001, + "loss": 1.5303, + "step": 8476 + }, + { + "epoch": 0.9737522256045029, + "grad_norm": 0.46352124214172363, + "learning_rate": 0.0001, + "loss": 1.5687, + "step": 8477 + }, + { + "epoch": 0.97386709551433, + "grad_norm": 0.4936666488647461, + "learning_rate": 0.0001, + "loss": 1.6035, + "step": 8478 + }, + { + "epoch": 0.9739819654241572, + "grad_norm": 0.4917965829372406, + "learning_rate": 0.0001, + "loss": 1.545, + "step": 8479 + }, + { + "epoch": 0.9740968353339843, + "grad_norm": 0.5103937387466431, + "learning_rate": 0.0001, + "loss": 1.7674, + "step": 8480 + }, + { + "epoch": 0.9742117052438114, + "grad_norm": 0.44612032175064087, + "learning_rate": 0.0001, + "loss": 1.3708, + "step": 8481 + }, + { + "epoch": 0.9743265751536385, + "grad_norm": 0.48888033628463745, + "learning_rate": 0.0001, + "loss": 1.5865, + "step": 8482 + }, + { + "epoch": 0.9744414450634656, + "grad_norm": 0.49829840660095215, + "learning_rate": 0.0001, + "loss": 1.7156, + "step": 8483 + }, + { + "epoch": 0.9745563149732928, + "grad_norm": 0.4825059473514557, + "learning_rate": 0.0001, + "loss": 1.4587, + "step": 8484 + }, + { + "epoch": 0.9746711848831199, + "grad_norm": 0.44610595703125, + "learning_rate": 0.0001, + "loss": 1.3814, + "step": 8485 + }, + { + "epoch": 0.974786054792947, + "grad_norm": 0.5563763976097107, + "learning_rate": 0.0001, + "loss": 1.7003, + "step": 8486 + }, + { + "epoch": 0.9749009247027741, + "grad_norm": 0.4703843295574188, + "learning_rate": 0.0001, + "loss": 1.4677, + "step": 8487 + }, + { + "epoch": 0.9750157946126012, + "grad_norm": 0.524733304977417, + "learning_rate": 0.0001, + "loss": 1.6185, + "step": 8488 + }, + { + "epoch": 0.9751306645224284, + "grad_norm": 0.5146064758300781, + "learning_rate": 0.0001, + "loss": 1.8297, + "step": 8489 + }, + { + "epoch": 0.9752455344322555, + "grad_norm": 0.432230681180954, + "learning_rate": 0.0001, + "loss": 1.5052, + "step": 8490 + }, + { + "epoch": 0.9753604043420826, + "grad_norm": 0.4458877742290497, + "learning_rate": 0.0001, + "loss": 1.5611, + "step": 8491 + }, + { + "epoch": 0.9754752742519097, + "grad_norm": 0.474324107170105, + "learning_rate": 0.0001, + "loss": 1.7137, + "step": 8492 + }, + { + "epoch": 0.9755901441617368, + "grad_norm": 0.4817248582839966, + "learning_rate": 0.0001, + "loss": 1.4067, + "step": 8493 + }, + { + "epoch": 0.975705014071564, + "grad_norm": 0.46746066212654114, + "learning_rate": 0.0001, + "loss": 1.5686, + "step": 8494 + }, + { + "epoch": 0.9758198839813911, + "grad_norm": 0.4995744824409485, + "learning_rate": 0.0001, + "loss": 1.6563, + "step": 8495 + }, + { + "epoch": 0.9759347538912182, + "grad_norm": 0.4750157594680786, + "learning_rate": 0.0001, + "loss": 1.4601, + "step": 8496 + }, + { + "epoch": 0.9760496238010453, + "grad_norm": 0.474306583404541, + "learning_rate": 0.0001, + "loss": 1.5302, + "step": 8497 + }, + { + "epoch": 0.9761644937108724, + "grad_norm": 0.494228720664978, + "learning_rate": 0.0001, + "loss": 1.4225, + "step": 8498 + }, + { + "epoch": 0.9762793636206996, + "grad_norm": 0.4527890384197235, + "learning_rate": 0.0001, + "loss": 1.5073, + "step": 8499 + }, + { + "epoch": 0.9763942335305267, + "grad_norm": 0.4580267071723938, + "learning_rate": 0.0001, + "loss": 1.5774, + "step": 8500 + }, + { + "epoch": 0.9765091034403538, + "grad_norm": 0.5154402852058411, + "learning_rate": 0.0001, + "loss": 1.6121, + "step": 8501 + }, + { + "epoch": 0.9766239733501809, + "grad_norm": 0.4666356146335602, + "learning_rate": 0.0001, + "loss": 1.6026, + "step": 8502 + }, + { + "epoch": 0.976738843260008, + "grad_norm": 0.4806033670902252, + "learning_rate": 0.0001, + "loss": 1.6383, + "step": 8503 + }, + { + "epoch": 0.9768537131698352, + "grad_norm": 0.49173009395599365, + "learning_rate": 0.0001, + "loss": 1.6378, + "step": 8504 + }, + { + "epoch": 0.9769685830796623, + "grad_norm": 0.528270959854126, + "learning_rate": 0.0001, + "loss": 1.6054, + "step": 8505 + }, + { + "epoch": 0.9770834529894894, + "grad_norm": 0.4734746217727661, + "learning_rate": 0.0001, + "loss": 1.7124, + "step": 8506 + }, + { + "epoch": 0.9771983228993165, + "grad_norm": 0.44804245233535767, + "learning_rate": 0.0001, + "loss": 1.4199, + "step": 8507 + }, + { + "epoch": 0.9773131928091436, + "grad_norm": 0.4997158944606781, + "learning_rate": 0.0001, + "loss": 1.5853, + "step": 8508 + }, + { + "epoch": 0.9774280627189708, + "grad_norm": 0.5285566449165344, + "learning_rate": 0.0001, + "loss": 1.5682, + "step": 8509 + }, + { + "epoch": 0.9775429326287979, + "grad_norm": 0.5019384026527405, + "learning_rate": 0.0001, + "loss": 1.7047, + "step": 8510 + }, + { + "epoch": 0.977657802538625, + "grad_norm": 0.49325329065322876, + "learning_rate": 0.0001, + "loss": 1.5596, + "step": 8511 + }, + { + "epoch": 0.9777726724484521, + "grad_norm": 0.439513236284256, + "learning_rate": 0.0001, + "loss": 1.3171, + "step": 8512 + }, + { + "epoch": 0.9778875423582792, + "grad_norm": 0.4921404719352722, + "learning_rate": 0.0001, + "loss": 1.6964, + "step": 8513 + }, + { + "epoch": 0.9780024122681064, + "grad_norm": 0.48236212134361267, + "learning_rate": 0.0001, + "loss": 1.3529, + "step": 8514 + }, + { + "epoch": 0.9781172821779335, + "grad_norm": 0.437320739030838, + "learning_rate": 0.0001, + "loss": 1.345, + "step": 8515 + }, + { + "epoch": 0.9782321520877606, + "grad_norm": 0.48497122526168823, + "learning_rate": 0.0001, + "loss": 1.2823, + "step": 8516 + }, + { + "epoch": 0.9783470219975877, + "grad_norm": 0.48262646794319153, + "learning_rate": 0.0001, + "loss": 1.6981, + "step": 8517 + }, + { + "epoch": 0.9784618919074148, + "grad_norm": 0.48424267768859863, + "learning_rate": 0.0001, + "loss": 1.5371, + "step": 8518 + }, + { + "epoch": 0.978576761817242, + "grad_norm": 0.48815423250198364, + "learning_rate": 0.0001, + "loss": 1.5908, + "step": 8519 + }, + { + "epoch": 0.9786916317270691, + "grad_norm": 0.48007214069366455, + "learning_rate": 0.0001, + "loss": 1.696, + "step": 8520 + }, + { + "epoch": 0.9788065016368962, + "grad_norm": 0.501133918762207, + "learning_rate": 0.0001, + "loss": 1.6893, + "step": 8521 + }, + { + "epoch": 0.9789213715467233, + "grad_norm": 0.48664984107017517, + "learning_rate": 0.0001, + "loss": 1.5407, + "step": 8522 + }, + { + "epoch": 0.9790362414565504, + "grad_norm": 0.47685080766677856, + "learning_rate": 0.0001, + "loss": 1.5173, + "step": 8523 + }, + { + "epoch": 0.9791511113663776, + "grad_norm": 0.458021342754364, + "learning_rate": 0.0001, + "loss": 1.3507, + "step": 8524 + }, + { + "epoch": 0.9792659812762047, + "grad_norm": 0.48686715960502625, + "learning_rate": 0.0001, + "loss": 1.6155, + "step": 8525 + }, + { + "epoch": 0.9793808511860318, + "grad_norm": 0.4461478590965271, + "learning_rate": 0.0001, + "loss": 1.4896, + "step": 8526 + }, + { + "epoch": 0.9794957210958589, + "grad_norm": 0.47489675879478455, + "learning_rate": 0.0001, + "loss": 1.691, + "step": 8527 + }, + { + "epoch": 0.979610591005686, + "grad_norm": 0.4845513105392456, + "learning_rate": 0.0001, + "loss": 1.5867, + "step": 8528 + }, + { + "epoch": 0.9797254609155132, + "grad_norm": 0.4592495858669281, + "learning_rate": 0.0001, + "loss": 1.5356, + "step": 8529 + }, + { + "epoch": 0.9798403308253403, + "grad_norm": 0.4863679111003876, + "learning_rate": 0.0001, + "loss": 1.5116, + "step": 8530 + }, + { + "epoch": 0.9799552007351674, + "grad_norm": 0.47153112292289734, + "learning_rate": 0.0001, + "loss": 1.5476, + "step": 8531 + }, + { + "epoch": 0.9800700706449945, + "grad_norm": 0.4806155562400818, + "learning_rate": 0.0001, + "loss": 1.3655, + "step": 8532 + }, + { + "epoch": 0.9801849405548216, + "grad_norm": 0.5179005265235901, + "learning_rate": 0.0001, + "loss": 1.6003, + "step": 8533 + }, + { + "epoch": 0.9802998104646488, + "grad_norm": 0.5735377073287964, + "learning_rate": 0.0001, + "loss": 1.5655, + "step": 8534 + }, + { + "epoch": 0.9804146803744759, + "grad_norm": 0.47526344656944275, + "learning_rate": 0.0001, + "loss": 1.5431, + "step": 8535 + }, + { + "epoch": 0.980529550284303, + "grad_norm": 0.4913672208786011, + "learning_rate": 0.0001, + "loss": 1.6334, + "step": 8536 + }, + { + "epoch": 0.9806444201941301, + "grad_norm": 0.5402122735977173, + "learning_rate": 0.0001, + "loss": 1.6258, + "step": 8537 + }, + { + "epoch": 0.9807592901039572, + "grad_norm": 0.4758612811565399, + "learning_rate": 0.0001, + "loss": 1.6862, + "step": 8538 + }, + { + "epoch": 0.9808741600137844, + "grad_norm": 0.46299320459365845, + "learning_rate": 0.0001, + "loss": 1.5098, + "step": 8539 + }, + { + "epoch": 0.9809890299236115, + "grad_norm": 0.48221555352211, + "learning_rate": 0.0001, + "loss": 1.5331, + "step": 8540 + }, + { + "epoch": 0.9811038998334386, + "grad_norm": 0.47534555196762085, + "learning_rate": 0.0001, + "loss": 1.6351, + "step": 8541 + }, + { + "epoch": 0.9812187697432657, + "grad_norm": 0.44301724433898926, + "learning_rate": 0.0001, + "loss": 1.4897, + "step": 8542 + }, + { + "epoch": 0.9813336396530928, + "grad_norm": 0.5158907771110535, + "learning_rate": 0.0001, + "loss": 1.6469, + "step": 8543 + }, + { + "epoch": 0.98144850956292, + "grad_norm": 0.5194290280342102, + "learning_rate": 0.0001, + "loss": 1.4948, + "step": 8544 + }, + { + "epoch": 0.9815633794727471, + "grad_norm": 0.5136659741401672, + "learning_rate": 0.0001, + "loss": 1.6842, + "step": 8545 + }, + { + "epoch": 0.9816782493825742, + "grad_norm": 0.5167720913887024, + "learning_rate": 0.0001, + "loss": 1.844, + "step": 8546 + }, + { + "epoch": 0.9817931192924013, + "grad_norm": 0.45328572392463684, + "learning_rate": 0.0001, + "loss": 1.513, + "step": 8547 + }, + { + "epoch": 0.9819079892022284, + "grad_norm": 0.4788568317890167, + "learning_rate": 0.0001, + "loss": 1.6795, + "step": 8548 + }, + { + "epoch": 0.9820228591120556, + "grad_norm": 0.4906943440437317, + "learning_rate": 0.0001, + "loss": 1.5957, + "step": 8549 + }, + { + "epoch": 0.9821377290218827, + "grad_norm": 0.4383232295513153, + "learning_rate": 0.0001, + "loss": 1.3974, + "step": 8550 + }, + { + "epoch": 0.9822525989317098, + "grad_norm": 0.480825275182724, + "learning_rate": 0.0001, + "loss": 1.5138, + "step": 8551 + }, + { + "epoch": 0.9823674688415369, + "grad_norm": 0.47409677505493164, + "learning_rate": 0.0001, + "loss": 1.5389, + "step": 8552 + }, + { + "epoch": 0.982482338751364, + "grad_norm": 0.4691976010799408, + "learning_rate": 0.0001, + "loss": 1.4925, + "step": 8553 + }, + { + "epoch": 0.9825972086611912, + "grad_norm": 0.48811331391334534, + "learning_rate": 0.0001, + "loss": 1.3919, + "step": 8554 + }, + { + "epoch": 0.9827120785710183, + "grad_norm": 0.5102879405021667, + "learning_rate": 0.0001, + "loss": 1.7138, + "step": 8555 + }, + { + "epoch": 0.9828269484808454, + "grad_norm": 0.4797075092792511, + "learning_rate": 0.0001, + "loss": 1.4157, + "step": 8556 + }, + { + "epoch": 0.9829418183906725, + "grad_norm": 0.4744819104671478, + "learning_rate": 0.0001, + "loss": 1.54, + "step": 8557 + }, + { + "epoch": 0.9830566883004996, + "grad_norm": 0.5046994090080261, + "learning_rate": 0.0001, + "loss": 1.5037, + "step": 8558 + }, + { + "epoch": 0.9831715582103268, + "grad_norm": 0.49901989102363586, + "learning_rate": 0.0001, + "loss": 1.6441, + "step": 8559 + }, + { + "epoch": 0.9832864281201539, + "grad_norm": 0.49144455790519714, + "learning_rate": 0.0001, + "loss": 1.6678, + "step": 8560 + }, + { + "epoch": 0.983401298029981, + "grad_norm": 0.47116243839263916, + "learning_rate": 0.0001, + "loss": 1.5554, + "step": 8561 + }, + { + "epoch": 0.9835161679398081, + "grad_norm": 0.5342057347297668, + "learning_rate": 0.0001, + "loss": 1.7248, + "step": 8562 + }, + { + "epoch": 0.9836310378496352, + "grad_norm": 0.4303724467754364, + "learning_rate": 0.0001, + "loss": 1.3374, + "step": 8563 + }, + { + "epoch": 0.9837459077594624, + "grad_norm": 0.5251704454421997, + "learning_rate": 0.0001, + "loss": 1.572, + "step": 8564 + }, + { + "epoch": 0.9838607776692895, + "grad_norm": 0.4820622503757477, + "learning_rate": 0.0001, + "loss": 1.531, + "step": 8565 + }, + { + "epoch": 0.9839756475791166, + "grad_norm": 0.4786358177661896, + "learning_rate": 0.0001, + "loss": 1.6379, + "step": 8566 + }, + { + "epoch": 0.9840905174889437, + "grad_norm": 0.508850634098053, + "learning_rate": 0.0001, + "loss": 1.5797, + "step": 8567 + }, + { + "epoch": 0.9842053873987708, + "grad_norm": 0.5014883875846863, + "learning_rate": 0.0001, + "loss": 1.7386, + "step": 8568 + }, + { + "epoch": 0.9843202573085981, + "grad_norm": 0.48899659514427185, + "learning_rate": 0.0001, + "loss": 1.5323, + "step": 8569 + }, + { + "epoch": 0.9844351272184252, + "grad_norm": 0.5107044577598572, + "learning_rate": 0.0001, + "loss": 1.8143, + "step": 8570 + }, + { + "epoch": 0.9845499971282523, + "grad_norm": 0.5331456065177917, + "learning_rate": 0.0001, + "loss": 1.5942, + "step": 8571 + }, + { + "epoch": 0.9846648670380794, + "grad_norm": 0.46554502844810486, + "learning_rate": 0.0001, + "loss": 1.4281, + "step": 8572 + }, + { + "epoch": 0.9847797369479065, + "grad_norm": 0.45582613348960876, + "learning_rate": 0.0001, + "loss": 1.512, + "step": 8573 + }, + { + "epoch": 0.9848946068577337, + "grad_norm": 0.5047944188117981, + "learning_rate": 0.0001, + "loss": 1.6731, + "step": 8574 + }, + { + "epoch": 0.9850094767675608, + "grad_norm": 0.4860698878765106, + "learning_rate": 0.0001, + "loss": 1.6575, + "step": 8575 + }, + { + "epoch": 0.9851243466773879, + "grad_norm": 0.5398543477058411, + "learning_rate": 0.0001, + "loss": 1.6691, + "step": 8576 + }, + { + "epoch": 0.985239216587215, + "grad_norm": 0.4853983521461487, + "learning_rate": 0.0001, + "loss": 1.4137, + "step": 8577 + }, + { + "epoch": 0.9853540864970421, + "grad_norm": 0.47747766971588135, + "learning_rate": 0.0001, + "loss": 1.6155, + "step": 8578 + }, + { + "epoch": 0.9854689564068693, + "grad_norm": 0.5553230047225952, + "learning_rate": 0.0001, + "loss": 1.9805, + "step": 8579 + }, + { + "epoch": 0.9855838263166964, + "grad_norm": 0.5310031771659851, + "learning_rate": 0.0001, + "loss": 1.5902, + "step": 8580 + }, + { + "epoch": 0.9856986962265235, + "grad_norm": 0.46402546763420105, + "learning_rate": 0.0001, + "loss": 1.3974, + "step": 8581 + }, + { + "epoch": 0.9858135661363506, + "grad_norm": 0.4859161674976349, + "learning_rate": 0.0001, + "loss": 1.7247, + "step": 8582 + }, + { + "epoch": 0.9859284360461777, + "grad_norm": 0.47045204043388367, + "learning_rate": 0.0001, + "loss": 1.3744, + "step": 8583 + }, + { + "epoch": 0.9860433059560049, + "grad_norm": 0.525871992111206, + "learning_rate": 0.0001, + "loss": 1.5243, + "step": 8584 + }, + { + "epoch": 0.986158175865832, + "grad_norm": 0.47140932083129883, + "learning_rate": 0.0001, + "loss": 1.5403, + "step": 8585 + }, + { + "epoch": 0.9862730457756591, + "grad_norm": 0.4621630311012268, + "learning_rate": 0.0001, + "loss": 1.4708, + "step": 8586 + }, + { + "epoch": 0.9863879156854862, + "grad_norm": 0.4546113610267639, + "learning_rate": 0.0001, + "loss": 1.5826, + "step": 8587 + }, + { + "epoch": 0.9865027855953133, + "grad_norm": 0.47744020819664, + "learning_rate": 0.0001, + "loss": 1.5277, + "step": 8588 + }, + { + "epoch": 0.9866176555051405, + "grad_norm": 0.45344898104667664, + "learning_rate": 0.0001, + "loss": 1.3902, + "step": 8589 + }, + { + "epoch": 0.9867325254149676, + "grad_norm": 0.5098491907119751, + "learning_rate": 0.0001, + "loss": 1.5117, + "step": 8590 + }, + { + "epoch": 0.9868473953247947, + "grad_norm": 0.5069968700408936, + "learning_rate": 0.0001, + "loss": 1.4687, + "step": 8591 + }, + { + "epoch": 0.9869622652346218, + "grad_norm": 0.45141881704330444, + "learning_rate": 0.0001, + "loss": 1.5025, + "step": 8592 + }, + { + "epoch": 0.987077135144449, + "grad_norm": 0.49101027846336365, + "learning_rate": 0.0001, + "loss": 1.5944, + "step": 8593 + }, + { + "epoch": 0.9871920050542761, + "grad_norm": 0.4652627408504486, + "learning_rate": 0.0001, + "loss": 1.6014, + "step": 8594 + }, + { + "epoch": 0.9873068749641032, + "grad_norm": 0.4782879650592804, + "learning_rate": 0.0001, + "loss": 1.6211, + "step": 8595 + }, + { + "epoch": 0.9874217448739303, + "grad_norm": 0.4874049425125122, + "learning_rate": 0.0001, + "loss": 1.5952, + "step": 8596 + }, + { + "epoch": 0.9875366147837574, + "grad_norm": 0.5173846483230591, + "learning_rate": 0.0001, + "loss": 1.6132, + "step": 8597 + }, + { + "epoch": 0.9876514846935845, + "grad_norm": 0.520719051361084, + "learning_rate": 0.0001, + "loss": 1.6545, + "step": 8598 + }, + { + "epoch": 0.9877663546034117, + "grad_norm": 0.5008309483528137, + "learning_rate": 0.0001, + "loss": 1.5412, + "step": 8599 + }, + { + "epoch": 0.9878812245132388, + "grad_norm": 0.491559237241745, + "learning_rate": 0.0001, + "loss": 1.5968, + "step": 8600 + }, + { + "epoch": 0.9879960944230659, + "grad_norm": 0.5189545750617981, + "learning_rate": 0.0001, + "loss": 1.5997, + "step": 8601 + }, + { + "epoch": 0.988110964332893, + "grad_norm": 0.4896295964717865, + "learning_rate": 0.0001, + "loss": 1.4129, + "step": 8602 + }, + { + "epoch": 0.9882258342427201, + "grad_norm": 0.47650331258773804, + "learning_rate": 0.0001, + "loss": 1.6492, + "step": 8603 + }, + { + "epoch": 0.9883407041525473, + "grad_norm": 0.46054142713546753, + "learning_rate": 0.0001, + "loss": 1.4276, + "step": 8604 + }, + { + "epoch": 0.9884555740623744, + "grad_norm": 0.5123742818832397, + "learning_rate": 0.0001, + "loss": 1.6383, + "step": 8605 + }, + { + "epoch": 0.9885704439722015, + "grad_norm": 0.48375701904296875, + "learning_rate": 0.0001, + "loss": 1.3923, + "step": 8606 + }, + { + "epoch": 0.9886853138820286, + "grad_norm": 0.5176060795783997, + "learning_rate": 0.0001, + "loss": 1.5655, + "step": 8607 + }, + { + "epoch": 0.9888001837918557, + "grad_norm": 0.5452961921691895, + "learning_rate": 0.0001, + "loss": 1.7097, + "step": 8608 + }, + { + "epoch": 0.9889150537016829, + "grad_norm": 0.5161251425743103, + "learning_rate": 0.0001, + "loss": 1.6645, + "step": 8609 + }, + { + "epoch": 0.98902992361151, + "grad_norm": 0.47460034489631653, + "learning_rate": 0.0001, + "loss": 1.6462, + "step": 8610 + }, + { + "epoch": 0.9891447935213371, + "grad_norm": 0.46245843172073364, + "learning_rate": 0.0001, + "loss": 1.2765, + "step": 8611 + }, + { + "epoch": 0.9892596634311642, + "grad_norm": 0.46704110503196716, + "learning_rate": 0.0001, + "loss": 1.5735, + "step": 8612 + }, + { + "epoch": 0.9893745333409913, + "grad_norm": 0.585635781288147, + "learning_rate": 0.0001, + "loss": 1.4845, + "step": 8613 + }, + { + "epoch": 0.9894894032508185, + "grad_norm": 0.4674919843673706, + "learning_rate": 0.0001, + "loss": 1.6238, + "step": 8614 + }, + { + "epoch": 0.9896042731606456, + "grad_norm": 0.4823983609676361, + "learning_rate": 0.0001, + "loss": 1.5698, + "step": 8615 + }, + { + "epoch": 0.9897191430704727, + "grad_norm": 0.5025752186775208, + "learning_rate": 0.0001, + "loss": 1.5845, + "step": 8616 + }, + { + "epoch": 0.9898340129802998, + "grad_norm": 0.5351320505142212, + "learning_rate": 0.0001, + "loss": 1.7404, + "step": 8617 + }, + { + "epoch": 0.989948882890127, + "grad_norm": 0.4923611283302307, + "learning_rate": 0.0001, + "loss": 1.4485, + "step": 8618 + }, + { + "epoch": 0.9900637527999541, + "grad_norm": 0.4848472476005554, + "learning_rate": 0.0001, + "loss": 1.5072, + "step": 8619 + }, + { + "epoch": 0.9901786227097812, + "grad_norm": 0.5281382203102112, + "learning_rate": 0.0001, + "loss": 1.6906, + "step": 8620 + }, + { + "epoch": 0.9902934926196083, + "grad_norm": 0.4649568498134613, + "learning_rate": 0.0001, + "loss": 1.5553, + "step": 8621 + }, + { + "epoch": 0.9904083625294354, + "grad_norm": 0.4958634674549103, + "learning_rate": 0.0001, + "loss": 1.6043, + "step": 8622 + }, + { + "epoch": 0.9905232324392625, + "grad_norm": 0.501117467880249, + "learning_rate": 0.0001, + "loss": 1.6405, + "step": 8623 + }, + { + "epoch": 0.9906381023490897, + "grad_norm": 0.4949086308479309, + "learning_rate": 0.0001, + "loss": 1.7568, + "step": 8624 + }, + { + "epoch": 0.9907529722589168, + "grad_norm": 0.46853989362716675, + "learning_rate": 0.0001, + "loss": 1.4587, + "step": 8625 + }, + { + "epoch": 0.9908678421687439, + "grad_norm": 0.47322335839271545, + "learning_rate": 0.0001, + "loss": 1.5358, + "step": 8626 + }, + { + "epoch": 0.990982712078571, + "grad_norm": 0.5503798127174377, + "learning_rate": 0.0001, + "loss": 1.7705, + "step": 8627 + }, + { + "epoch": 0.9910975819883981, + "grad_norm": 0.48998063802719116, + "learning_rate": 0.0001, + "loss": 1.7953, + "step": 8628 + }, + { + "epoch": 0.9912124518982253, + "grad_norm": 0.4801502525806427, + "learning_rate": 0.0001, + "loss": 1.6477, + "step": 8629 + }, + { + "epoch": 0.9913273218080524, + "grad_norm": 0.5271478295326233, + "learning_rate": 0.0001, + "loss": 1.618, + "step": 8630 + }, + { + "epoch": 0.9914421917178795, + "grad_norm": 0.501660943031311, + "learning_rate": 0.0001, + "loss": 1.6391, + "step": 8631 + }, + { + "epoch": 0.9915570616277066, + "grad_norm": 0.47359898686408997, + "learning_rate": 0.0001, + "loss": 1.5764, + "step": 8632 + }, + { + "epoch": 0.9916719315375337, + "grad_norm": 0.49344560503959656, + "learning_rate": 0.0001, + "loss": 1.3197, + "step": 8633 + }, + { + "epoch": 0.9917868014473609, + "grad_norm": 0.4800715446472168, + "learning_rate": 0.0001, + "loss": 1.5114, + "step": 8634 + }, + { + "epoch": 0.991901671357188, + "grad_norm": 0.49181854724884033, + "learning_rate": 0.0001, + "loss": 1.6261, + "step": 8635 + }, + { + "epoch": 0.9920165412670151, + "grad_norm": 0.4867425560951233, + "learning_rate": 0.0001, + "loss": 1.5423, + "step": 8636 + }, + { + "epoch": 0.9921314111768422, + "grad_norm": 0.5553606152534485, + "learning_rate": 0.0001, + "loss": 1.8589, + "step": 8637 + }, + { + "epoch": 0.9922462810866693, + "grad_norm": 0.4396592080593109, + "learning_rate": 0.0001, + "loss": 1.3746, + "step": 8638 + }, + { + "epoch": 0.9923611509964965, + "grad_norm": 0.5199365019798279, + "learning_rate": 0.0001, + "loss": 1.7913, + "step": 8639 + }, + { + "epoch": 0.9924760209063236, + "grad_norm": 0.5626462697982788, + "learning_rate": 0.0001, + "loss": 1.583, + "step": 8640 + }, + { + "epoch": 0.9925908908161507, + "grad_norm": 0.4940207600593567, + "learning_rate": 0.0001, + "loss": 1.3618, + "step": 8641 + }, + { + "epoch": 0.9927057607259778, + "grad_norm": 0.5407283902168274, + "learning_rate": 0.0001, + "loss": 1.5611, + "step": 8642 + }, + { + "epoch": 0.992820630635805, + "grad_norm": 0.5184419751167297, + "learning_rate": 0.0001, + "loss": 1.7046, + "step": 8643 + }, + { + "epoch": 0.9929355005456321, + "grad_norm": 0.4674243628978729, + "learning_rate": 0.0001, + "loss": 1.4962, + "step": 8644 + }, + { + "epoch": 0.9930503704554592, + "grad_norm": 0.519338846206665, + "learning_rate": 0.0001, + "loss": 1.731, + "step": 8645 + }, + { + "epoch": 0.9931652403652863, + "grad_norm": 0.43663740158081055, + "learning_rate": 0.0001, + "loss": 1.34, + "step": 8646 + }, + { + "epoch": 0.9932801102751134, + "grad_norm": 0.4806329309940338, + "learning_rate": 0.0001, + "loss": 1.322, + "step": 8647 + }, + { + "epoch": 0.9933949801849405, + "grad_norm": 0.509090006351471, + "learning_rate": 0.0001, + "loss": 1.6648, + "step": 8648 + }, + { + "epoch": 0.9935098500947677, + "grad_norm": 0.4767511487007141, + "learning_rate": 0.0001, + "loss": 1.5916, + "step": 8649 + }, + { + "epoch": 0.9936247200045948, + "grad_norm": 0.4973703920841217, + "learning_rate": 0.0001, + "loss": 1.6247, + "step": 8650 + }, + { + "epoch": 0.9937395899144219, + "grad_norm": 0.4734041392803192, + "learning_rate": 0.0001, + "loss": 1.7253, + "step": 8651 + }, + { + "epoch": 0.993854459824249, + "grad_norm": 0.44982102513313293, + "learning_rate": 0.0001, + "loss": 1.5608, + "step": 8652 + }, + { + "epoch": 0.9939693297340761, + "grad_norm": 0.4307469129562378, + "learning_rate": 0.0001, + "loss": 1.3253, + "step": 8653 + }, + { + "epoch": 0.9940841996439033, + "grad_norm": 0.5234149694442749, + "learning_rate": 0.0001, + "loss": 1.6276, + "step": 8654 + }, + { + "epoch": 0.9941990695537304, + "grad_norm": 0.4377621114253998, + "learning_rate": 0.0001, + "loss": 1.1823, + "step": 8655 + }, + { + "epoch": 0.9943139394635575, + "grad_norm": 0.4438837766647339, + "learning_rate": 0.0001, + "loss": 1.415, + "step": 8656 + }, + { + "epoch": 0.9944288093733846, + "grad_norm": 0.4784393012523651, + "learning_rate": 0.0001, + "loss": 1.499, + "step": 8657 + }, + { + "epoch": 0.9945436792832117, + "grad_norm": 0.468770831823349, + "learning_rate": 0.0001, + "loss": 1.5585, + "step": 8658 + }, + { + "epoch": 0.9946585491930389, + "grad_norm": 0.4884924292564392, + "learning_rate": 0.0001, + "loss": 1.5302, + "step": 8659 + }, + { + "epoch": 0.994773419102866, + "grad_norm": 0.5758072733879089, + "learning_rate": 0.0001, + "loss": 1.6505, + "step": 8660 + }, + { + "epoch": 0.9948882890126931, + "grad_norm": 0.5178779363632202, + "learning_rate": 0.0001, + "loss": 1.7349, + "step": 8661 + }, + { + "epoch": 0.9950031589225202, + "grad_norm": 0.49969327449798584, + "learning_rate": 0.0001, + "loss": 1.5336, + "step": 8662 + }, + { + "epoch": 0.9951180288323473, + "grad_norm": 0.4617246985435486, + "learning_rate": 0.0001, + "loss": 1.467, + "step": 8663 + }, + { + "epoch": 0.9952328987421745, + "grad_norm": 0.4882105886936188, + "learning_rate": 0.0001, + "loss": 1.5951, + "step": 8664 + }, + { + "epoch": 0.9953477686520016, + "grad_norm": 0.47335708141326904, + "learning_rate": 0.0001, + "loss": 1.6539, + "step": 8665 + }, + { + "epoch": 0.9954626385618287, + "grad_norm": 0.5231687426567078, + "learning_rate": 0.0001, + "loss": 1.3034, + "step": 8666 + }, + { + "epoch": 0.9955775084716558, + "grad_norm": 0.48819267749786377, + "learning_rate": 0.0001, + "loss": 1.6421, + "step": 8667 + }, + { + "epoch": 0.995692378381483, + "grad_norm": 0.4395897686481476, + "learning_rate": 0.0001, + "loss": 1.349, + "step": 8668 + }, + { + "epoch": 0.9958072482913101, + "grad_norm": 0.44600117206573486, + "learning_rate": 0.0001, + "loss": 1.2248, + "step": 8669 + }, + { + "epoch": 0.9959221182011372, + "grad_norm": 0.5204833745956421, + "learning_rate": 0.0001, + "loss": 1.8804, + "step": 8670 + }, + { + "epoch": 0.9960369881109643, + "grad_norm": 0.5219905972480774, + "learning_rate": 0.0001, + "loss": 1.7275, + "step": 8671 + }, + { + "epoch": 0.9961518580207914, + "grad_norm": 0.5194002389907837, + "learning_rate": 0.0001, + "loss": 1.7193, + "step": 8672 + }, + { + "epoch": 0.9962667279306185, + "grad_norm": 0.4883606433868408, + "learning_rate": 0.0001, + "loss": 1.5022, + "step": 8673 + }, + { + "epoch": 0.9963815978404457, + "grad_norm": 0.5156763792037964, + "learning_rate": 0.0001, + "loss": 1.7083, + "step": 8674 + }, + { + "epoch": 0.9964964677502728, + "grad_norm": 0.4738801121711731, + "learning_rate": 0.0001, + "loss": 1.4915, + "step": 8675 + }, + { + "epoch": 0.9966113376600999, + "grad_norm": 0.4632324278354645, + "learning_rate": 0.0001, + "loss": 1.702, + "step": 8676 + }, + { + "epoch": 0.996726207569927, + "grad_norm": 0.4966660439968109, + "learning_rate": 0.0001, + "loss": 1.6701, + "step": 8677 + }, + { + "epoch": 0.9968410774797541, + "grad_norm": 0.48212072253227234, + "learning_rate": 0.0001, + "loss": 1.5969, + "step": 8678 + }, + { + "epoch": 0.9969559473895813, + "grad_norm": 0.498323529958725, + "learning_rate": 0.0001, + "loss": 1.6029, + "step": 8679 + }, + { + "epoch": 0.9970708172994084, + "grad_norm": 0.5070053935050964, + "learning_rate": 0.0001, + "loss": 1.8766, + "step": 8680 + }, + { + "epoch": 0.9971856872092355, + "grad_norm": 0.4994966387748718, + "learning_rate": 0.0001, + "loss": 1.6799, + "step": 8681 + }, + { + "epoch": 0.9973005571190626, + "grad_norm": 0.5102283954620361, + "learning_rate": 0.0001, + "loss": 1.4916, + "step": 8682 + }, + { + "epoch": 0.9974154270288897, + "grad_norm": 0.5465260148048401, + "learning_rate": 0.0001, + "loss": 1.7164, + "step": 8683 + }, + { + "epoch": 0.9975302969387169, + "grad_norm": 0.46429330110549927, + "learning_rate": 0.0001, + "loss": 1.6374, + "step": 8684 + }, + { + "epoch": 0.997645166848544, + "grad_norm": 0.4978303611278534, + "learning_rate": 0.0001, + "loss": 1.6153, + "step": 8685 + }, + { + "epoch": 0.9977600367583711, + "grad_norm": 0.4677812457084656, + "learning_rate": 0.0001, + "loss": 1.5271, + "step": 8686 + }, + { + "epoch": 0.9978749066681982, + "grad_norm": 0.49235597252845764, + "learning_rate": 0.0001, + "loss": 1.6004, + "step": 8687 + }, + { + "epoch": 0.9979897765780253, + "grad_norm": 0.48482832312583923, + "learning_rate": 0.0001, + "loss": 1.5123, + "step": 8688 + }, + { + "epoch": 0.9981046464878525, + "grad_norm": 0.5265589356422424, + "learning_rate": 0.0001, + "loss": 1.7991, + "step": 8689 + }, + { + "epoch": 0.9982195163976796, + "grad_norm": 0.47693178057670593, + "learning_rate": 0.0001, + "loss": 1.5514, + "step": 8690 + }, + { + "epoch": 0.9983343863075067, + "grad_norm": 0.4894190728664398, + "learning_rate": 0.0001, + "loss": 1.6614, + "step": 8691 + }, + { + "epoch": 0.9984492562173338, + "grad_norm": 0.46358221769332886, + "learning_rate": 0.0001, + "loss": 1.5936, + "step": 8692 + }, + { + "epoch": 0.998564126127161, + "grad_norm": 0.45858120918273926, + "learning_rate": 0.0001, + "loss": 1.5139, + "step": 8693 + }, + { + "epoch": 0.9986789960369881, + "grad_norm": 0.45518410205841064, + "learning_rate": 0.0001, + "loss": 1.5492, + "step": 8694 + }, + { + "epoch": 0.9987938659468152, + "grad_norm": 0.47189682722091675, + "learning_rate": 0.0001, + "loss": 1.5014, + "step": 8695 + }, + { + "epoch": 0.9989087358566423, + "grad_norm": 0.5133928060531616, + "learning_rate": 0.0001, + "loss": 1.6432, + "step": 8696 + }, + { + "epoch": 0.9990236057664694, + "grad_norm": 0.5051494240760803, + "learning_rate": 0.0001, + "loss": 1.6099, + "step": 8697 + }, + { + "epoch": 0.9991384756762965, + "grad_norm": 0.5052590370178223, + "learning_rate": 0.0001, + "loss": 1.5629, + "step": 8698 + }, + { + "epoch": 0.9992533455861237, + "grad_norm": 0.4864301085472107, + "learning_rate": 0.0001, + "loss": 1.4405, + "step": 8699 + }, + { + "epoch": 0.9993682154959508, + "grad_norm": 0.49039098620414734, + "learning_rate": 0.0001, + "loss": 1.6348, + "step": 8700 + }, + { + "epoch": 0.9994830854057779, + "grad_norm": 0.5003101229667664, + "learning_rate": 0.0001, + "loss": 1.729, + "step": 8701 + }, + { + "epoch": 0.999597955315605, + "grad_norm": 0.4698851704597473, + "learning_rate": 0.0001, + "loss": 1.5738, + "step": 8702 + }, + { + "epoch": 0.9997128252254321, + "grad_norm": 0.5079613327980042, + "learning_rate": 0.0001, + "loss": 1.6768, + "step": 8703 + }, + { + "epoch": 0.9998276951352593, + "grad_norm": 0.5211082100868225, + "learning_rate": 0.0001, + "loss": 1.536, + "step": 8704 + }, + { + "epoch": 0.9999425650450864, + "grad_norm": 0.45943862199783325, + "learning_rate": 0.0001, + "loss": 1.5753, + "step": 8705 + }, + { + "epoch": 1.0000574349549136, + "grad_norm": 0.45664501190185547, + "learning_rate": 0.0001, + "loss": 1.5082, + "step": 8706 + }, + { + "epoch": 1.0001723048647406, + "grad_norm": 0.4673193097114563, + "learning_rate": 0.0001, + "loss": 1.6225, + "step": 8707 + }, + { + "epoch": 1.0002871747745679, + "grad_norm": 0.46662116050720215, + "learning_rate": 0.0001, + "loss": 1.3963, + "step": 8708 + }, + { + "epoch": 1.0004020446843949, + "grad_norm": 0.543308675289154, + "learning_rate": 0.0001, + "loss": 1.2983, + "step": 8709 + }, + { + "epoch": 1.000516914594222, + "grad_norm": 0.49053019285202026, + "learning_rate": 0.0001, + "loss": 1.417, + "step": 8710 + }, + { + "epoch": 1.000631784504049, + "grad_norm": 0.504199743270874, + "learning_rate": 0.0001, + "loss": 1.4342, + "step": 8711 + }, + { + "epoch": 1.0007466544138763, + "grad_norm": 0.5157631635665894, + "learning_rate": 0.0001, + "loss": 1.5539, + "step": 8712 + }, + { + "epoch": 1.0008615243237033, + "grad_norm": 0.5185043215751648, + "learning_rate": 0.0001, + "loss": 1.401, + "step": 8713 + }, + { + "epoch": 1.0009763942335306, + "grad_norm": 0.5186070203781128, + "learning_rate": 0.0001, + "loss": 1.5388, + "step": 8714 + }, + { + "epoch": 1.0010912641433576, + "grad_norm": 0.5666443705558777, + "learning_rate": 0.0001, + "loss": 1.579, + "step": 8715 + }, + { + "epoch": 1.0012061340531848, + "grad_norm": 0.6023586988449097, + "learning_rate": 0.0001, + "loss": 1.4872, + "step": 8716 + }, + { + "epoch": 1.0013210039630118, + "grad_norm": 0.504416823387146, + "learning_rate": 0.0001, + "loss": 1.4496, + "step": 8717 + }, + { + "epoch": 1.001435873872839, + "grad_norm": 0.563830554485321, + "learning_rate": 0.0001, + "loss": 1.2972, + "step": 8718 + }, + { + "epoch": 1.001550743782666, + "grad_norm": 0.5681375861167908, + "learning_rate": 0.0001, + "loss": 1.5167, + "step": 8719 + }, + { + "epoch": 1.0016656136924933, + "grad_norm": 0.5679755210876465, + "learning_rate": 0.0001, + "loss": 1.4425, + "step": 8720 + }, + { + "epoch": 1.0017804836023203, + "grad_norm": 0.5443891286849976, + "learning_rate": 0.0001, + "loss": 1.6377, + "step": 8721 + }, + { + "epoch": 1.0018953535121475, + "grad_norm": 0.5426947474479675, + "learning_rate": 0.0001, + "loss": 1.6093, + "step": 8722 + }, + { + "epoch": 1.0020102234219745, + "grad_norm": 0.5179216861724854, + "learning_rate": 0.0001, + "loss": 1.463, + "step": 8723 + }, + { + "epoch": 1.0021250933318018, + "grad_norm": 0.5044360160827637, + "learning_rate": 0.0001, + "loss": 1.4617, + "step": 8724 + }, + { + "epoch": 1.0022399632416288, + "grad_norm": 0.5553032159805298, + "learning_rate": 0.0001, + "loss": 1.671, + "step": 8725 + }, + { + "epoch": 1.002354833151456, + "grad_norm": 0.5114759206771851, + "learning_rate": 0.0001, + "loss": 1.6313, + "step": 8726 + }, + { + "epoch": 1.002469703061283, + "grad_norm": 0.5439996719360352, + "learning_rate": 0.0001, + "loss": 1.54, + "step": 8727 + }, + { + "epoch": 1.0025845729711103, + "grad_norm": 0.5239347815513611, + "learning_rate": 0.0001, + "loss": 1.6265, + "step": 8728 + }, + { + "epoch": 1.0026994428809373, + "grad_norm": 0.49250611662864685, + "learning_rate": 0.0001, + "loss": 1.5114, + "step": 8729 + }, + { + "epoch": 1.0028143127907645, + "grad_norm": 0.5004076361656189, + "learning_rate": 0.0001, + "loss": 1.435, + "step": 8730 + }, + { + "epoch": 1.0029291827005915, + "grad_norm": 0.519535481929779, + "learning_rate": 0.0001, + "loss": 1.5809, + "step": 8731 + }, + { + "epoch": 1.0030440526104187, + "grad_norm": 0.536308228969574, + "learning_rate": 0.0001, + "loss": 1.5055, + "step": 8732 + }, + { + "epoch": 1.0031589225202457, + "grad_norm": 0.5679396986961365, + "learning_rate": 0.0001, + "loss": 1.8589, + "step": 8733 + }, + { + "epoch": 1.003273792430073, + "grad_norm": 0.5036899447441101, + "learning_rate": 0.0001, + "loss": 1.4946, + "step": 8734 + }, + { + "epoch": 1.0033886623399, + "grad_norm": 0.4876912236213684, + "learning_rate": 0.0001, + "loss": 1.4041, + "step": 8735 + }, + { + "epoch": 1.0035035322497272, + "grad_norm": 0.4986098110675812, + "learning_rate": 0.0001, + "loss": 1.5502, + "step": 8736 + }, + { + "epoch": 1.0036184021595542, + "grad_norm": 0.4937874674797058, + "learning_rate": 0.0001, + "loss": 1.4384, + "step": 8737 + }, + { + "epoch": 1.0037332720693815, + "grad_norm": 0.5576295852661133, + "learning_rate": 0.0001, + "loss": 1.4278, + "step": 8738 + }, + { + "epoch": 1.0038481419792085, + "grad_norm": 0.5405202507972717, + "learning_rate": 0.0001, + "loss": 1.6171, + "step": 8739 + }, + { + "epoch": 1.0039630118890357, + "grad_norm": 0.5165756344795227, + "learning_rate": 0.0001, + "loss": 1.4411, + "step": 8740 + }, + { + "epoch": 1.0040778817988627, + "grad_norm": 0.4816116988658905, + "learning_rate": 0.0001, + "loss": 1.2506, + "step": 8741 + }, + { + "epoch": 1.00419275170869, + "grad_norm": 0.5027560591697693, + "learning_rate": 0.0001, + "loss": 1.4022, + "step": 8742 + }, + { + "epoch": 1.004307621618517, + "grad_norm": 0.5379549264907837, + "learning_rate": 0.0001, + "loss": 1.6524, + "step": 8743 + }, + { + "epoch": 1.0044224915283442, + "grad_norm": 0.6135854721069336, + "learning_rate": 0.0001, + "loss": 1.5376, + "step": 8744 + }, + { + "epoch": 1.0045373614381712, + "grad_norm": 0.4991096556186676, + "learning_rate": 0.0001, + "loss": 1.454, + "step": 8745 + }, + { + "epoch": 1.0046522313479984, + "grad_norm": 0.5324615836143494, + "learning_rate": 0.0001, + "loss": 1.3607, + "step": 8746 + }, + { + "epoch": 1.0047671012578254, + "grad_norm": 0.5299088358879089, + "learning_rate": 0.0001, + "loss": 1.4187, + "step": 8747 + }, + { + "epoch": 1.0048819711676527, + "grad_norm": 0.4894624650478363, + "learning_rate": 0.0001, + "loss": 1.5665, + "step": 8748 + }, + { + "epoch": 1.0049968410774797, + "grad_norm": 0.5205270648002625, + "learning_rate": 0.0001, + "loss": 1.4387, + "step": 8749 + }, + { + "epoch": 1.005111710987307, + "grad_norm": 0.4865090250968933, + "learning_rate": 0.0001, + "loss": 1.478, + "step": 8750 + }, + { + "epoch": 1.005226580897134, + "grad_norm": 0.5512129664421082, + "learning_rate": 0.0001, + "loss": 1.5537, + "step": 8751 + }, + { + "epoch": 1.0053414508069611, + "grad_norm": 0.484320729970932, + "learning_rate": 0.0001, + "loss": 1.5362, + "step": 8752 + }, + { + "epoch": 1.0054563207167881, + "grad_norm": 0.5077235698699951, + "learning_rate": 0.0001, + "loss": 1.5192, + "step": 8753 + }, + { + "epoch": 1.0055711906266154, + "grad_norm": 0.5155119895935059, + "learning_rate": 0.0001, + "loss": 1.4048, + "step": 8754 + }, + { + "epoch": 1.0056860605364424, + "grad_norm": 0.54921954870224, + "learning_rate": 0.0001, + "loss": 1.459, + "step": 8755 + }, + { + "epoch": 1.0058009304462696, + "grad_norm": 0.4844646751880646, + "learning_rate": 0.0001, + "loss": 1.7109, + "step": 8756 + }, + { + "epoch": 1.0059158003560966, + "grad_norm": 0.47289541363716125, + "learning_rate": 0.0001, + "loss": 1.3917, + "step": 8757 + }, + { + "epoch": 1.0060306702659239, + "grad_norm": 0.5090060234069824, + "learning_rate": 0.0001, + "loss": 1.4632, + "step": 8758 + }, + { + "epoch": 1.0061455401757509, + "grad_norm": 0.5446418523788452, + "learning_rate": 0.0001, + "loss": 1.4641, + "step": 8759 + }, + { + "epoch": 1.006260410085578, + "grad_norm": 0.4822421669960022, + "learning_rate": 0.0001, + "loss": 1.3481, + "step": 8760 + }, + { + "epoch": 1.006375279995405, + "grad_norm": 0.5040774345397949, + "learning_rate": 0.0001, + "loss": 1.5635, + "step": 8761 + }, + { + "epoch": 1.0064901499052323, + "grad_norm": 0.5044432282447815, + "learning_rate": 0.0001, + "loss": 1.3254, + "step": 8762 + }, + { + "epoch": 1.0066050198150593, + "grad_norm": 0.5607876181602478, + "learning_rate": 0.0001, + "loss": 1.5483, + "step": 8763 + }, + { + "epoch": 1.0067198897248866, + "grad_norm": 0.5420013666152954, + "learning_rate": 0.0001, + "loss": 1.5704, + "step": 8764 + }, + { + "epoch": 1.0068347596347136, + "grad_norm": 0.5551548004150391, + "learning_rate": 0.0001, + "loss": 1.5416, + "step": 8765 + }, + { + "epoch": 1.0069496295445408, + "grad_norm": 0.48607149720191956, + "learning_rate": 0.0001, + "loss": 1.4455, + "step": 8766 + }, + { + "epoch": 1.0070644994543678, + "grad_norm": 0.5381131172180176, + "learning_rate": 0.0001, + "loss": 1.6194, + "step": 8767 + }, + { + "epoch": 1.007179369364195, + "grad_norm": 0.5239553451538086, + "learning_rate": 0.0001, + "loss": 1.4548, + "step": 8768 + }, + { + "epoch": 1.007294239274022, + "grad_norm": 0.5344405174255371, + "learning_rate": 0.0001, + "loss": 1.6047, + "step": 8769 + }, + { + "epoch": 1.0074091091838493, + "grad_norm": 0.4784035384654999, + "learning_rate": 0.0001, + "loss": 1.3902, + "step": 8770 + }, + { + "epoch": 1.0075239790936763, + "grad_norm": 0.49824607372283936, + "learning_rate": 0.0001, + "loss": 1.4876, + "step": 8771 + }, + { + "epoch": 1.0076388490035035, + "grad_norm": 0.4996454119682312, + "learning_rate": 0.0001, + "loss": 1.4361, + "step": 8772 + }, + { + "epoch": 1.0077537189133305, + "grad_norm": 0.5669626593589783, + "learning_rate": 0.0001, + "loss": 1.5217, + "step": 8773 + }, + { + "epoch": 1.0078685888231578, + "grad_norm": 0.5511415600776672, + "learning_rate": 0.0001, + "loss": 1.4873, + "step": 8774 + }, + { + "epoch": 1.0079834587329848, + "grad_norm": 0.5880183577537537, + "learning_rate": 0.0001, + "loss": 1.6994, + "step": 8775 + }, + { + "epoch": 1.008098328642812, + "grad_norm": 0.47547656297683716, + "learning_rate": 0.0001, + "loss": 1.3952, + "step": 8776 + }, + { + "epoch": 1.0082131985526392, + "grad_norm": 0.49340614676475525, + "learning_rate": 0.0001, + "loss": 1.4364, + "step": 8777 + }, + { + "epoch": 1.0083280684624663, + "grad_norm": 0.5298573970794678, + "learning_rate": 0.0001, + "loss": 1.6306, + "step": 8778 + }, + { + "epoch": 1.0084429383722935, + "grad_norm": 0.5538324117660522, + "learning_rate": 0.0001, + "loss": 1.6803, + "step": 8779 + }, + { + "epoch": 1.0085578082821205, + "grad_norm": 0.5182627439498901, + "learning_rate": 0.0001, + "loss": 1.5341, + "step": 8780 + }, + { + "epoch": 1.0086726781919477, + "grad_norm": 0.516372561454773, + "learning_rate": 0.0001, + "loss": 1.6618, + "step": 8781 + }, + { + "epoch": 1.0087875481017747, + "grad_norm": 0.47818052768707275, + "learning_rate": 0.0001, + "loss": 1.4983, + "step": 8782 + }, + { + "epoch": 1.008902418011602, + "grad_norm": 0.5226326584815979, + "learning_rate": 0.0001, + "loss": 1.3928, + "step": 8783 + }, + { + "epoch": 1.009017287921429, + "grad_norm": 0.4940192401409149, + "learning_rate": 0.0001, + "loss": 1.4149, + "step": 8784 + }, + { + "epoch": 1.0091321578312562, + "grad_norm": 0.5674400329589844, + "learning_rate": 0.0001, + "loss": 1.5652, + "step": 8785 + }, + { + "epoch": 1.0092470277410832, + "grad_norm": 0.4928450286388397, + "learning_rate": 0.0001, + "loss": 1.4894, + "step": 8786 + }, + { + "epoch": 1.0093618976509104, + "grad_norm": 0.48630955815315247, + "learning_rate": 0.0001, + "loss": 1.3444, + "step": 8787 + }, + { + "epoch": 1.0094767675607375, + "grad_norm": 0.4933887720108032, + "learning_rate": 0.0001, + "loss": 1.414, + "step": 8788 + }, + { + "epoch": 1.0095916374705647, + "grad_norm": 0.587792158126831, + "learning_rate": 0.0001, + "loss": 1.3733, + "step": 8789 + }, + { + "epoch": 1.0097065073803917, + "grad_norm": 0.5260916352272034, + "learning_rate": 0.0001, + "loss": 1.3751, + "step": 8790 + }, + { + "epoch": 1.009821377290219, + "grad_norm": 0.55689936876297, + "learning_rate": 0.0001, + "loss": 1.501, + "step": 8791 + }, + { + "epoch": 1.009936247200046, + "grad_norm": 0.5287038087844849, + "learning_rate": 0.0001, + "loss": 1.3097, + "step": 8792 + }, + { + "epoch": 1.0100511171098732, + "grad_norm": 0.521734356880188, + "learning_rate": 0.0001, + "loss": 1.5742, + "step": 8793 + }, + { + "epoch": 1.0101659870197002, + "grad_norm": 0.5244892239570618, + "learning_rate": 0.0001, + "loss": 1.3727, + "step": 8794 + }, + { + "epoch": 1.0102808569295274, + "grad_norm": 0.519815981388092, + "learning_rate": 0.0001, + "loss": 1.3018, + "step": 8795 + }, + { + "epoch": 1.0103957268393544, + "grad_norm": 0.5823325514793396, + "learning_rate": 0.0001, + "loss": 1.5463, + "step": 8796 + }, + { + "epoch": 1.0105105967491816, + "grad_norm": 0.51022869348526, + "learning_rate": 0.0001, + "loss": 1.2856, + "step": 8797 + }, + { + "epoch": 1.0106254666590087, + "grad_norm": 0.530922532081604, + "learning_rate": 0.0001, + "loss": 1.7368, + "step": 8798 + }, + { + "epoch": 1.0107403365688359, + "grad_norm": 0.48666539788246155, + "learning_rate": 0.0001, + "loss": 1.3965, + "step": 8799 + }, + { + "epoch": 1.010855206478663, + "grad_norm": 0.5583677291870117, + "learning_rate": 0.0001, + "loss": 1.499, + "step": 8800 + }, + { + "epoch": 1.0109700763884901, + "grad_norm": 0.5303579568862915, + "learning_rate": 0.0001, + "loss": 1.5556, + "step": 8801 + }, + { + "epoch": 1.0110849462983171, + "grad_norm": 0.5240969657897949, + "learning_rate": 0.0001, + "loss": 1.6731, + "step": 8802 + }, + { + "epoch": 1.0111998162081444, + "grad_norm": 0.5126948356628418, + "learning_rate": 0.0001, + "loss": 1.4357, + "step": 8803 + }, + { + "epoch": 1.0113146861179714, + "grad_norm": 0.5634292960166931, + "learning_rate": 0.0001, + "loss": 1.6382, + "step": 8804 + }, + { + "epoch": 1.0114295560277986, + "grad_norm": 0.5226359963417053, + "learning_rate": 0.0001, + "loss": 1.6549, + "step": 8805 + }, + { + "epoch": 1.0115444259376256, + "grad_norm": 0.4996303617954254, + "learning_rate": 0.0001, + "loss": 1.3072, + "step": 8806 + }, + { + "epoch": 1.0116592958474528, + "grad_norm": 0.49595144391059875, + "learning_rate": 0.0001, + "loss": 1.5657, + "step": 8807 + }, + { + "epoch": 1.0117741657572799, + "grad_norm": 0.5020740628242493, + "learning_rate": 0.0001, + "loss": 1.3781, + "step": 8808 + }, + { + "epoch": 1.011889035667107, + "grad_norm": 0.49597975611686707, + "learning_rate": 0.0001, + "loss": 1.4879, + "step": 8809 + }, + { + "epoch": 1.012003905576934, + "grad_norm": 0.5139532685279846, + "learning_rate": 0.0001, + "loss": 1.5396, + "step": 8810 + }, + { + "epoch": 1.0121187754867613, + "grad_norm": 0.4895060658454895, + "learning_rate": 0.0001, + "loss": 1.3705, + "step": 8811 + }, + { + "epoch": 1.0122336453965883, + "grad_norm": 0.47956687211990356, + "learning_rate": 0.0001, + "loss": 1.3134, + "step": 8812 + }, + { + "epoch": 1.0123485153064156, + "grad_norm": 0.4845159649848938, + "learning_rate": 0.0001, + "loss": 1.3541, + "step": 8813 + }, + { + "epoch": 1.0124633852162426, + "grad_norm": 0.5296355485916138, + "learning_rate": 0.0001, + "loss": 1.5696, + "step": 8814 + }, + { + "epoch": 1.0125782551260698, + "grad_norm": 0.5246522426605225, + "learning_rate": 0.0001, + "loss": 1.6872, + "step": 8815 + }, + { + "epoch": 1.0126931250358968, + "grad_norm": 0.5685099959373474, + "learning_rate": 0.0001, + "loss": 1.4409, + "step": 8816 + }, + { + "epoch": 1.012807994945724, + "grad_norm": 0.47896215319633484, + "learning_rate": 0.0001, + "loss": 1.4345, + "step": 8817 + }, + { + "epoch": 1.012922864855551, + "grad_norm": 0.487551748752594, + "learning_rate": 0.0001, + "loss": 1.1566, + "step": 8818 + }, + { + "epoch": 1.0130377347653783, + "grad_norm": 0.5409836173057556, + "learning_rate": 0.0001, + "loss": 1.6797, + "step": 8819 + }, + { + "epoch": 1.0131526046752053, + "grad_norm": 0.5281198620796204, + "learning_rate": 0.0001, + "loss": 1.5419, + "step": 8820 + }, + { + "epoch": 1.0132674745850325, + "grad_norm": 0.5546361804008484, + "learning_rate": 0.0001, + "loss": 1.5202, + "step": 8821 + }, + { + "epoch": 1.0133823444948595, + "grad_norm": 0.5375561118125916, + "learning_rate": 0.0001, + "loss": 1.4132, + "step": 8822 + }, + { + "epoch": 1.0134972144046868, + "grad_norm": 0.5230275988578796, + "learning_rate": 0.0001, + "loss": 1.242, + "step": 8823 + }, + { + "epoch": 1.0136120843145138, + "grad_norm": 0.5615382194519043, + "learning_rate": 0.0001, + "loss": 1.585, + "step": 8824 + }, + { + "epoch": 1.013726954224341, + "grad_norm": 0.49817347526550293, + "learning_rate": 0.0001, + "loss": 1.2932, + "step": 8825 + }, + { + "epoch": 1.013841824134168, + "grad_norm": 0.5396572947502136, + "learning_rate": 0.0001, + "loss": 1.5191, + "step": 8826 + }, + { + "epoch": 1.0139566940439952, + "grad_norm": 0.5678941607475281, + "learning_rate": 0.0001, + "loss": 1.548, + "step": 8827 + }, + { + "epoch": 1.0140715639538223, + "grad_norm": 0.5782578587532043, + "learning_rate": 0.0001, + "loss": 1.6673, + "step": 8828 + }, + { + "epoch": 1.0141864338636495, + "grad_norm": 0.502582311630249, + "learning_rate": 0.0001, + "loss": 1.614, + "step": 8829 + }, + { + "epoch": 1.0143013037734765, + "grad_norm": 0.6554863452911377, + "learning_rate": 0.0001, + "loss": 1.4321, + "step": 8830 + }, + { + "epoch": 1.0144161736833037, + "grad_norm": 0.5245055556297302, + "learning_rate": 0.0001, + "loss": 1.5352, + "step": 8831 + }, + { + "epoch": 1.0145310435931307, + "grad_norm": 0.4950437843799591, + "learning_rate": 0.0001, + "loss": 1.404, + "step": 8832 + }, + { + "epoch": 1.014645913502958, + "grad_norm": 0.48556363582611084, + "learning_rate": 0.0001, + "loss": 1.3682, + "step": 8833 + }, + { + "epoch": 1.014760783412785, + "grad_norm": 0.5288364291191101, + "learning_rate": 0.0001, + "loss": 1.3036, + "step": 8834 + }, + { + "epoch": 1.0148756533226122, + "grad_norm": 0.5161809325218201, + "learning_rate": 0.0001, + "loss": 1.4239, + "step": 8835 + }, + { + "epoch": 1.0149905232324392, + "grad_norm": 0.6864306330680847, + "learning_rate": 0.0001, + "loss": 1.4739, + "step": 8836 + }, + { + "epoch": 1.0151053931422664, + "grad_norm": 0.5075218081474304, + "learning_rate": 0.0001, + "loss": 1.4884, + "step": 8837 + }, + { + "epoch": 1.0152202630520935, + "grad_norm": 0.5431599617004395, + "learning_rate": 0.0001, + "loss": 1.522, + "step": 8838 + }, + { + "epoch": 1.0153351329619207, + "grad_norm": 0.49773699045181274, + "learning_rate": 0.0001, + "loss": 1.4085, + "step": 8839 + }, + { + "epoch": 1.0154500028717477, + "grad_norm": 0.5445903539657593, + "learning_rate": 0.0001, + "loss": 1.6883, + "step": 8840 + }, + { + "epoch": 1.015564872781575, + "grad_norm": 0.5013106465339661, + "learning_rate": 0.0001, + "loss": 1.489, + "step": 8841 + }, + { + "epoch": 1.015679742691402, + "grad_norm": 0.48422935605049133, + "learning_rate": 0.0001, + "loss": 1.4696, + "step": 8842 + }, + { + "epoch": 1.0157946126012292, + "grad_norm": 0.507689893245697, + "learning_rate": 0.0001, + "loss": 1.5015, + "step": 8843 + }, + { + "epoch": 1.0159094825110562, + "grad_norm": 0.48591098189353943, + "learning_rate": 0.0001, + "loss": 1.5294, + "step": 8844 + }, + { + "epoch": 1.0160243524208834, + "grad_norm": 0.5136837363243103, + "learning_rate": 0.0001, + "loss": 1.3958, + "step": 8845 + }, + { + "epoch": 1.0161392223307104, + "grad_norm": 0.5529723167419434, + "learning_rate": 0.0001, + "loss": 1.6141, + "step": 8846 + }, + { + "epoch": 1.0162540922405376, + "grad_norm": 0.5329626202583313, + "learning_rate": 0.0001, + "loss": 1.5841, + "step": 8847 + }, + { + "epoch": 1.0163689621503647, + "grad_norm": 0.5492246747016907, + "learning_rate": 0.0001, + "loss": 1.322, + "step": 8848 + }, + { + "epoch": 1.0164838320601919, + "grad_norm": 0.5101956725120544, + "learning_rate": 0.0001, + "loss": 1.4037, + "step": 8849 + }, + { + "epoch": 1.016598701970019, + "grad_norm": 0.5277436971664429, + "learning_rate": 0.0001, + "loss": 1.5163, + "step": 8850 + }, + { + "epoch": 1.0167135718798461, + "grad_norm": 0.5436544418334961, + "learning_rate": 0.0001, + "loss": 1.4159, + "step": 8851 + }, + { + "epoch": 1.0168284417896731, + "grad_norm": 0.5090106725692749, + "learning_rate": 0.0001, + "loss": 1.4402, + "step": 8852 + }, + { + "epoch": 1.0169433116995004, + "grad_norm": 0.5208089351654053, + "learning_rate": 0.0001, + "loss": 1.3969, + "step": 8853 + }, + { + "epoch": 1.0170581816093274, + "grad_norm": 0.506205141544342, + "learning_rate": 0.0001, + "loss": 1.2677, + "step": 8854 + }, + { + "epoch": 1.0171730515191546, + "grad_norm": 0.4834211468696594, + "learning_rate": 0.0001, + "loss": 1.4207, + "step": 8855 + }, + { + "epoch": 1.0172879214289816, + "grad_norm": 0.4916054606437683, + "learning_rate": 0.0001, + "loss": 1.6002, + "step": 8856 + }, + { + "epoch": 1.0174027913388088, + "grad_norm": 0.5485023856163025, + "learning_rate": 0.0001, + "loss": 1.4735, + "step": 8857 + }, + { + "epoch": 1.0175176612486359, + "grad_norm": 0.5115214586257935, + "learning_rate": 0.0001, + "loss": 1.4413, + "step": 8858 + }, + { + "epoch": 1.017632531158463, + "grad_norm": 0.526879608631134, + "learning_rate": 0.0001, + "loss": 1.5169, + "step": 8859 + }, + { + "epoch": 1.01774740106829, + "grad_norm": 0.547588050365448, + "learning_rate": 0.0001, + "loss": 1.5799, + "step": 8860 + }, + { + "epoch": 1.0178622709781173, + "grad_norm": 0.4805614948272705, + "learning_rate": 0.0001, + "loss": 1.2985, + "step": 8861 + }, + { + "epoch": 1.0179771408879443, + "grad_norm": 0.5069999098777771, + "learning_rate": 0.0001, + "loss": 1.4824, + "step": 8862 + }, + { + "epoch": 1.0180920107977716, + "grad_norm": 0.4949933588504791, + "learning_rate": 0.0001, + "loss": 1.483, + "step": 8863 + }, + { + "epoch": 1.0182068807075986, + "grad_norm": 0.4933357238769531, + "learning_rate": 0.0001, + "loss": 1.4847, + "step": 8864 + }, + { + "epoch": 1.0183217506174258, + "grad_norm": 0.5064468383789062, + "learning_rate": 0.0001, + "loss": 1.5914, + "step": 8865 + }, + { + "epoch": 1.0184366205272528, + "grad_norm": 0.5541262030601501, + "learning_rate": 0.0001, + "loss": 1.5795, + "step": 8866 + }, + { + "epoch": 1.01855149043708, + "grad_norm": 0.5688692331314087, + "learning_rate": 0.0001, + "loss": 1.565, + "step": 8867 + }, + { + "epoch": 1.018666360346907, + "grad_norm": 0.5120310187339783, + "learning_rate": 0.0001, + "loss": 1.4911, + "step": 8868 + }, + { + "epoch": 1.0187812302567343, + "grad_norm": 0.5274046063423157, + "learning_rate": 0.0001, + "loss": 1.6001, + "step": 8869 + }, + { + "epoch": 1.0188961001665613, + "grad_norm": 0.5022189021110535, + "learning_rate": 0.0001, + "loss": 1.4302, + "step": 8870 + }, + { + "epoch": 1.0190109700763885, + "grad_norm": 0.5581625699996948, + "learning_rate": 0.0001, + "loss": 1.6717, + "step": 8871 + }, + { + "epoch": 1.0191258399862155, + "grad_norm": 0.535330593585968, + "learning_rate": 0.0001, + "loss": 1.4295, + "step": 8872 + }, + { + "epoch": 1.0192407098960428, + "grad_norm": 0.4544619023799896, + "learning_rate": 0.0001, + "loss": 1.3384, + "step": 8873 + }, + { + "epoch": 1.0193555798058698, + "grad_norm": 0.6101300716400146, + "learning_rate": 0.0001, + "loss": 1.3963, + "step": 8874 + }, + { + "epoch": 1.019470449715697, + "grad_norm": 0.5120131969451904, + "learning_rate": 0.0001, + "loss": 1.5754, + "step": 8875 + }, + { + "epoch": 1.019585319625524, + "grad_norm": 0.49319207668304443, + "learning_rate": 0.0001, + "loss": 1.5138, + "step": 8876 + }, + { + "epoch": 1.0197001895353512, + "grad_norm": 0.5753055810928345, + "learning_rate": 0.0001, + "loss": 1.5648, + "step": 8877 + }, + { + "epoch": 1.0198150594451783, + "grad_norm": 0.5088034868240356, + "learning_rate": 0.0001, + "loss": 1.3886, + "step": 8878 + }, + { + "epoch": 1.0199299293550055, + "grad_norm": 0.5577898621559143, + "learning_rate": 0.0001, + "loss": 1.4367, + "step": 8879 + }, + { + "epoch": 1.0200447992648325, + "grad_norm": 0.5172377228736877, + "learning_rate": 0.0001, + "loss": 1.5446, + "step": 8880 + }, + { + "epoch": 1.0201596691746597, + "grad_norm": 0.49840047955513, + "learning_rate": 0.0001, + "loss": 1.3737, + "step": 8881 + }, + { + "epoch": 1.0202745390844867, + "grad_norm": 0.5156157612800598, + "learning_rate": 0.0001, + "loss": 1.4591, + "step": 8882 + }, + { + "epoch": 1.020389408994314, + "grad_norm": 0.4850468337535858, + "learning_rate": 0.0001, + "loss": 1.4474, + "step": 8883 + }, + { + "epoch": 1.020504278904141, + "grad_norm": 0.5439092516899109, + "learning_rate": 0.0001, + "loss": 1.3706, + "step": 8884 + }, + { + "epoch": 1.0206191488139682, + "grad_norm": 0.5357840657234192, + "learning_rate": 0.0001, + "loss": 1.5991, + "step": 8885 + }, + { + "epoch": 1.0207340187237952, + "grad_norm": 0.5319818258285522, + "learning_rate": 0.0001, + "loss": 1.5083, + "step": 8886 + }, + { + "epoch": 1.0208488886336224, + "grad_norm": 0.52201247215271, + "learning_rate": 0.0001, + "loss": 1.6158, + "step": 8887 + }, + { + "epoch": 1.0209637585434495, + "grad_norm": 0.4789310097694397, + "learning_rate": 0.0001, + "loss": 1.4419, + "step": 8888 + }, + { + "epoch": 1.0210786284532767, + "grad_norm": 0.5354008078575134, + "learning_rate": 0.0001, + "loss": 1.6706, + "step": 8889 + }, + { + "epoch": 1.0211934983631037, + "grad_norm": 0.5723029971122742, + "learning_rate": 0.0001, + "loss": 1.5574, + "step": 8890 + }, + { + "epoch": 1.021308368272931, + "grad_norm": 0.4998638927936554, + "learning_rate": 0.0001, + "loss": 1.5325, + "step": 8891 + }, + { + "epoch": 1.021423238182758, + "grad_norm": 0.5805894732475281, + "learning_rate": 0.0001, + "loss": 1.7066, + "step": 8892 + }, + { + "epoch": 1.0215381080925852, + "grad_norm": 0.6122562885284424, + "learning_rate": 0.0001, + "loss": 1.5378, + "step": 8893 + }, + { + "epoch": 1.0216529780024122, + "grad_norm": 0.5063309073448181, + "learning_rate": 0.0001, + "loss": 1.4165, + "step": 8894 + }, + { + "epoch": 1.0217678479122394, + "grad_norm": 0.5316261053085327, + "learning_rate": 0.0001, + "loss": 1.4106, + "step": 8895 + }, + { + "epoch": 1.0218827178220664, + "grad_norm": 0.5456799268722534, + "learning_rate": 0.0001, + "loss": 1.3952, + "step": 8896 + }, + { + "epoch": 1.0219975877318936, + "grad_norm": 0.5277631878852844, + "learning_rate": 0.0001, + "loss": 1.502, + "step": 8897 + }, + { + "epoch": 1.0221124576417207, + "grad_norm": 0.5488672852516174, + "learning_rate": 0.0001, + "loss": 1.4201, + "step": 8898 + }, + { + "epoch": 1.0222273275515479, + "grad_norm": 0.5567140579223633, + "learning_rate": 0.0001, + "loss": 1.5074, + "step": 8899 + }, + { + "epoch": 1.022342197461375, + "grad_norm": 0.5117017030715942, + "learning_rate": 0.0001, + "loss": 1.5041, + "step": 8900 + }, + { + "epoch": 1.0224570673712021, + "grad_norm": 0.569999098777771, + "learning_rate": 0.0001, + "loss": 1.4689, + "step": 8901 + }, + { + "epoch": 1.0225719372810291, + "grad_norm": 0.5409071445465088, + "learning_rate": 0.0001, + "loss": 1.3848, + "step": 8902 + }, + { + "epoch": 1.0226868071908564, + "grad_norm": 0.5431987047195435, + "learning_rate": 0.0001, + "loss": 1.6376, + "step": 8903 + }, + { + "epoch": 1.0228016771006834, + "grad_norm": 0.4773072302341461, + "learning_rate": 0.0001, + "loss": 1.3969, + "step": 8904 + }, + { + "epoch": 1.0229165470105106, + "grad_norm": 0.48939380049705505, + "learning_rate": 0.0001, + "loss": 1.2671, + "step": 8905 + }, + { + "epoch": 1.0230314169203376, + "grad_norm": 0.5301325917243958, + "learning_rate": 0.0001, + "loss": 1.6652, + "step": 8906 + }, + { + "epoch": 1.0231462868301648, + "grad_norm": 0.5332849621772766, + "learning_rate": 0.0001, + "loss": 1.4009, + "step": 8907 + }, + { + "epoch": 1.0232611567399919, + "grad_norm": 0.5592193603515625, + "learning_rate": 0.0001, + "loss": 1.4986, + "step": 8908 + }, + { + "epoch": 1.023376026649819, + "grad_norm": 0.5041161179542542, + "learning_rate": 0.0001, + "loss": 1.3943, + "step": 8909 + }, + { + "epoch": 1.0234908965596463, + "grad_norm": 0.5014781951904297, + "learning_rate": 0.0001, + "loss": 1.424, + "step": 8910 + }, + { + "epoch": 1.0236057664694733, + "grad_norm": 0.49788087606430054, + "learning_rate": 0.0001, + "loss": 1.442, + "step": 8911 + }, + { + "epoch": 1.0237206363793003, + "grad_norm": 0.5221337080001831, + "learning_rate": 0.0001, + "loss": 1.3539, + "step": 8912 + }, + { + "epoch": 1.0238355062891276, + "grad_norm": 0.5878422856330872, + "learning_rate": 0.0001, + "loss": 1.5483, + "step": 8913 + }, + { + "epoch": 1.0239503761989548, + "grad_norm": 0.5616274476051331, + "learning_rate": 0.0001, + "loss": 1.4821, + "step": 8914 + }, + { + "epoch": 1.0240652461087818, + "grad_norm": 0.4898712635040283, + "learning_rate": 0.0001, + "loss": 1.4507, + "step": 8915 + }, + { + "epoch": 1.024180116018609, + "grad_norm": 0.5116493105888367, + "learning_rate": 0.0001, + "loss": 1.4732, + "step": 8916 + }, + { + "epoch": 1.024294985928436, + "grad_norm": 0.5652375221252441, + "learning_rate": 0.0001, + "loss": 1.7441, + "step": 8917 + }, + { + "epoch": 1.0244098558382633, + "grad_norm": 0.525242805480957, + "learning_rate": 0.0001, + "loss": 1.3921, + "step": 8918 + }, + { + "epoch": 1.0245247257480903, + "grad_norm": 0.5136474370956421, + "learning_rate": 0.0001, + "loss": 1.4856, + "step": 8919 + }, + { + "epoch": 1.0246395956579175, + "grad_norm": 0.5519577860832214, + "learning_rate": 0.0001, + "loss": 1.4674, + "step": 8920 + }, + { + "epoch": 1.0247544655677445, + "grad_norm": 0.5397735834121704, + "learning_rate": 0.0001, + "loss": 1.4107, + "step": 8921 + }, + { + "epoch": 1.0248693354775718, + "grad_norm": 0.530933141708374, + "learning_rate": 0.0001, + "loss": 1.399, + "step": 8922 + }, + { + "epoch": 1.0249842053873988, + "grad_norm": 0.5000669956207275, + "learning_rate": 0.0001, + "loss": 1.5068, + "step": 8923 + }, + { + "epoch": 1.025099075297226, + "grad_norm": 0.5061250329017639, + "learning_rate": 0.0001, + "loss": 1.2578, + "step": 8924 + }, + { + "epoch": 1.025213945207053, + "grad_norm": 0.5225130915641785, + "learning_rate": 0.0001, + "loss": 1.5042, + "step": 8925 + }, + { + "epoch": 1.0253288151168802, + "grad_norm": 0.567387044429779, + "learning_rate": 0.0001, + "loss": 1.5323, + "step": 8926 + }, + { + "epoch": 1.0254436850267072, + "grad_norm": 0.6644077897071838, + "learning_rate": 0.0001, + "loss": 1.3388, + "step": 8927 + }, + { + "epoch": 1.0255585549365345, + "grad_norm": 0.5568031668663025, + "learning_rate": 0.0001, + "loss": 1.5316, + "step": 8928 + }, + { + "epoch": 1.0256734248463615, + "grad_norm": 0.45952072739601135, + "learning_rate": 0.0001, + "loss": 1.2552, + "step": 8929 + }, + { + "epoch": 1.0257882947561887, + "grad_norm": 0.4860810935497284, + "learning_rate": 0.0001, + "loss": 1.2508, + "step": 8930 + }, + { + "epoch": 1.0259031646660157, + "grad_norm": 0.5749561190605164, + "learning_rate": 0.0001, + "loss": 1.4491, + "step": 8931 + }, + { + "epoch": 1.026018034575843, + "grad_norm": 0.5022095441818237, + "learning_rate": 0.0001, + "loss": 1.2589, + "step": 8932 + }, + { + "epoch": 1.02613290448567, + "grad_norm": 0.4758133888244629, + "learning_rate": 0.0001, + "loss": 1.3553, + "step": 8933 + }, + { + "epoch": 1.0262477743954972, + "grad_norm": 0.5137754678726196, + "learning_rate": 0.0001, + "loss": 1.4829, + "step": 8934 + }, + { + "epoch": 1.0263626443053242, + "grad_norm": 0.517978847026825, + "learning_rate": 0.0001, + "loss": 1.2764, + "step": 8935 + }, + { + "epoch": 1.0264775142151514, + "grad_norm": 0.527491569519043, + "learning_rate": 0.0001, + "loss": 1.4839, + "step": 8936 + }, + { + "epoch": 1.0265923841249784, + "grad_norm": 0.5537298321723938, + "learning_rate": 0.0001, + "loss": 1.4757, + "step": 8937 + }, + { + "epoch": 1.0267072540348057, + "grad_norm": 0.49527880549430847, + "learning_rate": 0.0001, + "loss": 1.4042, + "step": 8938 + }, + { + "epoch": 1.0268221239446327, + "grad_norm": 0.5914473533630371, + "learning_rate": 0.0001, + "loss": 1.5999, + "step": 8939 + }, + { + "epoch": 1.02693699385446, + "grad_norm": 0.5270557403564453, + "learning_rate": 0.0001, + "loss": 1.5776, + "step": 8940 + }, + { + "epoch": 1.027051863764287, + "grad_norm": 0.4984856843948364, + "learning_rate": 0.0001, + "loss": 1.4058, + "step": 8941 + }, + { + "epoch": 1.0271667336741142, + "grad_norm": 0.5142775177955627, + "learning_rate": 0.0001, + "loss": 1.4986, + "step": 8942 + }, + { + "epoch": 1.0272816035839412, + "grad_norm": 0.5666712522506714, + "learning_rate": 0.0001, + "loss": 1.5856, + "step": 8943 + }, + { + "epoch": 1.0273964734937684, + "grad_norm": 0.515430212020874, + "learning_rate": 0.0001, + "loss": 1.5156, + "step": 8944 + }, + { + "epoch": 1.0275113434035954, + "grad_norm": 0.4515966475009918, + "learning_rate": 0.0001, + "loss": 1.0998, + "step": 8945 + }, + { + "epoch": 1.0276262133134226, + "grad_norm": 0.5609683394432068, + "learning_rate": 0.0001, + "loss": 1.5127, + "step": 8946 + }, + { + "epoch": 1.0277410832232496, + "grad_norm": 0.616452157497406, + "learning_rate": 0.0001, + "loss": 1.4311, + "step": 8947 + }, + { + "epoch": 1.0278559531330769, + "grad_norm": 0.5427160859107971, + "learning_rate": 0.0001, + "loss": 1.6402, + "step": 8948 + }, + { + "epoch": 1.0279708230429039, + "grad_norm": 0.5293609499931335, + "learning_rate": 0.0001, + "loss": 1.3271, + "step": 8949 + }, + { + "epoch": 1.0280856929527311, + "grad_norm": 0.5510618686676025, + "learning_rate": 0.0001, + "loss": 1.4305, + "step": 8950 + }, + { + "epoch": 1.0282005628625581, + "grad_norm": 0.5403707027435303, + "learning_rate": 0.0001, + "loss": 1.5898, + "step": 8951 + }, + { + "epoch": 1.0283154327723854, + "grad_norm": 0.5419697761535645, + "learning_rate": 0.0001, + "loss": 1.6068, + "step": 8952 + }, + { + "epoch": 1.0284303026822124, + "grad_norm": 0.5279735326766968, + "learning_rate": 0.0001, + "loss": 1.3857, + "step": 8953 + }, + { + "epoch": 1.0285451725920396, + "grad_norm": 0.48474159836769104, + "learning_rate": 0.0001, + "loss": 1.4308, + "step": 8954 + }, + { + "epoch": 1.0286600425018666, + "grad_norm": 0.5220118761062622, + "learning_rate": 0.0001, + "loss": 1.6229, + "step": 8955 + }, + { + "epoch": 1.0287749124116938, + "grad_norm": 0.5131214261054993, + "learning_rate": 0.0001, + "loss": 1.449, + "step": 8956 + }, + { + "epoch": 1.0288897823215208, + "grad_norm": 0.5555469989776611, + "learning_rate": 0.0001, + "loss": 1.4271, + "step": 8957 + }, + { + "epoch": 1.029004652231348, + "grad_norm": 0.5716768503189087, + "learning_rate": 0.0001, + "loss": 1.7778, + "step": 8958 + }, + { + "epoch": 1.029119522141175, + "grad_norm": 0.5097254514694214, + "learning_rate": 0.0001, + "loss": 1.5314, + "step": 8959 + }, + { + "epoch": 1.0292343920510023, + "grad_norm": 0.5094704031944275, + "learning_rate": 0.0001, + "loss": 1.5173, + "step": 8960 + }, + { + "epoch": 1.0293492619608293, + "grad_norm": 0.49823570251464844, + "learning_rate": 0.0001, + "loss": 1.4139, + "step": 8961 + }, + { + "epoch": 1.0294641318706566, + "grad_norm": 0.5155112147331238, + "learning_rate": 0.0001, + "loss": 1.4046, + "step": 8962 + }, + { + "epoch": 1.0295790017804836, + "grad_norm": 0.4962422549724579, + "learning_rate": 0.0001, + "loss": 1.5553, + "step": 8963 + }, + { + "epoch": 1.0296938716903108, + "grad_norm": 0.530081570148468, + "learning_rate": 0.0001, + "loss": 1.5269, + "step": 8964 + }, + { + "epoch": 1.0298087416001378, + "grad_norm": 0.5400624871253967, + "learning_rate": 0.0001, + "loss": 1.5078, + "step": 8965 + }, + { + "epoch": 1.029923611509965, + "grad_norm": 0.57288658618927, + "learning_rate": 0.0001, + "loss": 1.2249, + "step": 8966 + }, + { + "epoch": 1.030038481419792, + "grad_norm": 0.5239868760108948, + "learning_rate": 0.0001, + "loss": 1.3921, + "step": 8967 + }, + { + "epoch": 1.0301533513296193, + "grad_norm": 0.5611276030540466, + "learning_rate": 0.0001, + "loss": 1.5666, + "step": 8968 + }, + { + "epoch": 1.0302682212394463, + "grad_norm": 0.5283910036087036, + "learning_rate": 0.0001, + "loss": 1.5352, + "step": 8969 + }, + { + "epoch": 1.0303830911492735, + "grad_norm": 0.5439105033874512, + "learning_rate": 0.0001, + "loss": 1.4536, + "step": 8970 + }, + { + "epoch": 1.0304979610591005, + "grad_norm": 0.5273348093032837, + "learning_rate": 0.0001, + "loss": 1.683, + "step": 8971 + }, + { + "epoch": 1.0306128309689278, + "grad_norm": 0.5008038282394409, + "learning_rate": 0.0001, + "loss": 1.4605, + "step": 8972 + }, + { + "epoch": 1.0307277008787548, + "grad_norm": 0.5743789672851562, + "learning_rate": 0.0001, + "loss": 1.4544, + "step": 8973 + }, + { + "epoch": 1.030842570788582, + "grad_norm": 0.5148991346359253, + "learning_rate": 0.0001, + "loss": 1.3179, + "step": 8974 + }, + { + "epoch": 1.030957440698409, + "grad_norm": 0.5285124182701111, + "learning_rate": 0.0001, + "loss": 1.5723, + "step": 8975 + }, + { + "epoch": 1.0310723106082362, + "grad_norm": 0.5112460851669312, + "learning_rate": 0.0001, + "loss": 1.486, + "step": 8976 + }, + { + "epoch": 1.0311871805180632, + "grad_norm": 0.5338481068611145, + "learning_rate": 0.0001, + "loss": 1.3635, + "step": 8977 + }, + { + "epoch": 1.0313020504278905, + "grad_norm": 0.5114549994468689, + "learning_rate": 0.0001, + "loss": 1.1468, + "step": 8978 + }, + { + "epoch": 1.0314169203377175, + "grad_norm": 0.5220407247543335, + "learning_rate": 0.0001, + "loss": 1.5003, + "step": 8979 + }, + { + "epoch": 1.0315317902475447, + "grad_norm": 0.47584792971611023, + "learning_rate": 0.0001, + "loss": 1.4724, + "step": 8980 + }, + { + "epoch": 1.0316466601573717, + "grad_norm": 0.5218107104301453, + "learning_rate": 0.0001, + "loss": 1.4703, + "step": 8981 + }, + { + "epoch": 1.031761530067199, + "grad_norm": 0.5085425972938538, + "learning_rate": 0.0001, + "loss": 1.4469, + "step": 8982 + }, + { + "epoch": 1.031876399977026, + "grad_norm": 0.5143735408782959, + "learning_rate": 0.0001, + "loss": 1.2407, + "step": 8983 + }, + { + "epoch": 1.0319912698868532, + "grad_norm": 0.47457659244537354, + "learning_rate": 0.0001, + "loss": 1.3498, + "step": 8984 + }, + { + "epoch": 1.0321061397966802, + "grad_norm": 0.5300225615501404, + "learning_rate": 0.0001, + "loss": 1.4482, + "step": 8985 + }, + { + "epoch": 1.0322210097065074, + "grad_norm": 0.5350293517112732, + "learning_rate": 0.0001, + "loss": 1.4815, + "step": 8986 + }, + { + "epoch": 1.0323358796163344, + "grad_norm": 0.5425633788108826, + "learning_rate": 0.0001, + "loss": 1.674, + "step": 8987 + }, + { + "epoch": 1.0324507495261617, + "grad_norm": 0.5491638779640198, + "learning_rate": 0.0001, + "loss": 1.5377, + "step": 8988 + }, + { + "epoch": 1.0325656194359887, + "grad_norm": 0.47783395648002625, + "learning_rate": 0.0001, + "loss": 1.3218, + "step": 8989 + }, + { + "epoch": 1.032680489345816, + "grad_norm": 0.57433021068573, + "learning_rate": 0.0001, + "loss": 1.5094, + "step": 8990 + }, + { + "epoch": 1.032795359255643, + "grad_norm": 0.5770861506462097, + "learning_rate": 0.0001, + "loss": 1.568, + "step": 8991 + }, + { + "epoch": 1.0329102291654702, + "grad_norm": 0.5487734079360962, + "learning_rate": 0.0001, + "loss": 1.6067, + "step": 8992 + }, + { + "epoch": 1.0330250990752972, + "grad_norm": 0.5157055854797363, + "learning_rate": 0.0001, + "loss": 1.4258, + "step": 8993 + }, + { + "epoch": 1.0331399689851244, + "grad_norm": 0.5037803649902344, + "learning_rate": 0.0001, + "loss": 1.3896, + "step": 8994 + }, + { + "epoch": 1.0332548388949514, + "grad_norm": 0.5406895875930786, + "learning_rate": 0.0001, + "loss": 1.5333, + "step": 8995 + }, + { + "epoch": 1.0333697088047786, + "grad_norm": 0.5066888332366943, + "learning_rate": 0.0001, + "loss": 1.5356, + "step": 8996 + }, + { + "epoch": 1.0334845787146056, + "grad_norm": 0.48853886127471924, + "learning_rate": 0.0001, + "loss": 1.5338, + "step": 8997 + }, + { + "epoch": 1.0335994486244329, + "grad_norm": 0.48481160402297974, + "learning_rate": 0.0001, + "loss": 1.406, + "step": 8998 + }, + { + "epoch": 1.0337143185342599, + "grad_norm": 0.4979383945465088, + "learning_rate": 0.0001, + "loss": 1.3085, + "step": 8999 + }, + { + "epoch": 1.033829188444087, + "grad_norm": 0.5427919030189514, + "learning_rate": 0.0001, + "loss": 1.5745, + "step": 9000 + }, + { + "epoch": 1.0339440583539141, + "grad_norm": 0.5497532486915588, + "learning_rate": 0.0001, + "loss": 1.4961, + "step": 9001 + }, + { + "epoch": 1.0340589282637414, + "grad_norm": 0.5383864045143127, + "learning_rate": 0.0001, + "loss": 1.263, + "step": 9002 + }, + { + "epoch": 1.0341737981735684, + "grad_norm": 0.5036965012550354, + "learning_rate": 0.0001, + "loss": 1.4651, + "step": 9003 + }, + { + "epoch": 1.0342886680833956, + "grad_norm": 0.555041491985321, + "learning_rate": 0.0001, + "loss": 1.5879, + "step": 9004 + }, + { + "epoch": 1.0344035379932226, + "grad_norm": 0.5549968481063843, + "learning_rate": 0.0001, + "loss": 1.5353, + "step": 9005 + }, + { + "epoch": 1.0345184079030498, + "grad_norm": 0.579571008682251, + "learning_rate": 0.0001, + "loss": 1.458, + "step": 9006 + }, + { + "epoch": 1.0346332778128768, + "grad_norm": 0.5244538187980652, + "learning_rate": 0.0001, + "loss": 1.4796, + "step": 9007 + }, + { + "epoch": 1.034748147722704, + "grad_norm": 0.544011116027832, + "learning_rate": 0.0001, + "loss": 1.5416, + "step": 9008 + }, + { + "epoch": 1.034863017632531, + "grad_norm": 0.5533955097198486, + "learning_rate": 0.0001, + "loss": 1.6356, + "step": 9009 + }, + { + "epoch": 1.0349778875423583, + "grad_norm": 0.5618937611579895, + "learning_rate": 0.0001, + "loss": 1.6885, + "step": 9010 + }, + { + "epoch": 1.0350927574521853, + "grad_norm": 0.5388464331626892, + "learning_rate": 0.0001, + "loss": 1.4549, + "step": 9011 + }, + { + "epoch": 1.0352076273620126, + "grad_norm": 0.5120463967323303, + "learning_rate": 0.0001, + "loss": 1.509, + "step": 9012 + }, + { + "epoch": 1.0353224972718396, + "grad_norm": 0.5093674659729004, + "learning_rate": 0.0001, + "loss": 1.4645, + "step": 9013 + }, + { + "epoch": 1.0354373671816668, + "grad_norm": 0.4842431843280792, + "learning_rate": 0.0001, + "loss": 1.4475, + "step": 9014 + }, + { + "epoch": 1.0355522370914938, + "grad_norm": 0.5035402774810791, + "learning_rate": 0.0001, + "loss": 1.443, + "step": 9015 + }, + { + "epoch": 1.035667107001321, + "grad_norm": 0.5231695771217346, + "learning_rate": 0.0001, + "loss": 1.4145, + "step": 9016 + }, + { + "epoch": 1.035781976911148, + "grad_norm": 0.5481805801391602, + "learning_rate": 0.0001, + "loss": 1.5047, + "step": 9017 + }, + { + "epoch": 1.0358968468209753, + "grad_norm": 0.510319173336029, + "learning_rate": 0.0001, + "loss": 1.4618, + "step": 9018 + }, + { + "epoch": 1.0360117167308023, + "grad_norm": 0.5354859828948975, + "learning_rate": 0.0001, + "loss": 1.3844, + "step": 9019 + }, + { + "epoch": 1.0361265866406295, + "grad_norm": 0.4900701642036438, + "learning_rate": 0.0001, + "loss": 1.3788, + "step": 9020 + }, + { + "epoch": 1.0362414565504565, + "grad_norm": 0.5398866534233093, + "learning_rate": 0.0001, + "loss": 1.5015, + "step": 9021 + }, + { + "epoch": 1.0363563264602837, + "grad_norm": 0.5214775204658508, + "learning_rate": 0.0001, + "loss": 1.4741, + "step": 9022 + }, + { + "epoch": 1.0364711963701108, + "grad_norm": 0.4958128035068512, + "learning_rate": 0.0001, + "loss": 1.6112, + "step": 9023 + }, + { + "epoch": 1.036586066279938, + "grad_norm": 0.5190637111663818, + "learning_rate": 0.0001, + "loss": 1.4925, + "step": 9024 + }, + { + "epoch": 1.036700936189765, + "grad_norm": 0.568577766418457, + "learning_rate": 0.0001, + "loss": 1.5747, + "step": 9025 + }, + { + "epoch": 1.0368158060995922, + "grad_norm": 0.5356246829032898, + "learning_rate": 0.0001, + "loss": 1.5718, + "step": 9026 + }, + { + "epoch": 1.0369306760094192, + "grad_norm": 0.5341411232948303, + "learning_rate": 0.0001, + "loss": 1.5258, + "step": 9027 + }, + { + "epoch": 1.0370455459192465, + "grad_norm": 0.5894232392311096, + "learning_rate": 0.0001, + "loss": 1.7847, + "step": 9028 + }, + { + "epoch": 1.0371604158290735, + "grad_norm": 0.4876749515533447, + "learning_rate": 0.0001, + "loss": 1.4103, + "step": 9029 + }, + { + "epoch": 1.0372752857389007, + "grad_norm": 0.5098420977592468, + "learning_rate": 0.0001, + "loss": 1.59, + "step": 9030 + }, + { + "epoch": 1.0373901556487277, + "grad_norm": 0.5374711751937866, + "learning_rate": 0.0001, + "loss": 1.406, + "step": 9031 + }, + { + "epoch": 1.037505025558555, + "grad_norm": 0.5379756689071655, + "learning_rate": 0.0001, + "loss": 1.4881, + "step": 9032 + }, + { + "epoch": 1.037619895468382, + "grad_norm": 0.5455212593078613, + "learning_rate": 0.0001, + "loss": 1.4375, + "step": 9033 + }, + { + "epoch": 1.0377347653782092, + "grad_norm": 0.5194243788719177, + "learning_rate": 0.0001, + "loss": 1.4984, + "step": 9034 + }, + { + "epoch": 1.0378496352880362, + "grad_norm": 0.5264743566513062, + "learning_rate": 0.0001, + "loss": 1.3457, + "step": 9035 + }, + { + "epoch": 1.0379645051978634, + "grad_norm": 0.5833313465118408, + "learning_rate": 0.0001, + "loss": 1.5327, + "step": 9036 + }, + { + "epoch": 1.0380793751076904, + "grad_norm": 0.5064496994018555, + "learning_rate": 0.0001, + "loss": 1.3317, + "step": 9037 + }, + { + "epoch": 1.0381942450175177, + "grad_norm": 0.5563880205154419, + "learning_rate": 0.0001, + "loss": 1.5072, + "step": 9038 + }, + { + "epoch": 1.0383091149273447, + "grad_norm": 0.5034350156784058, + "learning_rate": 0.0001, + "loss": 1.5021, + "step": 9039 + }, + { + "epoch": 1.038423984837172, + "grad_norm": 0.5645545125007629, + "learning_rate": 0.0001, + "loss": 1.65, + "step": 9040 + }, + { + "epoch": 1.038538854746999, + "grad_norm": 0.5010533928871155, + "learning_rate": 0.0001, + "loss": 1.6339, + "step": 9041 + }, + { + "epoch": 1.0386537246568261, + "grad_norm": 0.5693106651306152, + "learning_rate": 0.0001, + "loss": 1.624, + "step": 9042 + }, + { + "epoch": 1.0387685945666532, + "grad_norm": 0.5178468227386475, + "learning_rate": 0.0001, + "loss": 1.5859, + "step": 9043 + }, + { + "epoch": 1.0388834644764804, + "grad_norm": 0.49142345786094666, + "learning_rate": 0.0001, + "loss": 1.3921, + "step": 9044 + }, + { + "epoch": 1.0389983343863074, + "grad_norm": 0.512225329875946, + "learning_rate": 0.0001, + "loss": 1.6559, + "step": 9045 + }, + { + "epoch": 1.0391132042961346, + "grad_norm": 0.500827968120575, + "learning_rate": 0.0001, + "loss": 1.444, + "step": 9046 + }, + { + "epoch": 1.0392280742059619, + "grad_norm": 0.6273430585861206, + "learning_rate": 0.0001, + "loss": 1.6012, + "step": 9047 + }, + { + "epoch": 1.0393429441157889, + "grad_norm": 0.610727071762085, + "learning_rate": 0.0001, + "loss": 1.6243, + "step": 9048 + }, + { + "epoch": 1.0394578140256159, + "grad_norm": 0.4880732297897339, + "learning_rate": 0.0001, + "loss": 1.3747, + "step": 9049 + }, + { + "epoch": 1.039572683935443, + "grad_norm": 0.5448058843612671, + "learning_rate": 0.0001, + "loss": 1.422, + "step": 9050 + }, + { + "epoch": 1.0396875538452703, + "grad_norm": 0.5537201762199402, + "learning_rate": 0.0001, + "loss": 1.4965, + "step": 9051 + }, + { + "epoch": 1.0398024237550973, + "grad_norm": 0.518325686454773, + "learning_rate": 0.0001, + "loss": 1.6048, + "step": 9052 + }, + { + "epoch": 1.0399172936649246, + "grad_norm": 0.49593982100486755, + "learning_rate": 0.0001, + "loss": 1.3273, + "step": 9053 + }, + { + "epoch": 1.0400321635747516, + "grad_norm": 0.5287415385246277, + "learning_rate": 0.0001, + "loss": 1.5204, + "step": 9054 + }, + { + "epoch": 1.0401470334845788, + "grad_norm": 0.48811841011047363, + "learning_rate": 0.0001, + "loss": 1.3151, + "step": 9055 + }, + { + "epoch": 1.0402619033944058, + "grad_norm": 0.557843804359436, + "learning_rate": 0.0001, + "loss": 1.5249, + "step": 9056 + }, + { + "epoch": 1.040376773304233, + "grad_norm": 0.5163830518722534, + "learning_rate": 0.0001, + "loss": 1.5059, + "step": 9057 + }, + { + "epoch": 1.04049164321406, + "grad_norm": 0.5076297521591187, + "learning_rate": 0.0001, + "loss": 1.3535, + "step": 9058 + }, + { + "epoch": 1.0406065131238873, + "grad_norm": 0.5205563902854919, + "learning_rate": 0.0001, + "loss": 1.5076, + "step": 9059 + }, + { + "epoch": 1.0407213830337143, + "grad_norm": 0.5657637119293213, + "learning_rate": 0.0001, + "loss": 1.6143, + "step": 9060 + }, + { + "epoch": 1.0408362529435415, + "grad_norm": 0.5690082907676697, + "learning_rate": 0.0001, + "loss": 1.4616, + "step": 9061 + }, + { + "epoch": 1.0409511228533685, + "grad_norm": 0.5683187246322632, + "learning_rate": 0.0001, + "loss": 1.3052, + "step": 9062 + }, + { + "epoch": 1.0410659927631958, + "grad_norm": 0.5876998901367188, + "learning_rate": 0.0001, + "loss": 1.5554, + "step": 9063 + }, + { + "epoch": 1.0411808626730228, + "grad_norm": 0.5582718849182129, + "learning_rate": 0.0001, + "loss": 1.3038, + "step": 9064 + }, + { + "epoch": 1.04129573258285, + "grad_norm": 0.4977602958679199, + "learning_rate": 0.0001, + "loss": 1.4045, + "step": 9065 + }, + { + "epoch": 1.041410602492677, + "grad_norm": 0.5169378519058228, + "learning_rate": 0.0001, + "loss": 1.5039, + "step": 9066 + }, + { + "epoch": 1.0415254724025043, + "grad_norm": 0.5189085006713867, + "learning_rate": 0.0001, + "loss": 1.4188, + "step": 9067 + }, + { + "epoch": 1.0416403423123313, + "grad_norm": 0.49691158533096313, + "learning_rate": 0.0001, + "loss": 1.2885, + "step": 9068 + }, + { + "epoch": 1.0417552122221585, + "grad_norm": 0.5555067658424377, + "learning_rate": 0.0001, + "loss": 1.6002, + "step": 9069 + }, + { + "epoch": 1.0418700821319855, + "grad_norm": 0.5489440560340881, + "learning_rate": 0.0001, + "loss": 1.5219, + "step": 9070 + }, + { + "epoch": 1.0419849520418127, + "grad_norm": 0.5622000098228455, + "learning_rate": 0.0001, + "loss": 1.5963, + "step": 9071 + }, + { + "epoch": 1.0420998219516397, + "grad_norm": 0.546467661857605, + "learning_rate": 0.0001, + "loss": 1.2848, + "step": 9072 + }, + { + "epoch": 1.042214691861467, + "grad_norm": 0.496011883020401, + "learning_rate": 0.0001, + "loss": 1.2943, + "step": 9073 + }, + { + "epoch": 1.042329561771294, + "grad_norm": 0.500908613204956, + "learning_rate": 0.0001, + "loss": 1.3917, + "step": 9074 + }, + { + "epoch": 1.0424444316811212, + "grad_norm": 0.554749071598053, + "learning_rate": 0.0001, + "loss": 1.547, + "step": 9075 + }, + { + "epoch": 1.0425593015909482, + "grad_norm": 0.5059463977813721, + "learning_rate": 0.0001, + "loss": 1.4333, + "step": 9076 + }, + { + "epoch": 1.0426741715007755, + "grad_norm": 0.5207396745681763, + "learning_rate": 0.0001, + "loss": 1.387, + "step": 9077 + }, + { + "epoch": 1.0427890414106025, + "grad_norm": 0.48321661353111267, + "learning_rate": 0.0001, + "loss": 1.4378, + "step": 9078 + }, + { + "epoch": 1.0429039113204297, + "grad_norm": 0.5234763622283936, + "learning_rate": 0.0001, + "loss": 1.5333, + "step": 9079 + }, + { + "epoch": 1.0430187812302567, + "grad_norm": 0.5880540609359741, + "learning_rate": 0.0001, + "loss": 1.5906, + "step": 9080 + }, + { + "epoch": 1.043133651140084, + "grad_norm": 0.548507809638977, + "learning_rate": 0.0001, + "loss": 1.5296, + "step": 9081 + }, + { + "epoch": 1.043248521049911, + "grad_norm": 0.545957088470459, + "learning_rate": 0.0001, + "loss": 1.5419, + "step": 9082 + }, + { + "epoch": 1.0433633909597382, + "grad_norm": 0.5886163115501404, + "learning_rate": 0.0001, + "loss": 1.5929, + "step": 9083 + }, + { + "epoch": 1.0434782608695652, + "grad_norm": 0.5363225340843201, + "learning_rate": 0.0001, + "loss": 1.5265, + "step": 9084 + }, + { + "epoch": 1.0435931307793924, + "grad_norm": 0.5168259739875793, + "learning_rate": 0.0001, + "loss": 1.3905, + "step": 9085 + }, + { + "epoch": 1.0437080006892194, + "grad_norm": 0.5514031052589417, + "learning_rate": 0.0001, + "loss": 1.6164, + "step": 9086 + }, + { + "epoch": 1.0438228705990467, + "grad_norm": 0.5472466945648193, + "learning_rate": 0.0001, + "loss": 1.382, + "step": 9087 + }, + { + "epoch": 1.0439377405088737, + "grad_norm": 0.5088561773300171, + "learning_rate": 0.0001, + "loss": 1.4711, + "step": 9088 + }, + { + "epoch": 1.044052610418701, + "grad_norm": 0.535428524017334, + "learning_rate": 0.0001, + "loss": 1.4639, + "step": 9089 + }, + { + "epoch": 1.044167480328528, + "grad_norm": 0.4890363812446594, + "learning_rate": 0.0001, + "loss": 1.3678, + "step": 9090 + }, + { + "epoch": 1.0442823502383551, + "grad_norm": 0.5183672308921814, + "learning_rate": 0.0001, + "loss": 1.4534, + "step": 9091 + }, + { + "epoch": 1.0443972201481821, + "grad_norm": 0.5594877004623413, + "learning_rate": 0.0001, + "loss": 1.6638, + "step": 9092 + }, + { + "epoch": 1.0445120900580094, + "grad_norm": 0.5082018375396729, + "learning_rate": 0.0001, + "loss": 1.5265, + "step": 9093 + }, + { + "epoch": 1.0446269599678364, + "grad_norm": 0.5126281380653381, + "learning_rate": 0.0001, + "loss": 1.3822, + "step": 9094 + }, + { + "epoch": 1.0447418298776636, + "grad_norm": 0.5137059092521667, + "learning_rate": 0.0001, + "loss": 1.4601, + "step": 9095 + }, + { + "epoch": 1.0448566997874906, + "grad_norm": 0.491517573595047, + "learning_rate": 0.0001, + "loss": 1.4795, + "step": 9096 + }, + { + "epoch": 1.0449715696973179, + "grad_norm": 0.5336720943450928, + "learning_rate": 0.0001, + "loss": 1.4341, + "step": 9097 + }, + { + "epoch": 1.0450864396071449, + "grad_norm": 0.5771209001541138, + "learning_rate": 0.0001, + "loss": 1.6016, + "step": 9098 + }, + { + "epoch": 1.045201309516972, + "grad_norm": 0.5206882357597351, + "learning_rate": 0.0001, + "loss": 1.5159, + "step": 9099 + }, + { + "epoch": 1.045316179426799, + "grad_norm": 0.5490942597389221, + "learning_rate": 0.0001, + "loss": 1.6517, + "step": 9100 + }, + { + "epoch": 1.0454310493366263, + "grad_norm": 0.5432296991348267, + "learning_rate": 0.0001, + "loss": 1.6488, + "step": 9101 + }, + { + "epoch": 1.0455459192464533, + "grad_norm": 0.5428939461708069, + "learning_rate": 0.0001, + "loss": 1.3368, + "step": 9102 + }, + { + "epoch": 1.0456607891562806, + "grad_norm": 0.4910656213760376, + "learning_rate": 0.0001, + "loss": 1.2355, + "step": 9103 + }, + { + "epoch": 1.0457756590661076, + "grad_norm": 0.5123563408851624, + "learning_rate": 0.0001, + "loss": 1.5744, + "step": 9104 + }, + { + "epoch": 1.0458905289759348, + "grad_norm": 0.5455781817436218, + "learning_rate": 0.0001, + "loss": 1.4858, + "step": 9105 + }, + { + "epoch": 1.0460053988857618, + "grad_norm": 0.5809556245803833, + "learning_rate": 0.0001, + "loss": 1.515, + "step": 9106 + }, + { + "epoch": 1.046120268795589, + "grad_norm": 0.5579778552055359, + "learning_rate": 0.0001, + "loss": 1.6172, + "step": 9107 + }, + { + "epoch": 1.046235138705416, + "grad_norm": 0.5217399597167969, + "learning_rate": 0.0001, + "loss": 1.4473, + "step": 9108 + }, + { + "epoch": 1.0463500086152433, + "grad_norm": 0.5513988137245178, + "learning_rate": 0.0001, + "loss": 1.663, + "step": 9109 + }, + { + "epoch": 1.0464648785250703, + "grad_norm": 0.5201188921928406, + "learning_rate": 0.0001, + "loss": 1.3266, + "step": 9110 + }, + { + "epoch": 1.0465797484348975, + "grad_norm": 0.5076174736022949, + "learning_rate": 0.0001, + "loss": 1.4519, + "step": 9111 + }, + { + "epoch": 1.0466946183447245, + "grad_norm": 0.543914258480072, + "learning_rate": 0.0001, + "loss": 1.5246, + "step": 9112 + }, + { + "epoch": 1.0468094882545518, + "grad_norm": 0.5245856642723083, + "learning_rate": 0.0001, + "loss": 1.587, + "step": 9113 + }, + { + "epoch": 1.0469243581643788, + "grad_norm": 0.5603649616241455, + "learning_rate": 0.0001, + "loss": 1.462, + "step": 9114 + }, + { + "epoch": 1.047039228074206, + "grad_norm": 0.5366967916488647, + "learning_rate": 0.0001, + "loss": 1.7216, + "step": 9115 + }, + { + "epoch": 1.047154097984033, + "grad_norm": 0.6067121624946594, + "learning_rate": 0.0001, + "loss": 1.6592, + "step": 9116 + }, + { + "epoch": 1.0472689678938603, + "grad_norm": 0.5686327219009399, + "learning_rate": 0.0001, + "loss": 1.6469, + "step": 9117 + }, + { + "epoch": 1.0473838378036873, + "grad_norm": 0.5154968500137329, + "learning_rate": 0.0001, + "loss": 1.2529, + "step": 9118 + }, + { + "epoch": 1.0474987077135145, + "grad_norm": 0.49103155732154846, + "learning_rate": 0.0001, + "loss": 1.3669, + "step": 9119 + }, + { + "epoch": 1.0476135776233415, + "grad_norm": 0.5241073966026306, + "learning_rate": 0.0001, + "loss": 1.4359, + "step": 9120 + }, + { + "epoch": 1.0477284475331687, + "grad_norm": 0.5242918133735657, + "learning_rate": 0.0001, + "loss": 1.4032, + "step": 9121 + }, + { + "epoch": 1.0478433174429957, + "grad_norm": 0.500344455242157, + "learning_rate": 0.0001, + "loss": 1.3488, + "step": 9122 + }, + { + "epoch": 1.047958187352823, + "grad_norm": 0.5362123250961304, + "learning_rate": 0.0001, + "loss": 1.3767, + "step": 9123 + }, + { + "epoch": 1.04807305726265, + "grad_norm": 0.5414429903030396, + "learning_rate": 0.0001, + "loss": 1.5629, + "step": 9124 + }, + { + "epoch": 1.0481879271724772, + "grad_norm": 0.5353612899780273, + "learning_rate": 0.0001, + "loss": 1.5028, + "step": 9125 + }, + { + "epoch": 1.0483027970823042, + "grad_norm": 0.5600303411483765, + "learning_rate": 0.0001, + "loss": 1.6894, + "step": 9126 + }, + { + "epoch": 1.0484176669921315, + "grad_norm": 0.5072823166847229, + "learning_rate": 0.0001, + "loss": 1.4421, + "step": 9127 + }, + { + "epoch": 1.0485325369019585, + "grad_norm": 0.5189083218574524, + "learning_rate": 0.0001, + "loss": 1.4933, + "step": 9128 + }, + { + "epoch": 1.0486474068117857, + "grad_norm": 0.5282391309738159, + "learning_rate": 0.0001, + "loss": 1.4874, + "step": 9129 + }, + { + "epoch": 1.0487622767216127, + "grad_norm": 0.5408895015716553, + "learning_rate": 0.0001, + "loss": 1.4552, + "step": 9130 + }, + { + "epoch": 1.04887714663144, + "grad_norm": 0.5891426205635071, + "learning_rate": 0.0001, + "loss": 1.5407, + "step": 9131 + }, + { + "epoch": 1.048992016541267, + "grad_norm": 0.5693600177764893, + "learning_rate": 0.0001, + "loss": 1.4055, + "step": 9132 + }, + { + "epoch": 1.0491068864510942, + "grad_norm": 0.534770667552948, + "learning_rate": 0.0001, + "loss": 1.4429, + "step": 9133 + }, + { + "epoch": 1.0492217563609212, + "grad_norm": 0.5606382489204407, + "learning_rate": 0.0001, + "loss": 1.5205, + "step": 9134 + }, + { + "epoch": 1.0493366262707484, + "grad_norm": 0.5300989747047424, + "learning_rate": 0.0001, + "loss": 1.4919, + "step": 9135 + }, + { + "epoch": 1.0494514961805754, + "grad_norm": 0.56827712059021, + "learning_rate": 0.0001, + "loss": 1.6758, + "step": 9136 + }, + { + "epoch": 1.0495663660904027, + "grad_norm": 0.5045410394668579, + "learning_rate": 0.0001, + "loss": 1.277, + "step": 9137 + }, + { + "epoch": 1.0496812360002297, + "grad_norm": 0.5462251901626587, + "learning_rate": 0.0001, + "loss": 1.3595, + "step": 9138 + }, + { + "epoch": 1.049796105910057, + "grad_norm": 0.5165095925331116, + "learning_rate": 0.0001, + "loss": 1.5132, + "step": 9139 + }, + { + "epoch": 1.049910975819884, + "grad_norm": 0.4886758327484131, + "learning_rate": 0.0001, + "loss": 1.4078, + "step": 9140 + }, + { + "epoch": 1.0500258457297111, + "grad_norm": 0.5420746207237244, + "learning_rate": 0.0001, + "loss": 1.2517, + "step": 9141 + }, + { + "epoch": 1.0501407156395381, + "grad_norm": 0.5480278730392456, + "learning_rate": 0.0001, + "loss": 1.554, + "step": 9142 + }, + { + "epoch": 1.0502555855493654, + "grad_norm": 0.5709562301635742, + "learning_rate": 0.0001, + "loss": 1.6546, + "step": 9143 + }, + { + "epoch": 1.0503704554591924, + "grad_norm": 0.5308666229248047, + "learning_rate": 0.0001, + "loss": 1.5047, + "step": 9144 + }, + { + "epoch": 1.0504853253690196, + "grad_norm": 0.5184751749038696, + "learning_rate": 0.0001, + "loss": 1.4136, + "step": 9145 + }, + { + "epoch": 1.0506001952788466, + "grad_norm": 0.5020022392272949, + "learning_rate": 0.0001, + "loss": 1.4594, + "step": 9146 + }, + { + "epoch": 1.0507150651886739, + "grad_norm": 0.531156063079834, + "learning_rate": 0.0001, + "loss": 1.4978, + "step": 9147 + }, + { + "epoch": 1.0508299350985009, + "grad_norm": 0.5110874772071838, + "learning_rate": 0.0001, + "loss": 1.3658, + "step": 9148 + }, + { + "epoch": 1.050944805008328, + "grad_norm": 0.6354340314865112, + "learning_rate": 0.0001, + "loss": 1.511, + "step": 9149 + }, + { + "epoch": 1.051059674918155, + "grad_norm": 0.5681877136230469, + "learning_rate": 0.0001, + "loss": 1.382, + "step": 9150 + }, + { + "epoch": 1.0511745448279823, + "grad_norm": 0.4902039170265198, + "learning_rate": 0.0001, + "loss": 1.3208, + "step": 9151 + }, + { + "epoch": 1.0512894147378093, + "grad_norm": 0.5818510055541992, + "learning_rate": 0.0001, + "loss": 1.4707, + "step": 9152 + }, + { + "epoch": 1.0514042846476366, + "grad_norm": 0.5937033891677856, + "learning_rate": 0.0001, + "loss": 1.5029, + "step": 9153 + }, + { + "epoch": 1.0515191545574636, + "grad_norm": 0.5286517143249512, + "learning_rate": 0.0001, + "loss": 1.4477, + "step": 9154 + }, + { + "epoch": 1.0516340244672908, + "grad_norm": 0.5208611488342285, + "learning_rate": 0.0001, + "loss": 1.3308, + "step": 9155 + }, + { + "epoch": 1.0517488943771178, + "grad_norm": 0.5702365636825562, + "learning_rate": 0.0001, + "loss": 1.7063, + "step": 9156 + }, + { + "epoch": 1.051863764286945, + "grad_norm": 0.551328182220459, + "learning_rate": 0.0001, + "loss": 1.548, + "step": 9157 + }, + { + "epoch": 1.051978634196772, + "grad_norm": 0.5106463432312012, + "learning_rate": 0.0001, + "loss": 1.5278, + "step": 9158 + }, + { + "epoch": 1.0520935041065993, + "grad_norm": 0.5503958463668823, + "learning_rate": 0.0001, + "loss": 1.5551, + "step": 9159 + }, + { + "epoch": 1.0522083740164263, + "grad_norm": 0.5540024042129517, + "learning_rate": 0.0001, + "loss": 1.3085, + "step": 9160 + }, + { + "epoch": 1.0523232439262535, + "grad_norm": 0.5419454574584961, + "learning_rate": 0.0001, + "loss": 1.5515, + "step": 9161 + }, + { + "epoch": 1.0524381138360805, + "grad_norm": 0.5251337885856628, + "learning_rate": 0.0001, + "loss": 1.3589, + "step": 9162 + }, + { + "epoch": 1.0525529837459078, + "grad_norm": 0.553993284702301, + "learning_rate": 0.0001, + "loss": 1.5545, + "step": 9163 + }, + { + "epoch": 1.0526678536557348, + "grad_norm": 0.5526601672172546, + "learning_rate": 0.0001, + "loss": 1.3881, + "step": 9164 + }, + { + "epoch": 1.052782723565562, + "grad_norm": 0.5293091535568237, + "learning_rate": 0.0001, + "loss": 1.5249, + "step": 9165 + }, + { + "epoch": 1.052897593475389, + "grad_norm": 0.5309327840805054, + "learning_rate": 0.0001, + "loss": 1.2998, + "step": 9166 + }, + { + "epoch": 1.0530124633852163, + "grad_norm": 0.5376285314559937, + "learning_rate": 0.0001, + "loss": 1.5854, + "step": 9167 + }, + { + "epoch": 1.0531273332950433, + "grad_norm": 0.5321171283721924, + "learning_rate": 0.0001, + "loss": 1.4715, + "step": 9168 + }, + { + "epoch": 1.0532422032048705, + "grad_norm": 0.5454419851303101, + "learning_rate": 0.0001, + "loss": 1.565, + "step": 9169 + }, + { + "epoch": 1.0533570731146975, + "grad_norm": 0.5634780526161194, + "learning_rate": 0.0001, + "loss": 1.2175, + "step": 9170 + }, + { + "epoch": 1.0534719430245247, + "grad_norm": 0.6618204712867737, + "learning_rate": 0.0001, + "loss": 1.6722, + "step": 9171 + }, + { + "epoch": 1.0535868129343517, + "grad_norm": 0.5510662198066711, + "learning_rate": 0.0001, + "loss": 1.5464, + "step": 9172 + }, + { + "epoch": 1.053701682844179, + "grad_norm": 0.5555408596992493, + "learning_rate": 0.0001, + "loss": 1.5615, + "step": 9173 + }, + { + "epoch": 1.053816552754006, + "grad_norm": 0.5211841464042664, + "learning_rate": 0.0001, + "loss": 1.3639, + "step": 9174 + }, + { + "epoch": 1.0539314226638332, + "grad_norm": 0.5432996153831482, + "learning_rate": 0.0001, + "loss": 1.6417, + "step": 9175 + }, + { + "epoch": 1.0540462925736602, + "grad_norm": 0.5261744260787964, + "learning_rate": 0.0001, + "loss": 1.4003, + "step": 9176 + }, + { + "epoch": 1.0541611624834875, + "grad_norm": 0.5317165851593018, + "learning_rate": 0.0001, + "loss": 1.3629, + "step": 9177 + }, + { + "epoch": 1.0542760323933145, + "grad_norm": 0.5396149754524231, + "learning_rate": 0.0001, + "loss": 1.4707, + "step": 9178 + }, + { + "epoch": 1.0543909023031417, + "grad_norm": 0.5117886066436768, + "learning_rate": 0.0001, + "loss": 1.2257, + "step": 9179 + }, + { + "epoch": 1.0545057722129687, + "grad_norm": 0.5107698440551758, + "learning_rate": 0.0001, + "loss": 1.5142, + "step": 9180 + }, + { + "epoch": 1.054620642122796, + "grad_norm": 0.5465172529220581, + "learning_rate": 0.0001, + "loss": 1.4611, + "step": 9181 + }, + { + "epoch": 1.054735512032623, + "grad_norm": 0.5007506012916565, + "learning_rate": 0.0001, + "loss": 1.3207, + "step": 9182 + }, + { + "epoch": 1.0548503819424502, + "grad_norm": 0.5821637511253357, + "learning_rate": 0.0001, + "loss": 1.5268, + "step": 9183 + }, + { + "epoch": 1.0549652518522774, + "grad_norm": 0.5477774739265442, + "learning_rate": 0.0001, + "loss": 1.1992, + "step": 9184 + }, + { + "epoch": 1.0550801217621044, + "grad_norm": 0.5345506072044373, + "learning_rate": 0.0001, + "loss": 1.4741, + "step": 9185 + }, + { + "epoch": 1.0551949916719314, + "grad_norm": 0.4949226379394531, + "learning_rate": 0.0001, + "loss": 1.4013, + "step": 9186 + }, + { + "epoch": 1.0553098615817587, + "grad_norm": 0.543696403503418, + "learning_rate": 0.0001, + "loss": 1.6937, + "step": 9187 + }, + { + "epoch": 1.0554247314915859, + "grad_norm": 0.5482311248779297, + "learning_rate": 0.0001, + "loss": 1.6222, + "step": 9188 + }, + { + "epoch": 1.055539601401413, + "grad_norm": 0.500032901763916, + "learning_rate": 0.0001, + "loss": 1.507, + "step": 9189 + }, + { + "epoch": 1.0556544713112401, + "grad_norm": 0.5224020481109619, + "learning_rate": 0.0001, + "loss": 1.4317, + "step": 9190 + }, + { + "epoch": 1.0557693412210671, + "grad_norm": 0.5447744131088257, + "learning_rate": 0.0001, + "loss": 1.3627, + "step": 9191 + }, + { + "epoch": 1.0558842111308944, + "grad_norm": 0.551868200302124, + "learning_rate": 0.0001, + "loss": 1.5095, + "step": 9192 + }, + { + "epoch": 1.0559990810407214, + "grad_norm": 0.5407156944274902, + "learning_rate": 0.0001, + "loss": 1.6561, + "step": 9193 + }, + { + "epoch": 1.0561139509505486, + "grad_norm": 0.5364171862602234, + "learning_rate": 0.0001, + "loss": 1.5194, + "step": 9194 + }, + { + "epoch": 1.0562288208603756, + "grad_norm": 0.5270219445228577, + "learning_rate": 0.0001, + "loss": 1.3945, + "step": 9195 + }, + { + "epoch": 1.0563436907702028, + "grad_norm": 0.5508518218994141, + "learning_rate": 0.0001, + "loss": 1.5531, + "step": 9196 + }, + { + "epoch": 1.0564585606800299, + "grad_norm": 0.5767947435379028, + "learning_rate": 0.0001, + "loss": 1.3796, + "step": 9197 + }, + { + "epoch": 1.056573430589857, + "grad_norm": 0.516804575920105, + "learning_rate": 0.0001, + "loss": 1.3958, + "step": 9198 + }, + { + "epoch": 1.056688300499684, + "grad_norm": 0.5120447278022766, + "learning_rate": 0.0001, + "loss": 1.5065, + "step": 9199 + }, + { + "epoch": 1.0568031704095113, + "grad_norm": 0.5243872404098511, + "learning_rate": 0.0001, + "loss": 1.5117, + "step": 9200 + }, + { + "epoch": 1.0569180403193383, + "grad_norm": 0.5689402222633362, + "learning_rate": 0.0001, + "loss": 1.5406, + "step": 9201 + }, + { + "epoch": 1.0570329102291656, + "grad_norm": 0.5817130208015442, + "learning_rate": 0.0001, + "loss": 1.4481, + "step": 9202 + }, + { + "epoch": 1.0571477801389926, + "grad_norm": 0.5574333667755127, + "learning_rate": 0.0001, + "loss": 1.4967, + "step": 9203 + }, + { + "epoch": 1.0572626500488198, + "grad_norm": 0.5260115265846252, + "learning_rate": 0.0001, + "loss": 1.51, + "step": 9204 + }, + { + "epoch": 1.0573775199586468, + "grad_norm": 0.5010888576507568, + "learning_rate": 0.0001, + "loss": 1.46, + "step": 9205 + }, + { + "epoch": 1.057492389868474, + "grad_norm": 0.5643918514251709, + "learning_rate": 0.0001, + "loss": 1.335, + "step": 9206 + }, + { + "epoch": 1.057607259778301, + "grad_norm": 0.5270254015922546, + "learning_rate": 0.0001, + "loss": 1.2919, + "step": 9207 + }, + { + "epoch": 1.0577221296881283, + "grad_norm": 0.5090634226799011, + "learning_rate": 0.0001, + "loss": 1.4459, + "step": 9208 + }, + { + "epoch": 1.0578369995979553, + "grad_norm": 0.5546037554740906, + "learning_rate": 0.0001, + "loss": 1.4886, + "step": 9209 + }, + { + "epoch": 1.0579518695077825, + "grad_norm": 0.5387788414955139, + "learning_rate": 0.0001, + "loss": 1.5268, + "step": 9210 + }, + { + "epoch": 1.0580667394176095, + "grad_norm": 0.5352441668510437, + "learning_rate": 0.0001, + "loss": 1.5164, + "step": 9211 + }, + { + "epoch": 1.0581816093274368, + "grad_norm": 0.5658225417137146, + "learning_rate": 0.0001, + "loss": 1.472, + "step": 9212 + }, + { + "epoch": 1.0582964792372638, + "grad_norm": 0.541875958442688, + "learning_rate": 0.0001, + "loss": 1.0823, + "step": 9213 + }, + { + "epoch": 1.058411349147091, + "grad_norm": 0.5292019248008728, + "learning_rate": 0.0001, + "loss": 1.4035, + "step": 9214 + }, + { + "epoch": 1.058526219056918, + "grad_norm": 0.5192826986312866, + "learning_rate": 0.0001, + "loss": 1.2651, + "step": 9215 + }, + { + "epoch": 1.0586410889667452, + "grad_norm": 0.5812119245529175, + "learning_rate": 0.0001, + "loss": 1.3717, + "step": 9216 + }, + { + "epoch": 1.0587559588765723, + "grad_norm": 0.6301900744438171, + "learning_rate": 0.0001, + "loss": 1.616, + "step": 9217 + }, + { + "epoch": 1.0588708287863995, + "grad_norm": 0.5290337800979614, + "learning_rate": 0.0001, + "loss": 1.3963, + "step": 9218 + }, + { + "epoch": 1.0589856986962265, + "grad_norm": 0.6721698641777039, + "learning_rate": 0.0001, + "loss": 1.6594, + "step": 9219 + }, + { + "epoch": 1.0591005686060537, + "grad_norm": 0.6142293810844421, + "learning_rate": 0.0001, + "loss": 1.6459, + "step": 9220 + }, + { + "epoch": 1.0592154385158807, + "grad_norm": 0.5450447797775269, + "learning_rate": 0.0001, + "loss": 1.5673, + "step": 9221 + }, + { + "epoch": 1.059330308425708, + "grad_norm": 0.5755720138549805, + "learning_rate": 0.0001, + "loss": 1.5907, + "step": 9222 + }, + { + "epoch": 1.059445178335535, + "grad_norm": 0.6007996201515198, + "learning_rate": 0.0001, + "loss": 1.7402, + "step": 9223 + }, + { + "epoch": 1.0595600482453622, + "grad_norm": 0.5676028728485107, + "learning_rate": 0.0001, + "loss": 1.4648, + "step": 9224 + }, + { + "epoch": 1.0596749181551892, + "grad_norm": 0.5260946750640869, + "learning_rate": 0.0001, + "loss": 1.5316, + "step": 9225 + }, + { + "epoch": 1.0597897880650164, + "grad_norm": 0.5359583497047424, + "learning_rate": 0.0001, + "loss": 1.3536, + "step": 9226 + }, + { + "epoch": 1.0599046579748435, + "grad_norm": 0.5493340492248535, + "learning_rate": 0.0001, + "loss": 1.6997, + "step": 9227 + }, + { + "epoch": 1.0600195278846707, + "grad_norm": 0.5583151578903198, + "learning_rate": 0.0001, + "loss": 1.5545, + "step": 9228 + }, + { + "epoch": 1.0601343977944977, + "grad_norm": 0.5902903079986572, + "learning_rate": 0.0001, + "loss": 1.4995, + "step": 9229 + }, + { + "epoch": 1.060249267704325, + "grad_norm": 0.5786603689193726, + "learning_rate": 0.0001, + "loss": 1.3942, + "step": 9230 + }, + { + "epoch": 1.060364137614152, + "grad_norm": 0.5523187518119812, + "learning_rate": 0.0001, + "loss": 1.5173, + "step": 9231 + }, + { + "epoch": 1.0604790075239792, + "grad_norm": 0.5302920341491699, + "learning_rate": 0.0001, + "loss": 1.4374, + "step": 9232 + }, + { + "epoch": 1.0605938774338062, + "grad_norm": 0.5174841284751892, + "learning_rate": 0.0001, + "loss": 1.4171, + "step": 9233 + }, + { + "epoch": 1.0607087473436334, + "grad_norm": 0.5988504886627197, + "learning_rate": 0.0001, + "loss": 1.6997, + "step": 9234 + }, + { + "epoch": 1.0608236172534604, + "grad_norm": 0.5470107197761536, + "learning_rate": 0.0001, + "loss": 1.5583, + "step": 9235 + }, + { + "epoch": 1.0609384871632876, + "grad_norm": 0.5042878985404968, + "learning_rate": 0.0001, + "loss": 1.4093, + "step": 9236 + }, + { + "epoch": 1.0610533570731147, + "grad_norm": 0.5748067498207092, + "learning_rate": 0.0001, + "loss": 1.5261, + "step": 9237 + }, + { + "epoch": 1.0611682269829419, + "grad_norm": 0.5632085204124451, + "learning_rate": 0.0001, + "loss": 1.6162, + "step": 9238 + }, + { + "epoch": 1.061283096892769, + "grad_norm": 0.5615068674087524, + "learning_rate": 0.0001, + "loss": 1.5225, + "step": 9239 + }, + { + "epoch": 1.0613979668025961, + "grad_norm": 0.5865307450294495, + "learning_rate": 0.0001, + "loss": 1.3333, + "step": 9240 + }, + { + "epoch": 1.0615128367124231, + "grad_norm": 0.5403387546539307, + "learning_rate": 0.0001, + "loss": 1.3786, + "step": 9241 + }, + { + "epoch": 1.0616277066222504, + "grad_norm": 0.5474202036857605, + "learning_rate": 0.0001, + "loss": 1.2461, + "step": 9242 + }, + { + "epoch": 1.0617425765320774, + "grad_norm": 0.5543806552886963, + "learning_rate": 0.0001, + "loss": 1.4252, + "step": 9243 + }, + { + "epoch": 1.0618574464419046, + "grad_norm": 0.5339075326919556, + "learning_rate": 0.0001, + "loss": 1.3884, + "step": 9244 + }, + { + "epoch": 1.0619723163517316, + "grad_norm": 0.49664506316185, + "learning_rate": 0.0001, + "loss": 1.376, + "step": 9245 + }, + { + "epoch": 1.0620871862615588, + "grad_norm": 0.5427022576332092, + "learning_rate": 0.0001, + "loss": 1.5364, + "step": 9246 + }, + { + "epoch": 1.0622020561713859, + "grad_norm": 0.5456292629241943, + "learning_rate": 0.0001, + "loss": 1.6116, + "step": 9247 + }, + { + "epoch": 1.062316926081213, + "grad_norm": 0.533386766910553, + "learning_rate": 0.0001, + "loss": 1.4291, + "step": 9248 + }, + { + "epoch": 1.06243179599104, + "grad_norm": 0.5511561632156372, + "learning_rate": 0.0001, + "loss": 1.4988, + "step": 9249 + }, + { + "epoch": 1.0625466659008673, + "grad_norm": 0.5116701126098633, + "learning_rate": 0.0001, + "loss": 1.367, + "step": 9250 + }, + { + "epoch": 1.0626615358106943, + "grad_norm": 0.5287088751792908, + "learning_rate": 0.0001, + "loss": 1.5248, + "step": 9251 + }, + { + "epoch": 1.0627764057205216, + "grad_norm": 0.5381811261177063, + "learning_rate": 0.0001, + "loss": 1.6131, + "step": 9252 + }, + { + "epoch": 1.0628912756303486, + "grad_norm": 0.5121027231216431, + "learning_rate": 0.0001, + "loss": 1.3487, + "step": 9253 + }, + { + "epoch": 1.0630061455401758, + "grad_norm": 0.49183768033981323, + "learning_rate": 0.0001, + "loss": 1.4759, + "step": 9254 + }, + { + "epoch": 1.0631210154500028, + "grad_norm": 0.5818896293640137, + "learning_rate": 0.0001, + "loss": 1.5526, + "step": 9255 + }, + { + "epoch": 1.06323588535983, + "grad_norm": 0.5403487682342529, + "learning_rate": 0.0001, + "loss": 1.0887, + "step": 9256 + }, + { + "epoch": 1.063350755269657, + "grad_norm": 0.5411499738693237, + "learning_rate": 0.0001, + "loss": 1.4933, + "step": 9257 + }, + { + "epoch": 1.0634656251794843, + "grad_norm": 0.5758698582649231, + "learning_rate": 0.0001, + "loss": 1.5877, + "step": 9258 + }, + { + "epoch": 1.0635804950893113, + "grad_norm": 0.5343381762504578, + "learning_rate": 0.0001, + "loss": 1.5186, + "step": 9259 + }, + { + "epoch": 1.0636953649991385, + "grad_norm": 0.5107303261756897, + "learning_rate": 0.0001, + "loss": 1.5133, + "step": 9260 + }, + { + "epoch": 1.0638102349089655, + "grad_norm": 0.5439027547836304, + "learning_rate": 0.0001, + "loss": 1.6627, + "step": 9261 + }, + { + "epoch": 1.0639251048187928, + "grad_norm": 0.5605586767196655, + "learning_rate": 0.0001, + "loss": 1.6073, + "step": 9262 + }, + { + "epoch": 1.0640399747286198, + "grad_norm": 0.5504233837127686, + "learning_rate": 0.0001, + "loss": 1.5642, + "step": 9263 + }, + { + "epoch": 1.064154844638447, + "grad_norm": 0.5588496923446655, + "learning_rate": 0.0001, + "loss": 1.6055, + "step": 9264 + }, + { + "epoch": 1.064269714548274, + "grad_norm": 0.5234324932098389, + "learning_rate": 0.0001, + "loss": 1.4739, + "step": 9265 + }, + { + "epoch": 1.0643845844581012, + "grad_norm": 0.5559567809104919, + "learning_rate": 0.0001, + "loss": 1.4358, + "step": 9266 + }, + { + "epoch": 1.0644994543679283, + "grad_norm": 0.5315650105476379, + "learning_rate": 0.0001, + "loss": 1.5795, + "step": 9267 + }, + { + "epoch": 1.0646143242777555, + "grad_norm": 0.586476743221283, + "learning_rate": 0.0001, + "loss": 1.693, + "step": 9268 + }, + { + "epoch": 1.0647291941875825, + "grad_norm": 0.545502245426178, + "learning_rate": 0.0001, + "loss": 1.5206, + "step": 9269 + }, + { + "epoch": 1.0648440640974097, + "grad_norm": 0.4984760582447052, + "learning_rate": 0.0001, + "loss": 1.3982, + "step": 9270 + }, + { + "epoch": 1.0649589340072367, + "grad_norm": 0.5218265056610107, + "learning_rate": 0.0001, + "loss": 1.4516, + "step": 9271 + }, + { + "epoch": 1.065073803917064, + "grad_norm": 0.5094042420387268, + "learning_rate": 0.0001, + "loss": 1.5032, + "step": 9272 + }, + { + "epoch": 1.065188673826891, + "grad_norm": 0.5259179472923279, + "learning_rate": 0.0001, + "loss": 1.5218, + "step": 9273 + }, + { + "epoch": 1.0653035437367182, + "grad_norm": 0.4852963387966156, + "learning_rate": 0.0001, + "loss": 1.2565, + "step": 9274 + }, + { + "epoch": 1.0654184136465452, + "grad_norm": 0.5180572867393494, + "learning_rate": 0.0001, + "loss": 1.4978, + "step": 9275 + }, + { + "epoch": 1.0655332835563724, + "grad_norm": 0.5113215446472168, + "learning_rate": 0.0001, + "loss": 1.5048, + "step": 9276 + }, + { + "epoch": 1.0656481534661995, + "grad_norm": 0.5396996140480042, + "learning_rate": 0.0001, + "loss": 1.5992, + "step": 9277 + }, + { + "epoch": 1.0657630233760267, + "grad_norm": 0.5498672723770142, + "learning_rate": 0.0001, + "loss": 1.5927, + "step": 9278 + }, + { + "epoch": 1.0658778932858537, + "grad_norm": 0.5394907593727112, + "learning_rate": 0.0001, + "loss": 1.4526, + "step": 9279 + }, + { + "epoch": 1.065992763195681, + "grad_norm": 0.5096206665039062, + "learning_rate": 0.0001, + "loss": 1.5192, + "step": 9280 + }, + { + "epoch": 1.066107633105508, + "grad_norm": 0.49931639432907104, + "learning_rate": 0.0001, + "loss": 1.3194, + "step": 9281 + }, + { + "epoch": 1.0662225030153352, + "grad_norm": 0.5749096274375916, + "learning_rate": 0.0001, + "loss": 1.634, + "step": 9282 + }, + { + "epoch": 1.0663373729251622, + "grad_norm": 0.5646559000015259, + "learning_rate": 0.0001, + "loss": 1.5262, + "step": 9283 + }, + { + "epoch": 1.0664522428349894, + "grad_norm": 0.5303424000740051, + "learning_rate": 0.0001, + "loss": 1.2186, + "step": 9284 + }, + { + "epoch": 1.0665671127448164, + "grad_norm": 0.5843081474304199, + "learning_rate": 0.0001, + "loss": 1.5284, + "step": 9285 + }, + { + "epoch": 1.0666819826546436, + "grad_norm": 0.506814181804657, + "learning_rate": 0.0001, + "loss": 1.2877, + "step": 9286 + }, + { + "epoch": 1.0667968525644707, + "grad_norm": 0.5348950028419495, + "learning_rate": 0.0001, + "loss": 1.5083, + "step": 9287 + }, + { + "epoch": 1.0669117224742979, + "grad_norm": 0.5017212629318237, + "learning_rate": 0.0001, + "loss": 1.4226, + "step": 9288 + }, + { + "epoch": 1.067026592384125, + "grad_norm": 0.5333235859870911, + "learning_rate": 0.0001, + "loss": 1.6674, + "step": 9289 + }, + { + "epoch": 1.0671414622939521, + "grad_norm": 0.4682038426399231, + "learning_rate": 0.0001, + "loss": 1.3008, + "step": 9290 + }, + { + "epoch": 1.0672563322037791, + "grad_norm": 0.4889301061630249, + "learning_rate": 0.0001, + "loss": 1.4078, + "step": 9291 + }, + { + "epoch": 1.0673712021136064, + "grad_norm": 0.52678382396698, + "learning_rate": 0.0001, + "loss": 1.4328, + "step": 9292 + }, + { + "epoch": 1.0674860720234334, + "grad_norm": 0.5729466676712036, + "learning_rate": 0.0001, + "loss": 1.6461, + "step": 9293 + }, + { + "epoch": 1.0676009419332606, + "grad_norm": 0.5632121562957764, + "learning_rate": 0.0001, + "loss": 1.5527, + "step": 9294 + }, + { + "epoch": 1.0677158118430876, + "grad_norm": 0.5657988786697388, + "learning_rate": 0.0001, + "loss": 1.4767, + "step": 9295 + }, + { + "epoch": 1.0678306817529148, + "grad_norm": 0.5650860071182251, + "learning_rate": 0.0001, + "loss": 1.4778, + "step": 9296 + }, + { + "epoch": 1.0679455516627419, + "grad_norm": 0.5312690138816833, + "learning_rate": 0.0001, + "loss": 1.5812, + "step": 9297 + }, + { + "epoch": 1.068060421572569, + "grad_norm": 0.5419080853462219, + "learning_rate": 0.0001, + "loss": 1.544, + "step": 9298 + }, + { + "epoch": 1.068175291482396, + "grad_norm": 0.5316713452339172, + "learning_rate": 0.0001, + "loss": 1.4472, + "step": 9299 + }, + { + "epoch": 1.0682901613922233, + "grad_norm": 0.5229997634887695, + "learning_rate": 0.0001, + "loss": 1.333, + "step": 9300 + }, + { + "epoch": 1.0684050313020503, + "grad_norm": 0.5338220596313477, + "learning_rate": 0.0001, + "loss": 1.4723, + "step": 9301 + }, + { + "epoch": 1.0685199012118776, + "grad_norm": 0.5003429651260376, + "learning_rate": 0.0001, + "loss": 1.3834, + "step": 9302 + }, + { + "epoch": 1.0686347711217046, + "grad_norm": 0.5164094567298889, + "learning_rate": 0.0001, + "loss": 1.5219, + "step": 9303 + }, + { + "epoch": 1.0687496410315318, + "grad_norm": 0.5205984115600586, + "learning_rate": 0.0001, + "loss": 1.2881, + "step": 9304 + }, + { + "epoch": 1.0688645109413588, + "grad_norm": 0.5387089252471924, + "learning_rate": 0.0001, + "loss": 1.4241, + "step": 9305 + }, + { + "epoch": 1.068979380851186, + "grad_norm": 0.5411062836647034, + "learning_rate": 0.0001, + "loss": 1.4488, + "step": 9306 + }, + { + "epoch": 1.069094250761013, + "grad_norm": 0.522082507610321, + "learning_rate": 0.0001, + "loss": 1.48, + "step": 9307 + }, + { + "epoch": 1.0692091206708403, + "grad_norm": 0.5525926351547241, + "learning_rate": 0.0001, + "loss": 1.359, + "step": 9308 + }, + { + "epoch": 1.0693239905806673, + "grad_norm": 0.6042261719703674, + "learning_rate": 0.0001, + "loss": 1.6407, + "step": 9309 + }, + { + "epoch": 1.0694388604904945, + "grad_norm": 0.5462369918823242, + "learning_rate": 0.0001, + "loss": 1.713, + "step": 9310 + }, + { + "epoch": 1.0695537304003215, + "grad_norm": 0.5386967658996582, + "learning_rate": 0.0001, + "loss": 1.4737, + "step": 9311 + }, + { + "epoch": 1.0696686003101488, + "grad_norm": 0.5753105282783508, + "learning_rate": 0.0001, + "loss": 1.7007, + "step": 9312 + }, + { + "epoch": 1.0697834702199758, + "grad_norm": 0.509254515171051, + "learning_rate": 0.0001, + "loss": 1.3826, + "step": 9313 + }, + { + "epoch": 1.069898340129803, + "grad_norm": 0.5281634330749512, + "learning_rate": 0.0001, + "loss": 1.5919, + "step": 9314 + }, + { + "epoch": 1.07001321003963, + "grad_norm": 0.5292205214500427, + "learning_rate": 0.0001, + "loss": 1.5443, + "step": 9315 + }, + { + "epoch": 1.0701280799494572, + "grad_norm": 0.522649347782135, + "learning_rate": 0.0001, + "loss": 1.4171, + "step": 9316 + }, + { + "epoch": 1.0702429498592845, + "grad_norm": 0.5478872656822205, + "learning_rate": 0.0001, + "loss": 1.5693, + "step": 9317 + }, + { + "epoch": 1.0703578197691115, + "grad_norm": 0.5441164374351501, + "learning_rate": 0.0001, + "loss": 1.4823, + "step": 9318 + }, + { + "epoch": 1.0704726896789385, + "grad_norm": 0.5701571702957153, + "learning_rate": 0.0001, + "loss": 1.4287, + "step": 9319 + }, + { + "epoch": 1.0705875595887657, + "grad_norm": 0.5586724281311035, + "learning_rate": 0.0001, + "loss": 1.4541, + "step": 9320 + }, + { + "epoch": 1.070702429498593, + "grad_norm": 0.5121877789497375, + "learning_rate": 0.0001, + "loss": 1.58, + "step": 9321 + }, + { + "epoch": 1.07081729940842, + "grad_norm": 0.5557401776313782, + "learning_rate": 0.0001, + "loss": 1.4617, + "step": 9322 + }, + { + "epoch": 1.070932169318247, + "grad_norm": 0.5157942175865173, + "learning_rate": 0.0001, + "loss": 1.5935, + "step": 9323 + }, + { + "epoch": 1.0710470392280742, + "grad_norm": 0.5756588578224182, + "learning_rate": 0.0001, + "loss": 1.5785, + "step": 9324 + }, + { + "epoch": 1.0711619091379014, + "grad_norm": 0.5151377320289612, + "learning_rate": 0.0001, + "loss": 1.4551, + "step": 9325 + }, + { + "epoch": 1.0712767790477284, + "grad_norm": 0.5527158975601196, + "learning_rate": 0.0001, + "loss": 1.5425, + "step": 9326 + }, + { + "epoch": 1.0713916489575555, + "grad_norm": 0.517097532749176, + "learning_rate": 0.0001, + "loss": 1.2917, + "step": 9327 + }, + { + "epoch": 1.0715065188673827, + "grad_norm": 0.5368314385414124, + "learning_rate": 0.0001, + "loss": 1.4936, + "step": 9328 + }, + { + "epoch": 1.07162138877721, + "grad_norm": 0.5495009422302246, + "learning_rate": 0.0001, + "loss": 1.589, + "step": 9329 + }, + { + "epoch": 1.071736258687037, + "grad_norm": 0.5558755993843079, + "learning_rate": 0.0001, + "loss": 1.4963, + "step": 9330 + }, + { + "epoch": 1.0718511285968642, + "grad_norm": 0.5218509435653687, + "learning_rate": 0.0001, + "loss": 1.4938, + "step": 9331 + }, + { + "epoch": 1.0719659985066912, + "grad_norm": 0.5873258709907532, + "learning_rate": 0.0001, + "loss": 1.5986, + "step": 9332 + }, + { + "epoch": 1.0720808684165184, + "grad_norm": 0.542597234249115, + "learning_rate": 0.0001, + "loss": 1.4964, + "step": 9333 + }, + { + "epoch": 1.0721957383263454, + "grad_norm": 0.5635952949523926, + "learning_rate": 0.0001, + "loss": 1.5998, + "step": 9334 + }, + { + "epoch": 1.0723106082361726, + "grad_norm": 0.5015137791633606, + "learning_rate": 0.0001, + "loss": 1.409, + "step": 9335 + }, + { + "epoch": 1.0724254781459996, + "grad_norm": 0.5085453987121582, + "learning_rate": 0.0001, + "loss": 1.4133, + "step": 9336 + }, + { + "epoch": 1.0725403480558269, + "grad_norm": 0.5499141216278076, + "learning_rate": 0.0001, + "loss": 1.5787, + "step": 9337 + }, + { + "epoch": 1.0726552179656539, + "grad_norm": 0.5491046905517578, + "learning_rate": 0.0001, + "loss": 1.389, + "step": 9338 + }, + { + "epoch": 1.0727700878754811, + "grad_norm": 0.58219975233078, + "learning_rate": 0.0001, + "loss": 1.5334, + "step": 9339 + }, + { + "epoch": 1.0728849577853081, + "grad_norm": 0.5443148612976074, + "learning_rate": 0.0001, + "loss": 1.4332, + "step": 9340 + }, + { + "epoch": 1.0729998276951354, + "grad_norm": 0.5684245824813843, + "learning_rate": 0.0001, + "loss": 1.5124, + "step": 9341 + }, + { + "epoch": 1.0731146976049624, + "grad_norm": 0.6064300537109375, + "learning_rate": 0.0001, + "loss": 1.5588, + "step": 9342 + }, + { + "epoch": 1.0732295675147896, + "grad_norm": 0.5258216857910156, + "learning_rate": 0.0001, + "loss": 1.3385, + "step": 9343 + }, + { + "epoch": 1.0733444374246166, + "grad_norm": 0.5391144752502441, + "learning_rate": 0.0001, + "loss": 1.4488, + "step": 9344 + }, + { + "epoch": 1.0734593073344438, + "grad_norm": 0.5189342498779297, + "learning_rate": 0.0001, + "loss": 1.4995, + "step": 9345 + }, + { + "epoch": 1.0735741772442708, + "grad_norm": 0.527388870716095, + "learning_rate": 0.0001, + "loss": 1.4789, + "step": 9346 + }, + { + "epoch": 1.073689047154098, + "grad_norm": 0.569824755191803, + "learning_rate": 0.0001, + "loss": 1.2466, + "step": 9347 + }, + { + "epoch": 1.073803917063925, + "grad_norm": 0.5429185628890991, + "learning_rate": 0.0001, + "loss": 1.4456, + "step": 9348 + }, + { + "epoch": 1.0739187869737523, + "grad_norm": 0.5612589120864868, + "learning_rate": 0.0001, + "loss": 1.2836, + "step": 9349 + }, + { + "epoch": 1.0740336568835793, + "grad_norm": 0.562879204750061, + "learning_rate": 0.0001, + "loss": 1.2671, + "step": 9350 + }, + { + "epoch": 1.0741485267934066, + "grad_norm": 0.6129858493804932, + "learning_rate": 0.0001, + "loss": 1.6126, + "step": 9351 + }, + { + "epoch": 1.0742633967032336, + "grad_norm": 0.5683223009109497, + "learning_rate": 0.0001, + "loss": 1.5117, + "step": 9352 + }, + { + "epoch": 1.0743782666130608, + "grad_norm": 0.549032986164093, + "learning_rate": 0.0001, + "loss": 1.3925, + "step": 9353 + }, + { + "epoch": 1.0744931365228878, + "grad_norm": 0.5353028774261475, + "learning_rate": 0.0001, + "loss": 1.4364, + "step": 9354 + }, + { + "epoch": 1.074608006432715, + "grad_norm": 0.5511956810951233, + "learning_rate": 0.0001, + "loss": 1.6354, + "step": 9355 + }, + { + "epoch": 1.074722876342542, + "grad_norm": 0.5705670118331909, + "learning_rate": 0.0001, + "loss": 1.3258, + "step": 9356 + }, + { + "epoch": 1.0748377462523693, + "grad_norm": 0.5503584742546082, + "learning_rate": 0.0001, + "loss": 1.3599, + "step": 9357 + }, + { + "epoch": 1.0749526161621963, + "grad_norm": 0.48784855008125305, + "learning_rate": 0.0001, + "loss": 1.3298, + "step": 9358 + }, + { + "epoch": 1.0750674860720235, + "grad_norm": 0.5487546324729919, + "learning_rate": 0.0001, + "loss": 1.3159, + "step": 9359 + }, + { + "epoch": 1.0751823559818505, + "grad_norm": 0.6427622437477112, + "learning_rate": 0.0001, + "loss": 1.4016, + "step": 9360 + }, + { + "epoch": 1.0752972258916778, + "grad_norm": 0.5325155258178711, + "learning_rate": 0.0001, + "loss": 1.4784, + "step": 9361 + }, + { + "epoch": 1.0754120958015048, + "grad_norm": 0.5536829829216003, + "learning_rate": 0.0001, + "loss": 1.5445, + "step": 9362 + }, + { + "epoch": 1.075526965711332, + "grad_norm": 0.49843937158584595, + "learning_rate": 0.0001, + "loss": 1.6056, + "step": 9363 + }, + { + "epoch": 1.075641835621159, + "grad_norm": 0.5196369290351868, + "learning_rate": 0.0001, + "loss": 1.5446, + "step": 9364 + }, + { + "epoch": 1.0757567055309862, + "grad_norm": 0.5939671993255615, + "learning_rate": 0.0001, + "loss": 1.532, + "step": 9365 + }, + { + "epoch": 1.0758715754408132, + "grad_norm": 0.5654057860374451, + "learning_rate": 0.0001, + "loss": 1.7005, + "step": 9366 + }, + { + "epoch": 1.0759864453506405, + "grad_norm": 0.5311039686203003, + "learning_rate": 0.0001, + "loss": 1.4949, + "step": 9367 + }, + { + "epoch": 1.0761013152604675, + "grad_norm": 0.5418568849563599, + "learning_rate": 0.0001, + "loss": 1.5952, + "step": 9368 + }, + { + "epoch": 1.0762161851702947, + "grad_norm": 0.5042740106582642, + "learning_rate": 0.0001, + "loss": 1.4819, + "step": 9369 + }, + { + "epoch": 1.0763310550801217, + "grad_norm": 0.5391519665718079, + "learning_rate": 0.0001, + "loss": 1.5778, + "step": 9370 + }, + { + "epoch": 1.076445924989949, + "grad_norm": 0.48869240283966064, + "learning_rate": 0.0001, + "loss": 1.3108, + "step": 9371 + }, + { + "epoch": 1.076560794899776, + "grad_norm": 0.5286645889282227, + "learning_rate": 0.0001, + "loss": 1.6169, + "step": 9372 + }, + { + "epoch": 1.0766756648096032, + "grad_norm": 0.5066417455673218, + "learning_rate": 0.0001, + "loss": 1.2543, + "step": 9373 + }, + { + "epoch": 1.0767905347194302, + "grad_norm": 0.5383975505828857, + "learning_rate": 0.0001, + "loss": 1.2786, + "step": 9374 + }, + { + "epoch": 1.0769054046292574, + "grad_norm": 0.5614655017852783, + "learning_rate": 0.0001, + "loss": 1.5742, + "step": 9375 + }, + { + "epoch": 1.0770202745390844, + "grad_norm": 0.5506321787834167, + "learning_rate": 0.0001, + "loss": 1.4668, + "step": 9376 + }, + { + "epoch": 1.0771351444489117, + "grad_norm": 0.6028522253036499, + "learning_rate": 0.0001, + "loss": 1.6613, + "step": 9377 + }, + { + "epoch": 1.0772500143587387, + "grad_norm": 0.535199761390686, + "learning_rate": 0.0001, + "loss": 1.5098, + "step": 9378 + }, + { + "epoch": 1.077364884268566, + "grad_norm": 0.5362054705619812, + "learning_rate": 0.0001, + "loss": 1.6251, + "step": 9379 + }, + { + "epoch": 1.077479754178393, + "grad_norm": 0.5681738257408142, + "learning_rate": 0.0001, + "loss": 1.6243, + "step": 9380 + }, + { + "epoch": 1.0775946240882202, + "grad_norm": 0.5353713631629944, + "learning_rate": 0.0001, + "loss": 1.5445, + "step": 9381 + }, + { + "epoch": 1.0777094939980472, + "grad_norm": 0.5305677652359009, + "learning_rate": 0.0001, + "loss": 1.5472, + "step": 9382 + }, + { + "epoch": 1.0778243639078744, + "grad_norm": 0.5264517068862915, + "learning_rate": 0.0001, + "loss": 1.3933, + "step": 9383 + }, + { + "epoch": 1.0779392338177014, + "grad_norm": 0.5325207114219666, + "learning_rate": 0.0001, + "loss": 1.5097, + "step": 9384 + }, + { + "epoch": 1.0780541037275286, + "grad_norm": 0.5129626989364624, + "learning_rate": 0.0001, + "loss": 1.392, + "step": 9385 + }, + { + "epoch": 1.0781689736373556, + "grad_norm": 0.5592479109764099, + "learning_rate": 0.0001, + "loss": 1.4948, + "step": 9386 + }, + { + "epoch": 1.0782838435471829, + "grad_norm": 0.5640060901641846, + "learning_rate": 0.0001, + "loss": 1.4874, + "step": 9387 + }, + { + "epoch": 1.0783987134570099, + "grad_norm": 0.5315003991127014, + "learning_rate": 0.0001, + "loss": 1.4705, + "step": 9388 + }, + { + "epoch": 1.0785135833668371, + "grad_norm": 0.5562155842781067, + "learning_rate": 0.0001, + "loss": 1.5838, + "step": 9389 + }, + { + "epoch": 1.0786284532766641, + "grad_norm": 0.5401049256324768, + "learning_rate": 0.0001, + "loss": 1.5833, + "step": 9390 + }, + { + "epoch": 1.0787433231864914, + "grad_norm": 0.5349462032318115, + "learning_rate": 0.0001, + "loss": 1.471, + "step": 9391 + }, + { + "epoch": 1.0788581930963184, + "grad_norm": 0.543636679649353, + "learning_rate": 0.0001, + "loss": 1.4982, + "step": 9392 + }, + { + "epoch": 1.0789730630061456, + "grad_norm": 0.5344265103340149, + "learning_rate": 0.0001, + "loss": 1.5697, + "step": 9393 + }, + { + "epoch": 1.0790879329159726, + "grad_norm": 0.5137317180633545, + "learning_rate": 0.0001, + "loss": 1.509, + "step": 9394 + }, + { + "epoch": 1.0792028028257998, + "grad_norm": 0.5402754545211792, + "learning_rate": 0.0001, + "loss": 1.422, + "step": 9395 + }, + { + "epoch": 1.0793176727356268, + "grad_norm": 0.6155654191970825, + "learning_rate": 0.0001, + "loss": 1.7829, + "step": 9396 + }, + { + "epoch": 1.079432542645454, + "grad_norm": 0.5445329546928406, + "learning_rate": 0.0001, + "loss": 1.3084, + "step": 9397 + }, + { + "epoch": 1.079547412555281, + "grad_norm": 0.5331481099128723, + "learning_rate": 0.0001, + "loss": 1.2886, + "step": 9398 + }, + { + "epoch": 1.0796622824651083, + "grad_norm": 0.4990472197532654, + "learning_rate": 0.0001, + "loss": 1.4427, + "step": 9399 + }, + { + "epoch": 1.0797771523749353, + "grad_norm": 0.5740399956703186, + "learning_rate": 0.0001, + "loss": 1.5451, + "step": 9400 + }, + { + "epoch": 1.0798920222847626, + "grad_norm": 0.5508021712303162, + "learning_rate": 0.0001, + "loss": 1.725, + "step": 9401 + }, + { + "epoch": 1.0800068921945896, + "grad_norm": 0.5360609889030457, + "learning_rate": 0.0001, + "loss": 1.5607, + "step": 9402 + }, + { + "epoch": 1.0801217621044168, + "grad_norm": 0.4839753806591034, + "learning_rate": 0.0001, + "loss": 1.3208, + "step": 9403 + }, + { + "epoch": 1.0802366320142438, + "grad_norm": 0.5315743684768677, + "learning_rate": 0.0001, + "loss": 1.4254, + "step": 9404 + }, + { + "epoch": 1.080351501924071, + "grad_norm": 0.62214195728302, + "learning_rate": 0.0001, + "loss": 1.5521, + "step": 9405 + }, + { + "epoch": 1.080466371833898, + "grad_norm": 0.5492658615112305, + "learning_rate": 0.0001, + "loss": 1.5138, + "step": 9406 + }, + { + "epoch": 1.0805812417437253, + "grad_norm": 0.5915753841400146, + "learning_rate": 0.0001, + "loss": 1.6729, + "step": 9407 + }, + { + "epoch": 1.0806961116535523, + "grad_norm": 0.49104854464530945, + "learning_rate": 0.0001, + "loss": 1.284, + "step": 9408 + }, + { + "epoch": 1.0808109815633795, + "grad_norm": 0.5419100522994995, + "learning_rate": 0.0001, + "loss": 1.3962, + "step": 9409 + }, + { + "epoch": 1.0809258514732065, + "grad_norm": 0.5406461358070374, + "learning_rate": 0.0001, + "loss": 1.4855, + "step": 9410 + }, + { + "epoch": 1.0810407213830338, + "grad_norm": 0.5958341360092163, + "learning_rate": 0.0001, + "loss": 1.5927, + "step": 9411 + }, + { + "epoch": 1.0811555912928608, + "grad_norm": 0.5315396189689636, + "learning_rate": 0.0001, + "loss": 1.5714, + "step": 9412 + }, + { + "epoch": 1.081270461202688, + "grad_norm": 0.5059750080108643, + "learning_rate": 0.0001, + "loss": 1.3242, + "step": 9413 + }, + { + "epoch": 1.081385331112515, + "grad_norm": 0.5587995052337646, + "learning_rate": 0.0001, + "loss": 1.446, + "step": 9414 + }, + { + "epoch": 1.0815002010223422, + "grad_norm": 0.5836725831031799, + "learning_rate": 0.0001, + "loss": 1.6805, + "step": 9415 + }, + { + "epoch": 1.0816150709321692, + "grad_norm": 0.5699766278266907, + "learning_rate": 0.0001, + "loss": 1.3352, + "step": 9416 + }, + { + "epoch": 1.0817299408419965, + "grad_norm": 0.5126357078552246, + "learning_rate": 0.0001, + "loss": 1.3888, + "step": 9417 + }, + { + "epoch": 1.0818448107518235, + "grad_norm": 0.5475831627845764, + "learning_rate": 0.0001, + "loss": 1.6174, + "step": 9418 + }, + { + "epoch": 1.0819596806616507, + "grad_norm": 0.5603591203689575, + "learning_rate": 0.0001, + "loss": 1.5595, + "step": 9419 + }, + { + "epoch": 1.0820745505714777, + "grad_norm": 0.5825121998786926, + "learning_rate": 0.0001, + "loss": 1.5418, + "step": 9420 + }, + { + "epoch": 1.082189420481305, + "grad_norm": 0.5266685485839844, + "learning_rate": 0.0001, + "loss": 1.5018, + "step": 9421 + }, + { + "epoch": 1.082304290391132, + "grad_norm": 0.5489354133605957, + "learning_rate": 0.0001, + "loss": 1.3732, + "step": 9422 + }, + { + "epoch": 1.0824191603009592, + "grad_norm": 0.5415682792663574, + "learning_rate": 0.0001, + "loss": 1.4901, + "step": 9423 + }, + { + "epoch": 1.0825340302107862, + "grad_norm": 0.5336417555809021, + "learning_rate": 0.0001, + "loss": 1.4311, + "step": 9424 + }, + { + "epoch": 1.0826489001206134, + "grad_norm": 0.5154435038566589, + "learning_rate": 0.0001, + "loss": 1.4462, + "step": 9425 + }, + { + "epoch": 1.0827637700304404, + "grad_norm": 0.5345861911773682, + "learning_rate": 0.0001, + "loss": 1.3614, + "step": 9426 + }, + { + "epoch": 1.0828786399402677, + "grad_norm": 0.5687355399131775, + "learning_rate": 0.0001, + "loss": 1.4912, + "step": 9427 + }, + { + "epoch": 1.0829935098500947, + "grad_norm": 0.508594810962677, + "learning_rate": 0.0001, + "loss": 1.471, + "step": 9428 + }, + { + "epoch": 1.083108379759922, + "grad_norm": 0.5026642084121704, + "learning_rate": 0.0001, + "loss": 1.3456, + "step": 9429 + }, + { + "epoch": 1.083223249669749, + "grad_norm": 0.5498985648155212, + "learning_rate": 0.0001, + "loss": 1.3439, + "step": 9430 + }, + { + "epoch": 1.0833381195795762, + "grad_norm": 0.5488834977149963, + "learning_rate": 0.0001, + "loss": 1.5431, + "step": 9431 + }, + { + "epoch": 1.0834529894894032, + "grad_norm": 0.5577558279037476, + "learning_rate": 0.0001, + "loss": 1.4781, + "step": 9432 + }, + { + "epoch": 1.0835678593992304, + "grad_norm": 0.5541431307792664, + "learning_rate": 0.0001, + "loss": 1.5649, + "step": 9433 + }, + { + "epoch": 1.0836827293090574, + "grad_norm": 0.6540974378585815, + "learning_rate": 0.0001, + "loss": 1.7835, + "step": 9434 + }, + { + "epoch": 1.0837975992188846, + "grad_norm": 0.6181141138076782, + "learning_rate": 0.0001, + "loss": 1.5425, + "step": 9435 + }, + { + "epoch": 1.0839124691287116, + "grad_norm": 0.5547922849655151, + "learning_rate": 0.0001, + "loss": 1.4657, + "step": 9436 + }, + { + "epoch": 1.0840273390385389, + "grad_norm": 0.5842359066009521, + "learning_rate": 0.0001, + "loss": 1.5911, + "step": 9437 + }, + { + "epoch": 1.0841422089483659, + "grad_norm": 0.5383824110031128, + "learning_rate": 0.0001, + "loss": 1.5272, + "step": 9438 + }, + { + "epoch": 1.0842570788581931, + "grad_norm": 0.5450701713562012, + "learning_rate": 0.0001, + "loss": 1.5627, + "step": 9439 + }, + { + "epoch": 1.0843719487680201, + "grad_norm": 0.5500561594963074, + "learning_rate": 0.0001, + "loss": 1.5371, + "step": 9440 + }, + { + "epoch": 1.0844868186778474, + "grad_norm": 0.6040080785751343, + "learning_rate": 0.0001, + "loss": 1.5762, + "step": 9441 + }, + { + "epoch": 1.0846016885876744, + "grad_norm": 0.5557257533073425, + "learning_rate": 0.0001, + "loss": 1.6202, + "step": 9442 + }, + { + "epoch": 1.0847165584975016, + "grad_norm": 0.5492468476295471, + "learning_rate": 0.0001, + "loss": 1.5363, + "step": 9443 + }, + { + "epoch": 1.0848314284073286, + "grad_norm": 0.5362855195999146, + "learning_rate": 0.0001, + "loss": 1.3116, + "step": 9444 + }, + { + "epoch": 1.0849462983171558, + "grad_norm": 0.491189569234848, + "learning_rate": 0.0001, + "loss": 1.1433, + "step": 9445 + }, + { + "epoch": 1.0850611682269828, + "grad_norm": 0.561486005783081, + "learning_rate": 0.0001, + "loss": 1.5065, + "step": 9446 + }, + { + "epoch": 1.08517603813681, + "grad_norm": 0.5247159600257874, + "learning_rate": 0.0001, + "loss": 1.5988, + "step": 9447 + }, + { + "epoch": 1.085290908046637, + "grad_norm": 0.5002375841140747, + "learning_rate": 0.0001, + "loss": 1.3614, + "step": 9448 + }, + { + "epoch": 1.0854057779564643, + "grad_norm": 0.5415583252906799, + "learning_rate": 0.0001, + "loss": 1.3459, + "step": 9449 + }, + { + "epoch": 1.0855206478662913, + "grad_norm": 0.5931771993637085, + "learning_rate": 0.0001, + "loss": 1.5553, + "step": 9450 + }, + { + "epoch": 1.0856355177761186, + "grad_norm": 0.6163171529769897, + "learning_rate": 0.0001, + "loss": 1.5842, + "step": 9451 + }, + { + "epoch": 1.0857503876859456, + "grad_norm": 0.5527577996253967, + "learning_rate": 0.0001, + "loss": 1.3209, + "step": 9452 + }, + { + "epoch": 1.0858652575957728, + "grad_norm": 0.5498605966567993, + "learning_rate": 0.0001, + "loss": 1.3325, + "step": 9453 + }, + { + "epoch": 1.0859801275056, + "grad_norm": 0.5718228220939636, + "learning_rate": 0.0001, + "loss": 1.5869, + "step": 9454 + }, + { + "epoch": 1.086094997415427, + "grad_norm": 0.537427544593811, + "learning_rate": 0.0001, + "loss": 1.3966, + "step": 9455 + }, + { + "epoch": 1.086209867325254, + "grad_norm": 0.5483804941177368, + "learning_rate": 0.0001, + "loss": 1.2163, + "step": 9456 + }, + { + "epoch": 1.0863247372350813, + "grad_norm": 0.5748531222343445, + "learning_rate": 0.0001, + "loss": 1.4913, + "step": 9457 + }, + { + "epoch": 1.0864396071449085, + "grad_norm": 0.5303325653076172, + "learning_rate": 0.0001, + "loss": 1.5366, + "step": 9458 + }, + { + "epoch": 1.0865544770547355, + "grad_norm": 0.5701431632041931, + "learning_rate": 0.0001, + "loss": 1.5886, + "step": 9459 + }, + { + "epoch": 1.0866693469645625, + "grad_norm": 0.5297312140464783, + "learning_rate": 0.0001, + "loss": 1.49, + "step": 9460 + }, + { + "epoch": 1.0867842168743898, + "grad_norm": 0.5745691061019897, + "learning_rate": 0.0001, + "loss": 1.6774, + "step": 9461 + }, + { + "epoch": 1.086899086784217, + "grad_norm": 0.5294743180274963, + "learning_rate": 0.0001, + "loss": 1.3204, + "step": 9462 + }, + { + "epoch": 1.087013956694044, + "grad_norm": 0.5992701649665833, + "learning_rate": 0.0001, + "loss": 1.3512, + "step": 9463 + }, + { + "epoch": 1.087128826603871, + "grad_norm": 0.5554730892181396, + "learning_rate": 0.0001, + "loss": 1.5883, + "step": 9464 + }, + { + "epoch": 1.0872436965136982, + "grad_norm": 0.555023193359375, + "learning_rate": 0.0001, + "loss": 1.5655, + "step": 9465 + }, + { + "epoch": 1.0873585664235255, + "grad_norm": 0.5271770358085632, + "learning_rate": 0.0001, + "loss": 1.2415, + "step": 9466 + }, + { + "epoch": 1.0874734363333525, + "grad_norm": 0.520576536655426, + "learning_rate": 0.0001, + "loss": 1.6088, + "step": 9467 + }, + { + "epoch": 1.0875883062431795, + "grad_norm": 0.5427173972129822, + "learning_rate": 0.0001, + "loss": 1.6376, + "step": 9468 + }, + { + "epoch": 1.0877031761530067, + "grad_norm": 0.5307012796401978, + "learning_rate": 0.0001, + "loss": 1.3187, + "step": 9469 + }, + { + "epoch": 1.087818046062834, + "grad_norm": 0.49438974261283875, + "learning_rate": 0.0001, + "loss": 1.3237, + "step": 9470 + }, + { + "epoch": 1.087932915972661, + "grad_norm": 0.5442484617233276, + "learning_rate": 0.0001, + "loss": 1.5418, + "step": 9471 + }, + { + "epoch": 1.0880477858824882, + "grad_norm": 0.5300331711769104, + "learning_rate": 0.0001, + "loss": 1.6, + "step": 9472 + }, + { + "epoch": 1.0881626557923152, + "grad_norm": 0.5837119817733765, + "learning_rate": 0.0001, + "loss": 1.656, + "step": 9473 + }, + { + "epoch": 1.0882775257021424, + "grad_norm": 0.5622014999389648, + "learning_rate": 0.0001, + "loss": 1.5176, + "step": 9474 + }, + { + "epoch": 1.0883923956119694, + "grad_norm": 0.5535881519317627, + "learning_rate": 0.0001, + "loss": 1.4017, + "step": 9475 + }, + { + "epoch": 1.0885072655217967, + "grad_norm": 0.5968254804611206, + "learning_rate": 0.0001, + "loss": 1.6419, + "step": 9476 + }, + { + "epoch": 1.0886221354316237, + "grad_norm": 0.5204367637634277, + "learning_rate": 0.0001, + "loss": 1.4491, + "step": 9477 + }, + { + "epoch": 1.088737005341451, + "grad_norm": 0.5856305956840515, + "learning_rate": 0.0001, + "loss": 1.6327, + "step": 9478 + }, + { + "epoch": 1.088851875251278, + "grad_norm": 0.5582146048545837, + "learning_rate": 0.0001, + "loss": 1.5893, + "step": 9479 + }, + { + "epoch": 1.0889667451611051, + "grad_norm": 0.5371931195259094, + "learning_rate": 0.0001, + "loss": 1.4195, + "step": 9480 + }, + { + "epoch": 1.0890816150709322, + "grad_norm": 0.5727693438529968, + "learning_rate": 0.0001, + "loss": 1.5191, + "step": 9481 + }, + { + "epoch": 1.0891964849807594, + "grad_norm": 0.5168283581733704, + "learning_rate": 0.0001, + "loss": 1.4037, + "step": 9482 + }, + { + "epoch": 1.0893113548905864, + "grad_norm": 0.5147133469581604, + "learning_rate": 0.0001, + "loss": 1.3784, + "step": 9483 + }, + { + "epoch": 1.0894262248004136, + "grad_norm": 0.5450608730316162, + "learning_rate": 0.0001, + "loss": 1.5136, + "step": 9484 + }, + { + "epoch": 1.0895410947102406, + "grad_norm": 0.5697365403175354, + "learning_rate": 0.0001, + "loss": 1.5325, + "step": 9485 + }, + { + "epoch": 1.0896559646200679, + "grad_norm": 0.5053684711456299, + "learning_rate": 0.0001, + "loss": 1.3655, + "step": 9486 + }, + { + "epoch": 1.0897708345298949, + "grad_norm": 0.5777620673179626, + "learning_rate": 0.0001, + "loss": 1.5045, + "step": 9487 + }, + { + "epoch": 1.089885704439722, + "grad_norm": 0.5835351943969727, + "learning_rate": 0.0001, + "loss": 1.3695, + "step": 9488 + }, + { + "epoch": 1.0900005743495491, + "grad_norm": 0.5509904623031616, + "learning_rate": 0.0001, + "loss": 1.4987, + "step": 9489 + }, + { + "epoch": 1.0901154442593763, + "grad_norm": 0.5363668203353882, + "learning_rate": 0.0001, + "loss": 1.321, + "step": 9490 + }, + { + "epoch": 1.0902303141692034, + "grad_norm": 0.5192736983299255, + "learning_rate": 0.0001, + "loss": 1.4694, + "step": 9491 + }, + { + "epoch": 1.0903451840790306, + "grad_norm": 0.5779258608818054, + "learning_rate": 0.0001, + "loss": 1.4376, + "step": 9492 + }, + { + "epoch": 1.0904600539888576, + "grad_norm": 0.5055257678031921, + "learning_rate": 0.0001, + "loss": 1.3177, + "step": 9493 + }, + { + "epoch": 1.0905749238986848, + "grad_norm": 0.5672529935836792, + "learning_rate": 0.0001, + "loss": 1.5611, + "step": 9494 + }, + { + "epoch": 1.0906897938085118, + "grad_norm": 0.5167456865310669, + "learning_rate": 0.0001, + "loss": 1.4562, + "step": 9495 + }, + { + "epoch": 1.090804663718339, + "grad_norm": 0.522533118724823, + "learning_rate": 0.0001, + "loss": 1.4801, + "step": 9496 + }, + { + "epoch": 1.090919533628166, + "grad_norm": 0.5050001740455627, + "learning_rate": 0.0001, + "loss": 1.5072, + "step": 9497 + }, + { + "epoch": 1.0910344035379933, + "grad_norm": 0.5358273983001709, + "learning_rate": 0.0001, + "loss": 1.496, + "step": 9498 + }, + { + "epoch": 1.0911492734478203, + "grad_norm": 0.5162444114685059, + "learning_rate": 0.0001, + "loss": 1.4414, + "step": 9499 + }, + { + "epoch": 1.0912641433576475, + "grad_norm": 0.533893346786499, + "learning_rate": 0.0001, + "loss": 1.4475, + "step": 9500 + }, + { + "epoch": 1.0913790132674746, + "grad_norm": 0.5578415989875793, + "learning_rate": 0.0001, + "loss": 1.5303, + "step": 9501 + }, + { + "epoch": 1.0914938831773018, + "grad_norm": 0.5179681777954102, + "learning_rate": 0.0001, + "loss": 1.4064, + "step": 9502 + }, + { + "epoch": 1.0916087530871288, + "grad_norm": 0.5305806994438171, + "learning_rate": 0.0001, + "loss": 1.5176, + "step": 9503 + }, + { + "epoch": 1.091723622996956, + "grad_norm": 0.5723759531974792, + "learning_rate": 0.0001, + "loss": 1.1344, + "step": 9504 + }, + { + "epoch": 1.091838492906783, + "grad_norm": 0.520270049571991, + "learning_rate": 0.0001, + "loss": 1.3782, + "step": 9505 + }, + { + "epoch": 1.0919533628166103, + "grad_norm": 0.5493273138999939, + "learning_rate": 0.0001, + "loss": 1.4887, + "step": 9506 + }, + { + "epoch": 1.0920682327264373, + "grad_norm": 0.5690314769744873, + "learning_rate": 0.0001, + "loss": 1.49, + "step": 9507 + }, + { + "epoch": 1.0921831026362645, + "grad_norm": 0.5376046895980835, + "learning_rate": 0.0001, + "loss": 1.4984, + "step": 9508 + }, + { + "epoch": 1.0922979725460915, + "grad_norm": 0.5700758695602417, + "learning_rate": 0.0001, + "loss": 1.5265, + "step": 9509 + }, + { + "epoch": 1.0924128424559187, + "grad_norm": 0.5583578944206238, + "learning_rate": 0.0001, + "loss": 1.4936, + "step": 9510 + }, + { + "epoch": 1.0925277123657458, + "grad_norm": 0.5892508625984192, + "learning_rate": 0.0001, + "loss": 1.5108, + "step": 9511 + }, + { + "epoch": 1.092642582275573, + "grad_norm": 0.5539071559906006, + "learning_rate": 0.0001, + "loss": 1.4654, + "step": 9512 + }, + { + "epoch": 1.0927574521854, + "grad_norm": 0.5687013268470764, + "learning_rate": 0.0001, + "loss": 1.4186, + "step": 9513 + }, + { + "epoch": 1.0928723220952272, + "grad_norm": 0.612593412399292, + "learning_rate": 0.0001, + "loss": 1.6754, + "step": 9514 + }, + { + "epoch": 1.0929871920050542, + "grad_norm": 0.5612267851829529, + "learning_rate": 0.0001, + "loss": 1.5463, + "step": 9515 + }, + { + "epoch": 1.0931020619148815, + "grad_norm": 0.5222997665405273, + "learning_rate": 0.0001, + "loss": 1.3799, + "step": 9516 + }, + { + "epoch": 1.0932169318247085, + "grad_norm": 0.5479604005813599, + "learning_rate": 0.0001, + "loss": 1.5003, + "step": 9517 + }, + { + "epoch": 1.0933318017345357, + "grad_norm": 0.5224593281745911, + "learning_rate": 0.0001, + "loss": 1.4149, + "step": 9518 + }, + { + "epoch": 1.0934466716443627, + "grad_norm": 0.524739682674408, + "learning_rate": 0.0001, + "loss": 1.3937, + "step": 9519 + }, + { + "epoch": 1.09356154155419, + "grad_norm": 0.5201284885406494, + "learning_rate": 0.0001, + "loss": 1.5372, + "step": 9520 + }, + { + "epoch": 1.093676411464017, + "grad_norm": 0.5348097681999207, + "learning_rate": 0.0001, + "loss": 1.3621, + "step": 9521 + }, + { + "epoch": 1.0937912813738442, + "grad_norm": 0.5572234988212585, + "learning_rate": 0.0001, + "loss": 1.3629, + "step": 9522 + }, + { + "epoch": 1.0939061512836712, + "grad_norm": 0.5672652721405029, + "learning_rate": 0.0001, + "loss": 1.6203, + "step": 9523 + }, + { + "epoch": 1.0940210211934984, + "grad_norm": 0.5444334149360657, + "learning_rate": 0.0001, + "loss": 1.7165, + "step": 9524 + }, + { + "epoch": 1.0941358911033254, + "grad_norm": 0.54752516746521, + "learning_rate": 0.0001, + "loss": 1.4252, + "step": 9525 + }, + { + "epoch": 1.0942507610131527, + "grad_norm": 0.49809858202934265, + "learning_rate": 0.0001, + "loss": 1.387, + "step": 9526 + }, + { + "epoch": 1.0943656309229797, + "grad_norm": 0.512008786201477, + "learning_rate": 0.0001, + "loss": 1.295, + "step": 9527 + }, + { + "epoch": 1.094480500832807, + "grad_norm": 0.5522312521934509, + "learning_rate": 0.0001, + "loss": 1.4827, + "step": 9528 + }, + { + "epoch": 1.094595370742634, + "grad_norm": 0.5582612752914429, + "learning_rate": 0.0001, + "loss": 1.4605, + "step": 9529 + }, + { + "epoch": 1.0947102406524611, + "grad_norm": 0.48772454261779785, + "learning_rate": 0.0001, + "loss": 1.3134, + "step": 9530 + }, + { + "epoch": 1.0948251105622882, + "grad_norm": 0.49887439608573914, + "learning_rate": 0.0001, + "loss": 1.2972, + "step": 9531 + }, + { + "epoch": 1.0949399804721154, + "grad_norm": 0.5389269590377808, + "learning_rate": 0.0001, + "loss": 1.5936, + "step": 9532 + }, + { + "epoch": 1.0950548503819424, + "grad_norm": 0.5234885215759277, + "learning_rate": 0.0001, + "loss": 1.3984, + "step": 9533 + }, + { + "epoch": 1.0951697202917696, + "grad_norm": 0.5532096028327942, + "learning_rate": 0.0001, + "loss": 1.7103, + "step": 9534 + }, + { + "epoch": 1.0952845902015966, + "grad_norm": 0.5728646516799927, + "learning_rate": 0.0001, + "loss": 1.5835, + "step": 9535 + }, + { + "epoch": 1.0953994601114239, + "grad_norm": 0.5275560021400452, + "learning_rate": 0.0001, + "loss": 1.5167, + "step": 9536 + }, + { + "epoch": 1.0955143300212509, + "grad_norm": 0.5168836116790771, + "learning_rate": 0.0001, + "loss": 1.4736, + "step": 9537 + }, + { + "epoch": 1.095629199931078, + "grad_norm": 0.5980050563812256, + "learning_rate": 0.0001, + "loss": 1.5943, + "step": 9538 + }, + { + "epoch": 1.095744069840905, + "grad_norm": 0.553145706653595, + "learning_rate": 0.0001, + "loss": 1.5197, + "step": 9539 + }, + { + "epoch": 1.0958589397507323, + "grad_norm": 0.5515928864479065, + "learning_rate": 0.0001, + "loss": 1.3338, + "step": 9540 + }, + { + "epoch": 1.0959738096605594, + "grad_norm": 0.5219181776046753, + "learning_rate": 0.0001, + "loss": 1.3579, + "step": 9541 + }, + { + "epoch": 1.0960886795703866, + "grad_norm": 0.4816209077835083, + "learning_rate": 0.0001, + "loss": 1.3447, + "step": 9542 + }, + { + "epoch": 1.0962035494802136, + "grad_norm": 0.5502380132675171, + "learning_rate": 0.0001, + "loss": 1.5632, + "step": 9543 + }, + { + "epoch": 1.0963184193900408, + "grad_norm": 0.5180455446243286, + "learning_rate": 0.0001, + "loss": 1.4077, + "step": 9544 + }, + { + "epoch": 1.0964332892998678, + "grad_norm": 0.524360716342926, + "learning_rate": 0.0001, + "loss": 1.3433, + "step": 9545 + }, + { + "epoch": 1.096548159209695, + "grad_norm": 0.5326607823371887, + "learning_rate": 0.0001, + "loss": 1.4864, + "step": 9546 + }, + { + "epoch": 1.096663029119522, + "grad_norm": 0.578113853931427, + "learning_rate": 0.0001, + "loss": 1.3848, + "step": 9547 + }, + { + "epoch": 1.0967778990293493, + "grad_norm": 0.5228033065795898, + "learning_rate": 0.0001, + "loss": 1.4384, + "step": 9548 + }, + { + "epoch": 1.0968927689391763, + "grad_norm": 0.5391709804534912, + "learning_rate": 0.0001, + "loss": 1.5039, + "step": 9549 + }, + { + "epoch": 1.0970076388490035, + "grad_norm": 0.5661507248878479, + "learning_rate": 0.0001, + "loss": 1.5187, + "step": 9550 + }, + { + "epoch": 1.0971225087588306, + "grad_norm": 0.5772484540939331, + "learning_rate": 0.0001, + "loss": 1.526, + "step": 9551 + }, + { + "epoch": 1.0972373786686578, + "grad_norm": 0.5205997824668884, + "learning_rate": 0.0001, + "loss": 1.2596, + "step": 9552 + }, + { + "epoch": 1.0973522485784848, + "grad_norm": 0.4969817101955414, + "learning_rate": 0.0001, + "loss": 1.3083, + "step": 9553 + }, + { + "epoch": 1.097467118488312, + "grad_norm": 0.5085188746452332, + "learning_rate": 0.0001, + "loss": 1.4074, + "step": 9554 + }, + { + "epoch": 1.097581988398139, + "grad_norm": 0.5870746970176697, + "learning_rate": 0.0001, + "loss": 1.5538, + "step": 9555 + }, + { + "epoch": 1.0976968583079663, + "grad_norm": 0.5884862542152405, + "learning_rate": 0.0001, + "loss": 1.5905, + "step": 9556 + }, + { + "epoch": 1.0978117282177933, + "grad_norm": 0.6046068668365479, + "learning_rate": 0.0001, + "loss": 1.5751, + "step": 9557 + }, + { + "epoch": 1.0979265981276205, + "grad_norm": 0.5737109780311584, + "learning_rate": 0.0001, + "loss": 1.5331, + "step": 9558 + }, + { + "epoch": 1.0980414680374475, + "grad_norm": 0.5459645390510559, + "learning_rate": 0.0001, + "loss": 1.4078, + "step": 9559 + }, + { + "epoch": 1.0981563379472747, + "grad_norm": 0.5659840106964111, + "learning_rate": 0.0001, + "loss": 1.5431, + "step": 9560 + }, + { + "epoch": 1.0982712078571017, + "grad_norm": 0.6050505042076111, + "learning_rate": 0.0001, + "loss": 1.4345, + "step": 9561 + }, + { + "epoch": 1.098386077766929, + "grad_norm": 0.5667203068733215, + "learning_rate": 0.0001, + "loss": 1.4814, + "step": 9562 + }, + { + "epoch": 1.098500947676756, + "grad_norm": 0.5636523962020874, + "learning_rate": 0.0001, + "loss": 1.6369, + "step": 9563 + }, + { + "epoch": 1.0986158175865832, + "grad_norm": 0.6931208372116089, + "learning_rate": 0.0001, + "loss": 1.5261, + "step": 9564 + }, + { + "epoch": 1.0987306874964102, + "grad_norm": 0.6440662145614624, + "learning_rate": 0.0001, + "loss": 1.7034, + "step": 9565 + }, + { + "epoch": 1.0988455574062375, + "grad_norm": 0.5347530245780945, + "learning_rate": 0.0001, + "loss": 1.4818, + "step": 9566 + }, + { + "epoch": 1.0989604273160645, + "grad_norm": 0.504833996295929, + "learning_rate": 0.0001, + "loss": 1.3958, + "step": 9567 + }, + { + "epoch": 1.0990752972258917, + "grad_norm": 0.5506554245948792, + "learning_rate": 0.0001, + "loss": 1.3691, + "step": 9568 + }, + { + "epoch": 1.0991901671357187, + "grad_norm": 0.573026180267334, + "learning_rate": 0.0001, + "loss": 1.6405, + "step": 9569 + }, + { + "epoch": 1.099305037045546, + "grad_norm": 0.5038986206054688, + "learning_rate": 0.0001, + "loss": 1.4027, + "step": 9570 + }, + { + "epoch": 1.099419906955373, + "grad_norm": 0.5019423961639404, + "learning_rate": 0.0001, + "loss": 1.239, + "step": 9571 + }, + { + "epoch": 1.0995347768652002, + "grad_norm": 0.5748392343521118, + "learning_rate": 0.0001, + "loss": 1.346, + "step": 9572 + }, + { + "epoch": 1.0996496467750272, + "grad_norm": 0.5261626243591309, + "learning_rate": 0.0001, + "loss": 1.3476, + "step": 9573 + }, + { + "epoch": 1.0997645166848544, + "grad_norm": 0.5816366672515869, + "learning_rate": 0.0001, + "loss": 1.6714, + "step": 9574 + }, + { + "epoch": 1.0998793865946814, + "grad_norm": 0.5417282581329346, + "learning_rate": 0.0001, + "loss": 1.3579, + "step": 9575 + }, + { + "epoch": 1.0999942565045087, + "grad_norm": 0.5732975602149963, + "learning_rate": 0.0001, + "loss": 1.6473, + "step": 9576 + }, + { + "epoch": 1.1001091264143357, + "grad_norm": 0.532828688621521, + "learning_rate": 0.0001, + "loss": 1.438, + "step": 9577 + }, + { + "epoch": 1.100223996324163, + "grad_norm": 0.5136977434158325, + "learning_rate": 0.0001, + "loss": 1.4017, + "step": 9578 + }, + { + "epoch": 1.10033886623399, + "grad_norm": 0.5484635233879089, + "learning_rate": 0.0001, + "loss": 1.5923, + "step": 9579 + }, + { + "epoch": 1.1004537361438171, + "grad_norm": 0.5523190498352051, + "learning_rate": 0.0001, + "loss": 1.4729, + "step": 9580 + }, + { + "epoch": 1.1005686060536441, + "grad_norm": 0.5286394357681274, + "learning_rate": 0.0001, + "loss": 1.5283, + "step": 9581 + }, + { + "epoch": 1.1006834759634714, + "grad_norm": 0.5363751649856567, + "learning_rate": 0.0001, + "loss": 1.3553, + "step": 9582 + }, + { + "epoch": 1.1007983458732984, + "grad_norm": 0.6055129766464233, + "learning_rate": 0.0001, + "loss": 1.7219, + "step": 9583 + }, + { + "epoch": 1.1009132157831256, + "grad_norm": 0.5629612803459167, + "learning_rate": 0.0001, + "loss": 1.1768, + "step": 9584 + }, + { + "epoch": 1.1010280856929526, + "grad_norm": 0.5564055442810059, + "learning_rate": 0.0001, + "loss": 1.4302, + "step": 9585 + }, + { + "epoch": 1.1011429556027799, + "grad_norm": 0.6236995458602905, + "learning_rate": 0.0001, + "loss": 1.6443, + "step": 9586 + }, + { + "epoch": 1.1012578255126069, + "grad_norm": 0.5221498012542725, + "learning_rate": 0.0001, + "loss": 1.4249, + "step": 9587 + }, + { + "epoch": 1.101372695422434, + "grad_norm": 0.5449710488319397, + "learning_rate": 0.0001, + "loss": 1.5856, + "step": 9588 + }, + { + "epoch": 1.101487565332261, + "grad_norm": 0.5365622043609619, + "learning_rate": 0.0001, + "loss": 1.4603, + "step": 9589 + }, + { + "epoch": 1.1016024352420883, + "grad_norm": 0.5306606888771057, + "learning_rate": 0.0001, + "loss": 1.232, + "step": 9590 + }, + { + "epoch": 1.1017173051519156, + "grad_norm": 0.5195592045783997, + "learning_rate": 0.0001, + "loss": 1.4095, + "step": 9591 + }, + { + "epoch": 1.1018321750617426, + "grad_norm": 0.5111302733421326, + "learning_rate": 0.0001, + "loss": 1.4183, + "step": 9592 + }, + { + "epoch": 1.1019470449715696, + "grad_norm": 0.6306554079055786, + "learning_rate": 0.0001, + "loss": 1.5632, + "step": 9593 + }, + { + "epoch": 1.1020619148813968, + "grad_norm": 0.5819112062454224, + "learning_rate": 0.0001, + "loss": 1.5985, + "step": 9594 + }, + { + "epoch": 1.102176784791224, + "grad_norm": 0.5284945368766785, + "learning_rate": 0.0001, + "loss": 1.457, + "step": 9595 + }, + { + "epoch": 1.102291654701051, + "grad_norm": 0.5951351523399353, + "learning_rate": 0.0001, + "loss": 1.6157, + "step": 9596 + }, + { + "epoch": 1.102406524610878, + "grad_norm": 0.5186183452606201, + "learning_rate": 0.0001, + "loss": 1.452, + "step": 9597 + }, + { + "epoch": 1.1025213945207053, + "grad_norm": 0.5780056118965149, + "learning_rate": 0.0001, + "loss": 1.4106, + "step": 9598 + }, + { + "epoch": 1.1026362644305325, + "grad_norm": 0.5407369136810303, + "learning_rate": 0.0001, + "loss": 1.4405, + "step": 9599 + }, + { + "epoch": 1.1027511343403595, + "grad_norm": 0.5503196120262146, + "learning_rate": 0.0001, + "loss": 1.6157, + "step": 9600 + }, + { + "epoch": 1.1028660042501865, + "grad_norm": 0.5217320919036865, + "learning_rate": 0.0001, + "loss": 1.5486, + "step": 9601 + }, + { + "epoch": 1.1029808741600138, + "grad_norm": 0.5520971417427063, + "learning_rate": 0.0001, + "loss": 1.5373, + "step": 9602 + }, + { + "epoch": 1.103095744069841, + "grad_norm": 0.531853973865509, + "learning_rate": 0.0001, + "loss": 1.2797, + "step": 9603 + }, + { + "epoch": 1.103210613979668, + "grad_norm": 0.5419740676879883, + "learning_rate": 0.0001, + "loss": 1.4649, + "step": 9604 + }, + { + "epoch": 1.103325483889495, + "grad_norm": 0.5819375514984131, + "learning_rate": 0.0001, + "loss": 1.3623, + "step": 9605 + }, + { + "epoch": 1.1034403537993223, + "grad_norm": 0.5647183656692505, + "learning_rate": 0.0001, + "loss": 1.6388, + "step": 9606 + }, + { + "epoch": 1.1035552237091495, + "grad_norm": 0.5452742576599121, + "learning_rate": 0.0001, + "loss": 1.5939, + "step": 9607 + }, + { + "epoch": 1.1036700936189765, + "grad_norm": 0.5725043416023254, + "learning_rate": 0.0001, + "loss": 1.5899, + "step": 9608 + }, + { + "epoch": 1.1037849635288037, + "grad_norm": 0.5144510269165039, + "learning_rate": 0.0001, + "loss": 1.5325, + "step": 9609 + }, + { + "epoch": 1.1038998334386307, + "grad_norm": 0.5603897571563721, + "learning_rate": 0.0001, + "loss": 1.4753, + "step": 9610 + }, + { + "epoch": 1.104014703348458, + "grad_norm": 0.5356127023696899, + "learning_rate": 0.0001, + "loss": 1.2814, + "step": 9611 + }, + { + "epoch": 1.104129573258285, + "grad_norm": 0.5167484879493713, + "learning_rate": 0.0001, + "loss": 1.4795, + "step": 9612 + }, + { + "epoch": 1.1042444431681122, + "grad_norm": 0.5493837594985962, + "learning_rate": 0.0001, + "loss": 1.226, + "step": 9613 + }, + { + "epoch": 1.1043593130779392, + "grad_norm": 0.5558966398239136, + "learning_rate": 0.0001, + "loss": 1.6028, + "step": 9614 + }, + { + "epoch": 1.1044741829877665, + "grad_norm": 0.5911669135093689, + "learning_rate": 0.0001, + "loss": 1.4025, + "step": 9615 + }, + { + "epoch": 1.1045890528975935, + "grad_norm": 0.5736728310585022, + "learning_rate": 0.0001, + "loss": 1.2921, + "step": 9616 + }, + { + "epoch": 1.1047039228074207, + "grad_norm": 0.55420982837677, + "learning_rate": 0.0001, + "loss": 1.4201, + "step": 9617 + }, + { + "epoch": 1.1048187927172477, + "grad_norm": 0.6312325596809387, + "learning_rate": 0.0001, + "loss": 1.5217, + "step": 9618 + }, + { + "epoch": 1.104933662627075, + "grad_norm": 0.5303093194961548, + "learning_rate": 0.0001, + "loss": 1.3404, + "step": 9619 + }, + { + "epoch": 1.105048532536902, + "grad_norm": 0.5138710737228394, + "learning_rate": 0.0001, + "loss": 1.4007, + "step": 9620 + }, + { + "epoch": 1.1051634024467292, + "grad_norm": 0.6012336015701294, + "learning_rate": 0.0001, + "loss": 1.632, + "step": 9621 + }, + { + "epoch": 1.1052782723565562, + "grad_norm": 0.5534988641738892, + "learning_rate": 0.0001, + "loss": 1.6167, + "step": 9622 + }, + { + "epoch": 1.1053931422663834, + "grad_norm": 0.5863144993782043, + "learning_rate": 0.0001, + "loss": 1.5825, + "step": 9623 + }, + { + "epoch": 1.1055080121762104, + "grad_norm": 0.5527574419975281, + "learning_rate": 0.0001, + "loss": 1.6297, + "step": 9624 + }, + { + "epoch": 1.1056228820860377, + "grad_norm": 0.5487685203552246, + "learning_rate": 0.0001, + "loss": 1.3806, + "step": 9625 + }, + { + "epoch": 1.1057377519958647, + "grad_norm": 0.541068971157074, + "learning_rate": 0.0001, + "loss": 1.5204, + "step": 9626 + }, + { + "epoch": 1.105852621905692, + "grad_norm": 0.5597257018089294, + "learning_rate": 0.0001, + "loss": 1.6394, + "step": 9627 + }, + { + "epoch": 1.105967491815519, + "grad_norm": 0.5184458494186401, + "learning_rate": 0.0001, + "loss": 1.5862, + "step": 9628 + }, + { + "epoch": 1.1060823617253461, + "grad_norm": 0.6078442931175232, + "learning_rate": 0.0001, + "loss": 1.6372, + "step": 9629 + }, + { + "epoch": 1.1061972316351731, + "grad_norm": 0.5217808485031128, + "learning_rate": 0.0001, + "loss": 1.4827, + "step": 9630 + }, + { + "epoch": 1.1063121015450004, + "grad_norm": 0.5544619560241699, + "learning_rate": 0.0001, + "loss": 1.5177, + "step": 9631 + }, + { + "epoch": 1.1064269714548274, + "grad_norm": 0.5434312224388123, + "learning_rate": 0.0001, + "loss": 1.497, + "step": 9632 + }, + { + "epoch": 1.1065418413646546, + "grad_norm": 0.5772722959518433, + "learning_rate": 0.0001, + "loss": 1.2362, + "step": 9633 + }, + { + "epoch": 1.1066567112744816, + "grad_norm": 0.571907103061676, + "learning_rate": 0.0001, + "loss": 1.4295, + "step": 9634 + }, + { + "epoch": 1.1067715811843089, + "grad_norm": 0.5757927894592285, + "learning_rate": 0.0001, + "loss": 1.437, + "step": 9635 + }, + { + "epoch": 1.1068864510941359, + "grad_norm": 0.5998075008392334, + "learning_rate": 0.0001, + "loss": 1.6233, + "step": 9636 + }, + { + "epoch": 1.107001321003963, + "grad_norm": 0.5600723624229431, + "learning_rate": 0.0001, + "loss": 1.624, + "step": 9637 + }, + { + "epoch": 1.10711619091379, + "grad_norm": 0.5362415909767151, + "learning_rate": 0.0001, + "loss": 1.4906, + "step": 9638 + }, + { + "epoch": 1.1072310608236173, + "grad_norm": 0.5417195558547974, + "learning_rate": 0.0001, + "loss": 1.4588, + "step": 9639 + }, + { + "epoch": 1.1073459307334443, + "grad_norm": 0.5452782511711121, + "learning_rate": 0.0001, + "loss": 1.5054, + "step": 9640 + }, + { + "epoch": 1.1074608006432716, + "grad_norm": 0.565497100353241, + "learning_rate": 0.0001, + "loss": 1.626, + "step": 9641 + }, + { + "epoch": 1.1075756705530986, + "grad_norm": 0.5098233222961426, + "learning_rate": 0.0001, + "loss": 1.2684, + "step": 9642 + }, + { + "epoch": 1.1076905404629258, + "grad_norm": 0.5035114288330078, + "learning_rate": 0.0001, + "loss": 1.4774, + "step": 9643 + }, + { + "epoch": 1.1078054103727528, + "grad_norm": 0.5948421359062195, + "learning_rate": 0.0001, + "loss": 1.198, + "step": 9644 + }, + { + "epoch": 1.10792028028258, + "grad_norm": 0.5490888357162476, + "learning_rate": 0.0001, + "loss": 1.576, + "step": 9645 + }, + { + "epoch": 1.108035150192407, + "grad_norm": 0.5749881267547607, + "learning_rate": 0.0001, + "loss": 1.6602, + "step": 9646 + }, + { + "epoch": 1.1081500201022343, + "grad_norm": 0.5411537885665894, + "learning_rate": 0.0001, + "loss": 1.4558, + "step": 9647 + }, + { + "epoch": 1.1082648900120613, + "grad_norm": 0.5704604387283325, + "learning_rate": 0.0001, + "loss": 1.5484, + "step": 9648 + }, + { + "epoch": 1.1083797599218885, + "grad_norm": 0.5247997641563416, + "learning_rate": 0.0001, + "loss": 1.2063, + "step": 9649 + }, + { + "epoch": 1.1084946298317155, + "grad_norm": 0.5762208700180054, + "learning_rate": 0.0001, + "loss": 1.6697, + "step": 9650 + }, + { + "epoch": 1.1086094997415428, + "grad_norm": 0.5289835333824158, + "learning_rate": 0.0001, + "loss": 1.3045, + "step": 9651 + }, + { + "epoch": 1.1087243696513698, + "grad_norm": 0.5333086848258972, + "learning_rate": 0.0001, + "loss": 1.5886, + "step": 9652 + }, + { + "epoch": 1.108839239561197, + "grad_norm": 0.5131956338882446, + "learning_rate": 0.0001, + "loss": 1.3351, + "step": 9653 + }, + { + "epoch": 1.108954109471024, + "grad_norm": 0.5169805884361267, + "learning_rate": 0.0001, + "loss": 1.5557, + "step": 9654 + }, + { + "epoch": 1.1090689793808512, + "grad_norm": 0.5443220734596252, + "learning_rate": 0.0001, + "loss": 1.5177, + "step": 9655 + }, + { + "epoch": 1.1091838492906783, + "grad_norm": 0.5020636916160583, + "learning_rate": 0.0001, + "loss": 1.2518, + "step": 9656 + }, + { + "epoch": 1.1092987192005055, + "grad_norm": 0.49906498193740845, + "learning_rate": 0.0001, + "loss": 1.3971, + "step": 9657 + }, + { + "epoch": 1.1094135891103325, + "grad_norm": 0.5414373874664307, + "learning_rate": 0.0001, + "loss": 1.6474, + "step": 9658 + }, + { + "epoch": 1.1095284590201597, + "grad_norm": 0.5560583472251892, + "learning_rate": 0.0001, + "loss": 1.5238, + "step": 9659 + }, + { + "epoch": 1.1096433289299867, + "grad_norm": 0.5157898664474487, + "learning_rate": 0.0001, + "loss": 1.3337, + "step": 9660 + }, + { + "epoch": 1.109758198839814, + "grad_norm": 0.5900468826293945, + "learning_rate": 0.0001, + "loss": 1.6205, + "step": 9661 + }, + { + "epoch": 1.109873068749641, + "grad_norm": 0.5296939611434937, + "learning_rate": 0.0001, + "loss": 1.468, + "step": 9662 + }, + { + "epoch": 1.1099879386594682, + "grad_norm": 0.5018689632415771, + "learning_rate": 0.0001, + "loss": 1.4018, + "step": 9663 + }, + { + "epoch": 1.1101028085692952, + "grad_norm": 0.4990765452384949, + "learning_rate": 0.0001, + "loss": 1.3802, + "step": 9664 + }, + { + "epoch": 1.1102176784791224, + "grad_norm": 0.4909820556640625, + "learning_rate": 0.0001, + "loss": 1.3892, + "step": 9665 + }, + { + "epoch": 1.1103325483889495, + "grad_norm": 0.5232742428779602, + "learning_rate": 0.0001, + "loss": 1.5673, + "step": 9666 + }, + { + "epoch": 1.1104474182987767, + "grad_norm": 0.5467817783355713, + "learning_rate": 0.0001, + "loss": 1.3905, + "step": 9667 + }, + { + "epoch": 1.1105622882086037, + "grad_norm": 0.49867168068885803, + "learning_rate": 0.0001, + "loss": 1.5118, + "step": 9668 + }, + { + "epoch": 1.110677158118431, + "grad_norm": 0.5683446526527405, + "learning_rate": 0.0001, + "loss": 1.636, + "step": 9669 + }, + { + "epoch": 1.110792028028258, + "grad_norm": 0.5260730385780334, + "learning_rate": 0.0001, + "loss": 1.4308, + "step": 9670 + }, + { + "epoch": 1.1109068979380852, + "grad_norm": 0.5530440807342529, + "learning_rate": 0.0001, + "loss": 1.4861, + "step": 9671 + }, + { + "epoch": 1.1110217678479122, + "grad_norm": 0.597644567489624, + "learning_rate": 0.0001, + "loss": 1.6696, + "step": 9672 + }, + { + "epoch": 1.1111366377577394, + "grad_norm": 0.5474144220352173, + "learning_rate": 0.0001, + "loss": 1.4307, + "step": 9673 + }, + { + "epoch": 1.1112515076675664, + "grad_norm": 0.499236136674881, + "learning_rate": 0.0001, + "loss": 1.4741, + "step": 9674 + }, + { + "epoch": 1.1113663775773936, + "grad_norm": 0.5453770756721497, + "learning_rate": 0.0001, + "loss": 1.3632, + "step": 9675 + }, + { + "epoch": 1.1114812474872207, + "grad_norm": 0.5170800685882568, + "learning_rate": 0.0001, + "loss": 1.4903, + "step": 9676 + }, + { + "epoch": 1.111596117397048, + "grad_norm": 0.5183429718017578, + "learning_rate": 0.0001, + "loss": 1.3797, + "step": 9677 + }, + { + "epoch": 1.111710987306875, + "grad_norm": 0.5546143054962158, + "learning_rate": 0.0001, + "loss": 1.5733, + "step": 9678 + }, + { + "epoch": 1.1118258572167021, + "grad_norm": 0.5528146028518677, + "learning_rate": 0.0001, + "loss": 1.3667, + "step": 9679 + }, + { + "epoch": 1.1119407271265291, + "grad_norm": 0.5532962679862976, + "learning_rate": 0.0001, + "loss": 1.5328, + "step": 9680 + }, + { + "epoch": 1.1120555970363564, + "grad_norm": 0.5805869698524475, + "learning_rate": 0.0001, + "loss": 1.5082, + "step": 9681 + }, + { + "epoch": 1.1121704669461834, + "grad_norm": 0.5238086581230164, + "learning_rate": 0.0001, + "loss": 1.5363, + "step": 9682 + }, + { + "epoch": 1.1122853368560106, + "grad_norm": 0.5135483741760254, + "learning_rate": 0.0001, + "loss": 1.3031, + "step": 9683 + }, + { + "epoch": 1.1124002067658376, + "grad_norm": 0.535915732383728, + "learning_rate": 0.0001, + "loss": 1.4422, + "step": 9684 + }, + { + "epoch": 1.1125150766756648, + "grad_norm": 0.5621902346611023, + "learning_rate": 0.0001, + "loss": 1.5421, + "step": 9685 + }, + { + "epoch": 1.1126299465854919, + "grad_norm": 0.5478909611701965, + "learning_rate": 0.0001, + "loss": 1.3292, + "step": 9686 + }, + { + "epoch": 1.112744816495319, + "grad_norm": 0.5914627313613892, + "learning_rate": 0.0001, + "loss": 1.5824, + "step": 9687 + }, + { + "epoch": 1.112859686405146, + "grad_norm": 0.555410623550415, + "learning_rate": 0.0001, + "loss": 1.4208, + "step": 9688 + }, + { + "epoch": 1.1129745563149733, + "grad_norm": 0.6287572979927063, + "learning_rate": 0.0001, + "loss": 1.5024, + "step": 9689 + }, + { + "epoch": 1.1130894262248003, + "grad_norm": 0.5535327792167664, + "learning_rate": 0.0001, + "loss": 1.283, + "step": 9690 + }, + { + "epoch": 1.1132042961346276, + "grad_norm": 0.6567327976226807, + "learning_rate": 0.0001, + "loss": 1.6511, + "step": 9691 + }, + { + "epoch": 1.1133191660444546, + "grad_norm": 0.5440387725830078, + "learning_rate": 0.0001, + "loss": 1.4729, + "step": 9692 + }, + { + "epoch": 1.1134340359542818, + "grad_norm": 0.576004683971405, + "learning_rate": 0.0001, + "loss": 1.5678, + "step": 9693 + }, + { + "epoch": 1.1135489058641088, + "grad_norm": 0.5710541605949402, + "learning_rate": 0.0001, + "loss": 1.5312, + "step": 9694 + }, + { + "epoch": 1.113663775773936, + "grad_norm": 0.5407060384750366, + "learning_rate": 0.0001, + "loss": 1.4169, + "step": 9695 + }, + { + "epoch": 1.113778645683763, + "grad_norm": 0.618705689907074, + "learning_rate": 0.0001, + "loss": 1.7292, + "step": 9696 + }, + { + "epoch": 1.1138935155935903, + "grad_norm": 0.5561045408248901, + "learning_rate": 0.0001, + "loss": 1.4788, + "step": 9697 + }, + { + "epoch": 1.1140083855034173, + "grad_norm": 0.590338945388794, + "learning_rate": 0.0001, + "loss": 1.4074, + "step": 9698 + }, + { + "epoch": 1.1141232554132445, + "grad_norm": 0.5989879369735718, + "learning_rate": 0.0001, + "loss": 1.574, + "step": 9699 + }, + { + "epoch": 1.1142381253230715, + "grad_norm": 0.5720906853675842, + "learning_rate": 0.0001, + "loss": 1.7072, + "step": 9700 + }, + { + "epoch": 1.1143529952328988, + "grad_norm": 0.53765869140625, + "learning_rate": 0.0001, + "loss": 1.2634, + "step": 9701 + }, + { + "epoch": 1.1144678651427258, + "grad_norm": 0.5853990316390991, + "learning_rate": 0.0001, + "loss": 1.58, + "step": 9702 + }, + { + "epoch": 1.114582735052553, + "grad_norm": 0.5783793926239014, + "learning_rate": 0.0001, + "loss": 1.5545, + "step": 9703 + }, + { + "epoch": 1.11469760496238, + "grad_norm": 0.5769542455673218, + "learning_rate": 0.0001, + "loss": 1.4782, + "step": 9704 + }, + { + "epoch": 1.1148124748722072, + "grad_norm": 0.56031334400177, + "learning_rate": 0.0001, + "loss": 1.4986, + "step": 9705 + }, + { + "epoch": 1.1149273447820343, + "grad_norm": 0.5601339936256409, + "learning_rate": 0.0001, + "loss": 1.2713, + "step": 9706 + }, + { + "epoch": 1.1150422146918615, + "grad_norm": 0.552882730960846, + "learning_rate": 0.0001, + "loss": 1.5844, + "step": 9707 + }, + { + "epoch": 1.1151570846016885, + "grad_norm": 0.513989269733429, + "learning_rate": 0.0001, + "loss": 1.4661, + "step": 9708 + }, + { + "epoch": 1.1152719545115157, + "grad_norm": 0.5489285588264465, + "learning_rate": 0.0001, + "loss": 1.456, + "step": 9709 + }, + { + "epoch": 1.1153868244213427, + "grad_norm": 0.5569881200790405, + "learning_rate": 0.0001, + "loss": 1.5918, + "step": 9710 + }, + { + "epoch": 1.11550169433117, + "grad_norm": 0.5199733972549438, + "learning_rate": 0.0001, + "loss": 1.3066, + "step": 9711 + }, + { + "epoch": 1.115616564240997, + "grad_norm": 0.5051583051681519, + "learning_rate": 0.0001, + "loss": 1.4748, + "step": 9712 + }, + { + "epoch": 1.1157314341508242, + "grad_norm": 0.5694435238838196, + "learning_rate": 0.0001, + "loss": 1.5148, + "step": 9713 + }, + { + "epoch": 1.1158463040606512, + "grad_norm": 0.5439704656600952, + "learning_rate": 0.0001, + "loss": 1.4556, + "step": 9714 + }, + { + "epoch": 1.1159611739704784, + "grad_norm": 0.5158405900001526, + "learning_rate": 0.0001, + "loss": 1.3826, + "step": 9715 + }, + { + "epoch": 1.1160760438803055, + "grad_norm": 0.5488100647926331, + "learning_rate": 0.0001, + "loss": 1.5873, + "step": 9716 + }, + { + "epoch": 1.1161909137901327, + "grad_norm": 0.5777077078819275, + "learning_rate": 0.0001, + "loss": 1.3642, + "step": 9717 + }, + { + "epoch": 1.1163057836999597, + "grad_norm": 0.5701835751533508, + "learning_rate": 0.0001, + "loss": 1.2141, + "step": 9718 + }, + { + "epoch": 1.116420653609787, + "grad_norm": 0.5419429540634155, + "learning_rate": 0.0001, + "loss": 1.4514, + "step": 9719 + }, + { + "epoch": 1.116535523519614, + "grad_norm": 0.5637021660804749, + "learning_rate": 0.0001, + "loss": 1.4831, + "step": 9720 + }, + { + "epoch": 1.1166503934294412, + "grad_norm": 0.6277347803115845, + "learning_rate": 0.0001, + "loss": 1.6107, + "step": 9721 + }, + { + "epoch": 1.1167652633392682, + "grad_norm": 0.6077524423599243, + "learning_rate": 0.0001, + "loss": 1.5655, + "step": 9722 + }, + { + "epoch": 1.1168801332490954, + "grad_norm": 0.579602837562561, + "learning_rate": 0.0001, + "loss": 1.5203, + "step": 9723 + }, + { + "epoch": 1.1169950031589224, + "grad_norm": 0.6332294344902039, + "learning_rate": 0.0001, + "loss": 1.5833, + "step": 9724 + }, + { + "epoch": 1.1171098730687496, + "grad_norm": 0.5484510064125061, + "learning_rate": 0.0001, + "loss": 1.4893, + "step": 9725 + }, + { + "epoch": 1.1172247429785767, + "grad_norm": 0.5461903214454651, + "learning_rate": 0.0001, + "loss": 1.424, + "step": 9726 + }, + { + "epoch": 1.1173396128884039, + "grad_norm": 0.5597689151763916, + "learning_rate": 0.0001, + "loss": 1.5302, + "step": 9727 + }, + { + "epoch": 1.1174544827982311, + "grad_norm": 0.5503045916557312, + "learning_rate": 0.0001, + "loss": 1.5391, + "step": 9728 + }, + { + "epoch": 1.1175693527080581, + "grad_norm": 0.5259599685668945, + "learning_rate": 0.0001, + "loss": 1.4662, + "step": 9729 + }, + { + "epoch": 1.1176842226178851, + "grad_norm": 0.5283674001693726, + "learning_rate": 0.0001, + "loss": 1.4768, + "step": 9730 + }, + { + "epoch": 1.1177990925277124, + "grad_norm": 0.5296016335487366, + "learning_rate": 0.0001, + "loss": 1.2765, + "step": 9731 + }, + { + "epoch": 1.1179139624375396, + "grad_norm": 0.528243899345398, + "learning_rate": 0.0001, + "loss": 1.3637, + "step": 9732 + }, + { + "epoch": 1.1180288323473666, + "grad_norm": 0.5380933880805969, + "learning_rate": 0.0001, + "loss": 1.5179, + "step": 9733 + }, + { + "epoch": 1.1181437022571936, + "grad_norm": 0.5619294047355652, + "learning_rate": 0.0001, + "loss": 1.3127, + "step": 9734 + }, + { + "epoch": 1.1182585721670208, + "grad_norm": 0.5538292527198792, + "learning_rate": 0.0001, + "loss": 1.4159, + "step": 9735 + }, + { + "epoch": 1.118373442076848, + "grad_norm": 0.5651335120201111, + "learning_rate": 0.0001, + "loss": 1.3661, + "step": 9736 + }, + { + "epoch": 1.118488311986675, + "grad_norm": 0.5316612720489502, + "learning_rate": 0.0001, + "loss": 1.5018, + "step": 9737 + }, + { + "epoch": 1.118603181896502, + "grad_norm": 0.5468516945838928, + "learning_rate": 0.0001, + "loss": 1.3585, + "step": 9738 + }, + { + "epoch": 1.1187180518063293, + "grad_norm": 0.60384202003479, + "learning_rate": 0.0001, + "loss": 1.5244, + "step": 9739 + }, + { + "epoch": 1.1188329217161566, + "grad_norm": 0.5294783711433411, + "learning_rate": 0.0001, + "loss": 1.6064, + "step": 9740 + }, + { + "epoch": 1.1189477916259836, + "grad_norm": 0.5466625690460205, + "learning_rate": 0.0001, + "loss": 1.5823, + "step": 9741 + }, + { + "epoch": 1.1190626615358106, + "grad_norm": 0.5397646427154541, + "learning_rate": 0.0001, + "loss": 1.5862, + "step": 9742 + }, + { + "epoch": 1.1191775314456378, + "grad_norm": 0.525026798248291, + "learning_rate": 0.0001, + "loss": 1.4643, + "step": 9743 + }, + { + "epoch": 1.119292401355465, + "grad_norm": 0.5274828672409058, + "learning_rate": 0.0001, + "loss": 1.2221, + "step": 9744 + }, + { + "epoch": 1.119407271265292, + "grad_norm": 0.5290265679359436, + "learning_rate": 0.0001, + "loss": 1.5952, + "step": 9745 + }, + { + "epoch": 1.1195221411751193, + "grad_norm": 0.5176373720169067, + "learning_rate": 0.0001, + "loss": 1.3871, + "step": 9746 + }, + { + "epoch": 1.1196370110849463, + "grad_norm": 0.5549317598342896, + "learning_rate": 0.0001, + "loss": 1.6335, + "step": 9747 + }, + { + "epoch": 1.1197518809947735, + "grad_norm": 0.5477745532989502, + "learning_rate": 0.0001, + "loss": 1.4009, + "step": 9748 + }, + { + "epoch": 1.1198667509046005, + "grad_norm": 0.5613062381744385, + "learning_rate": 0.0001, + "loss": 1.3842, + "step": 9749 + }, + { + "epoch": 1.1199816208144278, + "grad_norm": 0.5442327260971069, + "learning_rate": 0.0001, + "loss": 1.4955, + "step": 9750 + }, + { + "epoch": 1.1200964907242548, + "grad_norm": 0.5651378631591797, + "learning_rate": 0.0001, + "loss": 1.507, + "step": 9751 + }, + { + "epoch": 1.120211360634082, + "grad_norm": 0.5338380336761475, + "learning_rate": 0.0001, + "loss": 1.2772, + "step": 9752 + }, + { + "epoch": 1.120326230543909, + "grad_norm": 0.5371922850608826, + "learning_rate": 0.0001, + "loss": 1.4111, + "step": 9753 + }, + { + "epoch": 1.1204411004537362, + "grad_norm": 0.49218225479125977, + "learning_rate": 0.0001, + "loss": 1.3432, + "step": 9754 + }, + { + "epoch": 1.1205559703635632, + "grad_norm": 0.5054636001586914, + "learning_rate": 0.0001, + "loss": 1.2946, + "step": 9755 + }, + { + "epoch": 1.1206708402733905, + "grad_norm": 0.5905545949935913, + "learning_rate": 0.0001, + "loss": 1.5725, + "step": 9756 + }, + { + "epoch": 1.1207857101832175, + "grad_norm": 0.5989205241203308, + "learning_rate": 0.0001, + "loss": 1.5756, + "step": 9757 + }, + { + "epoch": 1.1209005800930447, + "grad_norm": 0.5933013558387756, + "learning_rate": 0.0001, + "loss": 1.5632, + "step": 9758 + }, + { + "epoch": 1.1210154500028717, + "grad_norm": 0.5364729166030884, + "learning_rate": 0.0001, + "loss": 1.5455, + "step": 9759 + }, + { + "epoch": 1.121130319912699, + "grad_norm": 0.638589084148407, + "learning_rate": 0.0001, + "loss": 1.5624, + "step": 9760 + }, + { + "epoch": 1.121245189822526, + "grad_norm": 0.5210257172584534, + "learning_rate": 0.0001, + "loss": 1.3005, + "step": 9761 + }, + { + "epoch": 1.1213600597323532, + "grad_norm": 0.581584095954895, + "learning_rate": 0.0001, + "loss": 1.4794, + "step": 9762 + }, + { + "epoch": 1.1214749296421802, + "grad_norm": 0.5900195240974426, + "learning_rate": 0.0001, + "loss": 1.6183, + "step": 9763 + }, + { + "epoch": 1.1215897995520074, + "grad_norm": 0.5471107959747314, + "learning_rate": 0.0001, + "loss": 1.457, + "step": 9764 + }, + { + "epoch": 1.1217046694618344, + "grad_norm": 0.5870269536972046, + "learning_rate": 0.0001, + "loss": 1.3416, + "step": 9765 + }, + { + "epoch": 1.1218195393716617, + "grad_norm": 0.6071326732635498, + "learning_rate": 0.0001, + "loss": 1.362, + "step": 9766 + }, + { + "epoch": 1.1219344092814887, + "grad_norm": 0.5291370749473572, + "learning_rate": 0.0001, + "loss": 1.6083, + "step": 9767 + }, + { + "epoch": 1.122049279191316, + "grad_norm": 0.5899519920349121, + "learning_rate": 0.0001, + "loss": 1.555, + "step": 9768 + }, + { + "epoch": 1.122164149101143, + "grad_norm": 0.5426907539367676, + "learning_rate": 0.0001, + "loss": 1.3719, + "step": 9769 + }, + { + "epoch": 1.1222790190109702, + "grad_norm": 0.634436309337616, + "learning_rate": 0.0001, + "loss": 1.6449, + "step": 9770 + }, + { + "epoch": 1.1223938889207972, + "grad_norm": 0.5499168038368225, + "learning_rate": 0.0001, + "loss": 1.6378, + "step": 9771 + }, + { + "epoch": 1.1225087588306244, + "grad_norm": 0.5340597629547119, + "learning_rate": 0.0001, + "loss": 1.5949, + "step": 9772 + }, + { + "epoch": 1.1226236287404514, + "grad_norm": 0.5629338622093201, + "learning_rate": 0.0001, + "loss": 1.4928, + "step": 9773 + }, + { + "epoch": 1.1227384986502786, + "grad_norm": 0.5041453242301941, + "learning_rate": 0.0001, + "loss": 1.2472, + "step": 9774 + }, + { + "epoch": 1.1228533685601056, + "grad_norm": 0.5762504935264587, + "learning_rate": 0.0001, + "loss": 1.6202, + "step": 9775 + }, + { + "epoch": 1.1229682384699329, + "grad_norm": 0.5604518055915833, + "learning_rate": 0.0001, + "loss": 1.452, + "step": 9776 + }, + { + "epoch": 1.1230831083797599, + "grad_norm": 0.5645705461502075, + "learning_rate": 0.0001, + "loss": 1.2915, + "step": 9777 + }, + { + "epoch": 1.1231979782895871, + "grad_norm": 0.47870761156082153, + "learning_rate": 0.0001, + "loss": 1.2856, + "step": 9778 + }, + { + "epoch": 1.1233128481994141, + "grad_norm": 0.5456746220588684, + "learning_rate": 0.0001, + "loss": 1.5596, + "step": 9779 + }, + { + "epoch": 1.1234277181092414, + "grad_norm": 0.5596235394477844, + "learning_rate": 0.0001, + "loss": 1.502, + "step": 9780 + }, + { + "epoch": 1.1235425880190684, + "grad_norm": 0.5665818452835083, + "learning_rate": 0.0001, + "loss": 1.6226, + "step": 9781 + }, + { + "epoch": 1.1236574579288956, + "grad_norm": 0.605762779712677, + "learning_rate": 0.0001, + "loss": 1.3177, + "step": 9782 + }, + { + "epoch": 1.1237723278387226, + "grad_norm": 0.5420957207679749, + "learning_rate": 0.0001, + "loss": 1.4605, + "step": 9783 + }, + { + "epoch": 1.1238871977485498, + "grad_norm": 0.6387947201728821, + "learning_rate": 0.0001, + "loss": 1.3037, + "step": 9784 + }, + { + "epoch": 1.1240020676583768, + "grad_norm": 0.5580546259880066, + "learning_rate": 0.0001, + "loss": 1.4796, + "step": 9785 + }, + { + "epoch": 1.124116937568204, + "grad_norm": 0.5426499843597412, + "learning_rate": 0.0001, + "loss": 1.5066, + "step": 9786 + }, + { + "epoch": 1.124231807478031, + "grad_norm": 0.5494298934936523, + "learning_rate": 0.0001, + "loss": 1.565, + "step": 9787 + }, + { + "epoch": 1.1243466773878583, + "grad_norm": 0.5614138245582581, + "learning_rate": 0.0001, + "loss": 1.4699, + "step": 9788 + }, + { + "epoch": 1.1244615472976853, + "grad_norm": 0.5716959238052368, + "learning_rate": 0.0001, + "loss": 1.6331, + "step": 9789 + }, + { + "epoch": 1.1245764172075126, + "grad_norm": 0.5259473919868469, + "learning_rate": 0.0001, + "loss": 1.4069, + "step": 9790 + }, + { + "epoch": 1.1246912871173396, + "grad_norm": 0.5336620211601257, + "learning_rate": 0.0001, + "loss": 1.5229, + "step": 9791 + }, + { + "epoch": 1.1248061570271668, + "grad_norm": 0.5299500226974487, + "learning_rate": 0.0001, + "loss": 1.5239, + "step": 9792 + }, + { + "epoch": 1.1249210269369938, + "grad_norm": 0.5537680387496948, + "learning_rate": 0.0001, + "loss": 1.3462, + "step": 9793 + }, + { + "epoch": 1.125035896846821, + "grad_norm": 0.5444726943969727, + "learning_rate": 0.0001, + "loss": 1.5284, + "step": 9794 + }, + { + "epoch": 1.125150766756648, + "grad_norm": 0.5653538107872009, + "learning_rate": 0.0001, + "loss": 1.7283, + "step": 9795 + }, + { + "epoch": 1.1252656366664753, + "grad_norm": 0.5434619784355164, + "learning_rate": 0.0001, + "loss": 1.2987, + "step": 9796 + }, + { + "epoch": 1.1253805065763023, + "grad_norm": 0.5725109577178955, + "learning_rate": 0.0001, + "loss": 1.6485, + "step": 9797 + }, + { + "epoch": 1.1254953764861295, + "grad_norm": 0.6203802227973938, + "learning_rate": 0.0001, + "loss": 1.5608, + "step": 9798 + }, + { + "epoch": 1.1256102463959565, + "grad_norm": 0.5914912223815918, + "learning_rate": 0.0001, + "loss": 1.6739, + "step": 9799 + }, + { + "epoch": 1.1257251163057838, + "grad_norm": 0.7401518225669861, + "learning_rate": 0.0001, + "loss": 1.5393, + "step": 9800 + }, + { + "epoch": 1.1258399862156108, + "grad_norm": 0.54278165102005, + "learning_rate": 0.0001, + "loss": 1.3736, + "step": 9801 + }, + { + "epoch": 1.125954856125438, + "grad_norm": 0.5903399586677551, + "learning_rate": 0.0001, + "loss": 1.6015, + "step": 9802 + }, + { + "epoch": 1.126069726035265, + "grad_norm": 0.5835331678390503, + "learning_rate": 0.0001, + "loss": 1.5263, + "step": 9803 + }, + { + "epoch": 1.1261845959450922, + "grad_norm": 0.5370138883590698, + "learning_rate": 0.0001, + "loss": 1.541, + "step": 9804 + }, + { + "epoch": 1.1262994658549192, + "grad_norm": 0.6110140681266785, + "learning_rate": 0.0001, + "loss": 1.7323, + "step": 9805 + }, + { + "epoch": 1.1264143357647465, + "grad_norm": 0.5909807085990906, + "learning_rate": 0.0001, + "loss": 1.493, + "step": 9806 + }, + { + "epoch": 1.1265292056745735, + "grad_norm": 0.5648689270019531, + "learning_rate": 0.0001, + "loss": 1.4891, + "step": 9807 + }, + { + "epoch": 1.1266440755844007, + "grad_norm": 0.6406177282333374, + "learning_rate": 0.0001, + "loss": 1.6556, + "step": 9808 + }, + { + "epoch": 1.1267589454942277, + "grad_norm": 0.5629665851593018, + "learning_rate": 0.0001, + "loss": 1.4789, + "step": 9809 + }, + { + "epoch": 1.126873815404055, + "grad_norm": 0.5698572993278503, + "learning_rate": 0.0001, + "loss": 1.456, + "step": 9810 + }, + { + "epoch": 1.126988685313882, + "grad_norm": 0.5605424046516418, + "learning_rate": 0.0001, + "loss": 1.3712, + "step": 9811 + }, + { + "epoch": 1.1271035552237092, + "grad_norm": 0.5104329586029053, + "learning_rate": 0.0001, + "loss": 1.372, + "step": 9812 + }, + { + "epoch": 1.1272184251335362, + "grad_norm": 0.5657603144645691, + "learning_rate": 0.0001, + "loss": 1.3791, + "step": 9813 + }, + { + "epoch": 1.1273332950433634, + "grad_norm": 0.5272459983825684, + "learning_rate": 0.0001, + "loss": 1.3004, + "step": 9814 + }, + { + "epoch": 1.1274481649531904, + "grad_norm": 0.5541790127754211, + "learning_rate": 0.0001, + "loss": 1.4475, + "step": 9815 + }, + { + "epoch": 1.1275630348630177, + "grad_norm": 0.5871829986572266, + "learning_rate": 0.0001, + "loss": 1.5131, + "step": 9816 + }, + { + "epoch": 1.1276779047728447, + "grad_norm": 0.5780866742134094, + "learning_rate": 0.0001, + "loss": 1.561, + "step": 9817 + }, + { + "epoch": 1.127792774682672, + "grad_norm": 0.606268048286438, + "learning_rate": 0.0001, + "loss": 1.733, + "step": 9818 + }, + { + "epoch": 1.127907644592499, + "grad_norm": 0.559883713722229, + "learning_rate": 0.0001, + "loss": 1.4383, + "step": 9819 + }, + { + "epoch": 1.1280225145023262, + "grad_norm": 0.5222516655921936, + "learning_rate": 0.0001, + "loss": 1.3852, + "step": 9820 + }, + { + "epoch": 1.1281373844121532, + "grad_norm": 0.5751352310180664, + "learning_rate": 0.0001, + "loss": 1.5563, + "step": 9821 + }, + { + "epoch": 1.1282522543219804, + "grad_norm": 0.5874543786048889, + "learning_rate": 0.0001, + "loss": 1.6203, + "step": 9822 + }, + { + "epoch": 1.1283671242318074, + "grad_norm": 0.5984853506088257, + "learning_rate": 0.0001, + "loss": 1.5788, + "step": 9823 + }, + { + "epoch": 1.1284819941416346, + "grad_norm": 0.5243591070175171, + "learning_rate": 0.0001, + "loss": 1.2917, + "step": 9824 + }, + { + "epoch": 1.1285968640514616, + "grad_norm": 0.5501663088798523, + "learning_rate": 0.0001, + "loss": 1.5474, + "step": 9825 + }, + { + "epoch": 1.1287117339612889, + "grad_norm": 0.5242180824279785, + "learning_rate": 0.0001, + "loss": 1.3522, + "step": 9826 + }, + { + "epoch": 1.1288266038711159, + "grad_norm": 0.5722183585166931, + "learning_rate": 0.0001, + "loss": 1.4354, + "step": 9827 + }, + { + "epoch": 1.1289414737809431, + "grad_norm": 0.4917822778224945, + "learning_rate": 0.0001, + "loss": 1.3709, + "step": 9828 + }, + { + "epoch": 1.1290563436907701, + "grad_norm": 0.5814813375473022, + "learning_rate": 0.0001, + "loss": 1.7127, + "step": 9829 + }, + { + "epoch": 1.1291712136005974, + "grad_norm": 0.5022294521331787, + "learning_rate": 0.0001, + "loss": 1.4454, + "step": 9830 + }, + { + "epoch": 1.1292860835104244, + "grad_norm": 0.5753988027572632, + "learning_rate": 0.0001, + "loss": 1.5266, + "step": 9831 + }, + { + "epoch": 1.1294009534202516, + "grad_norm": 0.5708458423614502, + "learning_rate": 0.0001, + "loss": 1.6136, + "step": 9832 + }, + { + "epoch": 1.1295158233300786, + "grad_norm": 0.5471024513244629, + "learning_rate": 0.0001, + "loss": 1.2932, + "step": 9833 + }, + { + "epoch": 1.1296306932399058, + "grad_norm": 0.5528780817985535, + "learning_rate": 0.0001, + "loss": 1.3153, + "step": 9834 + }, + { + "epoch": 1.1297455631497328, + "grad_norm": 0.5226410627365112, + "learning_rate": 0.0001, + "loss": 1.4946, + "step": 9835 + }, + { + "epoch": 1.12986043305956, + "grad_norm": 0.5486363768577576, + "learning_rate": 0.0001, + "loss": 1.4376, + "step": 9836 + }, + { + "epoch": 1.129975302969387, + "grad_norm": 0.526906430721283, + "learning_rate": 0.0001, + "loss": 1.4453, + "step": 9837 + }, + { + "epoch": 1.1300901728792143, + "grad_norm": 0.5345343351364136, + "learning_rate": 0.0001, + "loss": 1.4311, + "step": 9838 + }, + { + "epoch": 1.1302050427890413, + "grad_norm": 0.6694420576095581, + "learning_rate": 0.0001, + "loss": 1.6289, + "step": 9839 + }, + { + "epoch": 1.1303199126988686, + "grad_norm": 0.5467296242713928, + "learning_rate": 0.0001, + "loss": 1.4171, + "step": 9840 + }, + { + "epoch": 1.1304347826086956, + "grad_norm": 0.6066096425056458, + "learning_rate": 0.0001, + "loss": 1.5134, + "step": 9841 + }, + { + "epoch": 1.1305496525185228, + "grad_norm": 0.5474829077720642, + "learning_rate": 0.0001, + "loss": 1.4532, + "step": 9842 + }, + { + "epoch": 1.1306645224283498, + "grad_norm": 0.5534715056419373, + "learning_rate": 0.0001, + "loss": 1.5475, + "step": 9843 + }, + { + "epoch": 1.130779392338177, + "grad_norm": 0.5239380598068237, + "learning_rate": 0.0001, + "loss": 1.4398, + "step": 9844 + }, + { + "epoch": 1.130894262248004, + "grad_norm": 0.6033841371536255, + "learning_rate": 0.0001, + "loss": 1.7539, + "step": 9845 + }, + { + "epoch": 1.1310091321578313, + "grad_norm": 0.5651752352714539, + "learning_rate": 0.0001, + "loss": 1.3981, + "step": 9846 + }, + { + "epoch": 1.1311240020676583, + "grad_norm": 0.5665850043296814, + "learning_rate": 0.0001, + "loss": 1.3451, + "step": 9847 + }, + { + "epoch": 1.1312388719774855, + "grad_norm": 0.5850709676742554, + "learning_rate": 0.0001, + "loss": 1.4922, + "step": 9848 + }, + { + "epoch": 1.1313537418873125, + "grad_norm": 0.5818759799003601, + "learning_rate": 0.0001, + "loss": 1.4969, + "step": 9849 + }, + { + "epoch": 1.1314686117971398, + "grad_norm": 0.59281986951828, + "learning_rate": 0.0001, + "loss": 1.4706, + "step": 9850 + }, + { + "epoch": 1.1315834817069668, + "grad_norm": 0.5399245023727417, + "learning_rate": 0.0001, + "loss": 1.3788, + "step": 9851 + }, + { + "epoch": 1.131698351616794, + "grad_norm": 0.5174872875213623, + "learning_rate": 0.0001, + "loss": 1.3944, + "step": 9852 + }, + { + "epoch": 1.131813221526621, + "grad_norm": 0.5072240829467773, + "learning_rate": 0.0001, + "loss": 1.3622, + "step": 9853 + }, + { + "epoch": 1.1319280914364482, + "grad_norm": 0.5090083479881287, + "learning_rate": 0.0001, + "loss": 1.4824, + "step": 9854 + }, + { + "epoch": 1.1320429613462752, + "grad_norm": 0.5711216926574707, + "learning_rate": 0.0001, + "loss": 1.4839, + "step": 9855 + }, + { + "epoch": 1.1321578312561025, + "grad_norm": 0.5663666129112244, + "learning_rate": 0.0001, + "loss": 1.4702, + "step": 9856 + }, + { + "epoch": 1.1322727011659297, + "grad_norm": 0.5527740120887756, + "learning_rate": 0.0001, + "loss": 1.5937, + "step": 9857 + }, + { + "epoch": 1.1323875710757567, + "grad_norm": 0.5693678855895996, + "learning_rate": 0.0001, + "loss": 1.534, + "step": 9858 + }, + { + "epoch": 1.1325024409855837, + "grad_norm": 0.5752173066139221, + "learning_rate": 0.0001, + "loss": 1.4533, + "step": 9859 + }, + { + "epoch": 1.132617310895411, + "grad_norm": 0.5374181866645813, + "learning_rate": 0.0001, + "loss": 1.2613, + "step": 9860 + }, + { + "epoch": 1.1327321808052382, + "grad_norm": 0.5573873519897461, + "learning_rate": 0.0001, + "loss": 1.4246, + "step": 9861 + }, + { + "epoch": 1.1328470507150652, + "grad_norm": 0.5458858609199524, + "learning_rate": 0.0001, + "loss": 1.355, + "step": 9862 + }, + { + "epoch": 1.1329619206248922, + "grad_norm": 0.5189163684844971, + "learning_rate": 0.0001, + "loss": 1.4186, + "step": 9863 + }, + { + "epoch": 1.1330767905347194, + "grad_norm": 0.5939661860466003, + "learning_rate": 0.0001, + "loss": 1.6622, + "step": 9864 + }, + { + "epoch": 1.1331916604445467, + "grad_norm": 0.5526455044746399, + "learning_rate": 0.0001, + "loss": 1.6, + "step": 9865 + }, + { + "epoch": 1.1333065303543737, + "grad_norm": 0.5535423755645752, + "learning_rate": 0.0001, + "loss": 1.5184, + "step": 9866 + }, + { + "epoch": 1.1334214002642007, + "grad_norm": 0.5559777617454529, + "learning_rate": 0.0001, + "loss": 1.5139, + "step": 9867 + }, + { + "epoch": 1.133536270174028, + "grad_norm": 0.5639746785163879, + "learning_rate": 0.0001, + "loss": 1.3472, + "step": 9868 + }, + { + "epoch": 1.1336511400838551, + "grad_norm": 0.5288705229759216, + "learning_rate": 0.0001, + "loss": 1.3665, + "step": 9869 + }, + { + "epoch": 1.1337660099936822, + "grad_norm": 0.6040565371513367, + "learning_rate": 0.0001, + "loss": 1.7257, + "step": 9870 + }, + { + "epoch": 1.1338808799035092, + "grad_norm": 0.5919288992881775, + "learning_rate": 0.0001, + "loss": 1.6479, + "step": 9871 + }, + { + "epoch": 1.1339957498133364, + "grad_norm": 0.5653237104415894, + "learning_rate": 0.0001, + "loss": 1.4633, + "step": 9872 + }, + { + "epoch": 1.1341106197231636, + "grad_norm": 0.5587108731269836, + "learning_rate": 0.0001, + "loss": 1.5307, + "step": 9873 + }, + { + "epoch": 1.1342254896329906, + "grad_norm": 0.5890237092971802, + "learning_rate": 0.0001, + "loss": 1.5432, + "step": 9874 + }, + { + "epoch": 1.1343403595428176, + "grad_norm": 0.5263911485671997, + "learning_rate": 0.0001, + "loss": 1.4927, + "step": 9875 + }, + { + "epoch": 1.1344552294526449, + "grad_norm": 0.494157075881958, + "learning_rate": 0.0001, + "loss": 1.3288, + "step": 9876 + }, + { + "epoch": 1.134570099362472, + "grad_norm": 0.5565462112426758, + "learning_rate": 0.0001, + "loss": 1.1901, + "step": 9877 + }, + { + "epoch": 1.1346849692722991, + "grad_norm": 0.5866313576698303, + "learning_rate": 0.0001, + "loss": 1.3815, + "step": 9878 + }, + { + "epoch": 1.1347998391821261, + "grad_norm": 0.5734443068504333, + "learning_rate": 0.0001, + "loss": 1.5916, + "step": 9879 + }, + { + "epoch": 1.1349147090919534, + "grad_norm": 0.533345103263855, + "learning_rate": 0.0001, + "loss": 1.4551, + "step": 9880 + }, + { + "epoch": 1.1350295790017806, + "grad_norm": 0.5391797423362732, + "learning_rate": 0.0001, + "loss": 1.4961, + "step": 9881 + }, + { + "epoch": 1.1351444489116076, + "grad_norm": 0.5512882471084595, + "learning_rate": 0.0001, + "loss": 1.5141, + "step": 9882 + }, + { + "epoch": 1.1352593188214346, + "grad_norm": 0.565726637840271, + "learning_rate": 0.0001, + "loss": 1.5068, + "step": 9883 + }, + { + "epoch": 1.1353741887312618, + "grad_norm": 0.5509957671165466, + "learning_rate": 0.0001, + "loss": 1.4008, + "step": 9884 + }, + { + "epoch": 1.135489058641089, + "grad_norm": 0.6037254333496094, + "learning_rate": 0.0001, + "loss": 1.5944, + "step": 9885 + }, + { + "epoch": 1.135603928550916, + "grad_norm": 0.5137581825256348, + "learning_rate": 0.0001, + "loss": 1.431, + "step": 9886 + }, + { + "epoch": 1.135718798460743, + "grad_norm": 0.5548390746116638, + "learning_rate": 0.0001, + "loss": 1.5412, + "step": 9887 + }, + { + "epoch": 1.1358336683705703, + "grad_norm": 0.538205087184906, + "learning_rate": 0.0001, + "loss": 1.5042, + "step": 9888 + }, + { + "epoch": 1.1359485382803975, + "grad_norm": 0.5580369830131531, + "learning_rate": 0.0001, + "loss": 1.4531, + "step": 9889 + }, + { + "epoch": 1.1360634081902246, + "grad_norm": 0.5717020034790039, + "learning_rate": 0.0001, + "loss": 1.3808, + "step": 9890 + }, + { + "epoch": 1.1361782781000518, + "grad_norm": 0.5775260329246521, + "learning_rate": 0.0001, + "loss": 1.5856, + "step": 9891 + }, + { + "epoch": 1.1362931480098788, + "grad_norm": 0.6119628548622131, + "learning_rate": 0.0001, + "loss": 1.5353, + "step": 9892 + }, + { + "epoch": 1.136408017919706, + "grad_norm": 0.533612072467804, + "learning_rate": 0.0001, + "loss": 1.2585, + "step": 9893 + }, + { + "epoch": 1.136522887829533, + "grad_norm": 0.5262476205825806, + "learning_rate": 0.0001, + "loss": 1.3074, + "step": 9894 + }, + { + "epoch": 1.1366377577393603, + "grad_norm": 0.5193493366241455, + "learning_rate": 0.0001, + "loss": 1.43, + "step": 9895 + }, + { + "epoch": 1.1367526276491873, + "grad_norm": 0.5410826206207275, + "learning_rate": 0.0001, + "loss": 1.5718, + "step": 9896 + }, + { + "epoch": 1.1368674975590145, + "grad_norm": 0.5434339642524719, + "learning_rate": 0.0001, + "loss": 1.4059, + "step": 9897 + }, + { + "epoch": 1.1369823674688415, + "grad_norm": 0.6266562342643738, + "learning_rate": 0.0001, + "loss": 1.6898, + "step": 9898 + }, + { + "epoch": 1.1370972373786687, + "grad_norm": 0.5530896782875061, + "learning_rate": 0.0001, + "loss": 1.4496, + "step": 9899 + }, + { + "epoch": 1.1372121072884958, + "grad_norm": 0.5319846868515015, + "learning_rate": 0.0001, + "loss": 1.4595, + "step": 9900 + }, + { + "epoch": 1.137326977198323, + "grad_norm": 0.5379259586334229, + "learning_rate": 0.0001, + "loss": 1.5903, + "step": 9901 + }, + { + "epoch": 1.13744184710815, + "grad_norm": 0.5139333009719849, + "learning_rate": 0.0001, + "loss": 1.3436, + "step": 9902 + }, + { + "epoch": 1.1375567170179772, + "grad_norm": 0.5401899218559265, + "learning_rate": 0.0001, + "loss": 1.7349, + "step": 9903 + }, + { + "epoch": 1.1376715869278042, + "grad_norm": 0.551706075668335, + "learning_rate": 0.0001, + "loss": 1.4543, + "step": 9904 + }, + { + "epoch": 1.1377864568376315, + "grad_norm": 0.5094159245491028, + "learning_rate": 0.0001, + "loss": 1.4717, + "step": 9905 + }, + { + "epoch": 1.1379013267474585, + "grad_norm": 0.514320433139801, + "learning_rate": 0.0001, + "loss": 1.275, + "step": 9906 + }, + { + "epoch": 1.1380161966572857, + "grad_norm": 0.5413798689842224, + "learning_rate": 0.0001, + "loss": 1.4261, + "step": 9907 + }, + { + "epoch": 1.1381310665671127, + "grad_norm": 0.6346654295921326, + "learning_rate": 0.0001, + "loss": 1.4869, + "step": 9908 + }, + { + "epoch": 1.13824593647694, + "grad_norm": 0.5805730223655701, + "learning_rate": 0.0001, + "loss": 1.4811, + "step": 9909 + }, + { + "epoch": 1.138360806386767, + "grad_norm": 0.5956717133522034, + "learning_rate": 0.0001, + "loss": 1.6149, + "step": 9910 + }, + { + "epoch": 1.1384756762965942, + "grad_norm": 0.5871737599372864, + "learning_rate": 0.0001, + "loss": 1.5696, + "step": 9911 + }, + { + "epoch": 1.1385905462064212, + "grad_norm": 0.6298748254776001, + "learning_rate": 0.0001, + "loss": 1.5124, + "step": 9912 + }, + { + "epoch": 1.1387054161162484, + "grad_norm": 0.5252795219421387, + "learning_rate": 0.0001, + "loss": 1.4858, + "step": 9913 + }, + { + "epoch": 1.1388202860260754, + "grad_norm": 0.5625512003898621, + "learning_rate": 0.0001, + "loss": 1.5734, + "step": 9914 + }, + { + "epoch": 1.1389351559359027, + "grad_norm": 0.47873494029045105, + "learning_rate": 0.0001, + "loss": 1.2768, + "step": 9915 + }, + { + "epoch": 1.1390500258457297, + "grad_norm": 0.5676445960998535, + "learning_rate": 0.0001, + "loss": 1.6487, + "step": 9916 + }, + { + "epoch": 1.139164895755557, + "grad_norm": 0.5738489031791687, + "learning_rate": 0.0001, + "loss": 1.5787, + "step": 9917 + }, + { + "epoch": 1.139279765665384, + "grad_norm": 0.5460545420646667, + "learning_rate": 0.0001, + "loss": 1.375, + "step": 9918 + }, + { + "epoch": 1.1393946355752111, + "grad_norm": 0.5179741382598877, + "learning_rate": 0.0001, + "loss": 1.3943, + "step": 9919 + }, + { + "epoch": 1.1395095054850382, + "grad_norm": 0.6625398993492126, + "learning_rate": 0.0001, + "loss": 1.7408, + "step": 9920 + }, + { + "epoch": 1.1396243753948654, + "grad_norm": 0.55277019739151, + "learning_rate": 0.0001, + "loss": 1.5409, + "step": 9921 + }, + { + "epoch": 1.1397392453046924, + "grad_norm": 0.5727016925811768, + "learning_rate": 0.0001, + "loss": 1.4051, + "step": 9922 + }, + { + "epoch": 1.1398541152145196, + "grad_norm": 0.5403141975402832, + "learning_rate": 0.0001, + "loss": 1.5129, + "step": 9923 + }, + { + "epoch": 1.1399689851243466, + "grad_norm": 0.5404056906700134, + "learning_rate": 0.0001, + "loss": 1.4899, + "step": 9924 + }, + { + "epoch": 1.1400838550341739, + "grad_norm": 0.5808719992637634, + "learning_rate": 0.0001, + "loss": 1.4641, + "step": 9925 + }, + { + "epoch": 1.1401987249440009, + "grad_norm": 0.5395628809928894, + "learning_rate": 0.0001, + "loss": 1.4619, + "step": 9926 + }, + { + "epoch": 1.140313594853828, + "grad_norm": 0.5930881500244141, + "learning_rate": 0.0001, + "loss": 1.52, + "step": 9927 + }, + { + "epoch": 1.1404284647636551, + "grad_norm": 0.5815712809562683, + "learning_rate": 0.0001, + "loss": 1.2868, + "step": 9928 + }, + { + "epoch": 1.1405433346734823, + "grad_norm": 0.5318073630332947, + "learning_rate": 0.0001, + "loss": 1.4842, + "step": 9929 + }, + { + "epoch": 1.1406582045833094, + "grad_norm": 0.5083526968955994, + "learning_rate": 0.0001, + "loss": 1.411, + "step": 9930 + }, + { + "epoch": 1.1407730744931366, + "grad_norm": 0.6382246017456055, + "learning_rate": 0.0001, + "loss": 1.6084, + "step": 9931 + }, + { + "epoch": 1.1408879444029636, + "grad_norm": 0.5410823225975037, + "learning_rate": 0.0001, + "loss": 1.3782, + "step": 9932 + }, + { + "epoch": 1.1410028143127908, + "grad_norm": 0.5354570746421814, + "learning_rate": 0.0001, + "loss": 1.5489, + "step": 9933 + }, + { + "epoch": 1.1411176842226178, + "grad_norm": 0.5121961832046509, + "learning_rate": 0.0001, + "loss": 1.4649, + "step": 9934 + }, + { + "epoch": 1.141232554132445, + "grad_norm": 0.5164211392402649, + "learning_rate": 0.0001, + "loss": 1.3373, + "step": 9935 + }, + { + "epoch": 1.141347424042272, + "grad_norm": 0.5433701276779175, + "learning_rate": 0.0001, + "loss": 1.4922, + "step": 9936 + }, + { + "epoch": 1.1414622939520993, + "grad_norm": 0.5362388491630554, + "learning_rate": 0.0001, + "loss": 1.2922, + "step": 9937 + }, + { + "epoch": 1.1415771638619263, + "grad_norm": 0.5343228578567505, + "learning_rate": 0.0001, + "loss": 1.3815, + "step": 9938 + }, + { + "epoch": 1.1416920337717535, + "grad_norm": 0.5795764327049255, + "learning_rate": 0.0001, + "loss": 1.5437, + "step": 9939 + }, + { + "epoch": 1.1418069036815806, + "grad_norm": 0.5852019190788269, + "learning_rate": 0.0001, + "loss": 1.463, + "step": 9940 + }, + { + "epoch": 1.1419217735914078, + "grad_norm": 0.5105196237564087, + "learning_rate": 0.0001, + "loss": 1.3497, + "step": 9941 + }, + { + "epoch": 1.1420366435012348, + "grad_norm": 0.5408787131309509, + "learning_rate": 0.0001, + "loss": 1.3446, + "step": 9942 + }, + { + "epoch": 1.142151513411062, + "grad_norm": 0.552517294883728, + "learning_rate": 0.0001, + "loss": 1.431, + "step": 9943 + }, + { + "epoch": 1.142266383320889, + "grad_norm": 0.5696591138839722, + "learning_rate": 0.0001, + "loss": 1.5123, + "step": 9944 + }, + { + "epoch": 1.1423812532307163, + "grad_norm": 0.560500979423523, + "learning_rate": 0.0001, + "loss": 1.5635, + "step": 9945 + }, + { + "epoch": 1.1424961231405433, + "grad_norm": 0.6407963037490845, + "learning_rate": 0.0001, + "loss": 1.5365, + "step": 9946 + }, + { + "epoch": 1.1426109930503705, + "grad_norm": 0.5425936579704285, + "learning_rate": 0.0001, + "loss": 1.3761, + "step": 9947 + }, + { + "epoch": 1.1427258629601975, + "grad_norm": 0.5866653323173523, + "learning_rate": 0.0001, + "loss": 1.6785, + "step": 9948 + }, + { + "epoch": 1.1428407328700247, + "grad_norm": 0.5960055589675903, + "learning_rate": 0.0001, + "loss": 1.3274, + "step": 9949 + }, + { + "epoch": 1.1429556027798518, + "grad_norm": 0.5556841492652893, + "learning_rate": 0.0001, + "loss": 1.5652, + "step": 9950 + }, + { + "epoch": 1.143070472689679, + "grad_norm": 0.5279529094696045, + "learning_rate": 0.0001, + "loss": 1.2829, + "step": 9951 + }, + { + "epoch": 1.143185342599506, + "grad_norm": 0.5272794961929321, + "learning_rate": 0.0001, + "loss": 1.5075, + "step": 9952 + }, + { + "epoch": 1.1433002125093332, + "grad_norm": 0.5824765563011169, + "learning_rate": 0.0001, + "loss": 1.5392, + "step": 9953 + }, + { + "epoch": 1.1434150824191602, + "grad_norm": 0.5903116464614868, + "learning_rate": 0.0001, + "loss": 1.6437, + "step": 9954 + }, + { + "epoch": 1.1435299523289875, + "grad_norm": 0.525478184223175, + "learning_rate": 0.0001, + "loss": 1.4775, + "step": 9955 + }, + { + "epoch": 1.1436448222388145, + "grad_norm": 0.4939769208431244, + "learning_rate": 0.0001, + "loss": 1.2772, + "step": 9956 + }, + { + "epoch": 1.1437596921486417, + "grad_norm": 0.5516688823699951, + "learning_rate": 0.0001, + "loss": 1.3937, + "step": 9957 + }, + { + "epoch": 1.1438745620584687, + "grad_norm": 0.5349141955375671, + "learning_rate": 0.0001, + "loss": 1.4102, + "step": 9958 + }, + { + "epoch": 1.143989431968296, + "grad_norm": 0.5700792670249939, + "learning_rate": 0.0001, + "loss": 1.5783, + "step": 9959 + }, + { + "epoch": 1.144104301878123, + "grad_norm": 0.5753479599952698, + "learning_rate": 0.0001, + "loss": 1.5559, + "step": 9960 + }, + { + "epoch": 1.1442191717879502, + "grad_norm": 0.49369415640830994, + "learning_rate": 0.0001, + "loss": 1.3815, + "step": 9961 + }, + { + "epoch": 1.1443340416977772, + "grad_norm": 0.5094558000564575, + "learning_rate": 0.0001, + "loss": 1.3144, + "step": 9962 + }, + { + "epoch": 1.1444489116076044, + "grad_norm": 0.5119041800498962, + "learning_rate": 0.0001, + "loss": 1.3715, + "step": 9963 + }, + { + "epoch": 1.1445637815174314, + "grad_norm": 0.7021076083183289, + "learning_rate": 0.0001, + "loss": 1.818, + "step": 9964 + }, + { + "epoch": 1.1446786514272587, + "grad_norm": 0.5357758402824402, + "learning_rate": 0.0001, + "loss": 1.5008, + "step": 9965 + }, + { + "epoch": 1.1447935213370857, + "grad_norm": 0.5504557490348816, + "learning_rate": 0.0001, + "loss": 1.5859, + "step": 9966 + }, + { + "epoch": 1.144908391246913, + "grad_norm": 0.5839464068412781, + "learning_rate": 0.0001, + "loss": 1.5019, + "step": 9967 + }, + { + "epoch": 1.14502326115674, + "grad_norm": 0.5492793917655945, + "learning_rate": 0.0001, + "loss": 1.6016, + "step": 9968 + }, + { + "epoch": 1.1451381310665671, + "grad_norm": 0.5798627138137817, + "learning_rate": 0.0001, + "loss": 1.5336, + "step": 9969 + }, + { + "epoch": 1.1452530009763942, + "grad_norm": 0.5143793225288391, + "learning_rate": 0.0001, + "loss": 1.429, + "step": 9970 + }, + { + "epoch": 1.1453678708862214, + "grad_norm": 0.5312758684158325, + "learning_rate": 0.0001, + "loss": 1.4962, + "step": 9971 + }, + { + "epoch": 1.1454827407960484, + "grad_norm": 0.5357524156570435, + "learning_rate": 0.0001, + "loss": 1.4713, + "step": 9972 + }, + { + "epoch": 1.1455976107058756, + "grad_norm": 0.5727779269218445, + "learning_rate": 0.0001, + "loss": 1.3171, + "step": 9973 + }, + { + "epoch": 1.1457124806157026, + "grad_norm": 0.5089569091796875, + "learning_rate": 0.0001, + "loss": 1.5816, + "step": 9974 + }, + { + "epoch": 1.1458273505255299, + "grad_norm": 0.587568998336792, + "learning_rate": 0.0001, + "loss": 1.4017, + "step": 9975 + }, + { + "epoch": 1.1459422204353569, + "grad_norm": 0.5800287127494812, + "learning_rate": 0.0001, + "loss": 1.5692, + "step": 9976 + }, + { + "epoch": 1.146057090345184, + "grad_norm": 0.5658348798751831, + "learning_rate": 0.0001, + "loss": 1.5379, + "step": 9977 + }, + { + "epoch": 1.1461719602550111, + "grad_norm": 0.5689564347267151, + "learning_rate": 0.0001, + "loss": 1.4476, + "step": 9978 + }, + { + "epoch": 1.1462868301648383, + "grad_norm": 0.5329977869987488, + "learning_rate": 0.0001, + "loss": 1.5136, + "step": 9979 + }, + { + "epoch": 1.1464017000746654, + "grad_norm": 0.5485842227935791, + "learning_rate": 0.0001, + "loss": 1.494, + "step": 9980 + }, + { + "epoch": 1.1465165699844926, + "grad_norm": 0.5831310153007507, + "learning_rate": 0.0001, + "loss": 1.4663, + "step": 9981 + }, + { + "epoch": 1.1466314398943196, + "grad_norm": 0.5669857263565063, + "learning_rate": 0.0001, + "loss": 1.436, + "step": 9982 + }, + { + "epoch": 1.1467463098041468, + "grad_norm": 0.569514274597168, + "learning_rate": 0.0001, + "loss": 1.6238, + "step": 9983 + }, + { + "epoch": 1.1468611797139738, + "grad_norm": 0.5632662773132324, + "learning_rate": 0.0001, + "loss": 1.345, + "step": 9984 + }, + { + "epoch": 1.146976049623801, + "grad_norm": 0.5908129215240479, + "learning_rate": 0.0001, + "loss": 1.4914, + "step": 9985 + }, + { + "epoch": 1.147090919533628, + "grad_norm": 0.6112525463104248, + "learning_rate": 0.0001, + "loss": 1.5285, + "step": 9986 + }, + { + "epoch": 1.1472057894434553, + "grad_norm": 0.5455618500709534, + "learning_rate": 0.0001, + "loss": 1.4891, + "step": 9987 + }, + { + "epoch": 1.1473206593532823, + "grad_norm": 0.5896759033203125, + "learning_rate": 0.0001, + "loss": 1.4798, + "step": 9988 + }, + { + "epoch": 1.1474355292631095, + "grad_norm": 0.571235179901123, + "learning_rate": 0.0001, + "loss": 1.5773, + "step": 9989 + }, + { + "epoch": 1.1475503991729366, + "grad_norm": 0.554031252861023, + "learning_rate": 0.0001, + "loss": 1.4231, + "step": 9990 + }, + { + "epoch": 1.1476652690827638, + "grad_norm": 0.5455310344696045, + "learning_rate": 0.0001, + "loss": 1.4302, + "step": 9991 + }, + { + "epoch": 1.1477801389925908, + "grad_norm": 0.5141569972038269, + "learning_rate": 0.0001, + "loss": 1.377, + "step": 9992 + }, + { + "epoch": 1.147895008902418, + "grad_norm": 0.5764080286026001, + "learning_rate": 0.0001, + "loss": 1.4583, + "step": 9993 + }, + { + "epoch": 1.1480098788122453, + "grad_norm": 0.5481144189834595, + "learning_rate": 0.0001, + "loss": 1.3084, + "step": 9994 + }, + { + "epoch": 1.1481247487220723, + "grad_norm": 0.5942389965057373, + "learning_rate": 0.0001, + "loss": 1.3149, + "step": 9995 + }, + { + "epoch": 1.1482396186318993, + "grad_norm": 0.5685340166091919, + "learning_rate": 0.0001, + "loss": 1.6043, + "step": 9996 + }, + { + "epoch": 1.1483544885417265, + "grad_norm": 0.5472145080566406, + "learning_rate": 0.0001, + "loss": 1.3976, + "step": 9997 + }, + { + "epoch": 1.1484693584515537, + "grad_norm": 0.6134904623031616, + "learning_rate": 0.0001, + "loss": 1.3928, + "step": 9998 + }, + { + "epoch": 1.1485842283613807, + "grad_norm": 0.5569812655448914, + "learning_rate": 0.0001, + "loss": 1.3422, + "step": 9999 + }, + { + "epoch": 1.1486990982712078, + "grad_norm": 0.5773634314537048, + "learning_rate": 0.0001, + "loss": 1.2924, + "step": 10000 + }, + { + "epoch": 1.148813968181035, + "grad_norm": 0.5295365452766418, + "learning_rate": 0.0001, + "loss": 1.4651, + "step": 10001 + }, + { + "epoch": 1.1489288380908622, + "grad_norm": 0.5416709780693054, + "learning_rate": 0.0001, + "loss": 1.3973, + "step": 10002 + }, + { + "epoch": 1.1490437080006892, + "grad_norm": 0.5793564319610596, + "learning_rate": 0.0001, + "loss": 1.4149, + "step": 10003 + }, + { + "epoch": 1.1491585779105162, + "grad_norm": 0.6285029649734497, + "learning_rate": 0.0001, + "loss": 1.4183, + "step": 10004 + }, + { + "epoch": 1.1492734478203435, + "grad_norm": 0.5785372853279114, + "learning_rate": 0.0001, + "loss": 1.2509, + "step": 10005 + }, + { + "epoch": 1.1493883177301707, + "grad_norm": 0.6486859321594238, + "learning_rate": 0.0001, + "loss": 1.6246, + "step": 10006 + }, + { + "epoch": 1.1495031876399977, + "grad_norm": 0.5618028044700623, + "learning_rate": 0.0001, + "loss": 1.4458, + "step": 10007 + }, + { + "epoch": 1.1496180575498247, + "grad_norm": 0.5516600012779236, + "learning_rate": 0.0001, + "loss": 1.3189, + "step": 10008 + }, + { + "epoch": 1.149732927459652, + "grad_norm": 0.5638642907142639, + "learning_rate": 0.0001, + "loss": 1.4698, + "step": 10009 + }, + { + "epoch": 1.1498477973694792, + "grad_norm": 0.5339157581329346, + "learning_rate": 0.0001, + "loss": 1.4751, + "step": 10010 + }, + { + "epoch": 1.1499626672793062, + "grad_norm": 0.5690274834632874, + "learning_rate": 0.0001, + "loss": 1.5865, + "step": 10011 + }, + { + "epoch": 1.1500775371891332, + "grad_norm": 0.5620351433753967, + "learning_rate": 0.0001, + "loss": 1.5383, + "step": 10012 + }, + { + "epoch": 1.1501924070989604, + "grad_norm": 0.5315772294998169, + "learning_rate": 0.0001, + "loss": 1.5289, + "step": 10013 + }, + { + "epoch": 1.1503072770087877, + "grad_norm": 0.5182132124900818, + "learning_rate": 0.0001, + "loss": 1.3737, + "step": 10014 + }, + { + "epoch": 1.1504221469186147, + "grad_norm": 0.5324050188064575, + "learning_rate": 0.0001, + "loss": 1.3595, + "step": 10015 + }, + { + "epoch": 1.1505370168284417, + "grad_norm": 0.5690122842788696, + "learning_rate": 0.0001, + "loss": 1.6227, + "step": 10016 + }, + { + "epoch": 1.150651886738269, + "grad_norm": 0.5757220983505249, + "learning_rate": 0.0001, + "loss": 1.4787, + "step": 10017 + }, + { + "epoch": 1.1507667566480961, + "grad_norm": 0.5967593789100647, + "learning_rate": 0.0001, + "loss": 1.6556, + "step": 10018 + }, + { + "epoch": 1.1508816265579231, + "grad_norm": 0.5282356142997742, + "learning_rate": 0.0001, + "loss": 1.4053, + "step": 10019 + }, + { + "epoch": 1.1509964964677502, + "grad_norm": 0.5435159206390381, + "learning_rate": 0.0001, + "loss": 1.5139, + "step": 10020 + }, + { + "epoch": 1.1511113663775774, + "grad_norm": 0.5466963052749634, + "learning_rate": 0.0001, + "loss": 1.3257, + "step": 10021 + }, + { + "epoch": 1.1512262362874046, + "grad_norm": 0.5418528914451599, + "learning_rate": 0.0001, + "loss": 1.4969, + "step": 10022 + }, + { + "epoch": 1.1513411061972316, + "grad_norm": 0.6105648875236511, + "learning_rate": 0.0001, + "loss": 1.6334, + "step": 10023 + }, + { + "epoch": 1.1514559761070586, + "grad_norm": 0.5551570057868958, + "learning_rate": 0.0001, + "loss": 1.5054, + "step": 10024 + }, + { + "epoch": 1.1515708460168859, + "grad_norm": 0.5221659541130066, + "learning_rate": 0.0001, + "loss": 1.3441, + "step": 10025 + }, + { + "epoch": 1.151685715926713, + "grad_norm": 0.6351895332336426, + "learning_rate": 0.0001, + "loss": 1.5949, + "step": 10026 + }, + { + "epoch": 1.15180058583654, + "grad_norm": 0.5757087469100952, + "learning_rate": 0.0001, + "loss": 1.4843, + "step": 10027 + }, + { + "epoch": 1.1519154557463673, + "grad_norm": 0.5461332201957703, + "learning_rate": 0.0001, + "loss": 1.4231, + "step": 10028 + }, + { + "epoch": 1.1520303256561943, + "grad_norm": 0.5618088841438293, + "learning_rate": 0.0001, + "loss": 1.2567, + "step": 10029 + }, + { + "epoch": 1.1521451955660216, + "grad_norm": 0.668555736541748, + "learning_rate": 0.0001, + "loss": 1.4302, + "step": 10030 + }, + { + "epoch": 1.1522600654758486, + "grad_norm": 0.5682727694511414, + "learning_rate": 0.0001, + "loss": 1.3702, + "step": 10031 + }, + { + "epoch": 1.1523749353856758, + "grad_norm": 0.531745195388794, + "learning_rate": 0.0001, + "loss": 1.5091, + "step": 10032 + }, + { + "epoch": 1.1524898052955028, + "grad_norm": 0.5723196864128113, + "learning_rate": 0.0001, + "loss": 1.4564, + "step": 10033 + }, + { + "epoch": 1.15260467520533, + "grad_norm": 0.5728488564491272, + "learning_rate": 0.0001, + "loss": 1.5481, + "step": 10034 + }, + { + "epoch": 1.152719545115157, + "grad_norm": 0.528705894947052, + "learning_rate": 0.0001, + "loss": 1.4234, + "step": 10035 + }, + { + "epoch": 1.1528344150249843, + "grad_norm": 0.5178189873695374, + "learning_rate": 0.0001, + "loss": 1.3147, + "step": 10036 + }, + { + "epoch": 1.1529492849348113, + "grad_norm": 0.6135823130607605, + "learning_rate": 0.0001, + "loss": 1.4328, + "step": 10037 + }, + { + "epoch": 1.1530641548446385, + "grad_norm": 0.5911353230476379, + "learning_rate": 0.0001, + "loss": 1.1557, + "step": 10038 + }, + { + "epoch": 1.1531790247544655, + "grad_norm": 0.5582436919212341, + "learning_rate": 0.0001, + "loss": 1.2749, + "step": 10039 + }, + { + "epoch": 1.1532938946642928, + "grad_norm": 0.5512915849685669, + "learning_rate": 0.0001, + "loss": 1.4394, + "step": 10040 + }, + { + "epoch": 1.1534087645741198, + "grad_norm": 0.5502411127090454, + "learning_rate": 0.0001, + "loss": 1.2865, + "step": 10041 + }, + { + "epoch": 1.153523634483947, + "grad_norm": 0.5328064560890198, + "learning_rate": 0.0001, + "loss": 1.6308, + "step": 10042 + }, + { + "epoch": 1.153638504393774, + "grad_norm": 0.4983726441860199, + "learning_rate": 0.0001, + "loss": 1.2559, + "step": 10043 + }, + { + "epoch": 1.1537533743036013, + "grad_norm": 0.6484810709953308, + "learning_rate": 0.0001, + "loss": 1.77, + "step": 10044 + }, + { + "epoch": 1.1538682442134283, + "grad_norm": 0.5316923260688782, + "learning_rate": 0.0001, + "loss": 1.3421, + "step": 10045 + }, + { + "epoch": 1.1539831141232555, + "grad_norm": 0.5837815403938293, + "learning_rate": 0.0001, + "loss": 1.5662, + "step": 10046 + }, + { + "epoch": 1.1540979840330825, + "grad_norm": 0.557544469833374, + "learning_rate": 0.0001, + "loss": 1.6774, + "step": 10047 + }, + { + "epoch": 1.1542128539429097, + "grad_norm": 0.5785412192344666, + "learning_rate": 0.0001, + "loss": 1.4416, + "step": 10048 + }, + { + "epoch": 1.1543277238527367, + "grad_norm": 0.5959882140159607, + "learning_rate": 0.0001, + "loss": 1.5142, + "step": 10049 + }, + { + "epoch": 1.154442593762564, + "grad_norm": 0.5870964527130127, + "learning_rate": 0.0001, + "loss": 1.4598, + "step": 10050 + }, + { + "epoch": 1.154557463672391, + "grad_norm": 0.6367591619491577, + "learning_rate": 0.0001, + "loss": 1.3788, + "step": 10051 + }, + { + "epoch": 1.1546723335822182, + "grad_norm": 0.5441646575927734, + "learning_rate": 0.0001, + "loss": 1.4919, + "step": 10052 + }, + { + "epoch": 1.1547872034920452, + "grad_norm": 0.5900537967681885, + "learning_rate": 0.0001, + "loss": 1.2413, + "step": 10053 + }, + { + "epoch": 1.1549020734018725, + "grad_norm": 0.6054137945175171, + "learning_rate": 0.0001, + "loss": 1.4587, + "step": 10054 + }, + { + "epoch": 1.1550169433116995, + "grad_norm": 0.5724448561668396, + "learning_rate": 0.0001, + "loss": 1.3456, + "step": 10055 + }, + { + "epoch": 1.1551318132215267, + "grad_norm": 0.6357987523078918, + "learning_rate": 0.0001, + "loss": 1.5517, + "step": 10056 + }, + { + "epoch": 1.1552466831313537, + "grad_norm": 0.5552554130554199, + "learning_rate": 0.0001, + "loss": 1.5688, + "step": 10057 + }, + { + "epoch": 1.155361553041181, + "grad_norm": 0.5502124428749084, + "learning_rate": 0.0001, + "loss": 1.3949, + "step": 10058 + }, + { + "epoch": 1.155476422951008, + "grad_norm": 0.5750370025634766, + "learning_rate": 0.0001, + "loss": 1.4175, + "step": 10059 + }, + { + "epoch": 1.1555912928608352, + "grad_norm": 0.5867468118667603, + "learning_rate": 0.0001, + "loss": 1.5483, + "step": 10060 + }, + { + "epoch": 1.1557061627706622, + "grad_norm": 0.5584085583686829, + "learning_rate": 0.0001, + "loss": 1.3498, + "step": 10061 + }, + { + "epoch": 1.1558210326804894, + "grad_norm": 0.5806280970573425, + "learning_rate": 0.0001, + "loss": 1.4329, + "step": 10062 + }, + { + "epoch": 1.1559359025903164, + "grad_norm": 0.6064799427986145, + "learning_rate": 0.0001, + "loss": 1.5273, + "step": 10063 + }, + { + "epoch": 1.1560507725001437, + "grad_norm": 0.5346385836601257, + "learning_rate": 0.0001, + "loss": 1.4026, + "step": 10064 + }, + { + "epoch": 1.1561656424099707, + "grad_norm": 0.5625292062759399, + "learning_rate": 0.0001, + "loss": 1.4929, + "step": 10065 + }, + { + "epoch": 1.156280512319798, + "grad_norm": 0.5931135416030884, + "learning_rate": 0.0001, + "loss": 1.4473, + "step": 10066 + }, + { + "epoch": 1.156395382229625, + "grad_norm": 0.5838515162467957, + "learning_rate": 0.0001, + "loss": 1.6558, + "step": 10067 + }, + { + "epoch": 1.1565102521394521, + "grad_norm": 0.5748473405838013, + "learning_rate": 0.0001, + "loss": 1.5563, + "step": 10068 + }, + { + "epoch": 1.1566251220492791, + "grad_norm": 0.5717789530754089, + "learning_rate": 0.0001, + "loss": 1.359, + "step": 10069 + }, + { + "epoch": 1.1567399919591064, + "grad_norm": 0.5722107887268066, + "learning_rate": 0.0001, + "loss": 1.6154, + "step": 10070 + }, + { + "epoch": 1.1568548618689334, + "grad_norm": 0.5895611643791199, + "learning_rate": 0.0001, + "loss": 1.4712, + "step": 10071 + }, + { + "epoch": 1.1569697317787606, + "grad_norm": 0.5189610719680786, + "learning_rate": 0.0001, + "loss": 1.4458, + "step": 10072 + }, + { + "epoch": 1.1570846016885876, + "grad_norm": 0.5800937414169312, + "learning_rate": 0.0001, + "loss": 1.5325, + "step": 10073 + }, + { + "epoch": 1.1571994715984149, + "grad_norm": 0.5774216055870056, + "learning_rate": 0.0001, + "loss": 1.3827, + "step": 10074 + }, + { + "epoch": 1.1573143415082419, + "grad_norm": 0.6525434255599976, + "learning_rate": 0.0001, + "loss": 1.655, + "step": 10075 + }, + { + "epoch": 1.157429211418069, + "grad_norm": 0.5682554244995117, + "learning_rate": 0.0001, + "loss": 1.5512, + "step": 10076 + }, + { + "epoch": 1.157544081327896, + "grad_norm": 0.606096088886261, + "learning_rate": 0.0001, + "loss": 1.4762, + "step": 10077 + }, + { + "epoch": 1.1576589512377233, + "grad_norm": 0.5647655129432678, + "learning_rate": 0.0001, + "loss": 1.6138, + "step": 10078 + }, + { + "epoch": 1.1577738211475503, + "grad_norm": 0.580194354057312, + "learning_rate": 0.0001, + "loss": 1.5909, + "step": 10079 + }, + { + "epoch": 1.1578886910573776, + "grad_norm": 0.5711573958396912, + "learning_rate": 0.0001, + "loss": 1.5894, + "step": 10080 + }, + { + "epoch": 1.1580035609672046, + "grad_norm": 0.5933459401130676, + "learning_rate": 0.0001, + "loss": 1.5768, + "step": 10081 + }, + { + "epoch": 1.1581184308770318, + "grad_norm": 0.5333853960037231, + "learning_rate": 0.0001, + "loss": 1.3838, + "step": 10082 + }, + { + "epoch": 1.1582333007868588, + "grad_norm": 0.5227363705635071, + "learning_rate": 0.0001, + "loss": 1.5504, + "step": 10083 + }, + { + "epoch": 1.158348170696686, + "grad_norm": 0.5608334541320801, + "learning_rate": 0.0001, + "loss": 1.5402, + "step": 10084 + }, + { + "epoch": 1.158463040606513, + "grad_norm": 0.5283211469650269, + "learning_rate": 0.0001, + "loss": 1.2809, + "step": 10085 + }, + { + "epoch": 1.1585779105163403, + "grad_norm": 0.5522334575653076, + "learning_rate": 0.0001, + "loss": 1.4767, + "step": 10086 + }, + { + "epoch": 1.1586927804261673, + "grad_norm": 0.5605513453483582, + "learning_rate": 0.0001, + "loss": 1.3811, + "step": 10087 + }, + { + "epoch": 1.1588076503359945, + "grad_norm": 0.584088146686554, + "learning_rate": 0.0001, + "loss": 1.3513, + "step": 10088 + }, + { + "epoch": 1.1589225202458215, + "grad_norm": 0.5514279007911682, + "learning_rate": 0.0001, + "loss": 1.4998, + "step": 10089 + }, + { + "epoch": 1.1590373901556488, + "grad_norm": 0.5283199548721313, + "learning_rate": 0.0001, + "loss": 1.4285, + "step": 10090 + }, + { + "epoch": 1.1591522600654758, + "grad_norm": 0.5820270776748657, + "learning_rate": 0.0001, + "loss": 1.6182, + "step": 10091 + }, + { + "epoch": 1.159267129975303, + "grad_norm": 0.5973749160766602, + "learning_rate": 0.0001, + "loss": 1.3123, + "step": 10092 + }, + { + "epoch": 1.15938199988513, + "grad_norm": 0.5737087726593018, + "learning_rate": 0.0001, + "loss": 1.5244, + "step": 10093 + }, + { + "epoch": 1.1594968697949573, + "grad_norm": 0.5605661273002625, + "learning_rate": 0.0001, + "loss": 1.5358, + "step": 10094 + }, + { + "epoch": 1.1596117397047843, + "grad_norm": 0.5315190553665161, + "learning_rate": 0.0001, + "loss": 1.4233, + "step": 10095 + }, + { + "epoch": 1.1597266096146115, + "grad_norm": 0.5676159262657166, + "learning_rate": 0.0001, + "loss": 1.48, + "step": 10096 + }, + { + "epoch": 1.1598414795244385, + "grad_norm": 0.5562459826469421, + "learning_rate": 0.0001, + "loss": 1.6104, + "step": 10097 + }, + { + "epoch": 1.1599563494342657, + "grad_norm": 0.5750492215156555, + "learning_rate": 0.0001, + "loss": 1.5335, + "step": 10098 + }, + { + "epoch": 1.1600712193440927, + "grad_norm": 0.565864622592926, + "learning_rate": 0.0001, + "loss": 1.3542, + "step": 10099 + }, + { + "epoch": 1.16018608925392, + "grad_norm": 0.5741795897483826, + "learning_rate": 0.0001, + "loss": 1.5441, + "step": 10100 + }, + { + "epoch": 1.160300959163747, + "grad_norm": 0.5474345088005066, + "learning_rate": 0.0001, + "loss": 1.6313, + "step": 10101 + }, + { + "epoch": 1.1604158290735742, + "grad_norm": 0.5959522724151611, + "learning_rate": 0.0001, + "loss": 1.5481, + "step": 10102 + }, + { + "epoch": 1.1605306989834012, + "grad_norm": 0.5602507591247559, + "learning_rate": 0.0001, + "loss": 1.5372, + "step": 10103 + }, + { + "epoch": 1.1606455688932285, + "grad_norm": 0.5457338690757751, + "learning_rate": 0.0001, + "loss": 1.4527, + "step": 10104 + }, + { + "epoch": 1.1607604388030555, + "grad_norm": 0.5657529830932617, + "learning_rate": 0.0001, + "loss": 1.4834, + "step": 10105 + }, + { + "epoch": 1.1608753087128827, + "grad_norm": 0.5475678443908691, + "learning_rate": 0.0001, + "loss": 1.51, + "step": 10106 + }, + { + "epoch": 1.1609901786227097, + "grad_norm": 0.5341169238090515, + "learning_rate": 0.0001, + "loss": 1.5948, + "step": 10107 + }, + { + "epoch": 1.161105048532537, + "grad_norm": 0.5745497345924377, + "learning_rate": 0.0001, + "loss": 1.5087, + "step": 10108 + }, + { + "epoch": 1.161219918442364, + "grad_norm": 0.56389319896698, + "learning_rate": 0.0001, + "loss": 1.4402, + "step": 10109 + }, + { + "epoch": 1.1613347883521912, + "grad_norm": 0.5795552134513855, + "learning_rate": 0.0001, + "loss": 1.4112, + "step": 10110 + }, + { + "epoch": 1.1614496582620182, + "grad_norm": 0.5938699841499329, + "learning_rate": 0.0001, + "loss": 1.453, + "step": 10111 + }, + { + "epoch": 1.1615645281718454, + "grad_norm": 0.5595149993896484, + "learning_rate": 0.0001, + "loss": 1.4956, + "step": 10112 + }, + { + "epoch": 1.1616793980816724, + "grad_norm": 0.6180020570755005, + "learning_rate": 0.0001, + "loss": 1.6828, + "step": 10113 + }, + { + "epoch": 1.1617942679914997, + "grad_norm": 0.5853464603424072, + "learning_rate": 0.0001, + "loss": 1.3735, + "step": 10114 + }, + { + "epoch": 1.1619091379013267, + "grad_norm": 0.5351958870887756, + "learning_rate": 0.0001, + "loss": 1.3615, + "step": 10115 + }, + { + "epoch": 1.162024007811154, + "grad_norm": 0.5981417894363403, + "learning_rate": 0.0001, + "loss": 1.518, + "step": 10116 + }, + { + "epoch": 1.162138877720981, + "grad_norm": 0.5747444033622742, + "learning_rate": 0.0001, + "loss": 1.5427, + "step": 10117 + }, + { + "epoch": 1.1622537476308081, + "grad_norm": 0.5646340250968933, + "learning_rate": 0.0001, + "loss": 1.5378, + "step": 10118 + }, + { + "epoch": 1.1623686175406351, + "grad_norm": 0.5687159299850464, + "learning_rate": 0.0001, + "loss": 1.5357, + "step": 10119 + }, + { + "epoch": 1.1624834874504624, + "grad_norm": 0.5587418079376221, + "learning_rate": 0.0001, + "loss": 1.4571, + "step": 10120 + }, + { + "epoch": 1.1625983573602894, + "grad_norm": 0.575074315071106, + "learning_rate": 0.0001, + "loss": 1.3044, + "step": 10121 + }, + { + "epoch": 1.1627132272701166, + "grad_norm": 0.6066795587539673, + "learning_rate": 0.0001, + "loss": 1.7487, + "step": 10122 + }, + { + "epoch": 1.1628280971799436, + "grad_norm": 0.6262649893760681, + "learning_rate": 0.0001, + "loss": 1.5702, + "step": 10123 + }, + { + "epoch": 1.1629429670897709, + "grad_norm": 0.5629507303237915, + "learning_rate": 0.0001, + "loss": 1.4479, + "step": 10124 + }, + { + "epoch": 1.1630578369995979, + "grad_norm": 0.6446200013160706, + "learning_rate": 0.0001, + "loss": 1.6935, + "step": 10125 + }, + { + "epoch": 1.163172706909425, + "grad_norm": 0.567599356174469, + "learning_rate": 0.0001, + "loss": 1.4786, + "step": 10126 + }, + { + "epoch": 1.163287576819252, + "grad_norm": 0.5245000720024109, + "learning_rate": 0.0001, + "loss": 1.3432, + "step": 10127 + }, + { + "epoch": 1.1634024467290793, + "grad_norm": 0.6057835817337036, + "learning_rate": 0.0001, + "loss": 1.5532, + "step": 10128 + }, + { + "epoch": 1.1635173166389063, + "grad_norm": 0.5868533253669739, + "learning_rate": 0.0001, + "loss": 1.6142, + "step": 10129 + }, + { + "epoch": 1.1636321865487336, + "grad_norm": 0.533150315284729, + "learning_rate": 0.0001, + "loss": 1.4869, + "step": 10130 + }, + { + "epoch": 1.1637470564585608, + "grad_norm": 0.6112939715385437, + "learning_rate": 0.0001, + "loss": 1.3912, + "step": 10131 + }, + { + "epoch": 1.1638619263683878, + "grad_norm": 0.5236315131187439, + "learning_rate": 0.0001, + "loss": 1.4242, + "step": 10132 + }, + { + "epoch": 1.1639767962782148, + "grad_norm": 0.5508266091346741, + "learning_rate": 0.0001, + "loss": 1.582, + "step": 10133 + }, + { + "epoch": 1.164091666188042, + "grad_norm": 0.5487030744552612, + "learning_rate": 0.0001, + "loss": 1.4663, + "step": 10134 + }, + { + "epoch": 1.1642065360978693, + "grad_norm": 0.6089741587638855, + "learning_rate": 0.0001, + "loss": 1.6514, + "step": 10135 + }, + { + "epoch": 1.1643214060076963, + "grad_norm": 0.6037748456001282, + "learning_rate": 0.0001, + "loss": 1.3875, + "step": 10136 + }, + { + "epoch": 1.1644362759175233, + "grad_norm": 0.5815845131874084, + "learning_rate": 0.0001, + "loss": 1.5777, + "step": 10137 + }, + { + "epoch": 1.1645511458273505, + "grad_norm": 0.5638243556022644, + "learning_rate": 0.0001, + "loss": 1.564, + "step": 10138 + }, + { + "epoch": 1.1646660157371778, + "grad_norm": 0.546511173248291, + "learning_rate": 0.0001, + "loss": 1.4114, + "step": 10139 + }, + { + "epoch": 1.1647808856470048, + "grad_norm": 0.5502268671989441, + "learning_rate": 0.0001, + "loss": 1.4463, + "step": 10140 + }, + { + "epoch": 1.1648957555568318, + "grad_norm": 0.5502709150314331, + "learning_rate": 0.0001, + "loss": 1.4657, + "step": 10141 + }, + { + "epoch": 1.165010625466659, + "grad_norm": 0.5224112868309021, + "learning_rate": 0.0001, + "loss": 1.2935, + "step": 10142 + }, + { + "epoch": 1.1651254953764862, + "grad_norm": 0.5977753400802612, + "learning_rate": 0.0001, + "loss": 1.4873, + "step": 10143 + }, + { + "epoch": 1.1652403652863133, + "grad_norm": 0.5752722024917603, + "learning_rate": 0.0001, + "loss": 1.4752, + "step": 10144 + }, + { + "epoch": 1.1653552351961403, + "grad_norm": 0.5518738031387329, + "learning_rate": 0.0001, + "loss": 1.3145, + "step": 10145 + }, + { + "epoch": 1.1654701051059675, + "grad_norm": 0.5660110712051392, + "learning_rate": 0.0001, + "loss": 1.4595, + "step": 10146 + }, + { + "epoch": 1.1655849750157947, + "grad_norm": 0.6433327198028564, + "learning_rate": 0.0001, + "loss": 1.7201, + "step": 10147 + }, + { + "epoch": 1.1656998449256217, + "grad_norm": 0.5778366327285767, + "learning_rate": 0.0001, + "loss": 1.5793, + "step": 10148 + }, + { + "epoch": 1.1658147148354487, + "grad_norm": 0.5929848551750183, + "learning_rate": 0.0001, + "loss": 1.3827, + "step": 10149 + }, + { + "epoch": 1.165929584745276, + "grad_norm": 0.5661731958389282, + "learning_rate": 0.0001, + "loss": 1.3457, + "step": 10150 + }, + { + "epoch": 1.1660444546551032, + "grad_norm": 0.5540391802787781, + "learning_rate": 0.0001, + "loss": 1.513, + "step": 10151 + }, + { + "epoch": 1.1661593245649302, + "grad_norm": 0.5562129616737366, + "learning_rate": 0.0001, + "loss": 1.5181, + "step": 10152 + }, + { + "epoch": 1.1662741944747572, + "grad_norm": 0.5246435403823853, + "learning_rate": 0.0001, + "loss": 1.4335, + "step": 10153 + }, + { + "epoch": 1.1663890643845845, + "grad_norm": 0.650452733039856, + "learning_rate": 0.0001, + "loss": 1.7906, + "step": 10154 + }, + { + "epoch": 1.1665039342944117, + "grad_norm": 0.5996583104133606, + "learning_rate": 0.0001, + "loss": 1.5997, + "step": 10155 + }, + { + "epoch": 1.1666188042042387, + "grad_norm": 0.52683424949646, + "learning_rate": 0.0001, + "loss": 1.2967, + "step": 10156 + }, + { + "epoch": 1.1667336741140657, + "grad_norm": 0.5419752597808838, + "learning_rate": 0.0001, + "loss": 1.5335, + "step": 10157 + }, + { + "epoch": 1.166848544023893, + "grad_norm": 0.5371859073638916, + "learning_rate": 0.0001, + "loss": 1.2567, + "step": 10158 + }, + { + "epoch": 1.1669634139337202, + "grad_norm": 0.6510452032089233, + "learning_rate": 0.0001, + "loss": 1.7457, + "step": 10159 + }, + { + "epoch": 1.1670782838435472, + "grad_norm": 0.5702193379402161, + "learning_rate": 0.0001, + "loss": 1.5441, + "step": 10160 + }, + { + "epoch": 1.1671931537533742, + "grad_norm": 0.5725999474525452, + "learning_rate": 0.0001, + "loss": 1.5023, + "step": 10161 + }, + { + "epoch": 1.1673080236632014, + "grad_norm": 0.5554792284965515, + "learning_rate": 0.0001, + "loss": 1.4167, + "step": 10162 + }, + { + "epoch": 1.1674228935730286, + "grad_norm": 0.5548241138458252, + "learning_rate": 0.0001, + "loss": 1.3767, + "step": 10163 + }, + { + "epoch": 1.1675377634828557, + "grad_norm": 0.6183516979217529, + "learning_rate": 0.0001, + "loss": 1.626, + "step": 10164 + }, + { + "epoch": 1.1676526333926829, + "grad_norm": 0.5450295209884644, + "learning_rate": 0.0001, + "loss": 1.5589, + "step": 10165 + }, + { + "epoch": 1.16776750330251, + "grad_norm": 0.5990351438522339, + "learning_rate": 0.0001, + "loss": 1.544, + "step": 10166 + }, + { + "epoch": 1.1678823732123371, + "grad_norm": 0.5059851408004761, + "learning_rate": 0.0001, + "loss": 1.4095, + "step": 10167 + }, + { + "epoch": 1.1679972431221641, + "grad_norm": 0.534441351890564, + "learning_rate": 0.0001, + "loss": 1.5801, + "step": 10168 + }, + { + "epoch": 1.1681121130319914, + "grad_norm": 0.5403814911842346, + "learning_rate": 0.0001, + "loss": 1.4503, + "step": 10169 + }, + { + "epoch": 1.1682269829418184, + "grad_norm": 0.5525118112564087, + "learning_rate": 0.0001, + "loss": 1.2902, + "step": 10170 + }, + { + "epoch": 1.1683418528516456, + "grad_norm": 0.5069050788879395, + "learning_rate": 0.0001, + "loss": 1.3002, + "step": 10171 + }, + { + "epoch": 1.1684567227614726, + "grad_norm": 0.5735329389572144, + "learning_rate": 0.0001, + "loss": 1.6275, + "step": 10172 + }, + { + "epoch": 1.1685715926712998, + "grad_norm": 0.5312854647636414, + "learning_rate": 0.0001, + "loss": 1.6037, + "step": 10173 + }, + { + "epoch": 1.1686864625811269, + "grad_norm": 0.5686579346656799, + "learning_rate": 0.0001, + "loss": 1.5586, + "step": 10174 + }, + { + "epoch": 1.168801332490954, + "grad_norm": 0.5913828015327454, + "learning_rate": 0.0001, + "loss": 1.4098, + "step": 10175 + }, + { + "epoch": 1.168916202400781, + "grad_norm": 0.5945882797241211, + "learning_rate": 0.0001, + "loss": 1.1721, + "step": 10176 + }, + { + "epoch": 1.1690310723106083, + "grad_norm": 0.565756618976593, + "learning_rate": 0.0001, + "loss": 1.3638, + "step": 10177 + }, + { + "epoch": 1.1691459422204353, + "grad_norm": 0.557446300983429, + "learning_rate": 0.0001, + "loss": 1.3487, + "step": 10178 + }, + { + "epoch": 1.1692608121302626, + "grad_norm": 0.5504045486450195, + "learning_rate": 0.0001, + "loss": 1.4271, + "step": 10179 + }, + { + "epoch": 1.1693756820400896, + "grad_norm": 0.576124906539917, + "learning_rate": 0.0001, + "loss": 1.5313, + "step": 10180 + }, + { + "epoch": 1.1694905519499168, + "grad_norm": 0.5811876058578491, + "learning_rate": 0.0001, + "loss": 1.3186, + "step": 10181 + }, + { + "epoch": 1.1696054218597438, + "grad_norm": 0.5304403305053711, + "learning_rate": 0.0001, + "loss": 1.4356, + "step": 10182 + }, + { + "epoch": 1.169720291769571, + "grad_norm": 0.5441917777061462, + "learning_rate": 0.0001, + "loss": 1.4705, + "step": 10183 + }, + { + "epoch": 1.169835161679398, + "grad_norm": 0.5729200839996338, + "learning_rate": 0.0001, + "loss": 1.5841, + "step": 10184 + }, + { + "epoch": 1.1699500315892253, + "grad_norm": 0.5269560813903809, + "learning_rate": 0.0001, + "loss": 1.4486, + "step": 10185 + }, + { + "epoch": 1.1700649014990523, + "grad_norm": 0.6391506791114807, + "learning_rate": 0.0001, + "loss": 1.6032, + "step": 10186 + }, + { + "epoch": 1.1701797714088795, + "grad_norm": 0.5685237646102905, + "learning_rate": 0.0001, + "loss": 1.6004, + "step": 10187 + }, + { + "epoch": 1.1702946413187065, + "grad_norm": 0.6011033654212952, + "learning_rate": 0.0001, + "loss": 1.5688, + "step": 10188 + }, + { + "epoch": 1.1704095112285338, + "grad_norm": 0.5902635455131531, + "learning_rate": 0.0001, + "loss": 1.3985, + "step": 10189 + }, + { + "epoch": 1.1705243811383608, + "grad_norm": 0.5687843561172485, + "learning_rate": 0.0001, + "loss": 1.5375, + "step": 10190 + }, + { + "epoch": 1.170639251048188, + "grad_norm": 0.5689520239830017, + "learning_rate": 0.0001, + "loss": 1.6174, + "step": 10191 + }, + { + "epoch": 1.170754120958015, + "grad_norm": 0.5470170378684998, + "learning_rate": 0.0001, + "loss": 1.2694, + "step": 10192 + }, + { + "epoch": 1.1708689908678422, + "grad_norm": 0.5402204394340515, + "learning_rate": 0.0001, + "loss": 1.3891, + "step": 10193 + }, + { + "epoch": 1.1709838607776692, + "grad_norm": 0.5700439214706421, + "learning_rate": 0.0001, + "loss": 1.2554, + "step": 10194 + }, + { + "epoch": 1.1710987306874965, + "grad_norm": 0.5934173464775085, + "learning_rate": 0.0001, + "loss": 1.6594, + "step": 10195 + }, + { + "epoch": 1.1712136005973235, + "grad_norm": 0.5891930460929871, + "learning_rate": 0.0001, + "loss": 1.5339, + "step": 10196 + }, + { + "epoch": 1.1713284705071507, + "grad_norm": 0.5680025219917297, + "learning_rate": 0.0001, + "loss": 1.5624, + "step": 10197 + }, + { + "epoch": 1.1714433404169777, + "grad_norm": 0.557145893573761, + "learning_rate": 0.0001, + "loss": 1.363, + "step": 10198 + }, + { + "epoch": 1.171558210326805, + "grad_norm": 0.5671913623809814, + "learning_rate": 0.0001, + "loss": 1.1882, + "step": 10199 + }, + { + "epoch": 1.171673080236632, + "grad_norm": 0.5525150895118713, + "learning_rate": 0.0001, + "loss": 1.4802, + "step": 10200 + }, + { + "epoch": 1.1717879501464592, + "grad_norm": 0.5642703771591187, + "learning_rate": 0.0001, + "loss": 1.4937, + "step": 10201 + }, + { + "epoch": 1.1719028200562862, + "grad_norm": 0.5610083341598511, + "learning_rate": 0.0001, + "loss": 1.5124, + "step": 10202 + }, + { + "epoch": 1.1720176899661134, + "grad_norm": 0.5838199257850647, + "learning_rate": 0.0001, + "loss": 1.4971, + "step": 10203 + }, + { + "epoch": 1.1721325598759404, + "grad_norm": 0.6114639639854431, + "learning_rate": 0.0001, + "loss": 1.45, + "step": 10204 + }, + { + "epoch": 1.1722474297857677, + "grad_norm": 0.6125414371490479, + "learning_rate": 0.0001, + "loss": 1.4981, + "step": 10205 + }, + { + "epoch": 1.1723622996955947, + "grad_norm": 0.6367550492286682, + "learning_rate": 0.0001, + "loss": 1.7869, + "step": 10206 + }, + { + "epoch": 1.172477169605422, + "grad_norm": 0.5995863080024719, + "learning_rate": 0.0001, + "loss": 1.5015, + "step": 10207 + }, + { + "epoch": 1.172592039515249, + "grad_norm": 0.5278496742248535, + "learning_rate": 0.0001, + "loss": 1.4433, + "step": 10208 + }, + { + "epoch": 1.1727069094250762, + "grad_norm": 0.5485237240791321, + "learning_rate": 0.0001, + "loss": 1.5175, + "step": 10209 + }, + { + "epoch": 1.1728217793349032, + "grad_norm": 0.6074742674827576, + "learning_rate": 0.0001, + "loss": 1.3414, + "step": 10210 + }, + { + "epoch": 1.1729366492447304, + "grad_norm": 0.5964166522026062, + "learning_rate": 0.0001, + "loss": 1.3031, + "step": 10211 + }, + { + "epoch": 1.1730515191545574, + "grad_norm": 0.5800673365592957, + "learning_rate": 0.0001, + "loss": 1.4228, + "step": 10212 + }, + { + "epoch": 1.1731663890643846, + "grad_norm": 0.5409739017486572, + "learning_rate": 0.0001, + "loss": 1.5449, + "step": 10213 + }, + { + "epoch": 1.1732812589742116, + "grad_norm": 0.5506752729415894, + "learning_rate": 0.0001, + "loss": 1.5131, + "step": 10214 + }, + { + "epoch": 1.1733961288840389, + "grad_norm": 0.551644504070282, + "learning_rate": 0.0001, + "loss": 1.4612, + "step": 10215 + }, + { + "epoch": 1.173510998793866, + "grad_norm": 0.5593010783195496, + "learning_rate": 0.0001, + "loss": 1.4069, + "step": 10216 + }, + { + "epoch": 1.1736258687036931, + "grad_norm": 0.5570374131202698, + "learning_rate": 0.0001, + "loss": 1.1389, + "step": 10217 + }, + { + "epoch": 1.1737407386135201, + "grad_norm": 0.6743999719619751, + "learning_rate": 0.0001, + "loss": 1.5835, + "step": 10218 + }, + { + "epoch": 1.1738556085233474, + "grad_norm": 0.5389916300773621, + "learning_rate": 0.0001, + "loss": 1.3792, + "step": 10219 + }, + { + "epoch": 1.1739704784331744, + "grad_norm": 0.5596214532852173, + "learning_rate": 0.0001, + "loss": 1.3943, + "step": 10220 + }, + { + "epoch": 1.1740853483430016, + "grad_norm": 0.6074917912483215, + "learning_rate": 0.0001, + "loss": 1.2292, + "step": 10221 + }, + { + "epoch": 1.1742002182528286, + "grad_norm": 0.5609689950942993, + "learning_rate": 0.0001, + "loss": 1.514, + "step": 10222 + }, + { + "epoch": 1.1743150881626558, + "grad_norm": 0.554728627204895, + "learning_rate": 0.0001, + "loss": 1.4622, + "step": 10223 + }, + { + "epoch": 1.1744299580724828, + "grad_norm": 0.548478901386261, + "learning_rate": 0.0001, + "loss": 1.4344, + "step": 10224 + }, + { + "epoch": 1.17454482798231, + "grad_norm": 0.5518718957901001, + "learning_rate": 0.0001, + "loss": 1.4001, + "step": 10225 + }, + { + "epoch": 1.174659697892137, + "grad_norm": 0.6477909088134766, + "learning_rate": 0.0001, + "loss": 1.5887, + "step": 10226 + }, + { + "epoch": 1.1747745678019643, + "grad_norm": 0.5640280842781067, + "learning_rate": 0.0001, + "loss": 1.4278, + "step": 10227 + }, + { + "epoch": 1.1748894377117913, + "grad_norm": 0.5902442932128906, + "learning_rate": 0.0001, + "loss": 1.3603, + "step": 10228 + }, + { + "epoch": 1.1750043076216186, + "grad_norm": 0.575197160243988, + "learning_rate": 0.0001, + "loss": 1.4094, + "step": 10229 + }, + { + "epoch": 1.1751191775314456, + "grad_norm": 0.5721563696861267, + "learning_rate": 0.0001, + "loss": 1.5245, + "step": 10230 + }, + { + "epoch": 1.1752340474412728, + "grad_norm": 0.7208542823791504, + "learning_rate": 0.0001, + "loss": 1.5508, + "step": 10231 + }, + { + "epoch": 1.1753489173510998, + "grad_norm": 0.5777501463890076, + "learning_rate": 0.0001, + "loss": 1.3529, + "step": 10232 + }, + { + "epoch": 1.175463787260927, + "grad_norm": 0.5787646174430847, + "learning_rate": 0.0001, + "loss": 1.4706, + "step": 10233 + }, + { + "epoch": 1.175578657170754, + "grad_norm": 0.5515586733818054, + "learning_rate": 0.0001, + "loss": 1.5613, + "step": 10234 + }, + { + "epoch": 1.1756935270805813, + "grad_norm": 0.5776956677436829, + "learning_rate": 0.0001, + "loss": 1.4891, + "step": 10235 + }, + { + "epoch": 1.1758083969904083, + "grad_norm": 0.6034742593765259, + "learning_rate": 0.0001, + "loss": 1.5673, + "step": 10236 + }, + { + "epoch": 1.1759232669002355, + "grad_norm": 0.5228462219238281, + "learning_rate": 0.0001, + "loss": 1.5063, + "step": 10237 + }, + { + "epoch": 1.1760381368100625, + "grad_norm": 0.5508900880813599, + "learning_rate": 0.0001, + "loss": 1.477, + "step": 10238 + }, + { + "epoch": 1.1761530067198898, + "grad_norm": 0.5309219360351562, + "learning_rate": 0.0001, + "loss": 1.446, + "step": 10239 + }, + { + "epoch": 1.1762678766297168, + "grad_norm": 0.5277041792869568, + "learning_rate": 0.0001, + "loss": 1.405, + "step": 10240 + }, + { + "epoch": 1.176382746539544, + "grad_norm": 0.5799806118011475, + "learning_rate": 0.0001, + "loss": 1.342, + "step": 10241 + }, + { + "epoch": 1.176497616449371, + "grad_norm": 0.5781192183494568, + "learning_rate": 0.0001, + "loss": 1.356, + "step": 10242 + }, + { + "epoch": 1.1766124863591982, + "grad_norm": 0.5948365330696106, + "learning_rate": 0.0001, + "loss": 1.5877, + "step": 10243 + }, + { + "epoch": 1.1767273562690252, + "grad_norm": 0.5791916251182556, + "learning_rate": 0.0001, + "loss": 1.644, + "step": 10244 + }, + { + "epoch": 1.1768422261788525, + "grad_norm": 0.60085129737854, + "learning_rate": 0.0001, + "loss": 1.4501, + "step": 10245 + }, + { + "epoch": 1.1769570960886795, + "grad_norm": 0.5680752396583557, + "learning_rate": 0.0001, + "loss": 1.5569, + "step": 10246 + }, + { + "epoch": 1.1770719659985067, + "grad_norm": 0.5210364460945129, + "learning_rate": 0.0001, + "loss": 1.4383, + "step": 10247 + }, + { + "epoch": 1.1771868359083337, + "grad_norm": 0.5816622972488403, + "learning_rate": 0.0001, + "loss": 1.543, + "step": 10248 + }, + { + "epoch": 1.177301705818161, + "grad_norm": 0.5394155383110046, + "learning_rate": 0.0001, + "loss": 1.4946, + "step": 10249 + }, + { + "epoch": 1.177416575727988, + "grad_norm": 0.5853523015975952, + "learning_rate": 0.0001, + "loss": 1.6292, + "step": 10250 + }, + { + "epoch": 1.1775314456378152, + "grad_norm": 0.5694888234138489, + "learning_rate": 0.0001, + "loss": 1.5744, + "step": 10251 + }, + { + "epoch": 1.1776463155476422, + "grad_norm": 0.5495486855506897, + "learning_rate": 0.0001, + "loss": 1.5335, + "step": 10252 + }, + { + "epoch": 1.1777611854574694, + "grad_norm": 0.5514094233512878, + "learning_rate": 0.0001, + "loss": 1.1921, + "step": 10253 + }, + { + "epoch": 1.1778760553672964, + "grad_norm": 0.5321869850158691, + "learning_rate": 0.0001, + "loss": 1.4492, + "step": 10254 + }, + { + "epoch": 1.1779909252771237, + "grad_norm": 0.5383161902427673, + "learning_rate": 0.0001, + "loss": 1.4645, + "step": 10255 + }, + { + "epoch": 1.1781057951869507, + "grad_norm": 0.5531543493270874, + "learning_rate": 0.0001, + "loss": 1.494, + "step": 10256 + }, + { + "epoch": 1.178220665096778, + "grad_norm": 0.5813853144645691, + "learning_rate": 0.0001, + "loss": 1.6546, + "step": 10257 + }, + { + "epoch": 1.178335535006605, + "grad_norm": 0.6172130703926086, + "learning_rate": 0.0001, + "loss": 1.5318, + "step": 10258 + }, + { + "epoch": 1.1784504049164322, + "grad_norm": 0.6372018456459045, + "learning_rate": 0.0001, + "loss": 1.4326, + "step": 10259 + }, + { + "epoch": 1.1785652748262592, + "grad_norm": 0.5197803974151611, + "learning_rate": 0.0001, + "loss": 1.2676, + "step": 10260 + }, + { + "epoch": 1.1786801447360864, + "grad_norm": 0.549474835395813, + "learning_rate": 0.0001, + "loss": 1.6087, + "step": 10261 + }, + { + "epoch": 1.1787950146459134, + "grad_norm": 0.5710943937301636, + "learning_rate": 0.0001, + "loss": 1.6127, + "step": 10262 + }, + { + "epoch": 1.1789098845557406, + "grad_norm": 0.5572322010993958, + "learning_rate": 0.0001, + "loss": 1.4759, + "step": 10263 + }, + { + "epoch": 1.1790247544655676, + "grad_norm": 0.5510053038597107, + "learning_rate": 0.0001, + "loss": 1.2321, + "step": 10264 + }, + { + "epoch": 1.1791396243753949, + "grad_norm": 0.5195276141166687, + "learning_rate": 0.0001, + "loss": 1.2855, + "step": 10265 + }, + { + "epoch": 1.1792544942852219, + "grad_norm": 0.5562936663627625, + "learning_rate": 0.0001, + "loss": 1.3052, + "step": 10266 + }, + { + "epoch": 1.1793693641950491, + "grad_norm": 0.5841695666313171, + "learning_rate": 0.0001, + "loss": 1.6325, + "step": 10267 + }, + { + "epoch": 1.1794842341048764, + "grad_norm": 0.5963363647460938, + "learning_rate": 0.0001, + "loss": 1.4882, + "step": 10268 + }, + { + "epoch": 1.1795991040147034, + "grad_norm": 0.5893837809562683, + "learning_rate": 0.0001, + "loss": 1.5622, + "step": 10269 + }, + { + "epoch": 1.1797139739245304, + "grad_norm": 0.5855959057807922, + "learning_rate": 0.0001, + "loss": 1.4191, + "step": 10270 + }, + { + "epoch": 1.1798288438343576, + "grad_norm": 0.5364851355552673, + "learning_rate": 0.0001, + "loss": 1.3232, + "step": 10271 + }, + { + "epoch": 1.1799437137441848, + "grad_norm": 0.574670135974884, + "learning_rate": 0.0001, + "loss": 1.5866, + "step": 10272 + }, + { + "epoch": 1.1800585836540118, + "grad_norm": 0.5624828934669495, + "learning_rate": 0.0001, + "loss": 1.4357, + "step": 10273 + }, + { + "epoch": 1.1801734535638388, + "grad_norm": 0.585806131362915, + "learning_rate": 0.0001, + "loss": 1.4264, + "step": 10274 + }, + { + "epoch": 1.180288323473666, + "grad_norm": 0.6353849172592163, + "learning_rate": 0.0001, + "loss": 1.76, + "step": 10275 + }, + { + "epoch": 1.1804031933834933, + "grad_norm": 0.5217344760894775, + "learning_rate": 0.0001, + "loss": 1.4722, + "step": 10276 + }, + { + "epoch": 1.1805180632933203, + "grad_norm": 0.5620312690734863, + "learning_rate": 0.0001, + "loss": 1.3626, + "step": 10277 + }, + { + "epoch": 1.1806329332031473, + "grad_norm": 0.6162904500961304, + "learning_rate": 0.0001, + "loss": 1.2, + "step": 10278 + }, + { + "epoch": 1.1807478031129746, + "grad_norm": 0.5573971271514893, + "learning_rate": 0.0001, + "loss": 1.2377, + "step": 10279 + }, + { + "epoch": 1.1808626730228018, + "grad_norm": 0.5593479871749878, + "learning_rate": 0.0001, + "loss": 1.2591, + "step": 10280 + }, + { + "epoch": 1.1809775429326288, + "grad_norm": 0.5311997532844543, + "learning_rate": 0.0001, + "loss": 1.4939, + "step": 10281 + }, + { + "epoch": 1.1810924128424558, + "grad_norm": 0.5329908728599548, + "learning_rate": 0.0001, + "loss": 1.4148, + "step": 10282 + }, + { + "epoch": 1.181207282752283, + "grad_norm": 0.6186027526855469, + "learning_rate": 0.0001, + "loss": 1.7251, + "step": 10283 + }, + { + "epoch": 1.1813221526621103, + "grad_norm": 0.5550754070281982, + "learning_rate": 0.0001, + "loss": 1.3355, + "step": 10284 + }, + { + "epoch": 1.1814370225719373, + "grad_norm": 0.5469382405281067, + "learning_rate": 0.0001, + "loss": 1.5502, + "step": 10285 + }, + { + "epoch": 1.1815518924817643, + "grad_norm": 0.5690637826919556, + "learning_rate": 0.0001, + "loss": 1.4224, + "step": 10286 + }, + { + "epoch": 1.1816667623915915, + "grad_norm": 0.5610548853874207, + "learning_rate": 0.0001, + "loss": 1.4282, + "step": 10287 + }, + { + "epoch": 1.1817816323014187, + "grad_norm": 0.5904365181922913, + "learning_rate": 0.0001, + "loss": 1.5588, + "step": 10288 + }, + { + "epoch": 1.1818965022112458, + "grad_norm": 0.538159966468811, + "learning_rate": 0.0001, + "loss": 1.3842, + "step": 10289 + }, + { + "epoch": 1.1820113721210728, + "grad_norm": 0.5710332989692688, + "learning_rate": 0.0001, + "loss": 1.441, + "step": 10290 + }, + { + "epoch": 1.1821262420309, + "grad_norm": 0.5427202582359314, + "learning_rate": 0.0001, + "loss": 1.4344, + "step": 10291 + }, + { + "epoch": 1.1822411119407272, + "grad_norm": 0.5821972489356995, + "learning_rate": 0.0001, + "loss": 1.3847, + "step": 10292 + }, + { + "epoch": 1.1823559818505542, + "grad_norm": 0.5653535723686218, + "learning_rate": 0.0001, + "loss": 1.5676, + "step": 10293 + }, + { + "epoch": 1.1824708517603812, + "grad_norm": 0.5518774390220642, + "learning_rate": 0.0001, + "loss": 1.4604, + "step": 10294 + }, + { + "epoch": 1.1825857216702085, + "grad_norm": 0.506719172000885, + "learning_rate": 0.0001, + "loss": 1.4198, + "step": 10295 + }, + { + "epoch": 1.1827005915800357, + "grad_norm": 0.5615983009338379, + "learning_rate": 0.0001, + "loss": 1.5238, + "step": 10296 + }, + { + "epoch": 1.1828154614898627, + "grad_norm": 0.5364108681678772, + "learning_rate": 0.0001, + "loss": 1.3754, + "step": 10297 + }, + { + "epoch": 1.1829303313996897, + "grad_norm": 0.552790105342865, + "learning_rate": 0.0001, + "loss": 1.3189, + "step": 10298 + }, + { + "epoch": 1.183045201309517, + "grad_norm": 0.5753331780433655, + "learning_rate": 0.0001, + "loss": 1.6094, + "step": 10299 + }, + { + "epoch": 1.1831600712193442, + "grad_norm": 0.5673770308494568, + "learning_rate": 0.0001, + "loss": 1.5215, + "step": 10300 + }, + { + "epoch": 1.1832749411291712, + "grad_norm": 0.5506584644317627, + "learning_rate": 0.0001, + "loss": 1.4499, + "step": 10301 + }, + { + "epoch": 1.1833898110389984, + "grad_norm": 0.5558772087097168, + "learning_rate": 0.0001, + "loss": 1.5703, + "step": 10302 + }, + { + "epoch": 1.1835046809488254, + "grad_norm": 0.5837011337280273, + "learning_rate": 0.0001, + "loss": 1.6082, + "step": 10303 + }, + { + "epoch": 1.1836195508586527, + "grad_norm": 0.5474368333816528, + "learning_rate": 0.0001, + "loss": 1.4428, + "step": 10304 + }, + { + "epoch": 1.1837344207684797, + "grad_norm": 0.5366905331611633, + "learning_rate": 0.0001, + "loss": 1.4069, + "step": 10305 + }, + { + "epoch": 1.183849290678307, + "grad_norm": 0.5617532134056091, + "learning_rate": 0.0001, + "loss": 1.4165, + "step": 10306 + }, + { + "epoch": 1.183964160588134, + "grad_norm": 0.5564953684806824, + "learning_rate": 0.0001, + "loss": 1.5541, + "step": 10307 + }, + { + "epoch": 1.1840790304979611, + "grad_norm": 0.5196337699890137, + "learning_rate": 0.0001, + "loss": 1.3772, + "step": 10308 + }, + { + "epoch": 1.1841939004077882, + "grad_norm": 0.5515973567962646, + "learning_rate": 0.0001, + "loss": 1.5849, + "step": 10309 + }, + { + "epoch": 1.1843087703176154, + "grad_norm": 0.5052083730697632, + "learning_rate": 0.0001, + "loss": 1.4214, + "step": 10310 + }, + { + "epoch": 1.1844236402274424, + "grad_norm": 0.6033722162246704, + "learning_rate": 0.0001, + "loss": 1.7061, + "step": 10311 + }, + { + "epoch": 1.1845385101372696, + "grad_norm": 0.535667359828949, + "learning_rate": 0.0001, + "loss": 1.5664, + "step": 10312 + }, + { + "epoch": 1.1846533800470966, + "grad_norm": 0.5910223722457886, + "learning_rate": 0.0001, + "loss": 1.5376, + "step": 10313 + }, + { + "epoch": 1.1847682499569239, + "grad_norm": 0.542930543422699, + "learning_rate": 0.0001, + "loss": 1.4848, + "step": 10314 + }, + { + "epoch": 1.1848831198667509, + "grad_norm": 0.5212183594703674, + "learning_rate": 0.0001, + "loss": 1.5306, + "step": 10315 + }, + { + "epoch": 1.184997989776578, + "grad_norm": 0.5621635317802429, + "learning_rate": 0.0001, + "loss": 1.5478, + "step": 10316 + }, + { + "epoch": 1.1851128596864051, + "grad_norm": 0.6084299087524414, + "learning_rate": 0.0001, + "loss": 1.5851, + "step": 10317 + }, + { + "epoch": 1.1852277295962323, + "grad_norm": 0.5304902195930481, + "learning_rate": 0.0001, + "loss": 1.3968, + "step": 10318 + }, + { + "epoch": 1.1853425995060594, + "grad_norm": 0.5503674745559692, + "learning_rate": 0.0001, + "loss": 1.5019, + "step": 10319 + }, + { + "epoch": 1.1854574694158866, + "grad_norm": 0.5336641073226929, + "learning_rate": 0.0001, + "loss": 1.3079, + "step": 10320 + }, + { + "epoch": 1.1855723393257136, + "grad_norm": 0.6056388020515442, + "learning_rate": 0.0001, + "loss": 1.6839, + "step": 10321 + }, + { + "epoch": 1.1856872092355408, + "grad_norm": 0.5721645355224609, + "learning_rate": 0.0001, + "loss": 1.4073, + "step": 10322 + }, + { + "epoch": 1.1858020791453678, + "grad_norm": 0.6094290018081665, + "learning_rate": 0.0001, + "loss": 1.5212, + "step": 10323 + }, + { + "epoch": 1.185916949055195, + "grad_norm": 0.558312177658081, + "learning_rate": 0.0001, + "loss": 1.4004, + "step": 10324 + }, + { + "epoch": 1.186031818965022, + "grad_norm": 0.5515640377998352, + "learning_rate": 0.0001, + "loss": 1.4245, + "step": 10325 + }, + { + "epoch": 1.1861466888748493, + "grad_norm": 0.6152453422546387, + "learning_rate": 0.0001, + "loss": 1.5651, + "step": 10326 + }, + { + "epoch": 1.1862615587846763, + "grad_norm": 0.5758329629898071, + "learning_rate": 0.0001, + "loss": 1.3415, + "step": 10327 + }, + { + "epoch": 1.1863764286945035, + "grad_norm": 0.5209949612617493, + "learning_rate": 0.0001, + "loss": 1.3905, + "step": 10328 + }, + { + "epoch": 1.1864912986043306, + "grad_norm": 0.5001648664474487, + "learning_rate": 0.0001, + "loss": 1.31, + "step": 10329 + }, + { + "epoch": 1.1866061685141578, + "grad_norm": 0.5593850612640381, + "learning_rate": 0.0001, + "loss": 1.518, + "step": 10330 + }, + { + "epoch": 1.1867210384239848, + "grad_norm": 0.5500064492225647, + "learning_rate": 0.0001, + "loss": 1.4475, + "step": 10331 + }, + { + "epoch": 1.186835908333812, + "grad_norm": 0.5763779282569885, + "learning_rate": 0.0001, + "loss": 1.3441, + "step": 10332 + }, + { + "epoch": 1.186950778243639, + "grad_norm": 0.5549381375312805, + "learning_rate": 0.0001, + "loss": 1.5054, + "step": 10333 + }, + { + "epoch": 1.1870656481534663, + "grad_norm": 0.5587765574455261, + "learning_rate": 0.0001, + "loss": 1.3678, + "step": 10334 + }, + { + "epoch": 1.1871805180632933, + "grad_norm": 0.5837385654449463, + "learning_rate": 0.0001, + "loss": 1.5546, + "step": 10335 + }, + { + "epoch": 1.1872953879731205, + "grad_norm": 0.5909486413002014, + "learning_rate": 0.0001, + "loss": 1.5224, + "step": 10336 + }, + { + "epoch": 1.1874102578829475, + "grad_norm": 0.5553216338157654, + "learning_rate": 0.0001, + "loss": 1.4709, + "step": 10337 + }, + { + "epoch": 1.1875251277927747, + "grad_norm": 0.5260200500488281, + "learning_rate": 0.0001, + "loss": 1.3144, + "step": 10338 + }, + { + "epoch": 1.1876399977026018, + "grad_norm": 0.5829170346260071, + "learning_rate": 0.0001, + "loss": 1.6455, + "step": 10339 + }, + { + "epoch": 1.187754867612429, + "grad_norm": 0.5121257305145264, + "learning_rate": 0.0001, + "loss": 1.3636, + "step": 10340 + }, + { + "epoch": 1.187869737522256, + "grad_norm": 0.5356987714767456, + "learning_rate": 0.0001, + "loss": 1.2993, + "step": 10341 + }, + { + "epoch": 1.1879846074320832, + "grad_norm": 0.5900508761405945, + "learning_rate": 0.0001, + "loss": 1.5197, + "step": 10342 + }, + { + "epoch": 1.1880994773419102, + "grad_norm": 0.5535077452659607, + "learning_rate": 0.0001, + "loss": 1.5212, + "step": 10343 + }, + { + "epoch": 1.1882143472517375, + "grad_norm": 0.5894918441772461, + "learning_rate": 0.0001, + "loss": 1.6822, + "step": 10344 + }, + { + "epoch": 1.1883292171615645, + "grad_norm": 0.5290895104408264, + "learning_rate": 0.0001, + "loss": 1.4078, + "step": 10345 + }, + { + "epoch": 1.1884440870713917, + "grad_norm": 0.5344952344894409, + "learning_rate": 0.0001, + "loss": 1.3112, + "step": 10346 + }, + { + "epoch": 1.1885589569812187, + "grad_norm": 0.5518721342086792, + "learning_rate": 0.0001, + "loss": 1.386, + "step": 10347 + }, + { + "epoch": 1.188673826891046, + "grad_norm": 0.5459201335906982, + "learning_rate": 0.0001, + "loss": 1.3449, + "step": 10348 + }, + { + "epoch": 1.188788696800873, + "grad_norm": 0.5421302318572998, + "learning_rate": 0.0001, + "loss": 1.4268, + "step": 10349 + }, + { + "epoch": 1.1889035667107002, + "grad_norm": 0.5335027575492859, + "learning_rate": 0.0001, + "loss": 1.4623, + "step": 10350 + }, + { + "epoch": 1.1890184366205272, + "grad_norm": 0.5534794926643372, + "learning_rate": 0.0001, + "loss": 1.4581, + "step": 10351 + }, + { + "epoch": 1.1891333065303544, + "grad_norm": 0.5756953954696655, + "learning_rate": 0.0001, + "loss": 1.4053, + "step": 10352 + }, + { + "epoch": 1.1892481764401814, + "grad_norm": 0.6161737442016602, + "learning_rate": 0.0001, + "loss": 1.6299, + "step": 10353 + }, + { + "epoch": 1.1893630463500087, + "grad_norm": 0.6521564722061157, + "learning_rate": 0.0001, + "loss": 1.5336, + "step": 10354 + }, + { + "epoch": 1.1894779162598357, + "grad_norm": 0.5857300162315369, + "learning_rate": 0.0001, + "loss": 1.6375, + "step": 10355 + }, + { + "epoch": 1.189592786169663, + "grad_norm": 0.6228790283203125, + "learning_rate": 0.0001, + "loss": 1.5937, + "step": 10356 + }, + { + "epoch": 1.18970765607949, + "grad_norm": 0.5708868503570557, + "learning_rate": 0.0001, + "loss": 1.13, + "step": 10357 + }, + { + "epoch": 1.1898225259893171, + "grad_norm": 0.6147711873054504, + "learning_rate": 0.0001, + "loss": 1.5533, + "step": 10358 + }, + { + "epoch": 1.1899373958991442, + "grad_norm": 0.5267451405525208, + "learning_rate": 0.0001, + "loss": 1.4593, + "step": 10359 + }, + { + "epoch": 1.1900522658089714, + "grad_norm": 0.5641406178474426, + "learning_rate": 0.0001, + "loss": 1.6509, + "step": 10360 + }, + { + "epoch": 1.1901671357187984, + "grad_norm": 0.6026042699813843, + "learning_rate": 0.0001, + "loss": 1.5547, + "step": 10361 + }, + { + "epoch": 1.1902820056286256, + "grad_norm": 0.5601838231086731, + "learning_rate": 0.0001, + "loss": 1.5328, + "step": 10362 + }, + { + "epoch": 1.1903968755384526, + "grad_norm": 0.505669355392456, + "learning_rate": 0.0001, + "loss": 1.3774, + "step": 10363 + }, + { + "epoch": 1.1905117454482799, + "grad_norm": 0.5833644270896912, + "learning_rate": 0.0001, + "loss": 1.5833, + "step": 10364 + }, + { + "epoch": 1.1906266153581069, + "grad_norm": 0.5416715145111084, + "learning_rate": 0.0001, + "loss": 1.3589, + "step": 10365 + }, + { + "epoch": 1.190741485267934, + "grad_norm": 0.5769158601760864, + "learning_rate": 0.0001, + "loss": 1.5507, + "step": 10366 + }, + { + "epoch": 1.1908563551777611, + "grad_norm": 0.5252044200897217, + "learning_rate": 0.0001, + "loss": 1.2886, + "step": 10367 + }, + { + "epoch": 1.1909712250875883, + "grad_norm": 0.5603752136230469, + "learning_rate": 0.0001, + "loss": 1.4126, + "step": 10368 + }, + { + "epoch": 1.1910860949974154, + "grad_norm": 0.5847504138946533, + "learning_rate": 0.0001, + "loss": 1.4077, + "step": 10369 + }, + { + "epoch": 1.1912009649072426, + "grad_norm": 0.5426949262619019, + "learning_rate": 0.0001, + "loss": 1.3664, + "step": 10370 + }, + { + "epoch": 1.1913158348170696, + "grad_norm": 0.5475144386291504, + "learning_rate": 0.0001, + "loss": 1.5815, + "step": 10371 + }, + { + "epoch": 1.1914307047268968, + "grad_norm": 0.6072627305984497, + "learning_rate": 0.0001, + "loss": 1.6914, + "step": 10372 + }, + { + "epoch": 1.1915455746367238, + "grad_norm": 0.5702595710754395, + "learning_rate": 0.0001, + "loss": 1.4987, + "step": 10373 + }, + { + "epoch": 1.191660444546551, + "grad_norm": 0.5092419385910034, + "learning_rate": 0.0001, + "loss": 1.454, + "step": 10374 + }, + { + "epoch": 1.191775314456378, + "grad_norm": 0.5371357202529907, + "learning_rate": 0.0001, + "loss": 1.5396, + "step": 10375 + }, + { + "epoch": 1.1918901843662053, + "grad_norm": 0.6508344411849976, + "learning_rate": 0.0001, + "loss": 1.4152, + "step": 10376 + }, + { + "epoch": 1.1920050542760323, + "grad_norm": 0.5292761921882629, + "learning_rate": 0.0001, + "loss": 1.2684, + "step": 10377 + }, + { + "epoch": 1.1921199241858595, + "grad_norm": 0.5798367857933044, + "learning_rate": 0.0001, + "loss": 1.6399, + "step": 10378 + }, + { + "epoch": 1.1922347940956866, + "grad_norm": 0.5566953420639038, + "learning_rate": 0.0001, + "loss": 1.5344, + "step": 10379 + }, + { + "epoch": 1.1923496640055138, + "grad_norm": 0.6179848909378052, + "learning_rate": 0.0001, + "loss": 1.6928, + "step": 10380 + }, + { + "epoch": 1.1924645339153408, + "grad_norm": 0.5988585948944092, + "learning_rate": 0.0001, + "loss": 1.6972, + "step": 10381 + }, + { + "epoch": 1.192579403825168, + "grad_norm": 0.6032318472862244, + "learning_rate": 0.0001, + "loss": 1.6165, + "step": 10382 + }, + { + "epoch": 1.192694273734995, + "grad_norm": 0.5561224818229675, + "learning_rate": 0.0001, + "loss": 1.6675, + "step": 10383 + }, + { + "epoch": 1.1928091436448223, + "grad_norm": 0.5613009333610535, + "learning_rate": 0.0001, + "loss": 1.5307, + "step": 10384 + }, + { + "epoch": 1.1929240135546493, + "grad_norm": 0.5561355352401733, + "learning_rate": 0.0001, + "loss": 1.5057, + "step": 10385 + }, + { + "epoch": 1.1930388834644765, + "grad_norm": 0.5625793933868408, + "learning_rate": 0.0001, + "loss": 1.397, + "step": 10386 + }, + { + "epoch": 1.1931537533743035, + "grad_norm": 0.5584040284156799, + "learning_rate": 0.0001, + "loss": 1.529, + "step": 10387 + }, + { + "epoch": 1.1932686232841307, + "grad_norm": 0.5510308146476746, + "learning_rate": 0.0001, + "loss": 1.3537, + "step": 10388 + }, + { + "epoch": 1.1933834931939578, + "grad_norm": 0.5637242794036865, + "learning_rate": 0.0001, + "loss": 1.4676, + "step": 10389 + }, + { + "epoch": 1.193498363103785, + "grad_norm": 0.604081392288208, + "learning_rate": 0.0001, + "loss": 1.4873, + "step": 10390 + }, + { + "epoch": 1.193613233013612, + "grad_norm": 0.5362832546234131, + "learning_rate": 0.0001, + "loss": 1.2121, + "step": 10391 + }, + { + "epoch": 1.1937281029234392, + "grad_norm": 0.6020187735557556, + "learning_rate": 0.0001, + "loss": 1.5522, + "step": 10392 + }, + { + "epoch": 1.1938429728332662, + "grad_norm": 0.5550323724746704, + "learning_rate": 0.0001, + "loss": 1.3623, + "step": 10393 + }, + { + "epoch": 1.1939578427430935, + "grad_norm": 0.6165339350700378, + "learning_rate": 0.0001, + "loss": 1.6928, + "step": 10394 + }, + { + "epoch": 1.1940727126529205, + "grad_norm": 0.5294722318649292, + "learning_rate": 0.0001, + "loss": 1.3107, + "step": 10395 + }, + { + "epoch": 1.1941875825627477, + "grad_norm": 0.5582475662231445, + "learning_rate": 0.0001, + "loss": 1.5955, + "step": 10396 + }, + { + "epoch": 1.1943024524725747, + "grad_norm": 0.6636613607406616, + "learning_rate": 0.0001, + "loss": 1.2665, + "step": 10397 + }, + { + "epoch": 1.194417322382402, + "grad_norm": 0.5724130868911743, + "learning_rate": 0.0001, + "loss": 1.4812, + "step": 10398 + }, + { + "epoch": 1.194532192292229, + "grad_norm": 0.5657904148101807, + "learning_rate": 0.0001, + "loss": 1.5283, + "step": 10399 + }, + { + "epoch": 1.1946470622020562, + "grad_norm": 0.5498049259185791, + "learning_rate": 0.0001, + "loss": 1.4346, + "step": 10400 + }, + { + "epoch": 1.1947619321118832, + "grad_norm": 0.5673407912254333, + "learning_rate": 0.0001, + "loss": 1.3824, + "step": 10401 + }, + { + "epoch": 1.1948768020217104, + "grad_norm": 0.609406054019928, + "learning_rate": 0.0001, + "loss": 1.6127, + "step": 10402 + }, + { + "epoch": 1.1949916719315374, + "grad_norm": 0.5271400809288025, + "learning_rate": 0.0001, + "loss": 1.3876, + "step": 10403 + }, + { + "epoch": 1.1951065418413647, + "grad_norm": 0.5551247596740723, + "learning_rate": 0.0001, + "loss": 1.4809, + "step": 10404 + }, + { + "epoch": 1.195221411751192, + "grad_norm": 0.6490895748138428, + "learning_rate": 0.0001, + "loss": 1.5965, + "step": 10405 + }, + { + "epoch": 1.195336281661019, + "grad_norm": 0.5571427941322327, + "learning_rate": 0.0001, + "loss": 1.4025, + "step": 10406 + }, + { + "epoch": 1.195451151570846, + "grad_norm": 0.5582363605499268, + "learning_rate": 0.0001, + "loss": 1.4128, + "step": 10407 + }, + { + "epoch": 1.1955660214806731, + "grad_norm": 0.5777066946029663, + "learning_rate": 0.0001, + "loss": 1.4456, + "step": 10408 + }, + { + "epoch": 1.1956808913905004, + "grad_norm": 0.5664290189743042, + "learning_rate": 0.0001, + "loss": 1.4631, + "step": 10409 + }, + { + "epoch": 1.1957957613003274, + "grad_norm": 0.6081710457801819, + "learning_rate": 0.0001, + "loss": 1.5054, + "step": 10410 + }, + { + "epoch": 1.1959106312101544, + "grad_norm": 0.5311821103096008, + "learning_rate": 0.0001, + "loss": 1.1295, + "step": 10411 + }, + { + "epoch": 1.1960255011199816, + "grad_norm": 0.5118587017059326, + "learning_rate": 0.0001, + "loss": 1.2627, + "step": 10412 + }, + { + "epoch": 1.1961403710298089, + "grad_norm": 0.5950908660888672, + "learning_rate": 0.0001, + "loss": 1.5089, + "step": 10413 + }, + { + "epoch": 1.1962552409396359, + "grad_norm": 0.5565327405929565, + "learning_rate": 0.0001, + "loss": 1.4616, + "step": 10414 + }, + { + "epoch": 1.1963701108494629, + "grad_norm": 0.5890604853630066, + "learning_rate": 0.0001, + "loss": 1.4465, + "step": 10415 + }, + { + "epoch": 1.19648498075929, + "grad_norm": 0.5750487446784973, + "learning_rate": 0.0001, + "loss": 1.5495, + "step": 10416 + }, + { + "epoch": 1.1965998506691173, + "grad_norm": 0.5400042533874512, + "learning_rate": 0.0001, + "loss": 1.2722, + "step": 10417 + }, + { + "epoch": 1.1967147205789443, + "grad_norm": 0.5932859778404236, + "learning_rate": 0.0001, + "loss": 1.6366, + "step": 10418 + }, + { + "epoch": 1.1968295904887714, + "grad_norm": 0.5861518383026123, + "learning_rate": 0.0001, + "loss": 1.4621, + "step": 10419 + }, + { + "epoch": 1.1969444603985986, + "grad_norm": 0.5601950883865356, + "learning_rate": 0.0001, + "loss": 1.4763, + "step": 10420 + }, + { + "epoch": 1.1970593303084258, + "grad_norm": 0.5883669257164001, + "learning_rate": 0.0001, + "loss": 1.6461, + "step": 10421 + }, + { + "epoch": 1.1971742002182528, + "grad_norm": 0.5568323135375977, + "learning_rate": 0.0001, + "loss": 1.6238, + "step": 10422 + }, + { + "epoch": 1.1972890701280798, + "grad_norm": 0.5244744420051575, + "learning_rate": 0.0001, + "loss": 1.3016, + "step": 10423 + }, + { + "epoch": 1.197403940037907, + "grad_norm": 0.5605520606040955, + "learning_rate": 0.0001, + "loss": 1.4803, + "step": 10424 + }, + { + "epoch": 1.1975188099477343, + "grad_norm": 0.5980091691017151, + "learning_rate": 0.0001, + "loss": 1.5665, + "step": 10425 + }, + { + "epoch": 1.1976336798575613, + "grad_norm": 0.607279896736145, + "learning_rate": 0.0001, + "loss": 1.5404, + "step": 10426 + }, + { + "epoch": 1.1977485497673883, + "grad_norm": 0.5929856896400452, + "learning_rate": 0.0001, + "loss": 1.5402, + "step": 10427 + }, + { + "epoch": 1.1978634196772155, + "grad_norm": 0.5797104835510254, + "learning_rate": 0.0001, + "loss": 1.4828, + "step": 10428 + }, + { + "epoch": 1.1979782895870428, + "grad_norm": 0.5740180611610413, + "learning_rate": 0.0001, + "loss": 1.5289, + "step": 10429 + }, + { + "epoch": 1.1980931594968698, + "grad_norm": 0.5844727158546448, + "learning_rate": 0.0001, + "loss": 1.4686, + "step": 10430 + }, + { + "epoch": 1.1982080294066968, + "grad_norm": 0.542769193649292, + "learning_rate": 0.0001, + "loss": 1.5674, + "step": 10431 + }, + { + "epoch": 1.198322899316524, + "grad_norm": 0.5846665501594543, + "learning_rate": 0.0001, + "loss": 1.4399, + "step": 10432 + }, + { + "epoch": 1.1984377692263513, + "grad_norm": 0.5460920333862305, + "learning_rate": 0.0001, + "loss": 1.5411, + "step": 10433 + }, + { + "epoch": 1.1985526391361783, + "grad_norm": 0.582313597202301, + "learning_rate": 0.0001, + "loss": 1.6056, + "step": 10434 + }, + { + "epoch": 1.1986675090460053, + "grad_norm": 0.6199240684509277, + "learning_rate": 0.0001, + "loss": 1.4142, + "step": 10435 + }, + { + "epoch": 1.1987823789558325, + "grad_norm": 0.6095178127288818, + "learning_rate": 0.0001, + "loss": 1.5953, + "step": 10436 + }, + { + "epoch": 1.1988972488656597, + "grad_norm": 0.5635156631469727, + "learning_rate": 0.0001, + "loss": 1.2746, + "step": 10437 + }, + { + "epoch": 1.1990121187754867, + "grad_norm": 0.5342568755149841, + "learning_rate": 0.0001, + "loss": 1.3511, + "step": 10438 + }, + { + "epoch": 1.199126988685314, + "grad_norm": 0.5932930111885071, + "learning_rate": 0.0001, + "loss": 1.3836, + "step": 10439 + }, + { + "epoch": 1.199241858595141, + "grad_norm": 0.5218791365623474, + "learning_rate": 0.0001, + "loss": 1.169, + "step": 10440 + }, + { + "epoch": 1.1993567285049682, + "grad_norm": 0.5568528771400452, + "learning_rate": 0.0001, + "loss": 1.5939, + "step": 10441 + }, + { + "epoch": 1.1994715984147952, + "grad_norm": 0.512910008430481, + "learning_rate": 0.0001, + "loss": 1.34, + "step": 10442 + }, + { + "epoch": 1.1995864683246225, + "grad_norm": 0.5277653336524963, + "learning_rate": 0.0001, + "loss": 1.4526, + "step": 10443 + }, + { + "epoch": 1.1997013382344495, + "grad_norm": 0.5597155094146729, + "learning_rate": 0.0001, + "loss": 1.3142, + "step": 10444 + }, + { + "epoch": 1.1998162081442767, + "grad_norm": 0.5273895859718323, + "learning_rate": 0.0001, + "loss": 1.5123, + "step": 10445 + }, + { + "epoch": 1.1999310780541037, + "grad_norm": 0.5639094114303589, + "learning_rate": 0.0001, + "loss": 1.2597, + "step": 10446 + }, + { + "epoch": 1.200045947963931, + "grad_norm": 0.6367859244346619, + "learning_rate": 0.0001, + "loss": 1.6992, + "step": 10447 + }, + { + "epoch": 1.200160817873758, + "grad_norm": 0.6159301400184631, + "learning_rate": 0.0001, + "loss": 1.475, + "step": 10448 + }, + { + "epoch": 1.2002756877835852, + "grad_norm": 0.5858900547027588, + "learning_rate": 0.0001, + "loss": 1.4077, + "step": 10449 + }, + { + "epoch": 1.2003905576934122, + "grad_norm": 0.5637074112892151, + "learning_rate": 0.0001, + "loss": 1.4567, + "step": 10450 + }, + { + "epoch": 1.2005054276032394, + "grad_norm": 0.5393563508987427, + "learning_rate": 0.0001, + "loss": 1.3936, + "step": 10451 + }, + { + "epoch": 1.2006202975130664, + "grad_norm": 0.530194103717804, + "learning_rate": 0.0001, + "loss": 1.4639, + "step": 10452 + }, + { + "epoch": 1.2007351674228937, + "grad_norm": 0.5299816727638245, + "learning_rate": 0.0001, + "loss": 1.4348, + "step": 10453 + }, + { + "epoch": 1.2008500373327207, + "grad_norm": 0.5166096687316895, + "learning_rate": 0.0001, + "loss": 1.331, + "step": 10454 + }, + { + "epoch": 1.200964907242548, + "grad_norm": 0.5923041105270386, + "learning_rate": 0.0001, + "loss": 1.4583, + "step": 10455 + }, + { + "epoch": 1.201079777152375, + "grad_norm": 0.5478973388671875, + "learning_rate": 0.0001, + "loss": 1.3418, + "step": 10456 + }, + { + "epoch": 1.2011946470622021, + "grad_norm": 0.5634967684745789, + "learning_rate": 0.0001, + "loss": 1.3768, + "step": 10457 + }, + { + "epoch": 1.2013095169720291, + "grad_norm": 0.5654754042625427, + "learning_rate": 0.0001, + "loss": 1.6154, + "step": 10458 + }, + { + "epoch": 1.2014243868818564, + "grad_norm": 0.6268435716629028, + "learning_rate": 0.0001, + "loss": 1.6234, + "step": 10459 + }, + { + "epoch": 1.2015392567916834, + "grad_norm": 0.5597686171531677, + "learning_rate": 0.0001, + "loss": 1.2409, + "step": 10460 + }, + { + "epoch": 1.2016541267015106, + "grad_norm": 0.55586177110672, + "learning_rate": 0.0001, + "loss": 1.4184, + "step": 10461 + }, + { + "epoch": 1.2017689966113376, + "grad_norm": 0.6155359745025635, + "learning_rate": 0.0001, + "loss": 1.5127, + "step": 10462 + }, + { + "epoch": 1.2018838665211649, + "grad_norm": 0.6004906296730042, + "learning_rate": 0.0001, + "loss": 1.4996, + "step": 10463 + }, + { + "epoch": 1.2019987364309919, + "grad_norm": 0.5957759022712708, + "learning_rate": 0.0001, + "loss": 1.6768, + "step": 10464 + }, + { + "epoch": 1.202113606340819, + "grad_norm": 0.5964832901954651, + "learning_rate": 0.0001, + "loss": 1.5759, + "step": 10465 + }, + { + "epoch": 1.202228476250646, + "grad_norm": 0.6004605889320374, + "learning_rate": 0.0001, + "loss": 1.6343, + "step": 10466 + }, + { + "epoch": 1.2023433461604733, + "grad_norm": 0.5793516635894775, + "learning_rate": 0.0001, + "loss": 1.5425, + "step": 10467 + }, + { + "epoch": 1.2024582160703003, + "grad_norm": 0.565511167049408, + "learning_rate": 0.0001, + "loss": 1.3453, + "step": 10468 + }, + { + "epoch": 1.2025730859801276, + "grad_norm": 0.6401596069335938, + "learning_rate": 0.0001, + "loss": 1.5273, + "step": 10469 + }, + { + "epoch": 1.2026879558899546, + "grad_norm": 0.6024700999259949, + "learning_rate": 0.0001, + "loss": 1.6374, + "step": 10470 + }, + { + "epoch": 1.2028028257997818, + "grad_norm": 0.5722721219062805, + "learning_rate": 0.0001, + "loss": 1.6399, + "step": 10471 + }, + { + "epoch": 1.2029176957096088, + "grad_norm": 0.5830640196800232, + "learning_rate": 0.0001, + "loss": 1.4463, + "step": 10472 + }, + { + "epoch": 1.203032565619436, + "grad_norm": 0.5298201441764832, + "learning_rate": 0.0001, + "loss": 1.4246, + "step": 10473 + }, + { + "epoch": 1.203147435529263, + "grad_norm": 0.5793371796607971, + "learning_rate": 0.0001, + "loss": 1.4691, + "step": 10474 + }, + { + "epoch": 1.2032623054390903, + "grad_norm": 0.5723811984062195, + "learning_rate": 0.0001, + "loss": 1.507, + "step": 10475 + }, + { + "epoch": 1.2033771753489173, + "grad_norm": 0.5795249342918396, + "learning_rate": 0.0001, + "loss": 1.398, + "step": 10476 + }, + { + "epoch": 1.2034920452587445, + "grad_norm": 0.5610243678092957, + "learning_rate": 0.0001, + "loss": 1.6885, + "step": 10477 + }, + { + "epoch": 1.2036069151685715, + "grad_norm": 0.6054256558418274, + "learning_rate": 0.0001, + "loss": 1.6161, + "step": 10478 + }, + { + "epoch": 1.2037217850783988, + "grad_norm": 0.5674859881401062, + "learning_rate": 0.0001, + "loss": 1.4247, + "step": 10479 + }, + { + "epoch": 1.2038366549882258, + "grad_norm": 0.5590078830718994, + "learning_rate": 0.0001, + "loss": 1.4417, + "step": 10480 + }, + { + "epoch": 1.203951524898053, + "grad_norm": 0.5892341732978821, + "learning_rate": 0.0001, + "loss": 1.5407, + "step": 10481 + }, + { + "epoch": 1.20406639480788, + "grad_norm": 0.5569952130317688, + "learning_rate": 0.0001, + "loss": 1.324, + "step": 10482 + }, + { + "epoch": 1.2041812647177073, + "grad_norm": 0.5327668190002441, + "learning_rate": 0.0001, + "loss": 1.369, + "step": 10483 + }, + { + "epoch": 1.2042961346275343, + "grad_norm": 0.5070328116416931, + "learning_rate": 0.0001, + "loss": 1.2728, + "step": 10484 + }, + { + "epoch": 1.2044110045373615, + "grad_norm": 0.5609564781188965, + "learning_rate": 0.0001, + "loss": 1.4026, + "step": 10485 + }, + { + "epoch": 1.2045258744471885, + "grad_norm": 0.5414237380027771, + "learning_rate": 0.0001, + "loss": 1.3149, + "step": 10486 + }, + { + "epoch": 1.2046407443570157, + "grad_norm": 0.5648483633995056, + "learning_rate": 0.0001, + "loss": 1.5108, + "step": 10487 + }, + { + "epoch": 1.2047556142668427, + "grad_norm": 0.5687791705131531, + "learning_rate": 0.0001, + "loss": 1.5176, + "step": 10488 + }, + { + "epoch": 1.20487048417667, + "grad_norm": 0.5945368409156799, + "learning_rate": 0.0001, + "loss": 1.4052, + "step": 10489 + }, + { + "epoch": 1.204985354086497, + "grad_norm": 0.5893955826759338, + "learning_rate": 0.0001, + "loss": 1.6138, + "step": 10490 + }, + { + "epoch": 1.2051002239963242, + "grad_norm": 0.5878959894180298, + "learning_rate": 0.0001, + "loss": 1.5193, + "step": 10491 + }, + { + "epoch": 1.2052150939061512, + "grad_norm": 0.5943409204483032, + "learning_rate": 0.0001, + "loss": 1.3367, + "step": 10492 + }, + { + "epoch": 1.2053299638159785, + "grad_norm": 0.5430295467376709, + "learning_rate": 0.0001, + "loss": 1.46, + "step": 10493 + }, + { + "epoch": 1.2054448337258055, + "grad_norm": 0.5535471439361572, + "learning_rate": 0.0001, + "loss": 1.3129, + "step": 10494 + }, + { + "epoch": 1.2055597036356327, + "grad_norm": 0.540138840675354, + "learning_rate": 0.0001, + "loss": 1.3614, + "step": 10495 + }, + { + "epoch": 1.2056745735454597, + "grad_norm": 0.6055250763893127, + "learning_rate": 0.0001, + "loss": 1.5851, + "step": 10496 + }, + { + "epoch": 1.205789443455287, + "grad_norm": 0.6143799424171448, + "learning_rate": 0.0001, + "loss": 1.5218, + "step": 10497 + }, + { + "epoch": 1.205904313365114, + "grad_norm": 0.5408201813697815, + "learning_rate": 0.0001, + "loss": 1.4474, + "step": 10498 + }, + { + "epoch": 1.2060191832749412, + "grad_norm": 0.5724786520004272, + "learning_rate": 0.0001, + "loss": 1.5761, + "step": 10499 + }, + { + "epoch": 1.2061340531847682, + "grad_norm": 0.523482620716095, + "learning_rate": 0.0001, + "loss": 1.155, + "step": 10500 + }, + { + "epoch": 1.2062489230945954, + "grad_norm": 0.5779107809066772, + "learning_rate": 0.0001, + "loss": 1.5714, + "step": 10501 + }, + { + "epoch": 1.2063637930044224, + "grad_norm": 0.5509933233261108, + "learning_rate": 0.0001, + "loss": 1.4959, + "step": 10502 + }, + { + "epoch": 1.2064786629142497, + "grad_norm": 0.55925053358078, + "learning_rate": 0.0001, + "loss": 1.2454, + "step": 10503 + }, + { + "epoch": 1.2065935328240767, + "grad_norm": 0.5909457802772522, + "learning_rate": 0.0001, + "loss": 1.4099, + "step": 10504 + }, + { + "epoch": 1.206708402733904, + "grad_norm": 0.6092385649681091, + "learning_rate": 0.0001, + "loss": 1.5369, + "step": 10505 + }, + { + "epoch": 1.206823272643731, + "grad_norm": 0.5715480446815491, + "learning_rate": 0.0001, + "loss": 1.4217, + "step": 10506 + }, + { + "epoch": 1.2069381425535581, + "grad_norm": 0.5696176290512085, + "learning_rate": 0.0001, + "loss": 1.4611, + "step": 10507 + }, + { + "epoch": 1.2070530124633851, + "grad_norm": 0.5600457787513733, + "learning_rate": 0.0001, + "loss": 1.3629, + "step": 10508 + }, + { + "epoch": 1.2071678823732124, + "grad_norm": 0.5562861561775208, + "learning_rate": 0.0001, + "loss": 1.6141, + "step": 10509 + }, + { + "epoch": 1.2072827522830394, + "grad_norm": 0.5927658081054688, + "learning_rate": 0.0001, + "loss": 1.6618, + "step": 10510 + }, + { + "epoch": 1.2073976221928666, + "grad_norm": 0.5737072825431824, + "learning_rate": 0.0001, + "loss": 1.5495, + "step": 10511 + }, + { + "epoch": 1.2075124921026936, + "grad_norm": 0.5820686221122742, + "learning_rate": 0.0001, + "loss": 1.4886, + "step": 10512 + }, + { + "epoch": 1.2076273620125209, + "grad_norm": 0.5760904550552368, + "learning_rate": 0.0001, + "loss": 1.3553, + "step": 10513 + }, + { + "epoch": 1.2077422319223479, + "grad_norm": 0.5971365571022034, + "learning_rate": 0.0001, + "loss": 1.6553, + "step": 10514 + }, + { + "epoch": 1.207857101832175, + "grad_norm": 0.5970271825790405, + "learning_rate": 0.0001, + "loss": 1.581, + "step": 10515 + }, + { + "epoch": 1.207971971742002, + "grad_norm": 0.5633013248443604, + "learning_rate": 0.0001, + "loss": 1.3834, + "step": 10516 + }, + { + "epoch": 1.2080868416518293, + "grad_norm": 0.5619766116142273, + "learning_rate": 0.0001, + "loss": 1.5396, + "step": 10517 + }, + { + "epoch": 1.2082017115616563, + "grad_norm": 0.5610624551773071, + "learning_rate": 0.0001, + "loss": 1.456, + "step": 10518 + }, + { + "epoch": 1.2083165814714836, + "grad_norm": 0.5785056948661804, + "learning_rate": 0.0001, + "loss": 1.5337, + "step": 10519 + }, + { + "epoch": 1.2084314513813106, + "grad_norm": 0.6023985147476196, + "learning_rate": 0.0001, + "loss": 1.4268, + "step": 10520 + }, + { + "epoch": 1.2085463212911378, + "grad_norm": 0.6198663115501404, + "learning_rate": 0.0001, + "loss": 1.4412, + "step": 10521 + }, + { + "epoch": 1.2086611912009648, + "grad_norm": 0.5379393100738525, + "learning_rate": 0.0001, + "loss": 1.5346, + "step": 10522 + }, + { + "epoch": 1.208776061110792, + "grad_norm": 0.5652551651000977, + "learning_rate": 0.0001, + "loss": 1.3385, + "step": 10523 + }, + { + "epoch": 1.208890931020619, + "grad_norm": 0.5869433879852295, + "learning_rate": 0.0001, + "loss": 1.4932, + "step": 10524 + }, + { + "epoch": 1.2090058009304463, + "grad_norm": 0.5975571870803833, + "learning_rate": 0.0001, + "loss": 1.4304, + "step": 10525 + }, + { + "epoch": 1.2091206708402733, + "grad_norm": 0.5863459706306458, + "learning_rate": 0.0001, + "loss": 1.4973, + "step": 10526 + }, + { + "epoch": 1.2092355407501005, + "grad_norm": 0.5686241984367371, + "learning_rate": 0.0001, + "loss": 1.4045, + "step": 10527 + }, + { + "epoch": 1.2093504106599275, + "grad_norm": 0.6071587204933167, + "learning_rate": 0.0001, + "loss": 1.438, + "step": 10528 + }, + { + "epoch": 1.2094652805697548, + "grad_norm": 0.5743688344955444, + "learning_rate": 0.0001, + "loss": 1.388, + "step": 10529 + }, + { + "epoch": 1.2095801504795818, + "grad_norm": 0.5639120936393738, + "learning_rate": 0.0001, + "loss": 1.538, + "step": 10530 + }, + { + "epoch": 1.209695020389409, + "grad_norm": 0.5790785551071167, + "learning_rate": 0.0001, + "loss": 1.5912, + "step": 10531 + }, + { + "epoch": 1.209809890299236, + "grad_norm": 0.5463698506355286, + "learning_rate": 0.0001, + "loss": 1.3599, + "step": 10532 + }, + { + "epoch": 1.2099247602090633, + "grad_norm": 0.5384880304336548, + "learning_rate": 0.0001, + "loss": 1.5287, + "step": 10533 + }, + { + "epoch": 1.2100396301188903, + "grad_norm": 0.5078497529029846, + "learning_rate": 0.0001, + "loss": 1.4054, + "step": 10534 + }, + { + "epoch": 1.2101545000287175, + "grad_norm": 0.5871212482452393, + "learning_rate": 0.0001, + "loss": 1.5067, + "step": 10535 + }, + { + "epoch": 1.2102693699385445, + "grad_norm": 0.5731205344200134, + "learning_rate": 0.0001, + "loss": 1.6316, + "step": 10536 + }, + { + "epoch": 1.2103842398483717, + "grad_norm": 0.5511032938957214, + "learning_rate": 0.0001, + "loss": 1.5654, + "step": 10537 + }, + { + "epoch": 1.2104991097581987, + "grad_norm": 0.6102611422538757, + "learning_rate": 0.0001, + "loss": 1.5906, + "step": 10538 + }, + { + "epoch": 1.210613979668026, + "grad_norm": 0.5358269214630127, + "learning_rate": 0.0001, + "loss": 1.2765, + "step": 10539 + }, + { + "epoch": 1.210728849577853, + "grad_norm": 0.5827724933624268, + "learning_rate": 0.0001, + "loss": 1.4465, + "step": 10540 + }, + { + "epoch": 1.2108437194876802, + "grad_norm": 0.6176273822784424, + "learning_rate": 0.0001, + "loss": 1.4566, + "step": 10541 + }, + { + "epoch": 1.2109585893975074, + "grad_norm": 0.6200592517852783, + "learning_rate": 0.0001, + "loss": 1.7385, + "step": 10542 + }, + { + "epoch": 1.2110734593073345, + "grad_norm": 0.5881513357162476, + "learning_rate": 0.0001, + "loss": 1.4508, + "step": 10543 + }, + { + "epoch": 1.2111883292171615, + "grad_norm": 0.5368597507476807, + "learning_rate": 0.0001, + "loss": 1.2807, + "step": 10544 + }, + { + "epoch": 1.2113031991269887, + "grad_norm": 0.5737367868423462, + "learning_rate": 0.0001, + "loss": 1.3851, + "step": 10545 + }, + { + "epoch": 1.211418069036816, + "grad_norm": 0.5376740097999573, + "learning_rate": 0.0001, + "loss": 1.5095, + "step": 10546 + }, + { + "epoch": 1.211532938946643, + "grad_norm": 0.5448538064956665, + "learning_rate": 0.0001, + "loss": 1.4943, + "step": 10547 + }, + { + "epoch": 1.21164780885647, + "grad_norm": 0.5198777914047241, + "learning_rate": 0.0001, + "loss": 1.4856, + "step": 10548 + }, + { + "epoch": 1.2117626787662972, + "grad_norm": 0.5620352625846863, + "learning_rate": 0.0001, + "loss": 1.4172, + "step": 10549 + }, + { + "epoch": 1.2118775486761244, + "grad_norm": 0.6086106896400452, + "learning_rate": 0.0001, + "loss": 1.3698, + "step": 10550 + }, + { + "epoch": 1.2119924185859514, + "grad_norm": 0.5318285822868347, + "learning_rate": 0.0001, + "loss": 1.1826, + "step": 10551 + }, + { + "epoch": 1.2121072884957784, + "grad_norm": 0.6011495590209961, + "learning_rate": 0.0001, + "loss": 1.6163, + "step": 10552 + }, + { + "epoch": 1.2122221584056057, + "grad_norm": 0.536434531211853, + "learning_rate": 0.0001, + "loss": 1.3322, + "step": 10553 + }, + { + "epoch": 1.2123370283154329, + "grad_norm": 0.57552170753479, + "learning_rate": 0.0001, + "loss": 1.7569, + "step": 10554 + }, + { + "epoch": 1.21245189822526, + "grad_norm": 0.5894425511360168, + "learning_rate": 0.0001, + "loss": 1.4857, + "step": 10555 + }, + { + "epoch": 1.212566768135087, + "grad_norm": 0.6130958795547485, + "learning_rate": 0.0001, + "loss": 1.7492, + "step": 10556 + }, + { + "epoch": 1.2126816380449141, + "grad_norm": 0.6135485172271729, + "learning_rate": 0.0001, + "loss": 1.5469, + "step": 10557 + }, + { + "epoch": 1.2127965079547414, + "grad_norm": 0.5616319179534912, + "learning_rate": 0.0001, + "loss": 1.4481, + "step": 10558 + }, + { + "epoch": 1.2129113778645684, + "grad_norm": 0.567959725856781, + "learning_rate": 0.0001, + "loss": 1.5242, + "step": 10559 + }, + { + "epoch": 1.2130262477743954, + "grad_norm": 0.5984885692596436, + "learning_rate": 0.0001, + "loss": 1.6178, + "step": 10560 + }, + { + "epoch": 1.2131411176842226, + "grad_norm": 0.5240800380706787, + "learning_rate": 0.0001, + "loss": 1.4834, + "step": 10561 + }, + { + "epoch": 1.2132559875940498, + "grad_norm": 0.5749354362487793, + "learning_rate": 0.0001, + "loss": 1.4552, + "step": 10562 + }, + { + "epoch": 1.2133708575038769, + "grad_norm": 0.5432642698287964, + "learning_rate": 0.0001, + "loss": 1.5985, + "step": 10563 + }, + { + "epoch": 1.2134857274137039, + "grad_norm": 0.5396465063095093, + "learning_rate": 0.0001, + "loss": 1.4983, + "step": 10564 + }, + { + "epoch": 1.213600597323531, + "grad_norm": 0.5398960113525391, + "learning_rate": 0.0001, + "loss": 1.3812, + "step": 10565 + }, + { + "epoch": 1.2137154672333583, + "grad_norm": 0.5659125447273254, + "learning_rate": 0.0001, + "loss": 1.5102, + "step": 10566 + }, + { + "epoch": 1.2138303371431853, + "grad_norm": 0.5628279447555542, + "learning_rate": 0.0001, + "loss": 1.2602, + "step": 10567 + }, + { + "epoch": 1.2139452070530123, + "grad_norm": 0.5381669998168945, + "learning_rate": 0.0001, + "loss": 1.4341, + "step": 10568 + }, + { + "epoch": 1.2140600769628396, + "grad_norm": 0.6219281554222107, + "learning_rate": 0.0001, + "loss": 1.5175, + "step": 10569 + }, + { + "epoch": 1.2141749468726668, + "grad_norm": 0.5203022956848145, + "learning_rate": 0.0001, + "loss": 1.3109, + "step": 10570 + }, + { + "epoch": 1.2142898167824938, + "grad_norm": 0.6202723383903503, + "learning_rate": 0.0001, + "loss": 1.5858, + "step": 10571 + }, + { + "epoch": 1.2144046866923208, + "grad_norm": 0.6326547861099243, + "learning_rate": 0.0001, + "loss": 1.4975, + "step": 10572 + }, + { + "epoch": 1.214519556602148, + "grad_norm": 0.5709686875343323, + "learning_rate": 0.0001, + "loss": 1.3928, + "step": 10573 + }, + { + "epoch": 1.2146344265119753, + "grad_norm": 0.5582396984100342, + "learning_rate": 0.0001, + "loss": 1.4887, + "step": 10574 + }, + { + "epoch": 1.2147492964218023, + "grad_norm": 0.678507387638092, + "learning_rate": 0.0001, + "loss": 1.7535, + "step": 10575 + }, + { + "epoch": 1.2148641663316295, + "grad_norm": 0.5705639123916626, + "learning_rate": 0.0001, + "loss": 1.6121, + "step": 10576 + }, + { + "epoch": 1.2149790362414565, + "grad_norm": 0.5790746212005615, + "learning_rate": 0.0001, + "loss": 1.6152, + "step": 10577 + }, + { + "epoch": 1.2150939061512838, + "grad_norm": 0.5600179433822632, + "learning_rate": 0.0001, + "loss": 1.4223, + "step": 10578 + }, + { + "epoch": 1.2152087760611108, + "grad_norm": 0.5656793713569641, + "learning_rate": 0.0001, + "loss": 1.4859, + "step": 10579 + }, + { + "epoch": 1.215323645970938, + "grad_norm": 0.537078857421875, + "learning_rate": 0.0001, + "loss": 1.4168, + "step": 10580 + }, + { + "epoch": 1.215438515880765, + "grad_norm": 0.5688410401344299, + "learning_rate": 0.0001, + "loss": 1.5327, + "step": 10581 + }, + { + "epoch": 1.2155533857905922, + "grad_norm": 0.6211932897567749, + "learning_rate": 0.0001, + "loss": 1.5774, + "step": 10582 + }, + { + "epoch": 1.2156682557004193, + "grad_norm": 0.5459011793136597, + "learning_rate": 0.0001, + "loss": 1.3783, + "step": 10583 + }, + { + "epoch": 1.2157831256102465, + "grad_norm": 0.540130615234375, + "learning_rate": 0.0001, + "loss": 1.35, + "step": 10584 + }, + { + "epoch": 1.2158979955200735, + "grad_norm": 0.5064961910247803, + "learning_rate": 0.0001, + "loss": 1.3146, + "step": 10585 + }, + { + "epoch": 1.2160128654299007, + "grad_norm": 0.5856577157974243, + "learning_rate": 0.0001, + "loss": 1.5325, + "step": 10586 + }, + { + "epoch": 1.2161277353397277, + "grad_norm": 0.5436236262321472, + "learning_rate": 0.0001, + "loss": 1.2324, + "step": 10587 + }, + { + "epoch": 1.216242605249555, + "grad_norm": 0.5454403162002563, + "learning_rate": 0.0001, + "loss": 1.5701, + "step": 10588 + }, + { + "epoch": 1.216357475159382, + "grad_norm": 0.5347267985343933, + "learning_rate": 0.0001, + "loss": 1.574, + "step": 10589 + }, + { + "epoch": 1.2164723450692092, + "grad_norm": 0.5841572284698486, + "learning_rate": 0.0001, + "loss": 1.4605, + "step": 10590 + }, + { + "epoch": 1.2165872149790362, + "grad_norm": 0.5742143988609314, + "learning_rate": 0.0001, + "loss": 1.6305, + "step": 10591 + }, + { + "epoch": 1.2167020848888634, + "grad_norm": 0.5593157410621643, + "learning_rate": 0.0001, + "loss": 1.5328, + "step": 10592 + }, + { + "epoch": 1.2168169547986905, + "grad_norm": 0.58852219581604, + "learning_rate": 0.0001, + "loss": 1.4253, + "step": 10593 + }, + { + "epoch": 1.2169318247085177, + "grad_norm": 0.5854587554931641, + "learning_rate": 0.0001, + "loss": 1.4844, + "step": 10594 + }, + { + "epoch": 1.2170466946183447, + "grad_norm": 0.6186012625694275, + "learning_rate": 0.0001, + "loss": 1.5365, + "step": 10595 + }, + { + "epoch": 1.217161564528172, + "grad_norm": 0.5782433748245239, + "learning_rate": 0.0001, + "loss": 1.5929, + "step": 10596 + }, + { + "epoch": 1.217276434437999, + "grad_norm": 0.5456607341766357, + "learning_rate": 0.0001, + "loss": 1.383, + "step": 10597 + }, + { + "epoch": 1.2173913043478262, + "grad_norm": 0.5986420512199402, + "learning_rate": 0.0001, + "loss": 1.746, + "step": 10598 + }, + { + "epoch": 1.2175061742576532, + "grad_norm": 0.5691726207733154, + "learning_rate": 0.0001, + "loss": 1.4222, + "step": 10599 + }, + { + "epoch": 1.2176210441674804, + "grad_norm": 0.5510737299919128, + "learning_rate": 0.0001, + "loss": 1.6671, + "step": 10600 + }, + { + "epoch": 1.2177359140773074, + "grad_norm": 0.528681755065918, + "learning_rate": 0.0001, + "loss": 1.2332, + "step": 10601 + }, + { + "epoch": 1.2178507839871346, + "grad_norm": 0.5856773853302002, + "learning_rate": 0.0001, + "loss": 1.3116, + "step": 10602 + }, + { + "epoch": 1.2179656538969617, + "grad_norm": 0.6299785375595093, + "learning_rate": 0.0001, + "loss": 1.3567, + "step": 10603 + }, + { + "epoch": 1.2180805238067889, + "grad_norm": 0.5841706395149231, + "learning_rate": 0.0001, + "loss": 1.2655, + "step": 10604 + }, + { + "epoch": 1.218195393716616, + "grad_norm": 0.5885977745056152, + "learning_rate": 0.0001, + "loss": 1.4947, + "step": 10605 + }, + { + "epoch": 1.2183102636264431, + "grad_norm": 0.5559094548225403, + "learning_rate": 0.0001, + "loss": 1.2172, + "step": 10606 + }, + { + "epoch": 1.2184251335362701, + "grad_norm": 0.5585634112358093, + "learning_rate": 0.0001, + "loss": 1.504, + "step": 10607 + }, + { + "epoch": 1.2185400034460974, + "grad_norm": 0.5708191394805908, + "learning_rate": 0.0001, + "loss": 1.549, + "step": 10608 + }, + { + "epoch": 1.2186548733559244, + "grad_norm": 0.5848360657691956, + "learning_rate": 0.0001, + "loss": 1.6258, + "step": 10609 + }, + { + "epoch": 1.2187697432657516, + "grad_norm": 0.702562153339386, + "learning_rate": 0.0001, + "loss": 1.6419, + "step": 10610 + }, + { + "epoch": 1.2188846131755786, + "grad_norm": 0.5700719952583313, + "learning_rate": 0.0001, + "loss": 1.5087, + "step": 10611 + }, + { + "epoch": 1.2189994830854058, + "grad_norm": 0.5429587960243225, + "learning_rate": 0.0001, + "loss": 1.4472, + "step": 10612 + }, + { + "epoch": 1.2191143529952329, + "grad_norm": 0.5469475388526917, + "learning_rate": 0.0001, + "loss": 1.5313, + "step": 10613 + }, + { + "epoch": 1.21922922290506, + "grad_norm": 0.5817015171051025, + "learning_rate": 0.0001, + "loss": 1.4517, + "step": 10614 + }, + { + "epoch": 1.219344092814887, + "grad_norm": 0.6177971363067627, + "learning_rate": 0.0001, + "loss": 1.3462, + "step": 10615 + }, + { + "epoch": 1.2194589627247143, + "grad_norm": 0.5310166478157043, + "learning_rate": 0.0001, + "loss": 1.3836, + "step": 10616 + }, + { + "epoch": 1.2195738326345413, + "grad_norm": 0.5617627501487732, + "learning_rate": 0.0001, + "loss": 1.4244, + "step": 10617 + }, + { + "epoch": 1.2196887025443686, + "grad_norm": 0.6105504035949707, + "learning_rate": 0.0001, + "loss": 1.3383, + "step": 10618 + }, + { + "epoch": 1.2198035724541956, + "grad_norm": 0.5250139832496643, + "learning_rate": 0.0001, + "loss": 1.4938, + "step": 10619 + }, + { + "epoch": 1.2199184423640228, + "grad_norm": 0.5927942991256714, + "learning_rate": 0.0001, + "loss": 1.5573, + "step": 10620 + }, + { + "epoch": 1.2200333122738498, + "grad_norm": 0.6836938858032227, + "learning_rate": 0.0001, + "loss": 1.6504, + "step": 10621 + }, + { + "epoch": 1.220148182183677, + "grad_norm": 0.6172232627868652, + "learning_rate": 0.0001, + "loss": 1.5157, + "step": 10622 + }, + { + "epoch": 1.220263052093504, + "grad_norm": 0.5897935032844543, + "learning_rate": 0.0001, + "loss": 1.3833, + "step": 10623 + }, + { + "epoch": 1.2203779220033313, + "grad_norm": 0.5973081588745117, + "learning_rate": 0.0001, + "loss": 1.3658, + "step": 10624 + }, + { + "epoch": 1.2204927919131583, + "grad_norm": 0.6020708680152893, + "learning_rate": 0.0001, + "loss": 1.4735, + "step": 10625 + }, + { + "epoch": 1.2206076618229855, + "grad_norm": 0.5734338164329529, + "learning_rate": 0.0001, + "loss": 1.5679, + "step": 10626 + }, + { + "epoch": 1.2207225317328125, + "grad_norm": 0.6005342602729797, + "learning_rate": 0.0001, + "loss": 1.6655, + "step": 10627 + }, + { + "epoch": 1.2208374016426398, + "grad_norm": 0.6020346283912659, + "learning_rate": 0.0001, + "loss": 1.4883, + "step": 10628 + }, + { + "epoch": 1.2209522715524668, + "grad_norm": 0.5923011302947998, + "learning_rate": 0.0001, + "loss": 1.471, + "step": 10629 + }, + { + "epoch": 1.221067141462294, + "grad_norm": 0.5160320997238159, + "learning_rate": 0.0001, + "loss": 1.4516, + "step": 10630 + }, + { + "epoch": 1.221182011372121, + "grad_norm": 0.6067989468574524, + "learning_rate": 0.0001, + "loss": 1.3357, + "step": 10631 + }, + { + "epoch": 1.2212968812819482, + "grad_norm": 0.5493841767311096, + "learning_rate": 0.0001, + "loss": 1.0806, + "step": 10632 + }, + { + "epoch": 1.2214117511917753, + "grad_norm": 0.6001129150390625, + "learning_rate": 0.0001, + "loss": 1.6477, + "step": 10633 + }, + { + "epoch": 1.2215266211016025, + "grad_norm": 0.5696105360984802, + "learning_rate": 0.0001, + "loss": 1.4366, + "step": 10634 + }, + { + "epoch": 1.2216414910114295, + "grad_norm": 0.5546556711196899, + "learning_rate": 0.0001, + "loss": 1.224, + "step": 10635 + }, + { + "epoch": 1.2217563609212567, + "grad_norm": 0.5779848098754883, + "learning_rate": 0.0001, + "loss": 1.4388, + "step": 10636 + }, + { + "epoch": 1.2218712308310837, + "grad_norm": 0.6062044501304626, + "learning_rate": 0.0001, + "loss": 1.4777, + "step": 10637 + }, + { + "epoch": 1.221986100740911, + "grad_norm": 0.530096173286438, + "learning_rate": 0.0001, + "loss": 1.5618, + "step": 10638 + }, + { + "epoch": 1.222100970650738, + "grad_norm": 0.5306719541549683, + "learning_rate": 0.0001, + "loss": 1.4092, + "step": 10639 + }, + { + "epoch": 1.2222158405605652, + "grad_norm": 0.5447390675544739, + "learning_rate": 0.0001, + "loss": 1.3903, + "step": 10640 + }, + { + "epoch": 1.2223307104703922, + "grad_norm": 0.5845821499824524, + "learning_rate": 0.0001, + "loss": 1.5732, + "step": 10641 + }, + { + "epoch": 1.2224455803802194, + "grad_norm": 0.5478098392486572, + "learning_rate": 0.0001, + "loss": 1.3178, + "step": 10642 + }, + { + "epoch": 1.2225604502900465, + "grad_norm": 0.5850222706794739, + "learning_rate": 0.0001, + "loss": 1.5655, + "step": 10643 + }, + { + "epoch": 1.2226753201998737, + "grad_norm": 0.533097505569458, + "learning_rate": 0.0001, + "loss": 1.3787, + "step": 10644 + }, + { + "epoch": 1.2227901901097007, + "grad_norm": 0.6224300861358643, + "learning_rate": 0.0001, + "loss": 1.6143, + "step": 10645 + }, + { + "epoch": 1.222905060019528, + "grad_norm": 0.5365055203437805, + "learning_rate": 0.0001, + "loss": 1.3455, + "step": 10646 + }, + { + "epoch": 1.223019929929355, + "grad_norm": 0.5840891599655151, + "learning_rate": 0.0001, + "loss": 1.5427, + "step": 10647 + }, + { + "epoch": 1.2231347998391822, + "grad_norm": 0.5483365654945374, + "learning_rate": 0.0001, + "loss": 1.4997, + "step": 10648 + }, + { + "epoch": 1.2232496697490092, + "grad_norm": 0.5992576479911804, + "learning_rate": 0.0001, + "loss": 1.4718, + "step": 10649 + }, + { + "epoch": 1.2233645396588364, + "grad_norm": 0.5553631782531738, + "learning_rate": 0.0001, + "loss": 1.4211, + "step": 10650 + }, + { + "epoch": 1.2234794095686634, + "grad_norm": 0.5532668232917786, + "learning_rate": 0.0001, + "loss": 1.526, + "step": 10651 + }, + { + "epoch": 1.2235942794784906, + "grad_norm": 0.6031765341758728, + "learning_rate": 0.0001, + "loss": 1.4622, + "step": 10652 + }, + { + "epoch": 1.2237091493883177, + "grad_norm": 0.5665226578712463, + "learning_rate": 0.0001, + "loss": 1.4103, + "step": 10653 + }, + { + "epoch": 1.2238240192981449, + "grad_norm": 0.5733132362365723, + "learning_rate": 0.0001, + "loss": 1.451, + "step": 10654 + }, + { + "epoch": 1.223938889207972, + "grad_norm": 0.5637727379798889, + "learning_rate": 0.0001, + "loss": 1.4781, + "step": 10655 + }, + { + "epoch": 1.2240537591177991, + "grad_norm": 0.5689738392829895, + "learning_rate": 0.0001, + "loss": 1.4647, + "step": 10656 + }, + { + "epoch": 1.2241686290276261, + "grad_norm": 0.5613055229187012, + "learning_rate": 0.0001, + "loss": 1.5456, + "step": 10657 + }, + { + "epoch": 1.2242834989374534, + "grad_norm": 0.6242927312850952, + "learning_rate": 0.0001, + "loss": 1.6494, + "step": 10658 + }, + { + "epoch": 1.2243983688472804, + "grad_norm": 0.558158278465271, + "learning_rate": 0.0001, + "loss": 1.5788, + "step": 10659 + }, + { + "epoch": 1.2245132387571076, + "grad_norm": 0.5290167331695557, + "learning_rate": 0.0001, + "loss": 1.3642, + "step": 10660 + }, + { + "epoch": 1.2246281086669346, + "grad_norm": 0.530127763748169, + "learning_rate": 0.0001, + "loss": 1.448, + "step": 10661 + }, + { + "epoch": 1.2247429785767618, + "grad_norm": 0.6060307621955872, + "learning_rate": 0.0001, + "loss": 1.5438, + "step": 10662 + }, + { + "epoch": 1.2248578484865889, + "grad_norm": 0.5839899182319641, + "learning_rate": 0.0001, + "loss": 1.5879, + "step": 10663 + }, + { + "epoch": 1.224972718396416, + "grad_norm": 0.5494903326034546, + "learning_rate": 0.0001, + "loss": 1.5572, + "step": 10664 + }, + { + "epoch": 1.225087588306243, + "grad_norm": 0.5795137882232666, + "learning_rate": 0.0001, + "loss": 1.4283, + "step": 10665 + }, + { + "epoch": 1.2252024582160703, + "grad_norm": 0.5595057606697083, + "learning_rate": 0.0001, + "loss": 1.3211, + "step": 10666 + }, + { + "epoch": 1.2253173281258973, + "grad_norm": 0.535707414150238, + "learning_rate": 0.0001, + "loss": 1.3843, + "step": 10667 + }, + { + "epoch": 1.2254321980357246, + "grad_norm": 0.555937647819519, + "learning_rate": 0.0001, + "loss": 1.5849, + "step": 10668 + }, + { + "epoch": 1.2255470679455516, + "grad_norm": 0.568341076374054, + "learning_rate": 0.0001, + "loss": 1.5307, + "step": 10669 + }, + { + "epoch": 1.2256619378553788, + "grad_norm": 0.5604590177536011, + "learning_rate": 0.0001, + "loss": 1.3813, + "step": 10670 + }, + { + "epoch": 1.2257768077652058, + "grad_norm": 0.5631393790245056, + "learning_rate": 0.0001, + "loss": 1.3685, + "step": 10671 + }, + { + "epoch": 1.225891677675033, + "grad_norm": 0.579447329044342, + "learning_rate": 0.0001, + "loss": 1.4172, + "step": 10672 + }, + { + "epoch": 1.22600654758486, + "grad_norm": 0.635651171207428, + "learning_rate": 0.0001, + "loss": 1.4747, + "step": 10673 + }, + { + "epoch": 1.2261214174946873, + "grad_norm": 0.5449775457382202, + "learning_rate": 0.0001, + "loss": 1.3641, + "step": 10674 + }, + { + "epoch": 1.2262362874045143, + "grad_norm": 0.5698413252830505, + "learning_rate": 0.0001, + "loss": 1.5557, + "step": 10675 + }, + { + "epoch": 1.2263511573143415, + "grad_norm": 0.5639238953590393, + "learning_rate": 0.0001, + "loss": 1.5912, + "step": 10676 + }, + { + "epoch": 1.2264660272241685, + "grad_norm": 0.6942810416221619, + "learning_rate": 0.0001, + "loss": 1.5553, + "step": 10677 + }, + { + "epoch": 1.2265808971339958, + "grad_norm": 0.5638768076896667, + "learning_rate": 0.0001, + "loss": 1.5026, + "step": 10678 + }, + { + "epoch": 1.226695767043823, + "grad_norm": 0.5303918123245239, + "learning_rate": 0.0001, + "loss": 1.1953, + "step": 10679 + }, + { + "epoch": 1.22681063695365, + "grad_norm": 0.5451003313064575, + "learning_rate": 0.0001, + "loss": 1.3523, + "step": 10680 + }, + { + "epoch": 1.226925506863477, + "grad_norm": 0.5491355657577515, + "learning_rate": 0.0001, + "loss": 1.4684, + "step": 10681 + }, + { + "epoch": 1.2270403767733042, + "grad_norm": 0.5453058481216431, + "learning_rate": 0.0001, + "loss": 1.4057, + "step": 10682 + }, + { + "epoch": 1.2271552466831315, + "grad_norm": 0.5452755093574524, + "learning_rate": 0.0001, + "loss": 1.2485, + "step": 10683 + }, + { + "epoch": 1.2272701165929585, + "grad_norm": 0.5503637194633484, + "learning_rate": 0.0001, + "loss": 1.4669, + "step": 10684 + }, + { + "epoch": 1.2273849865027855, + "grad_norm": 0.5912861227989197, + "learning_rate": 0.0001, + "loss": 1.5349, + "step": 10685 + }, + { + "epoch": 1.2274998564126127, + "grad_norm": 0.5763379335403442, + "learning_rate": 0.0001, + "loss": 1.492, + "step": 10686 + }, + { + "epoch": 1.22761472632244, + "grad_norm": 0.590811550617218, + "learning_rate": 0.0001, + "loss": 1.4122, + "step": 10687 + }, + { + "epoch": 1.227729596232267, + "grad_norm": 0.6356752514839172, + "learning_rate": 0.0001, + "loss": 1.5515, + "step": 10688 + }, + { + "epoch": 1.227844466142094, + "grad_norm": 0.5507737398147583, + "learning_rate": 0.0001, + "loss": 1.4867, + "step": 10689 + }, + { + "epoch": 1.2279593360519212, + "grad_norm": 0.5941469073295593, + "learning_rate": 0.0001, + "loss": 1.5367, + "step": 10690 + }, + { + "epoch": 1.2280742059617484, + "grad_norm": 0.5983887314796448, + "learning_rate": 0.0001, + "loss": 1.6548, + "step": 10691 + }, + { + "epoch": 1.2281890758715754, + "grad_norm": 0.5874168276786804, + "learning_rate": 0.0001, + "loss": 1.3943, + "step": 10692 + }, + { + "epoch": 1.2283039457814025, + "grad_norm": 0.6247429847717285, + "learning_rate": 0.0001, + "loss": 1.712, + "step": 10693 + }, + { + "epoch": 1.2284188156912297, + "grad_norm": 0.5645884275436401, + "learning_rate": 0.0001, + "loss": 1.5444, + "step": 10694 + }, + { + "epoch": 1.228533685601057, + "grad_norm": 0.6020731925964355, + "learning_rate": 0.0001, + "loss": 1.4052, + "step": 10695 + }, + { + "epoch": 1.228648555510884, + "grad_norm": 0.6159355044364929, + "learning_rate": 0.0001, + "loss": 1.5297, + "step": 10696 + }, + { + "epoch": 1.228763425420711, + "grad_norm": 0.5802619457244873, + "learning_rate": 0.0001, + "loss": 1.4455, + "step": 10697 + }, + { + "epoch": 1.2288782953305382, + "grad_norm": 0.5384595990180969, + "learning_rate": 0.0001, + "loss": 1.3547, + "step": 10698 + }, + { + "epoch": 1.2289931652403654, + "grad_norm": 0.5957247018814087, + "learning_rate": 0.0001, + "loss": 1.4597, + "step": 10699 + }, + { + "epoch": 1.2291080351501924, + "grad_norm": 0.5814695954322815, + "learning_rate": 0.0001, + "loss": 1.4469, + "step": 10700 + }, + { + "epoch": 1.2292229050600194, + "grad_norm": 0.5483696460723877, + "learning_rate": 0.0001, + "loss": 1.3991, + "step": 10701 + }, + { + "epoch": 1.2293377749698466, + "grad_norm": 0.5747151374816895, + "learning_rate": 0.0001, + "loss": 1.4609, + "step": 10702 + }, + { + "epoch": 1.2294526448796739, + "grad_norm": 0.5760491490364075, + "learning_rate": 0.0001, + "loss": 1.5331, + "step": 10703 + }, + { + "epoch": 1.2295675147895009, + "grad_norm": 0.5944879055023193, + "learning_rate": 0.0001, + "loss": 1.2962, + "step": 10704 + }, + { + "epoch": 1.229682384699328, + "grad_norm": 0.5719342231750488, + "learning_rate": 0.0001, + "loss": 1.3065, + "step": 10705 + }, + { + "epoch": 1.2297972546091551, + "grad_norm": 0.6149895787239075, + "learning_rate": 0.0001, + "loss": 1.4154, + "step": 10706 + }, + { + "epoch": 1.2299121245189824, + "grad_norm": 0.6030192971229553, + "learning_rate": 0.0001, + "loss": 1.5497, + "step": 10707 + }, + { + "epoch": 1.2300269944288094, + "grad_norm": 0.6043549180030823, + "learning_rate": 0.0001, + "loss": 1.5306, + "step": 10708 + }, + { + "epoch": 1.2301418643386364, + "grad_norm": 0.5450412034988403, + "learning_rate": 0.0001, + "loss": 1.5068, + "step": 10709 + }, + { + "epoch": 1.2302567342484636, + "grad_norm": 0.5756334066390991, + "learning_rate": 0.0001, + "loss": 1.619, + "step": 10710 + }, + { + "epoch": 1.2303716041582908, + "grad_norm": 0.5773504376411438, + "learning_rate": 0.0001, + "loss": 1.496, + "step": 10711 + }, + { + "epoch": 1.2304864740681178, + "grad_norm": 0.5983827710151672, + "learning_rate": 0.0001, + "loss": 1.4885, + "step": 10712 + }, + { + "epoch": 1.230601343977945, + "grad_norm": 0.5947871804237366, + "learning_rate": 0.0001, + "loss": 1.5242, + "step": 10713 + }, + { + "epoch": 1.230716213887772, + "grad_norm": 0.5693453550338745, + "learning_rate": 0.0001, + "loss": 1.4284, + "step": 10714 + }, + { + "epoch": 1.2308310837975993, + "grad_norm": 0.616321325302124, + "learning_rate": 0.0001, + "loss": 1.59, + "step": 10715 + }, + { + "epoch": 1.2309459537074263, + "grad_norm": 0.581452488899231, + "learning_rate": 0.0001, + "loss": 1.4594, + "step": 10716 + }, + { + "epoch": 1.2310608236172536, + "grad_norm": 0.5225065350532532, + "learning_rate": 0.0001, + "loss": 1.4939, + "step": 10717 + }, + { + "epoch": 1.2311756935270806, + "grad_norm": 0.5418626666069031, + "learning_rate": 0.0001, + "loss": 1.3131, + "step": 10718 + }, + { + "epoch": 1.2312905634369078, + "grad_norm": 0.5273642539978027, + "learning_rate": 0.0001, + "loss": 1.4067, + "step": 10719 + }, + { + "epoch": 1.2314054333467348, + "grad_norm": 0.5336480140686035, + "learning_rate": 0.0001, + "loss": 1.3815, + "step": 10720 + }, + { + "epoch": 1.231520303256562, + "grad_norm": 0.6629604697227478, + "learning_rate": 0.0001, + "loss": 1.5378, + "step": 10721 + }, + { + "epoch": 1.231635173166389, + "grad_norm": 0.5382062792778015, + "learning_rate": 0.0001, + "loss": 1.1965, + "step": 10722 + }, + { + "epoch": 1.2317500430762163, + "grad_norm": 0.5966381430625916, + "learning_rate": 0.0001, + "loss": 1.3991, + "step": 10723 + }, + { + "epoch": 1.2318649129860433, + "grad_norm": 0.5476511716842651, + "learning_rate": 0.0001, + "loss": 1.4263, + "step": 10724 + }, + { + "epoch": 1.2319797828958705, + "grad_norm": 0.5930783152580261, + "learning_rate": 0.0001, + "loss": 1.3903, + "step": 10725 + }, + { + "epoch": 1.2320946528056975, + "grad_norm": 0.5383990406990051, + "learning_rate": 0.0001, + "loss": 1.3619, + "step": 10726 + }, + { + "epoch": 1.2322095227155248, + "grad_norm": 0.5706599950790405, + "learning_rate": 0.0001, + "loss": 1.5259, + "step": 10727 + }, + { + "epoch": 1.2323243926253518, + "grad_norm": 0.5487735867500305, + "learning_rate": 0.0001, + "loss": 1.4059, + "step": 10728 + }, + { + "epoch": 1.232439262535179, + "grad_norm": 0.5330200791358948, + "learning_rate": 0.0001, + "loss": 1.4389, + "step": 10729 + }, + { + "epoch": 1.232554132445006, + "grad_norm": 0.5702281594276428, + "learning_rate": 0.0001, + "loss": 1.632, + "step": 10730 + }, + { + "epoch": 1.2326690023548332, + "grad_norm": 0.5802901983261108, + "learning_rate": 0.0001, + "loss": 1.4692, + "step": 10731 + }, + { + "epoch": 1.2327838722646602, + "grad_norm": 0.6011072993278503, + "learning_rate": 0.0001, + "loss": 1.3248, + "step": 10732 + }, + { + "epoch": 1.2328987421744875, + "grad_norm": 0.6271477937698364, + "learning_rate": 0.0001, + "loss": 1.5501, + "step": 10733 + }, + { + "epoch": 1.2330136120843145, + "grad_norm": 0.5693656802177429, + "learning_rate": 0.0001, + "loss": 1.4817, + "step": 10734 + }, + { + "epoch": 1.2331284819941417, + "grad_norm": 0.625735342502594, + "learning_rate": 0.0001, + "loss": 1.5831, + "step": 10735 + }, + { + "epoch": 1.2332433519039687, + "grad_norm": 0.5748286247253418, + "learning_rate": 0.0001, + "loss": 1.5317, + "step": 10736 + }, + { + "epoch": 1.233358221813796, + "grad_norm": 0.636698842048645, + "learning_rate": 0.0001, + "loss": 1.5133, + "step": 10737 + }, + { + "epoch": 1.233473091723623, + "grad_norm": 0.5615163445472717, + "learning_rate": 0.0001, + "loss": 1.2444, + "step": 10738 + }, + { + "epoch": 1.2335879616334502, + "grad_norm": 0.572391927242279, + "learning_rate": 0.0001, + "loss": 1.3733, + "step": 10739 + }, + { + "epoch": 1.2337028315432772, + "grad_norm": 0.5233102440834045, + "learning_rate": 0.0001, + "loss": 1.2587, + "step": 10740 + }, + { + "epoch": 1.2338177014531044, + "grad_norm": 0.5385754108428955, + "learning_rate": 0.0001, + "loss": 1.3457, + "step": 10741 + }, + { + "epoch": 1.2339325713629314, + "grad_norm": 0.5545738339424133, + "learning_rate": 0.0001, + "loss": 1.4354, + "step": 10742 + }, + { + "epoch": 1.2340474412727587, + "grad_norm": 0.5675808787345886, + "learning_rate": 0.0001, + "loss": 1.4972, + "step": 10743 + }, + { + "epoch": 1.2341623111825857, + "grad_norm": 0.5583345293998718, + "learning_rate": 0.0001, + "loss": 1.4697, + "step": 10744 + }, + { + "epoch": 1.234277181092413, + "grad_norm": 0.5754501819610596, + "learning_rate": 0.0001, + "loss": 1.4148, + "step": 10745 + }, + { + "epoch": 1.23439205100224, + "grad_norm": 0.5526403784751892, + "learning_rate": 0.0001, + "loss": 1.4903, + "step": 10746 + }, + { + "epoch": 1.2345069209120672, + "grad_norm": 0.6143956184387207, + "learning_rate": 0.0001, + "loss": 1.4575, + "step": 10747 + }, + { + "epoch": 1.2346217908218942, + "grad_norm": 0.556839644908905, + "learning_rate": 0.0001, + "loss": 1.4682, + "step": 10748 + }, + { + "epoch": 1.2347366607317214, + "grad_norm": 0.5772037506103516, + "learning_rate": 0.0001, + "loss": 1.2345, + "step": 10749 + }, + { + "epoch": 1.2348515306415484, + "grad_norm": 0.5617235898971558, + "learning_rate": 0.0001, + "loss": 1.6019, + "step": 10750 + }, + { + "epoch": 1.2349664005513756, + "grad_norm": 0.5470720529556274, + "learning_rate": 0.0001, + "loss": 1.3394, + "step": 10751 + }, + { + "epoch": 1.2350812704612026, + "grad_norm": 0.6235065460205078, + "learning_rate": 0.0001, + "loss": 1.3382, + "step": 10752 + }, + { + "epoch": 1.2351961403710299, + "grad_norm": 0.5779804587364197, + "learning_rate": 0.0001, + "loss": 1.1888, + "step": 10753 + }, + { + "epoch": 1.2353110102808569, + "grad_norm": 0.6196669340133667, + "learning_rate": 0.0001, + "loss": 1.6653, + "step": 10754 + }, + { + "epoch": 1.2354258801906841, + "grad_norm": 0.6144828200340271, + "learning_rate": 0.0001, + "loss": 1.5581, + "step": 10755 + }, + { + "epoch": 1.2355407501005111, + "grad_norm": 0.5465558171272278, + "learning_rate": 0.0001, + "loss": 1.3457, + "step": 10756 + }, + { + "epoch": 1.2356556200103384, + "grad_norm": 0.6325592398643494, + "learning_rate": 0.0001, + "loss": 1.4344, + "step": 10757 + }, + { + "epoch": 1.2357704899201654, + "grad_norm": 0.5570918917655945, + "learning_rate": 0.0001, + "loss": 1.453, + "step": 10758 + }, + { + "epoch": 1.2358853598299926, + "grad_norm": 0.5771710872650146, + "learning_rate": 0.0001, + "loss": 1.5336, + "step": 10759 + }, + { + "epoch": 1.2360002297398196, + "grad_norm": 0.5577232837677002, + "learning_rate": 0.0001, + "loss": 1.4324, + "step": 10760 + }, + { + "epoch": 1.2361150996496468, + "grad_norm": 0.5303571224212646, + "learning_rate": 0.0001, + "loss": 1.2595, + "step": 10761 + }, + { + "epoch": 1.2362299695594738, + "grad_norm": 0.6245179176330566, + "learning_rate": 0.0001, + "loss": 1.4688, + "step": 10762 + }, + { + "epoch": 1.236344839469301, + "grad_norm": 0.5816663503646851, + "learning_rate": 0.0001, + "loss": 1.5545, + "step": 10763 + }, + { + "epoch": 1.236459709379128, + "grad_norm": 0.5651383996009827, + "learning_rate": 0.0001, + "loss": 1.293, + "step": 10764 + }, + { + "epoch": 1.2365745792889553, + "grad_norm": 0.563153862953186, + "learning_rate": 0.0001, + "loss": 1.3759, + "step": 10765 + }, + { + "epoch": 1.2366894491987823, + "grad_norm": 0.5659855604171753, + "learning_rate": 0.0001, + "loss": 1.5424, + "step": 10766 + }, + { + "epoch": 1.2368043191086096, + "grad_norm": 0.564038872718811, + "learning_rate": 0.0001, + "loss": 1.4861, + "step": 10767 + }, + { + "epoch": 1.2369191890184366, + "grad_norm": 0.6177818775177002, + "learning_rate": 0.0001, + "loss": 1.586, + "step": 10768 + }, + { + "epoch": 1.2370340589282638, + "grad_norm": 0.5920203328132629, + "learning_rate": 0.0001, + "loss": 1.5267, + "step": 10769 + }, + { + "epoch": 1.2371489288380908, + "grad_norm": 0.6030647158622742, + "learning_rate": 0.0001, + "loss": 1.6662, + "step": 10770 + }, + { + "epoch": 1.237263798747918, + "grad_norm": 0.5532615780830383, + "learning_rate": 0.0001, + "loss": 1.3566, + "step": 10771 + }, + { + "epoch": 1.237378668657745, + "grad_norm": 0.6269826292991638, + "learning_rate": 0.0001, + "loss": 1.3191, + "step": 10772 + }, + { + "epoch": 1.2374935385675723, + "grad_norm": 0.6108141541481018, + "learning_rate": 0.0001, + "loss": 1.3749, + "step": 10773 + }, + { + "epoch": 1.2376084084773993, + "grad_norm": 0.557407796382904, + "learning_rate": 0.0001, + "loss": 1.5626, + "step": 10774 + }, + { + "epoch": 1.2377232783872265, + "grad_norm": 0.6010729074478149, + "learning_rate": 0.0001, + "loss": 1.4623, + "step": 10775 + }, + { + "epoch": 1.2378381482970535, + "grad_norm": 0.6081424355506897, + "learning_rate": 0.0001, + "loss": 1.5455, + "step": 10776 + }, + { + "epoch": 1.2379530182068808, + "grad_norm": 0.6339059472084045, + "learning_rate": 0.0001, + "loss": 1.5383, + "step": 10777 + }, + { + "epoch": 1.2380678881167078, + "grad_norm": 0.5645229816436768, + "learning_rate": 0.0001, + "loss": 1.479, + "step": 10778 + }, + { + "epoch": 1.238182758026535, + "grad_norm": 0.5893407464027405, + "learning_rate": 0.0001, + "loss": 1.5504, + "step": 10779 + }, + { + "epoch": 1.238297627936362, + "grad_norm": 0.6675935387611389, + "learning_rate": 0.0001, + "loss": 1.7331, + "step": 10780 + }, + { + "epoch": 1.2384124978461892, + "grad_norm": 0.5694357752799988, + "learning_rate": 0.0001, + "loss": 1.3615, + "step": 10781 + }, + { + "epoch": 1.2385273677560162, + "grad_norm": 0.5906005501747131, + "learning_rate": 0.0001, + "loss": 1.569, + "step": 10782 + }, + { + "epoch": 1.2386422376658435, + "grad_norm": 0.6008540391921997, + "learning_rate": 0.0001, + "loss": 1.6614, + "step": 10783 + }, + { + "epoch": 1.2387571075756705, + "grad_norm": 0.5501822233200073, + "learning_rate": 0.0001, + "loss": 1.5858, + "step": 10784 + }, + { + "epoch": 1.2388719774854977, + "grad_norm": 0.5909137725830078, + "learning_rate": 0.0001, + "loss": 1.5946, + "step": 10785 + }, + { + "epoch": 1.2389868473953247, + "grad_norm": 0.5546433329582214, + "learning_rate": 0.0001, + "loss": 1.4521, + "step": 10786 + }, + { + "epoch": 1.239101717305152, + "grad_norm": 0.5489922165870667, + "learning_rate": 0.0001, + "loss": 1.2556, + "step": 10787 + }, + { + "epoch": 1.239216587214979, + "grad_norm": 0.5586884617805481, + "learning_rate": 0.0001, + "loss": 1.3457, + "step": 10788 + }, + { + "epoch": 1.2393314571248062, + "grad_norm": 0.5664530992507935, + "learning_rate": 0.0001, + "loss": 1.4248, + "step": 10789 + }, + { + "epoch": 1.2394463270346332, + "grad_norm": 0.5709450244903564, + "learning_rate": 0.0001, + "loss": 1.4569, + "step": 10790 + }, + { + "epoch": 1.2395611969444604, + "grad_norm": 0.514665424823761, + "learning_rate": 0.0001, + "loss": 1.394, + "step": 10791 + }, + { + "epoch": 1.2396760668542874, + "grad_norm": 0.5852550268173218, + "learning_rate": 0.0001, + "loss": 1.6148, + "step": 10792 + }, + { + "epoch": 1.2397909367641147, + "grad_norm": 0.5568869113922119, + "learning_rate": 0.0001, + "loss": 1.3949, + "step": 10793 + }, + { + "epoch": 1.2399058066739417, + "grad_norm": 0.5907896161079407, + "learning_rate": 0.0001, + "loss": 1.383, + "step": 10794 + }, + { + "epoch": 1.240020676583769, + "grad_norm": 0.6120040416717529, + "learning_rate": 0.0001, + "loss": 1.4843, + "step": 10795 + }, + { + "epoch": 1.240135546493596, + "grad_norm": 0.5711389183998108, + "learning_rate": 0.0001, + "loss": 1.4388, + "step": 10796 + }, + { + "epoch": 1.2402504164034232, + "grad_norm": 0.5505643486976624, + "learning_rate": 0.0001, + "loss": 1.502, + "step": 10797 + }, + { + "epoch": 1.2403652863132502, + "grad_norm": 0.5786750316619873, + "learning_rate": 0.0001, + "loss": 1.4686, + "step": 10798 + }, + { + "epoch": 1.2404801562230774, + "grad_norm": 0.5783752799034119, + "learning_rate": 0.0001, + "loss": 1.399, + "step": 10799 + }, + { + "epoch": 1.2405950261329044, + "grad_norm": 0.6035143733024597, + "learning_rate": 0.0001, + "loss": 1.4854, + "step": 10800 + }, + { + "epoch": 1.2407098960427316, + "grad_norm": 0.5874049663543701, + "learning_rate": 0.0001, + "loss": 1.5683, + "step": 10801 + }, + { + "epoch": 1.2408247659525586, + "grad_norm": 0.5582104921340942, + "learning_rate": 0.0001, + "loss": 1.3253, + "step": 10802 + }, + { + "epoch": 1.2409396358623859, + "grad_norm": 0.5821365118026733, + "learning_rate": 0.0001, + "loss": 1.396, + "step": 10803 + }, + { + "epoch": 1.2410545057722129, + "grad_norm": 0.5929169654846191, + "learning_rate": 0.0001, + "loss": 1.5014, + "step": 10804 + }, + { + "epoch": 1.24116937568204, + "grad_norm": 0.5773475766181946, + "learning_rate": 0.0001, + "loss": 1.6268, + "step": 10805 + }, + { + "epoch": 1.2412842455918671, + "grad_norm": 0.5636146068572998, + "learning_rate": 0.0001, + "loss": 1.5712, + "step": 10806 + }, + { + "epoch": 1.2413991155016944, + "grad_norm": 0.5757436156272888, + "learning_rate": 0.0001, + "loss": 1.6073, + "step": 10807 + }, + { + "epoch": 1.2415139854115214, + "grad_norm": 0.5821229815483093, + "learning_rate": 0.0001, + "loss": 1.4745, + "step": 10808 + }, + { + "epoch": 1.2416288553213486, + "grad_norm": 0.5732831954956055, + "learning_rate": 0.0001, + "loss": 1.4471, + "step": 10809 + }, + { + "epoch": 1.2417437252311756, + "grad_norm": 0.5851945877075195, + "learning_rate": 0.0001, + "loss": 1.4989, + "step": 10810 + }, + { + "epoch": 1.2418585951410028, + "grad_norm": 0.5864282846450806, + "learning_rate": 0.0001, + "loss": 1.6848, + "step": 10811 + }, + { + "epoch": 1.2419734650508298, + "grad_norm": 0.5669234395027161, + "learning_rate": 0.0001, + "loss": 1.2759, + "step": 10812 + }, + { + "epoch": 1.242088334960657, + "grad_norm": 0.5712340474128723, + "learning_rate": 0.0001, + "loss": 1.4233, + "step": 10813 + }, + { + "epoch": 1.242203204870484, + "grad_norm": 0.5951799750328064, + "learning_rate": 0.0001, + "loss": 1.4003, + "step": 10814 + }, + { + "epoch": 1.2423180747803113, + "grad_norm": 0.5672787427902222, + "learning_rate": 0.0001, + "loss": 1.5504, + "step": 10815 + }, + { + "epoch": 1.2424329446901385, + "grad_norm": 0.6768470406532288, + "learning_rate": 0.0001, + "loss": 1.7115, + "step": 10816 + }, + { + "epoch": 1.2425478145999655, + "grad_norm": 0.5624891519546509, + "learning_rate": 0.0001, + "loss": 1.5599, + "step": 10817 + }, + { + "epoch": 1.2426626845097926, + "grad_norm": 0.6003620624542236, + "learning_rate": 0.0001, + "loss": 1.6077, + "step": 10818 + }, + { + "epoch": 1.2427775544196198, + "grad_norm": 0.61441570520401, + "learning_rate": 0.0001, + "loss": 1.668, + "step": 10819 + }, + { + "epoch": 1.242892424329447, + "grad_norm": 0.5454705953598022, + "learning_rate": 0.0001, + "loss": 1.695, + "step": 10820 + }, + { + "epoch": 1.243007294239274, + "grad_norm": 0.6024851202964783, + "learning_rate": 0.0001, + "loss": 1.5696, + "step": 10821 + }, + { + "epoch": 1.243122164149101, + "grad_norm": 0.560492992401123, + "learning_rate": 0.0001, + "loss": 1.4578, + "step": 10822 + }, + { + "epoch": 1.2432370340589283, + "grad_norm": 0.5419345498085022, + "learning_rate": 0.0001, + "loss": 1.4609, + "step": 10823 + }, + { + "epoch": 1.2433519039687555, + "grad_norm": 0.6079069972038269, + "learning_rate": 0.0001, + "loss": 1.4584, + "step": 10824 + }, + { + "epoch": 1.2434667738785825, + "grad_norm": 0.519904613494873, + "learning_rate": 0.0001, + "loss": 1.4182, + "step": 10825 + }, + { + "epoch": 1.2435816437884095, + "grad_norm": 0.5554570555686951, + "learning_rate": 0.0001, + "loss": 1.5727, + "step": 10826 + }, + { + "epoch": 1.2436965136982367, + "grad_norm": 0.5826308131217957, + "learning_rate": 0.0001, + "loss": 1.5531, + "step": 10827 + }, + { + "epoch": 1.243811383608064, + "grad_norm": 0.5417545437812805, + "learning_rate": 0.0001, + "loss": 1.3831, + "step": 10828 + }, + { + "epoch": 1.243926253517891, + "grad_norm": 0.5289231538772583, + "learning_rate": 0.0001, + "loss": 1.5987, + "step": 10829 + }, + { + "epoch": 1.244041123427718, + "grad_norm": 0.5962126851081848, + "learning_rate": 0.0001, + "loss": 1.448, + "step": 10830 + }, + { + "epoch": 1.2441559933375452, + "grad_norm": 0.5800266861915588, + "learning_rate": 0.0001, + "loss": 1.5021, + "step": 10831 + }, + { + "epoch": 1.2442708632473725, + "grad_norm": 0.567566990852356, + "learning_rate": 0.0001, + "loss": 1.4349, + "step": 10832 + }, + { + "epoch": 1.2443857331571995, + "grad_norm": 0.6255682110786438, + "learning_rate": 0.0001, + "loss": 1.6459, + "step": 10833 + }, + { + "epoch": 1.2445006030670265, + "grad_norm": 0.5554635524749756, + "learning_rate": 0.0001, + "loss": 1.307, + "step": 10834 + }, + { + "epoch": 1.2446154729768537, + "grad_norm": 0.5715488195419312, + "learning_rate": 0.0001, + "loss": 1.5768, + "step": 10835 + }, + { + "epoch": 1.244730342886681, + "grad_norm": 0.5674731731414795, + "learning_rate": 0.0001, + "loss": 1.5265, + "step": 10836 + }, + { + "epoch": 1.244845212796508, + "grad_norm": 0.5632114410400391, + "learning_rate": 0.0001, + "loss": 1.4986, + "step": 10837 + }, + { + "epoch": 1.244960082706335, + "grad_norm": 0.5295623540878296, + "learning_rate": 0.0001, + "loss": 1.4137, + "step": 10838 + }, + { + "epoch": 1.2450749526161622, + "grad_norm": 0.5888751149177551, + "learning_rate": 0.0001, + "loss": 1.562, + "step": 10839 + }, + { + "epoch": 1.2451898225259894, + "grad_norm": 0.6099748015403748, + "learning_rate": 0.0001, + "loss": 1.4734, + "step": 10840 + }, + { + "epoch": 1.2453046924358164, + "grad_norm": 0.5546545386314392, + "learning_rate": 0.0001, + "loss": 1.4835, + "step": 10841 + }, + { + "epoch": 1.2454195623456434, + "grad_norm": 0.5606866478919983, + "learning_rate": 0.0001, + "loss": 1.5588, + "step": 10842 + }, + { + "epoch": 1.2455344322554707, + "grad_norm": 0.6139029860496521, + "learning_rate": 0.0001, + "loss": 1.5756, + "step": 10843 + }, + { + "epoch": 1.245649302165298, + "grad_norm": 0.6030558347702026, + "learning_rate": 0.0001, + "loss": 1.362, + "step": 10844 + }, + { + "epoch": 1.245764172075125, + "grad_norm": 0.5658564567565918, + "learning_rate": 0.0001, + "loss": 1.4998, + "step": 10845 + }, + { + "epoch": 1.245879041984952, + "grad_norm": 0.5419941544532776, + "learning_rate": 0.0001, + "loss": 1.3872, + "step": 10846 + }, + { + "epoch": 1.2459939118947791, + "grad_norm": 0.6134659647941589, + "learning_rate": 0.0001, + "loss": 1.6605, + "step": 10847 + }, + { + "epoch": 1.2461087818046064, + "grad_norm": 0.5531185269355774, + "learning_rate": 0.0001, + "loss": 1.4083, + "step": 10848 + }, + { + "epoch": 1.2462236517144334, + "grad_norm": 0.5249099731445312, + "learning_rate": 0.0001, + "loss": 1.4228, + "step": 10849 + }, + { + "epoch": 1.2463385216242606, + "grad_norm": 0.5550907254219055, + "learning_rate": 0.0001, + "loss": 1.314, + "step": 10850 + }, + { + "epoch": 1.2464533915340876, + "grad_norm": 0.6986422538757324, + "learning_rate": 0.0001, + "loss": 1.7078, + "step": 10851 + }, + { + "epoch": 1.2465682614439149, + "grad_norm": 0.5527095794677734, + "learning_rate": 0.0001, + "loss": 1.4435, + "step": 10852 + }, + { + "epoch": 1.2466831313537419, + "grad_norm": 0.6040549278259277, + "learning_rate": 0.0001, + "loss": 1.5648, + "step": 10853 + }, + { + "epoch": 1.246798001263569, + "grad_norm": 0.5868404507637024, + "learning_rate": 0.0001, + "loss": 1.3417, + "step": 10854 + }, + { + "epoch": 1.246912871173396, + "grad_norm": 0.5885109305381775, + "learning_rate": 0.0001, + "loss": 1.5384, + "step": 10855 + }, + { + "epoch": 1.2470277410832233, + "grad_norm": 0.6273307800292969, + "learning_rate": 0.0001, + "loss": 1.6859, + "step": 10856 + }, + { + "epoch": 1.2471426109930503, + "grad_norm": 0.6067867279052734, + "learning_rate": 0.0001, + "loss": 1.4738, + "step": 10857 + }, + { + "epoch": 1.2472574809028776, + "grad_norm": 0.6141306757926941, + "learning_rate": 0.0001, + "loss": 1.3835, + "step": 10858 + }, + { + "epoch": 1.2473723508127046, + "grad_norm": 0.5698774456977844, + "learning_rate": 0.0001, + "loss": 1.3689, + "step": 10859 + }, + { + "epoch": 1.2474872207225318, + "grad_norm": 0.6184090375900269, + "learning_rate": 0.0001, + "loss": 1.549, + "step": 10860 + }, + { + "epoch": 1.2476020906323588, + "grad_norm": 0.5597769021987915, + "learning_rate": 0.0001, + "loss": 1.3235, + "step": 10861 + }, + { + "epoch": 1.247716960542186, + "grad_norm": 0.5379227995872498, + "learning_rate": 0.0001, + "loss": 1.4883, + "step": 10862 + }, + { + "epoch": 1.247831830452013, + "grad_norm": 0.5401409864425659, + "learning_rate": 0.0001, + "loss": 1.3701, + "step": 10863 + }, + { + "epoch": 1.2479467003618403, + "grad_norm": 0.5571491718292236, + "learning_rate": 0.0001, + "loss": 1.4373, + "step": 10864 + }, + { + "epoch": 1.2480615702716673, + "grad_norm": 0.550502359867096, + "learning_rate": 0.0001, + "loss": 1.4155, + "step": 10865 + }, + { + "epoch": 1.2481764401814945, + "grad_norm": 0.5631131529808044, + "learning_rate": 0.0001, + "loss": 1.5354, + "step": 10866 + }, + { + "epoch": 1.2482913100913215, + "grad_norm": 0.5850018858909607, + "learning_rate": 0.0001, + "loss": 1.5193, + "step": 10867 + }, + { + "epoch": 1.2484061800011488, + "grad_norm": 0.6137135028839111, + "learning_rate": 0.0001, + "loss": 1.5496, + "step": 10868 + }, + { + "epoch": 1.2485210499109758, + "grad_norm": 0.5973943471908569, + "learning_rate": 0.0001, + "loss": 1.4159, + "step": 10869 + }, + { + "epoch": 1.248635919820803, + "grad_norm": 0.5690673589706421, + "learning_rate": 0.0001, + "loss": 1.5765, + "step": 10870 + }, + { + "epoch": 1.24875078973063, + "grad_norm": 0.5665297508239746, + "learning_rate": 0.0001, + "loss": 1.5149, + "step": 10871 + }, + { + "epoch": 1.2488656596404573, + "grad_norm": 0.5932878851890564, + "learning_rate": 0.0001, + "loss": 1.5818, + "step": 10872 + }, + { + "epoch": 1.2489805295502843, + "grad_norm": 0.634212076663971, + "learning_rate": 0.0001, + "loss": 1.4781, + "step": 10873 + }, + { + "epoch": 1.2490953994601115, + "grad_norm": 0.6295340061187744, + "learning_rate": 0.0001, + "loss": 1.567, + "step": 10874 + }, + { + "epoch": 1.2492102693699385, + "grad_norm": 0.5564239025115967, + "learning_rate": 0.0001, + "loss": 1.577, + "step": 10875 + }, + { + "epoch": 1.2493251392797657, + "grad_norm": 0.591235339641571, + "learning_rate": 0.0001, + "loss": 1.5672, + "step": 10876 + }, + { + "epoch": 1.2494400091895927, + "grad_norm": 0.540838360786438, + "learning_rate": 0.0001, + "loss": 1.4963, + "step": 10877 + }, + { + "epoch": 1.24955487909942, + "grad_norm": 0.5274412631988525, + "learning_rate": 0.0001, + "loss": 1.3721, + "step": 10878 + }, + { + "epoch": 1.249669749009247, + "grad_norm": 0.532894492149353, + "learning_rate": 0.0001, + "loss": 1.5508, + "step": 10879 + }, + { + "epoch": 1.2497846189190742, + "grad_norm": 0.5777339339256287, + "learning_rate": 0.0001, + "loss": 1.4037, + "step": 10880 + }, + { + "epoch": 1.2498994888289012, + "grad_norm": 0.6504830718040466, + "learning_rate": 0.0001, + "loss": 1.4341, + "step": 10881 + }, + { + "epoch": 1.2500143587387285, + "grad_norm": 0.5571553111076355, + "learning_rate": 0.0001, + "loss": 1.637, + "step": 10882 + }, + { + "epoch": 1.2501292286485555, + "grad_norm": 0.6024576425552368, + "learning_rate": 0.0001, + "loss": 1.5162, + "step": 10883 + }, + { + "epoch": 1.2502440985583827, + "grad_norm": 0.5811855792999268, + "learning_rate": 0.0001, + "loss": 1.3607, + "step": 10884 + }, + { + "epoch": 1.2503589684682097, + "grad_norm": 0.571258544921875, + "learning_rate": 0.0001, + "loss": 1.6321, + "step": 10885 + }, + { + "epoch": 1.250473838378037, + "grad_norm": 0.5692873001098633, + "learning_rate": 0.0001, + "loss": 1.4705, + "step": 10886 + }, + { + "epoch": 1.250588708287864, + "grad_norm": 0.6267750263214111, + "learning_rate": 0.0001, + "loss": 1.5673, + "step": 10887 + }, + { + "epoch": 1.2507035781976912, + "grad_norm": 0.6186516284942627, + "learning_rate": 0.0001, + "loss": 1.4483, + "step": 10888 + }, + { + "epoch": 1.2508184481075182, + "grad_norm": 0.5934524536132812, + "learning_rate": 0.0001, + "loss": 1.5474, + "step": 10889 + }, + { + "epoch": 1.2509333180173454, + "grad_norm": 0.6174366474151611, + "learning_rate": 0.0001, + "loss": 1.6715, + "step": 10890 + }, + { + "epoch": 1.2510481879271724, + "grad_norm": 0.5978173613548279, + "learning_rate": 0.0001, + "loss": 1.5807, + "step": 10891 + }, + { + "epoch": 1.2511630578369997, + "grad_norm": 0.563639760017395, + "learning_rate": 0.0001, + "loss": 1.5597, + "step": 10892 + }, + { + "epoch": 1.2512779277468267, + "grad_norm": 0.6402945518493652, + "learning_rate": 0.0001, + "loss": 1.6375, + "step": 10893 + }, + { + "epoch": 1.251392797656654, + "grad_norm": 0.589114248752594, + "learning_rate": 0.0001, + "loss": 1.5766, + "step": 10894 + }, + { + "epoch": 1.251507667566481, + "grad_norm": 0.5770435929298401, + "learning_rate": 0.0001, + "loss": 1.4116, + "step": 10895 + }, + { + "epoch": 1.2516225374763081, + "grad_norm": 0.6756162643432617, + "learning_rate": 0.0001, + "loss": 1.7188, + "step": 10896 + }, + { + "epoch": 1.2517374073861351, + "grad_norm": 0.5813696384429932, + "learning_rate": 0.0001, + "loss": 1.5876, + "step": 10897 + }, + { + "epoch": 1.2518522772959624, + "grad_norm": 0.5571926832199097, + "learning_rate": 0.0001, + "loss": 1.392, + "step": 10898 + }, + { + "epoch": 1.2519671472057894, + "grad_norm": 0.5992504954338074, + "learning_rate": 0.0001, + "loss": 1.6786, + "step": 10899 + }, + { + "epoch": 1.2520820171156166, + "grad_norm": 0.5757802724838257, + "learning_rate": 0.0001, + "loss": 1.4874, + "step": 10900 + }, + { + "epoch": 1.2521968870254436, + "grad_norm": 0.5644787549972534, + "learning_rate": 0.0001, + "loss": 1.3862, + "step": 10901 + }, + { + "epoch": 1.2523117569352709, + "grad_norm": 0.5546348690986633, + "learning_rate": 0.0001, + "loss": 1.4577, + "step": 10902 + }, + { + "epoch": 1.2524266268450979, + "grad_norm": 0.5690358281135559, + "learning_rate": 0.0001, + "loss": 1.3687, + "step": 10903 + }, + { + "epoch": 1.252541496754925, + "grad_norm": 0.5834867358207703, + "learning_rate": 0.0001, + "loss": 1.4238, + "step": 10904 + }, + { + "epoch": 1.252656366664752, + "grad_norm": 0.558188796043396, + "learning_rate": 0.0001, + "loss": 1.5329, + "step": 10905 + }, + { + "epoch": 1.2527712365745793, + "grad_norm": 0.5730266571044922, + "learning_rate": 0.0001, + "loss": 1.1854, + "step": 10906 + }, + { + "epoch": 1.2528861064844063, + "grad_norm": 0.6047393083572388, + "learning_rate": 0.0001, + "loss": 1.5067, + "step": 10907 + }, + { + "epoch": 1.2530009763942336, + "grad_norm": 0.5652978420257568, + "learning_rate": 0.0001, + "loss": 1.3756, + "step": 10908 + }, + { + "epoch": 1.2531158463040606, + "grad_norm": 0.5942120552062988, + "learning_rate": 0.0001, + "loss": 1.3795, + "step": 10909 + }, + { + "epoch": 1.2532307162138878, + "grad_norm": 0.5671000480651855, + "learning_rate": 0.0001, + "loss": 1.4261, + "step": 10910 + }, + { + "epoch": 1.2533455861237148, + "grad_norm": 0.5476558208465576, + "learning_rate": 0.0001, + "loss": 1.3019, + "step": 10911 + }, + { + "epoch": 1.253460456033542, + "grad_norm": 0.6260891556739807, + "learning_rate": 0.0001, + "loss": 1.7735, + "step": 10912 + }, + { + "epoch": 1.253575325943369, + "grad_norm": 0.6707521677017212, + "learning_rate": 0.0001, + "loss": 1.3852, + "step": 10913 + }, + { + "epoch": 1.2536901958531963, + "grad_norm": 0.5497795939445496, + "learning_rate": 0.0001, + "loss": 1.5263, + "step": 10914 + }, + { + "epoch": 1.2538050657630233, + "grad_norm": 0.6137018203735352, + "learning_rate": 0.0001, + "loss": 1.449, + "step": 10915 + }, + { + "epoch": 1.2539199356728505, + "grad_norm": 0.575927734375, + "learning_rate": 0.0001, + "loss": 1.5082, + "step": 10916 + }, + { + "epoch": 1.2540348055826775, + "grad_norm": 0.5461201667785645, + "learning_rate": 0.0001, + "loss": 1.571, + "step": 10917 + }, + { + "epoch": 1.2541496754925048, + "grad_norm": 0.5775430202484131, + "learning_rate": 0.0001, + "loss": 1.5978, + "step": 10918 + }, + { + "epoch": 1.2542645454023318, + "grad_norm": 0.596335768699646, + "learning_rate": 0.0001, + "loss": 1.2492, + "step": 10919 + }, + { + "epoch": 1.254379415312159, + "grad_norm": 0.552144467830658, + "learning_rate": 0.0001, + "loss": 1.5481, + "step": 10920 + }, + { + "epoch": 1.254494285221986, + "grad_norm": 0.6232661008834839, + "learning_rate": 0.0001, + "loss": 1.6734, + "step": 10921 + }, + { + "epoch": 1.2546091551318133, + "grad_norm": 0.579709529876709, + "learning_rate": 0.0001, + "loss": 1.3082, + "step": 10922 + }, + { + "epoch": 1.2547240250416403, + "grad_norm": 0.5315948724746704, + "learning_rate": 0.0001, + "loss": 1.3547, + "step": 10923 + }, + { + "epoch": 1.2548388949514675, + "grad_norm": 0.5723145008087158, + "learning_rate": 0.0001, + "loss": 1.4858, + "step": 10924 + }, + { + "epoch": 1.2549537648612945, + "grad_norm": 0.5732467770576477, + "learning_rate": 0.0001, + "loss": 1.5813, + "step": 10925 + }, + { + "epoch": 1.2550686347711217, + "grad_norm": 0.5439171195030212, + "learning_rate": 0.0001, + "loss": 1.6146, + "step": 10926 + }, + { + "epoch": 1.2551835046809487, + "grad_norm": 0.5616052150726318, + "learning_rate": 0.0001, + "loss": 1.5706, + "step": 10927 + }, + { + "epoch": 1.255298374590776, + "grad_norm": 0.5952640175819397, + "learning_rate": 0.0001, + "loss": 1.3995, + "step": 10928 + }, + { + "epoch": 1.255413244500603, + "grad_norm": 0.5831541419029236, + "learning_rate": 0.0001, + "loss": 1.6304, + "step": 10929 + }, + { + "epoch": 1.2555281144104302, + "grad_norm": 0.5609625577926636, + "learning_rate": 0.0001, + "loss": 1.4818, + "step": 10930 + }, + { + "epoch": 1.2556429843202572, + "grad_norm": 0.5512393116950989, + "learning_rate": 0.0001, + "loss": 1.2979, + "step": 10931 + }, + { + "epoch": 1.2557578542300845, + "grad_norm": 0.5595287084579468, + "learning_rate": 0.0001, + "loss": 1.4539, + "step": 10932 + }, + { + "epoch": 1.2558727241399115, + "grad_norm": 0.5650918483734131, + "learning_rate": 0.0001, + "loss": 1.4097, + "step": 10933 + }, + { + "epoch": 1.2559875940497387, + "grad_norm": 0.5434353351593018, + "learning_rate": 0.0001, + "loss": 1.5182, + "step": 10934 + }, + { + "epoch": 1.2561024639595657, + "grad_norm": 0.565133810043335, + "learning_rate": 0.0001, + "loss": 1.5029, + "step": 10935 + }, + { + "epoch": 1.256217333869393, + "grad_norm": 0.6342281699180603, + "learning_rate": 0.0001, + "loss": 1.4796, + "step": 10936 + }, + { + "epoch": 1.2563322037792202, + "grad_norm": 0.5778996348381042, + "learning_rate": 0.0001, + "loss": 1.4856, + "step": 10937 + }, + { + "epoch": 1.2564470736890472, + "grad_norm": 0.5874305367469788, + "learning_rate": 0.0001, + "loss": 1.6482, + "step": 10938 + }, + { + "epoch": 1.2565619435988742, + "grad_norm": 0.5764653086662292, + "learning_rate": 0.0001, + "loss": 1.4978, + "step": 10939 + }, + { + "epoch": 1.2566768135087014, + "grad_norm": 0.5709093809127808, + "learning_rate": 0.0001, + "loss": 1.3895, + "step": 10940 + }, + { + "epoch": 1.2567916834185286, + "grad_norm": 0.610541045665741, + "learning_rate": 0.0001, + "loss": 1.5036, + "step": 10941 + }, + { + "epoch": 1.2569065533283557, + "grad_norm": 0.6028667688369751, + "learning_rate": 0.0001, + "loss": 1.6399, + "step": 10942 + }, + { + "epoch": 1.2570214232381827, + "grad_norm": 0.5337639451026917, + "learning_rate": 0.0001, + "loss": 1.4331, + "step": 10943 + }, + { + "epoch": 1.25713629314801, + "grad_norm": 0.6048827767372131, + "learning_rate": 0.0001, + "loss": 1.4235, + "step": 10944 + }, + { + "epoch": 1.2572511630578371, + "grad_norm": 0.5719192028045654, + "learning_rate": 0.0001, + "loss": 1.4289, + "step": 10945 + }, + { + "epoch": 1.2573660329676641, + "grad_norm": 0.5911000967025757, + "learning_rate": 0.0001, + "loss": 1.5606, + "step": 10946 + }, + { + "epoch": 1.2574809028774911, + "grad_norm": 0.5696734189987183, + "learning_rate": 0.0001, + "loss": 1.2368, + "step": 10947 + }, + { + "epoch": 1.2575957727873184, + "grad_norm": 0.5841726064682007, + "learning_rate": 0.0001, + "loss": 1.5245, + "step": 10948 + }, + { + "epoch": 1.2577106426971456, + "grad_norm": 0.6017117500305176, + "learning_rate": 0.0001, + "loss": 1.3813, + "step": 10949 + }, + { + "epoch": 1.2578255126069726, + "grad_norm": 0.5943053364753723, + "learning_rate": 0.0001, + "loss": 1.3885, + "step": 10950 + }, + { + "epoch": 1.2579403825167996, + "grad_norm": 0.5891920924186707, + "learning_rate": 0.0001, + "loss": 1.4698, + "step": 10951 + }, + { + "epoch": 1.2580552524266269, + "grad_norm": 0.5655304193496704, + "learning_rate": 0.0001, + "loss": 1.4617, + "step": 10952 + }, + { + "epoch": 1.258170122336454, + "grad_norm": 0.6333246231079102, + "learning_rate": 0.0001, + "loss": 1.5318, + "step": 10953 + }, + { + "epoch": 1.258284992246281, + "grad_norm": 0.6080989837646484, + "learning_rate": 0.0001, + "loss": 1.5946, + "step": 10954 + }, + { + "epoch": 1.258399862156108, + "grad_norm": 0.5369704961776733, + "learning_rate": 0.0001, + "loss": 1.418, + "step": 10955 + }, + { + "epoch": 1.2585147320659353, + "grad_norm": 0.5709648132324219, + "learning_rate": 0.0001, + "loss": 1.4451, + "step": 10956 + }, + { + "epoch": 1.2586296019757626, + "grad_norm": 0.5845744609832764, + "learning_rate": 0.0001, + "loss": 1.5762, + "step": 10957 + }, + { + "epoch": 1.2587444718855896, + "grad_norm": 0.588251531124115, + "learning_rate": 0.0001, + "loss": 1.5864, + "step": 10958 + }, + { + "epoch": 1.2588593417954166, + "grad_norm": 0.5609616041183472, + "learning_rate": 0.0001, + "loss": 1.5538, + "step": 10959 + }, + { + "epoch": 1.2589742117052438, + "grad_norm": 0.5940161347389221, + "learning_rate": 0.0001, + "loss": 1.631, + "step": 10960 + }, + { + "epoch": 1.259089081615071, + "grad_norm": 0.6487417817115784, + "learning_rate": 0.0001, + "loss": 1.6715, + "step": 10961 + }, + { + "epoch": 1.259203951524898, + "grad_norm": 0.5806244015693665, + "learning_rate": 0.0001, + "loss": 1.3649, + "step": 10962 + }, + { + "epoch": 1.259318821434725, + "grad_norm": 0.5590049624443054, + "learning_rate": 0.0001, + "loss": 1.3849, + "step": 10963 + }, + { + "epoch": 1.2594336913445523, + "grad_norm": 0.57107013463974, + "learning_rate": 0.0001, + "loss": 1.3517, + "step": 10964 + }, + { + "epoch": 1.2595485612543795, + "grad_norm": 0.5718011260032654, + "learning_rate": 0.0001, + "loss": 1.3671, + "step": 10965 + }, + { + "epoch": 1.2596634311642065, + "grad_norm": 0.530774712562561, + "learning_rate": 0.0001, + "loss": 1.3329, + "step": 10966 + }, + { + "epoch": 1.2597783010740335, + "grad_norm": 0.5350990891456604, + "learning_rate": 0.0001, + "loss": 1.3828, + "step": 10967 + }, + { + "epoch": 1.2598931709838608, + "grad_norm": 0.5452684760093689, + "learning_rate": 0.0001, + "loss": 1.46, + "step": 10968 + }, + { + "epoch": 1.260008040893688, + "grad_norm": 0.5853266716003418, + "learning_rate": 0.0001, + "loss": 1.6132, + "step": 10969 + }, + { + "epoch": 1.260122910803515, + "grad_norm": 0.6207892298698425, + "learning_rate": 0.0001, + "loss": 1.5122, + "step": 10970 + }, + { + "epoch": 1.260237780713342, + "grad_norm": 0.6362786293029785, + "learning_rate": 0.0001, + "loss": 1.5732, + "step": 10971 + }, + { + "epoch": 1.2603526506231693, + "grad_norm": 0.5912286043167114, + "learning_rate": 0.0001, + "loss": 1.5615, + "step": 10972 + }, + { + "epoch": 1.2604675205329965, + "grad_norm": 0.5660232305526733, + "learning_rate": 0.0001, + "loss": 1.4575, + "step": 10973 + }, + { + "epoch": 1.2605823904428235, + "grad_norm": 0.5991741418838501, + "learning_rate": 0.0001, + "loss": 1.4294, + "step": 10974 + }, + { + "epoch": 1.2606972603526505, + "grad_norm": 0.6811667680740356, + "learning_rate": 0.0001, + "loss": 1.2061, + "step": 10975 + }, + { + "epoch": 1.2608121302624777, + "grad_norm": 0.5778743028640747, + "learning_rate": 0.0001, + "loss": 1.4363, + "step": 10976 + }, + { + "epoch": 1.260927000172305, + "grad_norm": 0.593360960483551, + "learning_rate": 0.0001, + "loss": 1.5062, + "step": 10977 + }, + { + "epoch": 1.261041870082132, + "grad_norm": 0.5722994804382324, + "learning_rate": 0.0001, + "loss": 1.5407, + "step": 10978 + }, + { + "epoch": 1.261156739991959, + "grad_norm": 0.5691931843757629, + "learning_rate": 0.0001, + "loss": 1.539, + "step": 10979 + }, + { + "epoch": 1.2612716099017862, + "grad_norm": 0.5399587750434875, + "learning_rate": 0.0001, + "loss": 1.3153, + "step": 10980 + }, + { + "epoch": 1.2613864798116134, + "grad_norm": 0.5468644499778748, + "learning_rate": 0.0001, + "loss": 1.4229, + "step": 10981 + }, + { + "epoch": 1.2615013497214405, + "grad_norm": 0.5714578032493591, + "learning_rate": 0.0001, + "loss": 1.5986, + "step": 10982 + }, + { + "epoch": 1.2616162196312675, + "grad_norm": 0.5381194353103638, + "learning_rate": 0.0001, + "loss": 1.4761, + "step": 10983 + }, + { + "epoch": 1.2617310895410947, + "grad_norm": 0.5348224639892578, + "learning_rate": 0.0001, + "loss": 1.328, + "step": 10984 + }, + { + "epoch": 1.261845959450922, + "grad_norm": 0.570966899394989, + "learning_rate": 0.0001, + "loss": 1.5432, + "step": 10985 + }, + { + "epoch": 1.261960829360749, + "grad_norm": 0.5382642149925232, + "learning_rate": 0.0001, + "loss": 1.4547, + "step": 10986 + }, + { + "epoch": 1.262075699270576, + "grad_norm": 0.5181246995925903, + "learning_rate": 0.0001, + "loss": 1.3591, + "step": 10987 + }, + { + "epoch": 1.2621905691804032, + "grad_norm": 0.5006334781646729, + "learning_rate": 0.0001, + "loss": 1.2591, + "step": 10988 + }, + { + "epoch": 1.2623054390902304, + "grad_norm": 0.5715638995170593, + "learning_rate": 0.0001, + "loss": 1.5684, + "step": 10989 + }, + { + "epoch": 1.2624203090000574, + "grad_norm": 0.5791033506393433, + "learning_rate": 0.0001, + "loss": 1.4059, + "step": 10990 + }, + { + "epoch": 1.2625351789098844, + "grad_norm": 0.5769622325897217, + "learning_rate": 0.0001, + "loss": 1.474, + "step": 10991 + }, + { + "epoch": 1.2626500488197117, + "grad_norm": 0.5328226685523987, + "learning_rate": 0.0001, + "loss": 1.4834, + "step": 10992 + }, + { + "epoch": 1.2627649187295389, + "grad_norm": 0.5586057901382446, + "learning_rate": 0.0001, + "loss": 1.3952, + "step": 10993 + }, + { + "epoch": 1.262879788639366, + "grad_norm": 0.5298300385475159, + "learning_rate": 0.0001, + "loss": 1.2045, + "step": 10994 + }, + { + "epoch": 1.262994658549193, + "grad_norm": 0.5825002193450928, + "learning_rate": 0.0001, + "loss": 1.585, + "step": 10995 + }, + { + "epoch": 1.2631095284590201, + "grad_norm": 0.5873317122459412, + "learning_rate": 0.0001, + "loss": 1.5954, + "step": 10996 + }, + { + "epoch": 1.2632243983688474, + "grad_norm": 0.5621634125709534, + "learning_rate": 0.0001, + "loss": 1.4467, + "step": 10997 + }, + { + "epoch": 1.2633392682786744, + "grad_norm": 0.6157379150390625, + "learning_rate": 0.0001, + "loss": 1.4748, + "step": 10998 + }, + { + "epoch": 1.2634541381885014, + "grad_norm": 0.5493670105934143, + "learning_rate": 0.0001, + "loss": 1.3428, + "step": 10999 + }, + { + "epoch": 1.2635690080983286, + "grad_norm": 0.5555403828620911, + "learning_rate": 0.0001, + "loss": 1.2846, + "step": 11000 + }, + { + "epoch": 1.2636838780081558, + "grad_norm": 0.5692359209060669, + "learning_rate": 0.0001, + "loss": 1.5709, + "step": 11001 + }, + { + "epoch": 1.2637987479179829, + "grad_norm": 0.5852733850479126, + "learning_rate": 0.0001, + "loss": 1.5246, + "step": 11002 + }, + { + "epoch": 1.26391361782781, + "grad_norm": 0.5792815685272217, + "learning_rate": 0.0001, + "loss": 1.41, + "step": 11003 + }, + { + "epoch": 1.264028487737637, + "grad_norm": 0.6380968689918518, + "learning_rate": 0.0001, + "loss": 1.3436, + "step": 11004 + }, + { + "epoch": 1.2641433576474643, + "grad_norm": 0.6811634302139282, + "learning_rate": 0.0001, + "loss": 1.5123, + "step": 11005 + }, + { + "epoch": 1.2642582275572913, + "grad_norm": 0.6941441297531128, + "learning_rate": 0.0001, + "loss": 1.6212, + "step": 11006 + }, + { + "epoch": 1.2643730974671186, + "grad_norm": 0.5787838697433472, + "learning_rate": 0.0001, + "loss": 1.4647, + "step": 11007 + }, + { + "epoch": 1.2644879673769456, + "grad_norm": 0.5790321230888367, + "learning_rate": 0.0001, + "loss": 1.477, + "step": 11008 + }, + { + "epoch": 1.2646028372867728, + "grad_norm": 0.55388343334198, + "learning_rate": 0.0001, + "loss": 1.4449, + "step": 11009 + }, + { + "epoch": 1.2647177071965998, + "grad_norm": 0.5964010953903198, + "learning_rate": 0.0001, + "loss": 1.3159, + "step": 11010 + }, + { + "epoch": 1.264832577106427, + "grad_norm": 0.5677118897438049, + "learning_rate": 0.0001, + "loss": 1.3851, + "step": 11011 + }, + { + "epoch": 1.264947447016254, + "grad_norm": 0.5453557372093201, + "learning_rate": 0.0001, + "loss": 1.3942, + "step": 11012 + }, + { + "epoch": 1.2650623169260813, + "grad_norm": 0.594939649105072, + "learning_rate": 0.0001, + "loss": 1.6116, + "step": 11013 + }, + { + "epoch": 1.2651771868359083, + "grad_norm": 0.5862032175064087, + "learning_rate": 0.0001, + "loss": 1.438, + "step": 11014 + }, + { + "epoch": 1.2652920567457355, + "grad_norm": 0.6072475910186768, + "learning_rate": 0.0001, + "loss": 1.4529, + "step": 11015 + }, + { + "epoch": 1.2654069266555625, + "grad_norm": 0.5791646242141724, + "learning_rate": 0.0001, + "loss": 1.4486, + "step": 11016 + }, + { + "epoch": 1.2655217965653898, + "grad_norm": 0.5972772240638733, + "learning_rate": 0.0001, + "loss": 1.6154, + "step": 11017 + }, + { + "epoch": 1.2656366664752168, + "grad_norm": 0.5665194988250732, + "learning_rate": 0.0001, + "loss": 1.2982, + "step": 11018 + }, + { + "epoch": 1.265751536385044, + "grad_norm": 0.5202224254608154, + "learning_rate": 0.0001, + "loss": 1.081, + "step": 11019 + }, + { + "epoch": 1.265866406294871, + "grad_norm": 0.5774416327476501, + "learning_rate": 0.0001, + "loss": 1.5648, + "step": 11020 + }, + { + "epoch": 1.2659812762046982, + "grad_norm": 0.6238007545471191, + "learning_rate": 0.0001, + "loss": 1.5547, + "step": 11021 + }, + { + "epoch": 1.2660961461145253, + "grad_norm": 0.6121378540992737, + "learning_rate": 0.0001, + "loss": 1.3435, + "step": 11022 + }, + { + "epoch": 1.2662110160243525, + "grad_norm": 0.6211857795715332, + "learning_rate": 0.0001, + "loss": 1.5317, + "step": 11023 + }, + { + "epoch": 1.2663258859341795, + "grad_norm": 0.6037236452102661, + "learning_rate": 0.0001, + "loss": 1.5751, + "step": 11024 + }, + { + "epoch": 1.2664407558440067, + "grad_norm": 0.5778083801269531, + "learning_rate": 0.0001, + "loss": 1.4495, + "step": 11025 + }, + { + "epoch": 1.2665556257538337, + "grad_norm": 0.5363320112228394, + "learning_rate": 0.0001, + "loss": 1.3852, + "step": 11026 + }, + { + "epoch": 1.266670495663661, + "grad_norm": 0.5823166966438293, + "learning_rate": 0.0001, + "loss": 1.5257, + "step": 11027 + }, + { + "epoch": 1.266785365573488, + "grad_norm": 0.5804757475852966, + "learning_rate": 0.0001, + "loss": 1.5805, + "step": 11028 + }, + { + "epoch": 1.2669002354833152, + "grad_norm": 0.5479066967964172, + "learning_rate": 0.0001, + "loss": 1.3971, + "step": 11029 + }, + { + "epoch": 1.2670151053931422, + "grad_norm": 0.5440239310264587, + "learning_rate": 0.0001, + "loss": 1.6347, + "step": 11030 + }, + { + "epoch": 1.2671299753029694, + "grad_norm": 0.5420060157775879, + "learning_rate": 0.0001, + "loss": 1.4851, + "step": 11031 + }, + { + "epoch": 1.2672448452127965, + "grad_norm": 0.551177442073822, + "learning_rate": 0.0001, + "loss": 1.3622, + "step": 11032 + }, + { + "epoch": 1.2673597151226237, + "grad_norm": 0.6188027262687683, + "learning_rate": 0.0001, + "loss": 1.4396, + "step": 11033 + }, + { + "epoch": 1.2674745850324507, + "grad_norm": 0.5277479887008667, + "learning_rate": 0.0001, + "loss": 1.4829, + "step": 11034 + }, + { + "epoch": 1.267589454942278, + "grad_norm": 0.5837298631668091, + "learning_rate": 0.0001, + "loss": 1.5163, + "step": 11035 + }, + { + "epoch": 1.267704324852105, + "grad_norm": 0.5999343991279602, + "learning_rate": 0.0001, + "loss": 1.6206, + "step": 11036 + }, + { + "epoch": 1.2678191947619322, + "grad_norm": 0.5406692624092102, + "learning_rate": 0.0001, + "loss": 1.3183, + "step": 11037 + }, + { + "epoch": 1.2679340646717592, + "grad_norm": 0.5225005745887756, + "learning_rate": 0.0001, + "loss": 1.3585, + "step": 11038 + }, + { + "epoch": 1.2680489345815864, + "grad_norm": 0.5517762899398804, + "learning_rate": 0.0001, + "loss": 1.4815, + "step": 11039 + }, + { + "epoch": 1.2681638044914134, + "grad_norm": 0.5250986218452454, + "learning_rate": 0.0001, + "loss": 1.4083, + "step": 11040 + }, + { + "epoch": 1.2682786744012406, + "grad_norm": 0.5599402189254761, + "learning_rate": 0.0001, + "loss": 1.5532, + "step": 11041 + }, + { + "epoch": 1.2683935443110677, + "grad_norm": 0.6003100872039795, + "learning_rate": 0.0001, + "loss": 1.3209, + "step": 11042 + }, + { + "epoch": 1.2685084142208949, + "grad_norm": 0.6202194690704346, + "learning_rate": 0.0001, + "loss": 1.3633, + "step": 11043 + }, + { + "epoch": 1.268623284130722, + "grad_norm": 0.5758548974990845, + "learning_rate": 0.0001, + "loss": 1.4419, + "step": 11044 + }, + { + "epoch": 1.2687381540405491, + "grad_norm": 0.6529328227043152, + "learning_rate": 0.0001, + "loss": 1.5932, + "step": 11045 + }, + { + "epoch": 1.2688530239503761, + "grad_norm": 0.617214560508728, + "learning_rate": 0.0001, + "loss": 1.6211, + "step": 11046 + }, + { + "epoch": 1.2689678938602034, + "grad_norm": 0.5643380880355835, + "learning_rate": 0.0001, + "loss": 1.5033, + "step": 11047 + }, + { + "epoch": 1.2690827637700304, + "grad_norm": 0.5501247048377991, + "learning_rate": 0.0001, + "loss": 1.5196, + "step": 11048 + }, + { + "epoch": 1.2691976336798576, + "grad_norm": 0.5517723560333252, + "learning_rate": 0.0001, + "loss": 1.2386, + "step": 11049 + }, + { + "epoch": 1.2693125035896846, + "grad_norm": 0.6187138557434082, + "learning_rate": 0.0001, + "loss": 1.5318, + "step": 11050 + }, + { + "epoch": 1.2694273734995118, + "grad_norm": 0.5243611931800842, + "learning_rate": 0.0001, + "loss": 1.2427, + "step": 11051 + }, + { + "epoch": 1.2695422434093389, + "grad_norm": 0.5593788623809814, + "learning_rate": 0.0001, + "loss": 1.3313, + "step": 11052 + }, + { + "epoch": 1.269657113319166, + "grad_norm": 0.6037552356719971, + "learning_rate": 0.0001, + "loss": 1.2074, + "step": 11053 + }, + { + "epoch": 1.269771983228993, + "grad_norm": 0.5670240521430969, + "learning_rate": 0.0001, + "loss": 1.327, + "step": 11054 + }, + { + "epoch": 1.2698868531388203, + "grad_norm": 0.6029371023178101, + "learning_rate": 0.0001, + "loss": 1.6885, + "step": 11055 + }, + { + "epoch": 1.2700017230486473, + "grad_norm": 0.6077609062194824, + "learning_rate": 0.0001, + "loss": 1.5042, + "step": 11056 + }, + { + "epoch": 1.2701165929584746, + "grad_norm": 0.5685033202171326, + "learning_rate": 0.0001, + "loss": 1.5165, + "step": 11057 + }, + { + "epoch": 1.2702314628683016, + "grad_norm": 0.6237457990646362, + "learning_rate": 0.0001, + "loss": 1.5755, + "step": 11058 + }, + { + "epoch": 1.2703463327781288, + "grad_norm": 0.5337412357330322, + "learning_rate": 0.0001, + "loss": 1.2787, + "step": 11059 + }, + { + "epoch": 1.2704612026879558, + "grad_norm": 0.5789129137992859, + "learning_rate": 0.0001, + "loss": 1.5369, + "step": 11060 + }, + { + "epoch": 1.270576072597783, + "grad_norm": 0.6278301477432251, + "learning_rate": 0.0001, + "loss": 1.6136, + "step": 11061 + }, + { + "epoch": 1.27069094250761, + "grad_norm": 0.5445199608802795, + "learning_rate": 0.0001, + "loss": 1.4081, + "step": 11062 + }, + { + "epoch": 1.2708058124174373, + "grad_norm": 0.5890607833862305, + "learning_rate": 0.0001, + "loss": 1.6572, + "step": 11063 + }, + { + "epoch": 1.2709206823272643, + "grad_norm": 0.5365835428237915, + "learning_rate": 0.0001, + "loss": 1.4548, + "step": 11064 + }, + { + "epoch": 1.2710355522370915, + "grad_norm": 0.5734820365905762, + "learning_rate": 0.0001, + "loss": 1.6119, + "step": 11065 + }, + { + "epoch": 1.2711504221469185, + "grad_norm": 0.5551184415817261, + "learning_rate": 0.0001, + "loss": 1.4606, + "step": 11066 + }, + { + "epoch": 1.2712652920567458, + "grad_norm": 0.5175613760948181, + "learning_rate": 0.0001, + "loss": 1.4638, + "step": 11067 + }, + { + "epoch": 1.2713801619665728, + "grad_norm": 0.5665346384048462, + "learning_rate": 0.0001, + "loss": 1.3524, + "step": 11068 + }, + { + "epoch": 1.2714950318764, + "grad_norm": 0.6107744574546814, + "learning_rate": 0.0001, + "loss": 1.528, + "step": 11069 + }, + { + "epoch": 1.271609901786227, + "grad_norm": 0.5816113948822021, + "learning_rate": 0.0001, + "loss": 1.4268, + "step": 11070 + }, + { + "epoch": 1.2717247716960542, + "grad_norm": 0.5861494541168213, + "learning_rate": 0.0001, + "loss": 1.5562, + "step": 11071 + }, + { + "epoch": 1.2718396416058813, + "grad_norm": 0.5912983417510986, + "learning_rate": 0.0001, + "loss": 1.4563, + "step": 11072 + }, + { + "epoch": 1.2719545115157085, + "grad_norm": 0.5545825362205505, + "learning_rate": 0.0001, + "loss": 1.2985, + "step": 11073 + }, + { + "epoch": 1.2720693814255357, + "grad_norm": 0.5643607974052429, + "learning_rate": 0.0001, + "loss": 1.4998, + "step": 11074 + }, + { + "epoch": 1.2721842513353627, + "grad_norm": 0.5997006297111511, + "learning_rate": 0.0001, + "loss": 1.5208, + "step": 11075 + }, + { + "epoch": 1.2722991212451897, + "grad_norm": 0.5586094260215759, + "learning_rate": 0.0001, + "loss": 1.453, + "step": 11076 + }, + { + "epoch": 1.272413991155017, + "grad_norm": 0.5602713227272034, + "learning_rate": 0.0001, + "loss": 1.4295, + "step": 11077 + }, + { + "epoch": 1.2725288610648442, + "grad_norm": 0.543787956237793, + "learning_rate": 0.0001, + "loss": 1.4595, + "step": 11078 + }, + { + "epoch": 1.2726437309746712, + "grad_norm": 0.5545603036880493, + "learning_rate": 0.0001, + "loss": 1.5192, + "step": 11079 + }, + { + "epoch": 1.2727586008844982, + "grad_norm": 0.6172767281532288, + "learning_rate": 0.0001, + "loss": 1.6377, + "step": 11080 + }, + { + "epoch": 1.2728734707943254, + "grad_norm": 0.5919816493988037, + "learning_rate": 0.0001, + "loss": 1.5502, + "step": 11081 + }, + { + "epoch": 1.2729883407041527, + "grad_norm": 0.5594301819801331, + "learning_rate": 0.0001, + "loss": 1.5123, + "step": 11082 + }, + { + "epoch": 1.2731032106139797, + "grad_norm": 0.6125577092170715, + "learning_rate": 0.0001, + "loss": 1.468, + "step": 11083 + }, + { + "epoch": 1.2732180805238067, + "grad_norm": 0.5834252834320068, + "learning_rate": 0.0001, + "loss": 1.5652, + "step": 11084 + }, + { + "epoch": 1.273332950433634, + "grad_norm": 0.5693926215171814, + "learning_rate": 0.0001, + "loss": 1.6421, + "step": 11085 + }, + { + "epoch": 1.2734478203434612, + "grad_norm": 0.5403253436088562, + "learning_rate": 0.0001, + "loss": 1.4777, + "step": 11086 + }, + { + "epoch": 1.2735626902532882, + "grad_norm": 0.5751697421073914, + "learning_rate": 0.0001, + "loss": 1.5373, + "step": 11087 + }, + { + "epoch": 1.2736775601631152, + "grad_norm": 0.5498262643814087, + "learning_rate": 0.0001, + "loss": 1.5295, + "step": 11088 + }, + { + "epoch": 1.2737924300729424, + "grad_norm": 0.5573551654815674, + "learning_rate": 0.0001, + "loss": 1.5071, + "step": 11089 + }, + { + "epoch": 1.2739072999827696, + "grad_norm": 0.5851514339447021, + "learning_rate": 0.0001, + "loss": 1.5744, + "step": 11090 + }, + { + "epoch": 1.2740221698925966, + "grad_norm": 0.5665884017944336, + "learning_rate": 0.0001, + "loss": 1.5061, + "step": 11091 + }, + { + "epoch": 1.2741370398024237, + "grad_norm": 0.5661911368370056, + "learning_rate": 0.0001, + "loss": 1.4927, + "step": 11092 + }, + { + "epoch": 1.2742519097122509, + "grad_norm": 0.5676097869873047, + "learning_rate": 0.0001, + "loss": 1.5236, + "step": 11093 + }, + { + "epoch": 1.2743667796220781, + "grad_norm": 0.62990403175354, + "learning_rate": 0.0001, + "loss": 1.6448, + "step": 11094 + }, + { + "epoch": 1.2744816495319051, + "grad_norm": 0.5617504715919495, + "learning_rate": 0.0001, + "loss": 1.3303, + "step": 11095 + }, + { + "epoch": 1.2745965194417321, + "grad_norm": 0.603344738483429, + "learning_rate": 0.0001, + "loss": 1.3515, + "step": 11096 + }, + { + "epoch": 1.2747113893515594, + "grad_norm": 0.5757971405982971, + "learning_rate": 0.0001, + "loss": 1.53, + "step": 11097 + }, + { + "epoch": 1.2748262592613866, + "grad_norm": 0.5930181741714478, + "learning_rate": 0.0001, + "loss": 1.4135, + "step": 11098 + }, + { + "epoch": 1.2749411291712136, + "grad_norm": 0.5848099589347839, + "learning_rate": 0.0001, + "loss": 1.5375, + "step": 11099 + }, + { + "epoch": 1.2750559990810406, + "grad_norm": 0.585987389087677, + "learning_rate": 0.0001, + "loss": 1.5055, + "step": 11100 + }, + { + "epoch": 1.2751708689908678, + "grad_norm": 0.6561366319656372, + "learning_rate": 0.0001, + "loss": 1.7621, + "step": 11101 + }, + { + "epoch": 1.275285738900695, + "grad_norm": 0.6223732233047485, + "learning_rate": 0.0001, + "loss": 1.5776, + "step": 11102 + }, + { + "epoch": 1.275400608810522, + "grad_norm": 0.5525573492050171, + "learning_rate": 0.0001, + "loss": 1.4327, + "step": 11103 + }, + { + "epoch": 1.275515478720349, + "grad_norm": 0.5294277667999268, + "learning_rate": 0.0001, + "loss": 1.3882, + "step": 11104 + }, + { + "epoch": 1.2756303486301763, + "grad_norm": 0.5459890961647034, + "learning_rate": 0.0001, + "loss": 1.3725, + "step": 11105 + }, + { + "epoch": 1.2757452185400036, + "grad_norm": 0.5299192667007446, + "learning_rate": 0.0001, + "loss": 1.4483, + "step": 11106 + }, + { + "epoch": 1.2758600884498306, + "grad_norm": 0.5696981549263, + "learning_rate": 0.0001, + "loss": 1.5508, + "step": 11107 + }, + { + "epoch": 1.2759749583596576, + "grad_norm": 0.5575034022331238, + "learning_rate": 0.0001, + "loss": 1.2812, + "step": 11108 + }, + { + "epoch": 1.2760898282694848, + "grad_norm": 0.570679783821106, + "learning_rate": 0.0001, + "loss": 1.4828, + "step": 11109 + }, + { + "epoch": 1.276204698179312, + "grad_norm": 0.5604519248008728, + "learning_rate": 0.0001, + "loss": 1.5221, + "step": 11110 + }, + { + "epoch": 1.276319568089139, + "grad_norm": 0.5516342520713806, + "learning_rate": 0.0001, + "loss": 1.4681, + "step": 11111 + }, + { + "epoch": 1.276434437998966, + "grad_norm": 0.5541993379592896, + "learning_rate": 0.0001, + "loss": 1.6564, + "step": 11112 + }, + { + "epoch": 1.2765493079087933, + "grad_norm": 0.6293795108795166, + "learning_rate": 0.0001, + "loss": 1.6179, + "step": 11113 + }, + { + "epoch": 1.2766641778186205, + "grad_norm": 0.5466127395629883, + "learning_rate": 0.0001, + "loss": 1.4961, + "step": 11114 + }, + { + "epoch": 1.2767790477284475, + "grad_norm": 0.5696339011192322, + "learning_rate": 0.0001, + "loss": 1.6104, + "step": 11115 + }, + { + "epoch": 1.2768939176382745, + "grad_norm": 0.5600918531417847, + "learning_rate": 0.0001, + "loss": 1.3857, + "step": 11116 + }, + { + "epoch": 1.2770087875481018, + "grad_norm": 0.5669732093811035, + "learning_rate": 0.0001, + "loss": 1.3401, + "step": 11117 + }, + { + "epoch": 1.277123657457929, + "grad_norm": 0.5944907665252686, + "learning_rate": 0.0001, + "loss": 1.4226, + "step": 11118 + }, + { + "epoch": 1.277238527367756, + "grad_norm": 0.5800361633300781, + "learning_rate": 0.0001, + "loss": 1.4727, + "step": 11119 + }, + { + "epoch": 1.277353397277583, + "grad_norm": 0.5681383609771729, + "learning_rate": 0.0001, + "loss": 1.4153, + "step": 11120 + }, + { + "epoch": 1.2774682671874102, + "grad_norm": 0.5693331956863403, + "learning_rate": 0.0001, + "loss": 1.7115, + "step": 11121 + }, + { + "epoch": 1.2775831370972375, + "grad_norm": 0.5463958978652954, + "learning_rate": 0.0001, + "loss": 1.5031, + "step": 11122 + }, + { + "epoch": 1.2776980070070645, + "grad_norm": 0.5928342342376709, + "learning_rate": 0.0001, + "loss": 1.4823, + "step": 11123 + }, + { + "epoch": 1.2778128769168915, + "grad_norm": 0.5508242845535278, + "learning_rate": 0.0001, + "loss": 1.4824, + "step": 11124 + }, + { + "epoch": 1.2779277468267187, + "grad_norm": 0.5686891078948975, + "learning_rate": 0.0001, + "loss": 1.6338, + "step": 11125 + }, + { + "epoch": 1.278042616736546, + "grad_norm": 0.5789403915405273, + "learning_rate": 0.0001, + "loss": 1.4, + "step": 11126 + }, + { + "epoch": 1.278157486646373, + "grad_norm": 0.6012910604476929, + "learning_rate": 0.0001, + "loss": 1.5804, + "step": 11127 + }, + { + "epoch": 1.2782723565562, + "grad_norm": 0.5584916472434998, + "learning_rate": 0.0001, + "loss": 1.5045, + "step": 11128 + }, + { + "epoch": 1.2783872264660272, + "grad_norm": 0.6591640710830688, + "learning_rate": 0.0001, + "loss": 1.5097, + "step": 11129 + }, + { + "epoch": 1.2785020963758544, + "grad_norm": 0.611392080783844, + "learning_rate": 0.0001, + "loss": 1.3733, + "step": 11130 + }, + { + "epoch": 1.2786169662856814, + "grad_norm": 0.5938810706138611, + "learning_rate": 0.0001, + "loss": 1.5768, + "step": 11131 + }, + { + "epoch": 1.2787318361955085, + "grad_norm": 0.5487352609634399, + "learning_rate": 0.0001, + "loss": 1.2881, + "step": 11132 + }, + { + "epoch": 1.2788467061053357, + "grad_norm": 0.5437043905258179, + "learning_rate": 0.0001, + "loss": 1.394, + "step": 11133 + }, + { + "epoch": 1.278961576015163, + "grad_norm": 0.6171312928199768, + "learning_rate": 0.0001, + "loss": 1.5819, + "step": 11134 + }, + { + "epoch": 1.27907644592499, + "grad_norm": 0.5747122168540955, + "learning_rate": 0.0001, + "loss": 1.4663, + "step": 11135 + }, + { + "epoch": 1.279191315834817, + "grad_norm": 0.5911594033241272, + "learning_rate": 0.0001, + "loss": 1.4412, + "step": 11136 + }, + { + "epoch": 1.2793061857446442, + "grad_norm": 0.5997012257575989, + "learning_rate": 0.0001, + "loss": 1.3998, + "step": 11137 + }, + { + "epoch": 1.2794210556544714, + "grad_norm": 0.5709753632545471, + "learning_rate": 0.0001, + "loss": 1.37, + "step": 11138 + }, + { + "epoch": 1.2795359255642984, + "grad_norm": 0.5752970576286316, + "learning_rate": 0.0001, + "loss": 1.2787, + "step": 11139 + }, + { + "epoch": 1.2796507954741256, + "grad_norm": 0.7025997042655945, + "learning_rate": 0.0001, + "loss": 1.6239, + "step": 11140 + }, + { + "epoch": 1.2797656653839526, + "grad_norm": 0.5793277025222778, + "learning_rate": 0.0001, + "loss": 1.4309, + "step": 11141 + }, + { + "epoch": 1.2798805352937799, + "grad_norm": 0.550739049911499, + "learning_rate": 0.0001, + "loss": 1.471, + "step": 11142 + }, + { + "epoch": 1.2799954052036069, + "grad_norm": 0.5863038897514343, + "learning_rate": 0.0001, + "loss": 1.5955, + "step": 11143 + }, + { + "epoch": 1.2801102751134341, + "grad_norm": 0.5753504633903503, + "learning_rate": 0.0001, + "loss": 1.3984, + "step": 11144 + }, + { + "epoch": 1.2802251450232611, + "grad_norm": 0.566723644733429, + "learning_rate": 0.0001, + "loss": 1.3528, + "step": 11145 + }, + { + "epoch": 1.2803400149330884, + "grad_norm": 0.600090503692627, + "learning_rate": 0.0001, + "loss": 1.5385, + "step": 11146 + }, + { + "epoch": 1.2804548848429154, + "grad_norm": 0.6001926064491272, + "learning_rate": 0.0001, + "loss": 1.3591, + "step": 11147 + }, + { + "epoch": 1.2805697547527426, + "grad_norm": 0.5610960721969604, + "learning_rate": 0.0001, + "loss": 1.4732, + "step": 11148 + }, + { + "epoch": 1.2806846246625696, + "grad_norm": 0.5722346305847168, + "learning_rate": 0.0001, + "loss": 1.5572, + "step": 11149 + }, + { + "epoch": 1.2807994945723968, + "grad_norm": 0.6043455600738525, + "learning_rate": 0.0001, + "loss": 1.635, + "step": 11150 + }, + { + "epoch": 1.2809143644822238, + "grad_norm": 0.6156902313232422, + "learning_rate": 0.0001, + "loss": 1.6608, + "step": 11151 + }, + { + "epoch": 1.281029234392051, + "grad_norm": 0.531518280506134, + "learning_rate": 0.0001, + "loss": 1.4802, + "step": 11152 + }, + { + "epoch": 1.281144104301878, + "grad_norm": 0.5361231565475464, + "learning_rate": 0.0001, + "loss": 1.4514, + "step": 11153 + }, + { + "epoch": 1.2812589742117053, + "grad_norm": 0.6165813207626343, + "learning_rate": 0.0001, + "loss": 1.73, + "step": 11154 + }, + { + "epoch": 1.2813738441215323, + "grad_norm": 0.5662755966186523, + "learning_rate": 0.0001, + "loss": 1.5448, + "step": 11155 + }, + { + "epoch": 1.2814887140313596, + "grad_norm": 0.5308471322059631, + "learning_rate": 0.0001, + "loss": 1.3735, + "step": 11156 + }, + { + "epoch": 1.2816035839411866, + "grad_norm": 0.7272038459777832, + "learning_rate": 0.0001, + "loss": 1.5976, + "step": 11157 + }, + { + "epoch": 1.2817184538510138, + "grad_norm": 0.606468141078949, + "learning_rate": 0.0001, + "loss": 1.4419, + "step": 11158 + }, + { + "epoch": 1.2818333237608408, + "grad_norm": 0.5892067551612854, + "learning_rate": 0.0001, + "loss": 1.4458, + "step": 11159 + }, + { + "epoch": 1.281948193670668, + "grad_norm": 0.5960896015167236, + "learning_rate": 0.0001, + "loss": 1.4519, + "step": 11160 + }, + { + "epoch": 1.282063063580495, + "grad_norm": 0.5746586322784424, + "learning_rate": 0.0001, + "loss": 1.5134, + "step": 11161 + }, + { + "epoch": 1.2821779334903223, + "grad_norm": 0.5626453757286072, + "learning_rate": 0.0001, + "loss": 1.3905, + "step": 11162 + }, + { + "epoch": 1.2822928034001493, + "grad_norm": 0.615481972694397, + "learning_rate": 0.0001, + "loss": 1.5979, + "step": 11163 + }, + { + "epoch": 1.2824076733099765, + "grad_norm": 0.56350177526474, + "learning_rate": 0.0001, + "loss": 1.5846, + "step": 11164 + }, + { + "epoch": 1.2825225432198035, + "grad_norm": 0.5434756278991699, + "learning_rate": 0.0001, + "loss": 1.2041, + "step": 11165 + }, + { + "epoch": 1.2826374131296308, + "grad_norm": 0.5499529838562012, + "learning_rate": 0.0001, + "loss": 1.3228, + "step": 11166 + }, + { + "epoch": 1.2827522830394578, + "grad_norm": 0.6841591000556946, + "learning_rate": 0.0001, + "loss": 1.5102, + "step": 11167 + }, + { + "epoch": 1.282867152949285, + "grad_norm": 0.5592834949493408, + "learning_rate": 0.0001, + "loss": 1.5731, + "step": 11168 + }, + { + "epoch": 1.282982022859112, + "grad_norm": 0.5657499432563782, + "learning_rate": 0.0001, + "loss": 1.3523, + "step": 11169 + }, + { + "epoch": 1.2830968927689392, + "grad_norm": 0.7231560945510864, + "learning_rate": 0.0001, + "loss": 1.4121, + "step": 11170 + }, + { + "epoch": 1.2832117626787662, + "grad_norm": 0.6643901467323303, + "learning_rate": 0.0001, + "loss": 1.723, + "step": 11171 + }, + { + "epoch": 1.2833266325885935, + "grad_norm": 0.5481140613555908, + "learning_rate": 0.0001, + "loss": 1.4084, + "step": 11172 + }, + { + "epoch": 1.2834415024984205, + "grad_norm": 0.5619171857833862, + "learning_rate": 0.0001, + "loss": 1.5306, + "step": 11173 + }, + { + "epoch": 1.2835563724082477, + "grad_norm": 0.6013187170028687, + "learning_rate": 0.0001, + "loss": 1.3116, + "step": 11174 + }, + { + "epoch": 1.2836712423180747, + "grad_norm": 0.5309972167015076, + "learning_rate": 0.0001, + "loss": 1.3814, + "step": 11175 + }, + { + "epoch": 1.283786112227902, + "grad_norm": 0.5112704038619995, + "learning_rate": 0.0001, + "loss": 1.4198, + "step": 11176 + }, + { + "epoch": 1.283900982137729, + "grad_norm": 0.5378767251968384, + "learning_rate": 0.0001, + "loss": 1.4499, + "step": 11177 + }, + { + "epoch": 1.2840158520475562, + "grad_norm": 0.5731784105300903, + "learning_rate": 0.0001, + "loss": 1.4214, + "step": 11178 + }, + { + "epoch": 1.2841307219573832, + "grad_norm": 0.520551860332489, + "learning_rate": 0.0001, + "loss": 1.3355, + "step": 11179 + }, + { + "epoch": 1.2842455918672104, + "grad_norm": 0.6834689378738403, + "learning_rate": 0.0001, + "loss": 1.5859, + "step": 11180 + }, + { + "epoch": 1.2843604617770374, + "grad_norm": 0.6506545543670654, + "learning_rate": 0.0001, + "loss": 1.5872, + "step": 11181 + }, + { + "epoch": 1.2844753316868647, + "grad_norm": 0.5278921723365784, + "learning_rate": 0.0001, + "loss": 1.2855, + "step": 11182 + }, + { + "epoch": 1.2845902015966917, + "grad_norm": 0.6012974977493286, + "learning_rate": 0.0001, + "loss": 1.3572, + "step": 11183 + }, + { + "epoch": 1.284705071506519, + "grad_norm": 0.59954434633255, + "learning_rate": 0.0001, + "loss": 1.5348, + "step": 11184 + }, + { + "epoch": 1.284819941416346, + "grad_norm": 0.5746206045150757, + "learning_rate": 0.0001, + "loss": 1.5353, + "step": 11185 + }, + { + "epoch": 1.2849348113261732, + "grad_norm": 0.5760904550552368, + "learning_rate": 0.0001, + "loss": 1.4865, + "step": 11186 + }, + { + "epoch": 1.2850496812360002, + "grad_norm": 0.6236897110939026, + "learning_rate": 0.0001, + "loss": 1.446, + "step": 11187 + }, + { + "epoch": 1.2851645511458274, + "grad_norm": 0.5215889811515808, + "learning_rate": 0.0001, + "loss": 1.4459, + "step": 11188 + }, + { + "epoch": 1.2852794210556544, + "grad_norm": 0.5659036040306091, + "learning_rate": 0.0001, + "loss": 1.5912, + "step": 11189 + }, + { + "epoch": 1.2853942909654816, + "grad_norm": 0.510826587677002, + "learning_rate": 0.0001, + "loss": 1.3339, + "step": 11190 + }, + { + "epoch": 1.2855091608753086, + "grad_norm": 0.5990018844604492, + "learning_rate": 0.0001, + "loss": 1.673, + "step": 11191 + }, + { + "epoch": 1.2856240307851359, + "grad_norm": 0.6071823835372925, + "learning_rate": 0.0001, + "loss": 1.6327, + "step": 11192 + }, + { + "epoch": 1.2857389006949629, + "grad_norm": 0.6266348958015442, + "learning_rate": 0.0001, + "loss": 1.7044, + "step": 11193 + }, + { + "epoch": 1.2858537706047901, + "grad_norm": 0.6025568246841431, + "learning_rate": 0.0001, + "loss": 1.2896, + "step": 11194 + }, + { + "epoch": 1.2859686405146171, + "grad_norm": 0.6044636368751526, + "learning_rate": 0.0001, + "loss": 1.4106, + "step": 11195 + }, + { + "epoch": 1.2860835104244444, + "grad_norm": 0.7049621939659119, + "learning_rate": 0.0001, + "loss": 1.8122, + "step": 11196 + }, + { + "epoch": 1.2861983803342714, + "grad_norm": 0.5462698936462402, + "learning_rate": 0.0001, + "loss": 1.484, + "step": 11197 + }, + { + "epoch": 1.2863132502440986, + "grad_norm": 0.5640134811401367, + "learning_rate": 0.0001, + "loss": 1.5136, + "step": 11198 + }, + { + "epoch": 1.2864281201539256, + "grad_norm": 0.5560702681541443, + "learning_rate": 0.0001, + "loss": 1.4912, + "step": 11199 + }, + { + "epoch": 1.2865429900637528, + "grad_norm": 0.5284111499786377, + "learning_rate": 0.0001, + "loss": 1.2599, + "step": 11200 + }, + { + "epoch": 1.2866578599735798, + "grad_norm": 0.5436888933181763, + "learning_rate": 0.0001, + "loss": 1.3779, + "step": 11201 + }, + { + "epoch": 1.286772729883407, + "grad_norm": 0.5959984660148621, + "learning_rate": 0.0001, + "loss": 1.7252, + "step": 11202 + }, + { + "epoch": 1.286887599793234, + "grad_norm": 0.5684484839439392, + "learning_rate": 0.0001, + "loss": 1.4591, + "step": 11203 + }, + { + "epoch": 1.2870024697030613, + "grad_norm": 0.5544134974479675, + "learning_rate": 0.0001, + "loss": 1.5097, + "step": 11204 + }, + { + "epoch": 1.2871173396128883, + "grad_norm": 0.5449683666229248, + "learning_rate": 0.0001, + "loss": 1.4292, + "step": 11205 + }, + { + "epoch": 1.2872322095227156, + "grad_norm": 0.5800986289978027, + "learning_rate": 0.0001, + "loss": 1.5465, + "step": 11206 + }, + { + "epoch": 1.2873470794325426, + "grad_norm": 0.5813232064247131, + "learning_rate": 0.0001, + "loss": 1.5989, + "step": 11207 + }, + { + "epoch": 1.2874619493423698, + "grad_norm": 0.5666254758834839, + "learning_rate": 0.0001, + "loss": 1.5075, + "step": 11208 + }, + { + "epoch": 1.2875768192521968, + "grad_norm": 0.6252794861793518, + "learning_rate": 0.0001, + "loss": 1.6814, + "step": 11209 + }, + { + "epoch": 1.287691689162024, + "grad_norm": 0.5321157574653625, + "learning_rate": 0.0001, + "loss": 1.3024, + "step": 11210 + }, + { + "epoch": 1.2878065590718513, + "grad_norm": 0.5458829998970032, + "learning_rate": 0.0001, + "loss": 1.3577, + "step": 11211 + }, + { + "epoch": 1.2879214289816783, + "grad_norm": 0.5970067977905273, + "learning_rate": 0.0001, + "loss": 1.4105, + "step": 11212 + }, + { + "epoch": 1.2880362988915053, + "grad_norm": 0.5711007118225098, + "learning_rate": 0.0001, + "loss": 1.4712, + "step": 11213 + }, + { + "epoch": 1.2881511688013325, + "grad_norm": 0.5796260833740234, + "learning_rate": 0.0001, + "loss": 1.5902, + "step": 11214 + }, + { + "epoch": 1.2882660387111597, + "grad_norm": 0.6137135028839111, + "learning_rate": 0.0001, + "loss": 1.6444, + "step": 11215 + }, + { + "epoch": 1.2883809086209868, + "grad_norm": 0.6448894739151001, + "learning_rate": 0.0001, + "loss": 1.4776, + "step": 11216 + }, + { + "epoch": 1.2884957785308138, + "grad_norm": 0.5258020758628845, + "learning_rate": 0.0001, + "loss": 1.396, + "step": 11217 + }, + { + "epoch": 1.288610648440641, + "grad_norm": 0.5507315397262573, + "learning_rate": 0.0001, + "loss": 1.3656, + "step": 11218 + }, + { + "epoch": 1.2887255183504682, + "grad_norm": 0.5842088460922241, + "learning_rate": 0.0001, + "loss": 1.5746, + "step": 11219 + }, + { + "epoch": 1.2888403882602952, + "grad_norm": 0.6149991154670715, + "learning_rate": 0.0001, + "loss": 1.5401, + "step": 11220 + }, + { + "epoch": 1.2889552581701222, + "grad_norm": 0.5865837335586548, + "learning_rate": 0.0001, + "loss": 1.3607, + "step": 11221 + }, + { + "epoch": 1.2890701280799495, + "grad_norm": 0.5908994078636169, + "learning_rate": 0.0001, + "loss": 1.403, + "step": 11222 + }, + { + "epoch": 1.2891849979897767, + "grad_norm": 0.5459616184234619, + "learning_rate": 0.0001, + "loss": 1.3524, + "step": 11223 + }, + { + "epoch": 1.2892998678996037, + "grad_norm": 0.6645961403846741, + "learning_rate": 0.0001, + "loss": 1.2631, + "step": 11224 + }, + { + "epoch": 1.2894147378094307, + "grad_norm": 0.6288965344429016, + "learning_rate": 0.0001, + "loss": 1.5521, + "step": 11225 + }, + { + "epoch": 1.289529607719258, + "grad_norm": 0.6147708892822266, + "learning_rate": 0.0001, + "loss": 1.4303, + "step": 11226 + }, + { + "epoch": 1.2896444776290852, + "grad_norm": 0.6296586394309998, + "learning_rate": 0.0001, + "loss": 1.5909, + "step": 11227 + }, + { + "epoch": 1.2897593475389122, + "grad_norm": 0.5921497344970703, + "learning_rate": 0.0001, + "loss": 1.4274, + "step": 11228 + }, + { + "epoch": 1.2898742174487392, + "grad_norm": 0.6173571348190308, + "learning_rate": 0.0001, + "loss": 1.6724, + "step": 11229 + }, + { + "epoch": 1.2899890873585664, + "grad_norm": 0.6189887523651123, + "learning_rate": 0.0001, + "loss": 1.3725, + "step": 11230 + }, + { + "epoch": 1.2901039572683937, + "grad_norm": 0.569510817527771, + "learning_rate": 0.0001, + "loss": 1.4343, + "step": 11231 + }, + { + "epoch": 1.2902188271782207, + "grad_norm": 0.622203528881073, + "learning_rate": 0.0001, + "loss": 1.4382, + "step": 11232 + }, + { + "epoch": 1.2903336970880477, + "grad_norm": 0.6754775643348694, + "learning_rate": 0.0001, + "loss": 1.6161, + "step": 11233 + }, + { + "epoch": 1.290448566997875, + "grad_norm": 0.5515174865722656, + "learning_rate": 0.0001, + "loss": 1.5448, + "step": 11234 + }, + { + "epoch": 1.2905634369077021, + "grad_norm": 0.6426066160202026, + "learning_rate": 0.0001, + "loss": 1.5509, + "step": 11235 + }, + { + "epoch": 1.2906783068175292, + "grad_norm": 0.6196022033691406, + "learning_rate": 0.0001, + "loss": 1.5919, + "step": 11236 + }, + { + "epoch": 1.2907931767273562, + "grad_norm": 0.5864408612251282, + "learning_rate": 0.0001, + "loss": 1.4135, + "step": 11237 + }, + { + "epoch": 1.2909080466371834, + "grad_norm": 0.607479989528656, + "learning_rate": 0.0001, + "loss": 1.5987, + "step": 11238 + }, + { + "epoch": 1.2910229165470106, + "grad_norm": 0.5427273511886597, + "learning_rate": 0.0001, + "loss": 1.374, + "step": 11239 + }, + { + "epoch": 1.2911377864568376, + "grad_norm": 0.5896614789962769, + "learning_rate": 0.0001, + "loss": 1.5488, + "step": 11240 + }, + { + "epoch": 1.2912526563666646, + "grad_norm": 0.5615909695625305, + "learning_rate": 0.0001, + "loss": 1.5587, + "step": 11241 + }, + { + "epoch": 1.2913675262764919, + "grad_norm": 0.6087133884429932, + "learning_rate": 0.0001, + "loss": 1.5799, + "step": 11242 + }, + { + "epoch": 1.291482396186319, + "grad_norm": 0.5094661712646484, + "learning_rate": 0.0001, + "loss": 1.2658, + "step": 11243 + }, + { + "epoch": 1.2915972660961461, + "grad_norm": 0.6043559908866882, + "learning_rate": 0.0001, + "loss": 1.4343, + "step": 11244 + }, + { + "epoch": 1.2917121360059731, + "grad_norm": 0.586310625076294, + "learning_rate": 0.0001, + "loss": 1.607, + "step": 11245 + }, + { + "epoch": 1.2918270059158004, + "grad_norm": 0.5929206013679504, + "learning_rate": 0.0001, + "loss": 1.2632, + "step": 11246 + }, + { + "epoch": 1.2919418758256276, + "grad_norm": 0.5981222987174988, + "learning_rate": 0.0001, + "loss": 1.5347, + "step": 11247 + }, + { + "epoch": 1.2920567457354546, + "grad_norm": 0.5489922761917114, + "learning_rate": 0.0001, + "loss": 1.3284, + "step": 11248 + }, + { + "epoch": 1.2921716156452816, + "grad_norm": 0.5560479164123535, + "learning_rate": 0.0001, + "loss": 1.2838, + "step": 11249 + }, + { + "epoch": 1.2922864855551088, + "grad_norm": 0.5626586675643921, + "learning_rate": 0.0001, + "loss": 1.5256, + "step": 11250 + }, + { + "epoch": 1.292401355464936, + "grad_norm": 0.5712599158287048, + "learning_rate": 0.0001, + "loss": 1.598, + "step": 11251 + }, + { + "epoch": 1.292516225374763, + "grad_norm": 0.5926998257637024, + "learning_rate": 0.0001, + "loss": 1.4374, + "step": 11252 + }, + { + "epoch": 1.29263109528459, + "grad_norm": 0.5249360799789429, + "learning_rate": 0.0001, + "loss": 1.2447, + "step": 11253 + }, + { + "epoch": 1.2927459651944173, + "grad_norm": 0.5579542517662048, + "learning_rate": 0.0001, + "loss": 1.3727, + "step": 11254 + }, + { + "epoch": 1.2928608351042445, + "grad_norm": 0.5543383955955505, + "learning_rate": 0.0001, + "loss": 1.3233, + "step": 11255 + }, + { + "epoch": 1.2929757050140716, + "grad_norm": 0.600277304649353, + "learning_rate": 0.0001, + "loss": 1.5099, + "step": 11256 + }, + { + "epoch": 1.2930905749238986, + "grad_norm": 0.5769601464271545, + "learning_rate": 0.0001, + "loss": 1.1063, + "step": 11257 + }, + { + "epoch": 1.2932054448337258, + "grad_norm": 0.5800927877426147, + "learning_rate": 0.0001, + "loss": 1.5813, + "step": 11258 + }, + { + "epoch": 1.293320314743553, + "grad_norm": 0.5866220593452454, + "learning_rate": 0.0001, + "loss": 1.4052, + "step": 11259 + }, + { + "epoch": 1.29343518465338, + "grad_norm": 0.5848959684371948, + "learning_rate": 0.0001, + "loss": 1.3863, + "step": 11260 + }, + { + "epoch": 1.293550054563207, + "grad_norm": 0.5541538000106812, + "learning_rate": 0.0001, + "loss": 1.325, + "step": 11261 + }, + { + "epoch": 1.2936649244730343, + "grad_norm": 0.6487392783164978, + "learning_rate": 0.0001, + "loss": 1.4631, + "step": 11262 + }, + { + "epoch": 1.2937797943828615, + "grad_norm": 0.6783632040023804, + "learning_rate": 0.0001, + "loss": 1.7223, + "step": 11263 + }, + { + "epoch": 1.2938946642926885, + "grad_norm": 0.5799053907394409, + "learning_rate": 0.0001, + "loss": 1.6532, + "step": 11264 + }, + { + "epoch": 1.2940095342025155, + "grad_norm": 0.5588035583496094, + "learning_rate": 0.0001, + "loss": 1.3215, + "step": 11265 + }, + { + "epoch": 1.2941244041123428, + "grad_norm": 0.5953567028045654, + "learning_rate": 0.0001, + "loss": 1.6348, + "step": 11266 + }, + { + "epoch": 1.29423927402217, + "grad_norm": 0.5587005615234375, + "learning_rate": 0.0001, + "loss": 1.5139, + "step": 11267 + }, + { + "epoch": 1.294354143931997, + "grad_norm": 0.6000686287879944, + "learning_rate": 0.0001, + "loss": 1.3503, + "step": 11268 + }, + { + "epoch": 1.294469013841824, + "grad_norm": 0.5800769329071045, + "learning_rate": 0.0001, + "loss": 1.5284, + "step": 11269 + }, + { + "epoch": 1.2945838837516512, + "grad_norm": 0.5538628101348877, + "learning_rate": 0.0001, + "loss": 1.2169, + "step": 11270 + }, + { + "epoch": 1.2946987536614785, + "grad_norm": 0.5448545217514038, + "learning_rate": 0.0001, + "loss": 1.5361, + "step": 11271 + }, + { + "epoch": 1.2948136235713055, + "grad_norm": 0.5209142565727234, + "learning_rate": 0.0001, + "loss": 1.3355, + "step": 11272 + }, + { + "epoch": 1.2949284934811325, + "grad_norm": 0.5593119859695435, + "learning_rate": 0.0001, + "loss": 1.4323, + "step": 11273 + }, + { + "epoch": 1.2950433633909597, + "grad_norm": 0.5502519607543945, + "learning_rate": 0.0001, + "loss": 1.4902, + "step": 11274 + }, + { + "epoch": 1.295158233300787, + "grad_norm": 0.5910590887069702, + "learning_rate": 0.0001, + "loss": 1.4953, + "step": 11275 + }, + { + "epoch": 1.295273103210614, + "grad_norm": 0.5811395645141602, + "learning_rate": 0.0001, + "loss": 1.6886, + "step": 11276 + }, + { + "epoch": 1.2953879731204412, + "grad_norm": 0.582584798336029, + "learning_rate": 0.0001, + "loss": 1.4012, + "step": 11277 + }, + { + "epoch": 1.2955028430302682, + "grad_norm": 0.5751772522926331, + "learning_rate": 0.0001, + "loss": 1.3962, + "step": 11278 + }, + { + "epoch": 1.2956177129400954, + "grad_norm": 0.5952954888343811, + "learning_rate": 0.0001, + "loss": 1.5932, + "step": 11279 + }, + { + "epoch": 1.2957325828499224, + "grad_norm": 0.5689733028411865, + "learning_rate": 0.0001, + "loss": 1.43, + "step": 11280 + }, + { + "epoch": 1.2958474527597497, + "grad_norm": 0.5460832715034485, + "learning_rate": 0.0001, + "loss": 1.4803, + "step": 11281 + }, + { + "epoch": 1.2959623226695767, + "grad_norm": 0.604256808757782, + "learning_rate": 0.0001, + "loss": 1.5998, + "step": 11282 + }, + { + "epoch": 1.296077192579404, + "grad_norm": 0.5800360441207886, + "learning_rate": 0.0001, + "loss": 1.4641, + "step": 11283 + }, + { + "epoch": 1.296192062489231, + "grad_norm": 0.529690146446228, + "learning_rate": 0.0001, + "loss": 1.3187, + "step": 11284 + }, + { + "epoch": 1.2963069323990581, + "grad_norm": 0.5543114542961121, + "learning_rate": 0.0001, + "loss": 1.5448, + "step": 11285 + }, + { + "epoch": 1.2964218023088852, + "grad_norm": 0.5886980295181274, + "learning_rate": 0.0001, + "loss": 1.5753, + "step": 11286 + }, + { + "epoch": 1.2965366722187124, + "grad_norm": 0.5955923795700073, + "learning_rate": 0.0001, + "loss": 1.6415, + "step": 11287 + }, + { + "epoch": 1.2966515421285394, + "grad_norm": 0.6192274689674377, + "learning_rate": 0.0001, + "loss": 1.5773, + "step": 11288 + }, + { + "epoch": 1.2967664120383666, + "grad_norm": 0.5242689251899719, + "learning_rate": 0.0001, + "loss": 1.3975, + "step": 11289 + }, + { + "epoch": 1.2968812819481936, + "grad_norm": 0.6103929877281189, + "learning_rate": 0.0001, + "loss": 1.5521, + "step": 11290 + }, + { + "epoch": 1.2969961518580209, + "grad_norm": 0.5915534496307373, + "learning_rate": 0.0001, + "loss": 1.4537, + "step": 11291 + }, + { + "epoch": 1.2971110217678479, + "grad_norm": 0.5651288032531738, + "learning_rate": 0.0001, + "loss": 1.2593, + "step": 11292 + }, + { + "epoch": 1.297225891677675, + "grad_norm": 0.5925031304359436, + "learning_rate": 0.0001, + "loss": 1.4754, + "step": 11293 + }, + { + "epoch": 1.2973407615875021, + "grad_norm": 0.5318403840065002, + "learning_rate": 0.0001, + "loss": 1.3499, + "step": 11294 + }, + { + "epoch": 1.2974556314973293, + "grad_norm": 0.5483918786048889, + "learning_rate": 0.0001, + "loss": 1.45, + "step": 11295 + }, + { + "epoch": 1.2975705014071564, + "grad_norm": 0.561732828617096, + "learning_rate": 0.0001, + "loss": 1.4525, + "step": 11296 + }, + { + "epoch": 1.2976853713169836, + "grad_norm": 0.5497708916664124, + "learning_rate": 0.0001, + "loss": 1.2997, + "step": 11297 + }, + { + "epoch": 1.2978002412268106, + "grad_norm": 0.5718294382095337, + "learning_rate": 0.0001, + "loss": 1.5627, + "step": 11298 + }, + { + "epoch": 1.2979151111366378, + "grad_norm": 0.5569791793823242, + "learning_rate": 0.0001, + "loss": 1.5249, + "step": 11299 + }, + { + "epoch": 1.2980299810464648, + "grad_norm": 0.6240097284317017, + "learning_rate": 0.0001, + "loss": 1.7817, + "step": 11300 + }, + { + "epoch": 1.298144850956292, + "grad_norm": 0.5455806851387024, + "learning_rate": 0.0001, + "loss": 1.517, + "step": 11301 + }, + { + "epoch": 1.298259720866119, + "grad_norm": 0.6025298833847046, + "learning_rate": 0.0001, + "loss": 1.4755, + "step": 11302 + }, + { + "epoch": 1.2983745907759463, + "grad_norm": 0.5894249677658081, + "learning_rate": 0.0001, + "loss": 1.5376, + "step": 11303 + }, + { + "epoch": 1.2984894606857733, + "grad_norm": 0.5962689518928528, + "learning_rate": 0.0001, + "loss": 1.6333, + "step": 11304 + }, + { + "epoch": 1.2986043305956005, + "grad_norm": 0.6713701486587524, + "learning_rate": 0.0001, + "loss": 1.5085, + "step": 11305 + }, + { + "epoch": 1.2987192005054276, + "grad_norm": 0.5443199276924133, + "learning_rate": 0.0001, + "loss": 1.4608, + "step": 11306 + }, + { + "epoch": 1.2988340704152548, + "grad_norm": 0.5849419236183167, + "learning_rate": 0.0001, + "loss": 1.573, + "step": 11307 + }, + { + "epoch": 1.2989489403250818, + "grad_norm": 0.5818788409233093, + "learning_rate": 0.0001, + "loss": 1.5971, + "step": 11308 + }, + { + "epoch": 1.299063810234909, + "grad_norm": 0.5864542126655579, + "learning_rate": 0.0001, + "loss": 1.545, + "step": 11309 + }, + { + "epoch": 1.299178680144736, + "grad_norm": 0.5585055947303772, + "learning_rate": 0.0001, + "loss": 1.3616, + "step": 11310 + }, + { + "epoch": 1.2992935500545633, + "grad_norm": 0.725004255771637, + "learning_rate": 0.0001, + "loss": 1.7103, + "step": 11311 + }, + { + "epoch": 1.2994084199643903, + "grad_norm": 0.6212374567985535, + "learning_rate": 0.0001, + "loss": 1.6987, + "step": 11312 + }, + { + "epoch": 1.2995232898742175, + "grad_norm": 0.5447000861167908, + "learning_rate": 0.0001, + "loss": 1.474, + "step": 11313 + }, + { + "epoch": 1.2996381597840445, + "grad_norm": 0.6027454137802124, + "learning_rate": 0.0001, + "loss": 1.5891, + "step": 11314 + }, + { + "epoch": 1.2997530296938717, + "grad_norm": 0.568374514579773, + "learning_rate": 0.0001, + "loss": 1.5025, + "step": 11315 + }, + { + "epoch": 1.2998678996036988, + "grad_norm": 0.6000039577484131, + "learning_rate": 0.0001, + "loss": 1.5641, + "step": 11316 + }, + { + "epoch": 1.299982769513526, + "grad_norm": 0.5799726247787476, + "learning_rate": 0.0001, + "loss": 1.416, + "step": 11317 + }, + { + "epoch": 1.300097639423353, + "grad_norm": 0.5564817190170288, + "learning_rate": 0.0001, + "loss": 1.3677, + "step": 11318 + }, + { + "epoch": 1.3002125093331802, + "grad_norm": 0.537472665309906, + "learning_rate": 0.0001, + "loss": 1.3931, + "step": 11319 + }, + { + "epoch": 1.3003273792430072, + "grad_norm": 0.577150821685791, + "learning_rate": 0.0001, + "loss": 1.42, + "step": 11320 + }, + { + "epoch": 1.3004422491528345, + "grad_norm": 0.572995126247406, + "learning_rate": 0.0001, + "loss": 1.3415, + "step": 11321 + }, + { + "epoch": 1.3005571190626615, + "grad_norm": 0.5590890645980835, + "learning_rate": 0.0001, + "loss": 1.321, + "step": 11322 + }, + { + "epoch": 1.3006719889724887, + "grad_norm": 0.5591031908988953, + "learning_rate": 0.0001, + "loss": 1.4025, + "step": 11323 + }, + { + "epoch": 1.3007868588823157, + "grad_norm": 0.5289561748504639, + "learning_rate": 0.0001, + "loss": 1.4627, + "step": 11324 + }, + { + "epoch": 1.300901728792143, + "grad_norm": 0.6011853814125061, + "learning_rate": 0.0001, + "loss": 1.7713, + "step": 11325 + }, + { + "epoch": 1.30101659870197, + "grad_norm": 0.5395965576171875, + "learning_rate": 0.0001, + "loss": 1.4244, + "step": 11326 + }, + { + "epoch": 1.3011314686117972, + "grad_norm": 0.5598213076591492, + "learning_rate": 0.0001, + "loss": 1.5376, + "step": 11327 + }, + { + "epoch": 1.3012463385216242, + "grad_norm": 0.5253582000732422, + "learning_rate": 0.0001, + "loss": 1.3091, + "step": 11328 + }, + { + "epoch": 1.3013612084314514, + "grad_norm": 0.57694011926651, + "learning_rate": 0.0001, + "loss": 1.4816, + "step": 11329 + }, + { + "epoch": 1.3014760783412784, + "grad_norm": 0.6319670677185059, + "learning_rate": 0.0001, + "loss": 1.6043, + "step": 11330 + }, + { + "epoch": 1.3015909482511057, + "grad_norm": 0.6250157356262207, + "learning_rate": 0.0001, + "loss": 1.5442, + "step": 11331 + }, + { + "epoch": 1.3017058181609327, + "grad_norm": 0.5917470455169678, + "learning_rate": 0.0001, + "loss": 1.3269, + "step": 11332 + }, + { + "epoch": 1.30182068807076, + "grad_norm": 0.546095609664917, + "learning_rate": 0.0001, + "loss": 1.186, + "step": 11333 + }, + { + "epoch": 1.301935557980587, + "grad_norm": 0.6022743582725525, + "learning_rate": 0.0001, + "loss": 1.4801, + "step": 11334 + }, + { + "epoch": 1.3020504278904141, + "grad_norm": 0.6155171394348145, + "learning_rate": 0.0001, + "loss": 1.7134, + "step": 11335 + }, + { + "epoch": 1.3021652978002412, + "grad_norm": 0.6018640995025635, + "learning_rate": 0.0001, + "loss": 1.6124, + "step": 11336 + }, + { + "epoch": 1.3022801677100684, + "grad_norm": 0.5561271905899048, + "learning_rate": 0.0001, + "loss": 1.6348, + "step": 11337 + }, + { + "epoch": 1.3023950376198954, + "grad_norm": 0.5770154595375061, + "learning_rate": 0.0001, + "loss": 1.4814, + "step": 11338 + }, + { + "epoch": 1.3025099075297226, + "grad_norm": 0.6154848337173462, + "learning_rate": 0.0001, + "loss": 1.3091, + "step": 11339 + }, + { + "epoch": 1.3026247774395496, + "grad_norm": 0.5868239402770996, + "learning_rate": 0.0001, + "loss": 1.5571, + "step": 11340 + }, + { + "epoch": 1.3027396473493769, + "grad_norm": 0.6514554619789124, + "learning_rate": 0.0001, + "loss": 1.5351, + "step": 11341 + }, + { + "epoch": 1.3028545172592039, + "grad_norm": 0.60183185338974, + "learning_rate": 0.0001, + "loss": 1.6098, + "step": 11342 + }, + { + "epoch": 1.302969387169031, + "grad_norm": 0.5899741649627686, + "learning_rate": 0.0001, + "loss": 1.4028, + "step": 11343 + }, + { + "epoch": 1.303084257078858, + "grad_norm": 0.5583611726760864, + "learning_rate": 0.0001, + "loss": 1.3219, + "step": 11344 + }, + { + "epoch": 1.3031991269886853, + "grad_norm": 0.5439897179603577, + "learning_rate": 0.0001, + "loss": 1.4106, + "step": 11345 + }, + { + "epoch": 1.3033139968985124, + "grad_norm": 0.5689780712127686, + "learning_rate": 0.0001, + "loss": 1.5358, + "step": 11346 + }, + { + "epoch": 1.3034288668083396, + "grad_norm": 0.5775777697563171, + "learning_rate": 0.0001, + "loss": 1.562, + "step": 11347 + }, + { + "epoch": 1.3035437367181668, + "grad_norm": 0.6249666810035706, + "learning_rate": 0.0001, + "loss": 1.2675, + "step": 11348 + }, + { + "epoch": 1.3036586066279938, + "grad_norm": 0.6017664074897766, + "learning_rate": 0.0001, + "loss": 1.5117, + "step": 11349 + }, + { + "epoch": 1.3037734765378208, + "grad_norm": 0.5548145174980164, + "learning_rate": 0.0001, + "loss": 1.4848, + "step": 11350 + }, + { + "epoch": 1.303888346447648, + "grad_norm": 0.6580796241760254, + "learning_rate": 0.0001, + "loss": 1.7596, + "step": 11351 + }, + { + "epoch": 1.3040032163574753, + "grad_norm": 0.5205984711647034, + "learning_rate": 0.0001, + "loss": 1.3289, + "step": 11352 + }, + { + "epoch": 1.3041180862673023, + "grad_norm": 0.5620521903038025, + "learning_rate": 0.0001, + "loss": 1.4247, + "step": 11353 + }, + { + "epoch": 1.3042329561771293, + "grad_norm": 0.6419723033905029, + "learning_rate": 0.0001, + "loss": 1.6835, + "step": 11354 + }, + { + "epoch": 1.3043478260869565, + "grad_norm": 0.5205932855606079, + "learning_rate": 0.0001, + "loss": 1.3938, + "step": 11355 + }, + { + "epoch": 1.3044626959967838, + "grad_norm": 0.5548444390296936, + "learning_rate": 0.0001, + "loss": 1.4959, + "step": 11356 + }, + { + "epoch": 1.3045775659066108, + "grad_norm": 0.6600873470306396, + "learning_rate": 0.0001, + "loss": 1.6834, + "step": 11357 + }, + { + "epoch": 1.3046924358164378, + "grad_norm": 0.5854970812797546, + "learning_rate": 0.0001, + "loss": 1.4424, + "step": 11358 + }, + { + "epoch": 1.304807305726265, + "grad_norm": 0.5743281841278076, + "learning_rate": 0.0001, + "loss": 1.5258, + "step": 11359 + }, + { + "epoch": 1.3049221756360923, + "grad_norm": 0.5808170437812805, + "learning_rate": 0.0001, + "loss": 1.4149, + "step": 11360 + }, + { + "epoch": 1.3050370455459193, + "grad_norm": 0.5460790395736694, + "learning_rate": 0.0001, + "loss": 1.2765, + "step": 11361 + }, + { + "epoch": 1.3051519154557463, + "grad_norm": 0.5941900610923767, + "learning_rate": 0.0001, + "loss": 1.6998, + "step": 11362 + }, + { + "epoch": 1.3052667853655735, + "grad_norm": 0.5910804271697998, + "learning_rate": 0.0001, + "loss": 1.6016, + "step": 11363 + }, + { + "epoch": 1.3053816552754007, + "grad_norm": 0.5705919861793518, + "learning_rate": 0.0001, + "loss": 1.4823, + "step": 11364 + }, + { + "epoch": 1.3054965251852277, + "grad_norm": 0.5501276850700378, + "learning_rate": 0.0001, + "loss": 1.4298, + "step": 11365 + }, + { + "epoch": 1.3056113950950547, + "grad_norm": 0.5633885860443115, + "learning_rate": 0.0001, + "loss": 1.5666, + "step": 11366 + }, + { + "epoch": 1.305726265004882, + "grad_norm": 0.6157667636871338, + "learning_rate": 0.0001, + "loss": 1.4968, + "step": 11367 + }, + { + "epoch": 1.3058411349147092, + "grad_norm": 0.5917145609855652, + "learning_rate": 0.0001, + "loss": 1.3983, + "step": 11368 + }, + { + "epoch": 1.3059560048245362, + "grad_norm": 0.6143923997879028, + "learning_rate": 0.0001, + "loss": 1.172, + "step": 11369 + }, + { + "epoch": 1.3060708747343632, + "grad_norm": 0.5878526568412781, + "learning_rate": 0.0001, + "loss": 1.3639, + "step": 11370 + }, + { + "epoch": 1.3061857446441905, + "grad_norm": 0.5434131026268005, + "learning_rate": 0.0001, + "loss": 1.4643, + "step": 11371 + }, + { + "epoch": 1.3063006145540177, + "grad_norm": 0.5768989324569702, + "learning_rate": 0.0001, + "loss": 1.4185, + "step": 11372 + }, + { + "epoch": 1.3064154844638447, + "grad_norm": 0.5958837270736694, + "learning_rate": 0.0001, + "loss": 1.4573, + "step": 11373 + }, + { + "epoch": 1.3065303543736717, + "grad_norm": 0.5841070413589478, + "learning_rate": 0.0001, + "loss": 1.5135, + "step": 11374 + }, + { + "epoch": 1.306645224283499, + "grad_norm": 0.5326519012451172, + "learning_rate": 0.0001, + "loss": 1.1777, + "step": 11375 + }, + { + "epoch": 1.3067600941933262, + "grad_norm": 0.640945553779602, + "learning_rate": 0.0001, + "loss": 1.5267, + "step": 11376 + }, + { + "epoch": 1.3068749641031532, + "grad_norm": 0.546720564365387, + "learning_rate": 0.0001, + "loss": 1.2742, + "step": 11377 + }, + { + "epoch": 1.3069898340129802, + "grad_norm": 0.546595573425293, + "learning_rate": 0.0001, + "loss": 1.1787, + "step": 11378 + }, + { + "epoch": 1.3071047039228074, + "grad_norm": 0.6050646901130676, + "learning_rate": 0.0001, + "loss": 1.5851, + "step": 11379 + }, + { + "epoch": 1.3072195738326347, + "grad_norm": 0.5861079096794128, + "learning_rate": 0.0001, + "loss": 1.6063, + "step": 11380 + }, + { + "epoch": 1.3073344437424617, + "grad_norm": 0.6234222650527954, + "learning_rate": 0.0001, + "loss": 1.5707, + "step": 11381 + }, + { + "epoch": 1.3074493136522887, + "grad_norm": 0.5523126721382141, + "learning_rate": 0.0001, + "loss": 1.3926, + "step": 11382 + }, + { + "epoch": 1.307564183562116, + "grad_norm": 0.6148357391357422, + "learning_rate": 0.0001, + "loss": 1.7076, + "step": 11383 + }, + { + "epoch": 1.3076790534719431, + "grad_norm": 0.5839380025863647, + "learning_rate": 0.0001, + "loss": 1.4059, + "step": 11384 + }, + { + "epoch": 1.3077939233817701, + "grad_norm": 0.559718132019043, + "learning_rate": 0.0001, + "loss": 1.4998, + "step": 11385 + }, + { + "epoch": 1.3079087932915971, + "grad_norm": 0.5904785990715027, + "learning_rate": 0.0001, + "loss": 1.4762, + "step": 11386 + }, + { + "epoch": 1.3080236632014244, + "grad_norm": 0.5947718620300293, + "learning_rate": 0.0001, + "loss": 1.5009, + "step": 11387 + }, + { + "epoch": 1.3081385331112516, + "grad_norm": 0.5415857434272766, + "learning_rate": 0.0001, + "loss": 1.1498, + "step": 11388 + }, + { + "epoch": 1.3082534030210786, + "grad_norm": 0.5549929738044739, + "learning_rate": 0.0001, + "loss": 1.256, + "step": 11389 + }, + { + "epoch": 1.3083682729309056, + "grad_norm": 0.5811049938201904, + "learning_rate": 0.0001, + "loss": 1.4635, + "step": 11390 + }, + { + "epoch": 1.3084831428407329, + "grad_norm": 0.597584068775177, + "learning_rate": 0.0001, + "loss": 1.3548, + "step": 11391 + }, + { + "epoch": 1.30859801275056, + "grad_norm": 0.6055376529693604, + "learning_rate": 0.0001, + "loss": 1.6609, + "step": 11392 + }, + { + "epoch": 1.308712882660387, + "grad_norm": 0.5904808044433594, + "learning_rate": 0.0001, + "loss": 1.6138, + "step": 11393 + }, + { + "epoch": 1.308827752570214, + "grad_norm": 0.6280949711799622, + "learning_rate": 0.0001, + "loss": 1.5001, + "step": 11394 + }, + { + "epoch": 1.3089426224800413, + "grad_norm": 0.6043780446052551, + "learning_rate": 0.0001, + "loss": 1.5045, + "step": 11395 + }, + { + "epoch": 1.3090574923898686, + "grad_norm": 0.5859163403511047, + "learning_rate": 0.0001, + "loss": 1.4466, + "step": 11396 + }, + { + "epoch": 1.3091723622996956, + "grad_norm": 0.5889775156974792, + "learning_rate": 0.0001, + "loss": 1.4596, + "step": 11397 + }, + { + "epoch": 1.3092872322095226, + "grad_norm": 0.5822187066078186, + "learning_rate": 0.0001, + "loss": 1.4784, + "step": 11398 + }, + { + "epoch": 1.3094021021193498, + "grad_norm": 0.5596886873245239, + "learning_rate": 0.0001, + "loss": 1.4668, + "step": 11399 + }, + { + "epoch": 1.309516972029177, + "grad_norm": 0.6127359867095947, + "learning_rate": 0.0001, + "loss": 1.4651, + "step": 11400 + }, + { + "epoch": 1.309631841939004, + "grad_norm": 0.5832725167274475, + "learning_rate": 0.0001, + "loss": 1.4728, + "step": 11401 + }, + { + "epoch": 1.309746711848831, + "grad_norm": 0.5246071219444275, + "learning_rate": 0.0001, + "loss": 1.3994, + "step": 11402 + }, + { + "epoch": 1.3098615817586583, + "grad_norm": 0.5589280724525452, + "learning_rate": 0.0001, + "loss": 1.3416, + "step": 11403 + }, + { + "epoch": 1.3099764516684855, + "grad_norm": 0.5817440748214722, + "learning_rate": 0.0001, + "loss": 1.357, + "step": 11404 + }, + { + "epoch": 1.3100913215783125, + "grad_norm": 0.5823844075202942, + "learning_rate": 0.0001, + "loss": 1.5579, + "step": 11405 + }, + { + "epoch": 1.3102061914881395, + "grad_norm": 0.590492308139801, + "learning_rate": 0.0001, + "loss": 1.3188, + "step": 11406 + }, + { + "epoch": 1.3103210613979668, + "grad_norm": 0.5652885437011719, + "learning_rate": 0.0001, + "loss": 1.3556, + "step": 11407 + }, + { + "epoch": 1.310435931307794, + "grad_norm": 0.5821486711502075, + "learning_rate": 0.0001, + "loss": 1.5111, + "step": 11408 + }, + { + "epoch": 1.310550801217621, + "grad_norm": 0.585404098033905, + "learning_rate": 0.0001, + "loss": 1.506, + "step": 11409 + }, + { + "epoch": 1.310665671127448, + "grad_norm": 0.5636382699012756, + "learning_rate": 0.0001, + "loss": 1.3113, + "step": 11410 + }, + { + "epoch": 1.3107805410372753, + "grad_norm": 0.5605599880218506, + "learning_rate": 0.0001, + "loss": 1.3142, + "step": 11411 + }, + { + "epoch": 1.3108954109471025, + "grad_norm": 0.6051444411277771, + "learning_rate": 0.0001, + "loss": 1.5978, + "step": 11412 + }, + { + "epoch": 1.3110102808569295, + "grad_norm": 0.6009870171546936, + "learning_rate": 0.0001, + "loss": 1.4469, + "step": 11413 + }, + { + "epoch": 1.3111251507667567, + "grad_norm": 0.5619489550590515, + "learning_rate": 0.0001, + "loss": 1.5581, + "step": 11414 + }, + { + "epoch": 1.3112400206765837, + "grad_norm": 0.5233682990074158, + "learning_rate": 0.0001, + "loss": 1.3234, + "step": 11415 + }, + { + "epoch": 1.311354890586411, + "grad_norm": 0.643240749835968, + "learning_rate": 0.0001, + "loss": 1.6655, + "step": 11416 + }, + { + "epoch": 1.311469760496238, + "grad_norm": 0.633753776550293, + "learning_rate": 0.0001, + "loss": 1.569, + "step": 11417 + }, + { + "epoch": 1.3115846304060652, + "grad_norm": 0.6189075708389282, + "learning_rate": 0.0001, + "loss": 1.6485, + "step": 11418 + }, + { + "epoch": 1.3116995003158922, + "grad_norm": 0.5767892003059387, + "learning_rate": 0.0001, + "loss": 1.4837, + "step": 11419 + }, + { + "epoch": 1.3118143702257195, + "grad_norm": 0.5835198163986206, + "learning_rate": 0.0001, + "loss": 1.5653, + "step": 11420 + }, + { + "epoch": 1.3119292401355465, + "grad_norm": 0.5322223901748657, + "learning_rate": 0.0001, + "loss": 1.4651, + "step": 11421 + }, + { + "epoch": 1.3120441100453737, + "grad_norm": 0.5825353264808655, + "learning_rate": 0.0001, + "loss": 1.6025, + "step": 11422 + }, + { + "epoch": 1.3121589799552007, + "grad_norm": 0.5791360139846802, + "learning_rate": 0.0001, + "loss": 1.3845, + "step": 11423 + }, + { + "epoch": 1.312273849865028, + "grad_norm": 0.637932538986206, + "learning_rate": 0.0001, + "loss": 1.6226, + "step": 11424 + }, + { + "epoch": 1.312388719774855, + "grad_norm": 0.6340711116790771, + "learning_rate": 0.0001, + "loss": 1.6321, + "step": 11425 + }, + { + "epoch": 1.3125035896846822, + "grad_norm": 0.5915501117706299, + "learning_rate": 0.0001, + "loss": 1.5359, + "step": 11426 + }, + { + "epoch": 1.3126184595945092, + "grad_norm": 0.6027012467384338, + "learning_rate": 0.0001, + "loss": 1.502, + "step": 11427 + }, + { + "epoch": 1.3127333295043364, + "grad_norm": 0.5210409760475159, + "learning_rate": 0.0001, + "loss": 1.2863, + "step": 11428 + }, + { + "epoch": 1.3128481994141634, + "grad_norm": 0.5848100185394287, + "learning_rate": 0.0001, + "loss": 1.4554, + "step": 11429 + }, + { + "epoch": 1.3129630693239907, + "grad_norm": 0.552432119846344, + "learning_rate": 0.0001, + "loss": 1.4561, + "step": 11430 + }, + { + "epoch": 1.3130779392338177, + "grad_norm": 0.5485931038856506, + "learning_rate": 0.0001, + "loss": 1.2885, + "step": 11431 + }, + { + "epoch": 1.313192809143645, + "grad_norm": 0.5956612825393677, + "learning_rate": 0.0001, + "loss": 1.5797, + "step": 11432 + }, + { + "epoch": 1.313307679053472, + "grad_norm": 0.5833698511123657, + "learning_rate": 0.0001, + "loss": 1.338, + "step": 11433 + }, + { + "epoch": 1.3134225489632991, + "grad_norm": 0.6002306342124939, + "learning_rate": 0.0001, + "loss": 1.4549, + "step": 11434 + }, + { + "epoch": 1.3135374188731261, + "grad_norm": 0.6598058938980103, + "learning_rate": 0.0001, + "loss": 1.5414, + "step": 11435 + }, + { + "epoch": 1.3136522887829534, + "grad_norm": 0.6079245805740356, + "learning_rate": 0.0001, + "loss": 1.4909, + "step": 11436 + }, + { + "epoch": 1.3137671586927804, + "grad_norm": 0.5811184048652649, + "learning_rate": 0.0001, + "loss": 1.4411, + "step": 11437 + }, + { + "epoch": 1.3138820286026076, + "grad_norm": 0.5552648305892944, + "learning_rate": 0.0001, + "loss": 1.4865, + "step": 11438 + }, + { + "epoch": 1.3139968985124346, + "grad_norm": 0.5896119475364685, + "learning_rate": 0.0001, + "loss": 1.4723, + "step": 11439 + }, + { + "epoch": 1.3141117684222619, + "grad_norm": 0.5828573703765869, + "learning_rate": 0.0001, + "loss": 1.5706, + "step": 11440 + }, + { + "epoch": 1.3142266383320889, + "grad_norm": 0.5936744809150696, + "learning_rate": 0.0001, + "loss": 1.5319, + "step": 11441 + }, + { + "epoch": 1.314341508241916, + "grad_norm": 0.672229528427124, + "learning_rate": 0.0001, + "loss": 1.7404, + "step": 11442 + }, + { + "epoch": 1.314456378151743, + "grad_norm": 0.5854552388191223, + "learning_rate": 0.0001, + "loss": 1.4428, + "step": 11443 + }, + { + "epoch": 1.3145712480615703, + "grad_norm": 0.5884828567504883, + "learning_rate": 0.0001, + "loss": 1.34, + "step": 11444 + }, + { + "epoch": 1.3146861179713973, + "grad_norm": 0.6672284007072449, + "learning_rate": 0.0001, + "loss": 1.3559, + "step": 11445 + }, + { + "epoch": 1.3148009878812246, + "grad_norm": 0.5890071988105774, + "learning_rate": 0.0001, + "loss": 1.4041, + "step": 11446 + }, + { + "epoch": 1.3149158577910516, + "grad_norm": 0.573523998260498, + "learning_rate": 0.0001, + "loss": 1.2297, + "step": 11447 + }, + { + "epoch": 1.3150307277008788, + "grad_norm": 0.6235065460205078, + "learning_rate": 0.0001, + "loss": 1.4743, + "step": 11448 + }, + { + "epoch": 1.3151455976107058, + "grad_norm": 0.5744754672050476, + "learning_rate": 0.0001, + "loss": 1.3518, + "step": 11449 + }, + { + "epoch": 1.315260467520533, + "grad_norm": 0.5936892032623291, + "learning_rate": 0.0001, + "loss": 1.4815, + "step": 11450 + }, + { + "epoch": 1.31537533743036, + "grad_norm": 0.5716831684112549, + "learning_rate": 0.0001, + "loss": 1.4176, + "step": 11451 + }, + { + "epoch": 1.3154902073401873, + "grad_norm": 0.5791821479797363, + "learning_rate": 0.0001, + "loss": 1.5311, + "step": 11452 + }, + { + "epoch": 1.3156050772500143, + "grad_norm": 0.591475248336792, + "learning_rate": 0.0001, + "loss": 1.6355, + "step": 11453 + }, + { + "epoch": 1.3157199471598415, + "grad_norm": 0.5741580128669739, + "learning_rate": 0.0001, + "loss": 1.4579, + "step": 11454 + }, + { + "epoch": 1.3158348170696685, + "grad_norm": 0.5884466767311096, + "learning_rate": 0.0001, + "loss": 1.4848, + "step": 11455 + }, + { + "epoch": 1.3159496869794958, + "grad_norm": 0.5626681447029114, + "learning_rate": 0.0001, + "loss": 1.3815, + "step": 11456 + }, + { + "epoch": 1.3160645568893228, + "grad_norm": 0.6762535572052002, + "learning_rate": 0.0001, + "loss": 1.5877, + "step": 11457 + }, + { + "epoch": 1.31617942679915, + "grad_norm": 0.5966523885726929, + "learning_rate": 0.0001, + "loss": 1.566, + "step": 11458 + }, + { + "epoch": 1.316294296708977, + "grad_norm": 0.5997388958930969, + "learning_rate": 0.0001, + "loss": 1.5986, + "step": 11459 + }, + { + "epoch": 1.3164091666188042, + "grad_norm": 0.6016896367073059, + "learning_rate": 0.0001, + "loss": 1.1723, + "step": 11460 + }, + { + "epoch": 1.3165240365286313, + "grad_norm": 0.5532729625701904, + "learning_rate": 0.0001, + "loss": 1.2546, + "step": 11461 + }, + { + "epoch": 1.3166389064384585, + "grad_norm": 0.5814374685287476, + "learning_rate": 0.0001, + "loss": 1.4586, + "step": 11462 + }, + { + "epoch": 1.3167537763482855, + "grad_norm": 0.5885840058326721, + "learning_rate": 0.0001, + "loss": 1.4208, + "step": 11463 + }, + { + "epoch": 1.3168686462581127, + "grad_norm": 0.5459346175193787, + "learning_rate": 0.0001, + "loss": 1.2433, + "step": 11464 + }, + { + "epoch": 1.3169835161679397, + "grad_norm": 0.633172333240509, + "learning_rate": 0.0001, + "loss": 1.5716, + "step": 11465 + }, + { + "epoch": 1.317098386077767, + "grad_norm": 0.5619357228279114, + "learning_rate": 0.0001, + "loss": 1.3453, + "step": 11466 + }, + { + "epoch": 1.317213255987594, + "grad_norm": 0.6191542744636536, + "learning_rate": 0.0001, + "loss": 1.6639, + "step": 11467 + }, + { + "epoch": 1.3173281258974212, + "grad_norm": 0.6198133230209351, + "learning_rate": 0.0001, + "loss": 1.3868, + "step": 11468 + }, + { + "epoch": 1.3174429958072482, + "grad_norm": 0.6131882667541504, + "learning_rate": 0.0001, + "loss": 1.5273, + "step": 11469 + }, + { + "epoch": 1.3175578657170754, + "grad_norm": 0.6336104869842529, + "learning_rate": 0.0001, + "loss": 1.5089, + "step": 11470 + }, + { + "epoch": 1.3176727356269025, + "grad_norm": 0.5683992505073547, + "learning_rate": 0.0001, + "loss": 1.6024, + "step": 11471 + }, + { + "epoch": 1.3177876055367297, + "grad_norm": 0.5928875207901001, + "learning_rate": 0.0001, + "loss": 1.6299, + "step": 11472 + }, + { + "epoch": 1.3179024754465567, + "grad_norm": 0.5696129202842712, + "learning_rate": 0.0001, + "loss": 1.427, + "step": 11473 + }, + { + "epoch": 1.318017345356384, + "grad_norm": 0.5560370683670044, + "learning_rate": 0.0001, + "loss": 1.3691, + "step": 11474 + }, + { + "epoch": 1.318132215266211, + "grad_norm": 0.6248264312744141, + "learning_rate": 0.0001, + "loss": 1.4913, + "step": 11475 + }, + { + "epoch": 1.3182470851760382, + "grad_norm": 0.6087796688079834, + "learning_rate": 0.0001, + "loss": 1.4265, + "step": 11476 + }, + { + "epoch": 1.3183619550858652, + "grad_norm": 0.5595715641975403, + "learning_rate": 0.0001, + "loss": 1.4056, + "step": 11477 + }, + { + "epoch": 1.3184768249956924, + "grad_norm": 0.5776646733283997, + "learning_rate": 0.0001, + "loss": 1.5342, + "step": 11478 + }, + { + "epoch": 1.3185916949055194, + "grad_norm": 0.6008871793746948, + "learning_rate": 0.0001, + "loss": 1.5584, + "step": 11479 + }, + { + "epoch": 1.3187065648153466, + "grad_norm": 0.6645510792732239, + "learning_rate": 0.0001, + "loss": 1.7724, + "step": 11480 + }, + { + "epoch": 1.3188214347251737, + "grad_norm": 0.5654270052909851, + "learning_rate": 0.0001, + "loss": 1.3849, + "step": 11481 + }, + { + "epoch": 1.318936304635001, + "grad_norm": 0.6282428503036499, + "learning_rate": 0.0001, + "loss": 1.6624, + "step": 11482 + }, + { + "epoch": 1.319051174544828, + "grad_norm": 0.5395941138267517, + "learning_rate": 0.0001, + "loss": 1.4415, + "step": 11483 + }, + { + "epoch": 1.3191660444546551, + "grad_norm": 0.6053428649902344, + "learning_rate": 0.0001, + "loss": 1.4817, + "step": 11484 + }, + { + "epoch": 1.3192809143644824, + "grad_norm": 0.6166926622390747, + "learning_rate": 0.0001, + "loss": 1.6258, + "step": 11485 + }, + { + "epoch": 1.3193957842743094, + "grad_norm": 0.5764713883399963, + "learning_rate": 0.0001, + "loss": 1.2562, + "step": 11486 + }, + { + "epoch": 1.3195106541841364, + "grad_norm": 0.5882161855697632, + "learning_rate": 0.0001, + "loss": 1.5924, + "step": 11487 + }, + { + "epoch": 1.3196255240939636, + "grad_norm": 0.5338685512542725, + "learning_rate": 0.0001, + "loss": 1.3349, + "step": 11488 + }, + { + "epoch": 1.3197403940037908, + "grad_norm": 0.6274675130844116, + "learning_rate": 0.0001, + "loss": 1.5144, + "step": 11489 + }, + { + "epoch": 1.3198552639136178, + "grad_norm": 0.6399273872375488, + "learning_rate": 0.0001, + "loss": 1.5503, + "step": 11490 + }, + { + "epoch": 1.3199701338234449, + "grad_norm": 0.5669889450073242, + "learning_rate": 0.0001, + "loss": 1.4638, + "step": 11491 + }, + { + "epoch": 1.320085003733272, + "grad_norm": 0.6101594567298889, + "learning_rate": 0.0001, + "loss": 1.6224, + "step": 11492 + }, + { + "epoch": 1.3201998736430993, + "grad_norm": 0.5899582505226135, + "learning_rate": 0.0001, + "loss": 1.5129, + "step": 11493 + }, + { + "epoch": 1.3203147435529263, + "grad_norm": 0.6240655779838562, + "learning_rate": 0.0001, + "loss": 1.5927, + "step": 11494 + }, + { + "epoch": 1.3204296134627533, + "grad_norm": 0.5994274616241455, + "learning_rate": 0.0001, + "loss": 1.6436, + "step": 11495 + }, + { + "epoch": 1.3205444833725806, + "grad_norm": 0.5786739587783813, + "learning_rate": 0.0001, + "loss": 1.5957, + "step": 11496 + }, + { + "epoch": 1.3206593532824078, + "grad_norm": 0.5860279202461243, + "learning_rate": 0.0001, + "loss": 1.3348, + "step": 11497 + }, + { + "epoch": 1.3207742231922348, + "grad_norm": 0.5858715772628784, + "learning_rate": 0.0001, + "loss": 1.3238, + "step": 11498 + }, + { + "epoch": 1.3208890931020618, + "grad_norm": 0.6237417459487915, + "learning_rate": 0.0001, + "loss": 1.7704, + "step": 11499 + }, + { + "epoch": 1.321003963011889, + "grad_norm": 0.5686200857162476, + "learning_rate": 0.0001, + "loss": 1.493, + "step": 11500 + }, + { + "epoch": 1.3211188329217163, + "grad_norm": 0.6311827301979065, + "learning_rate": 0.0001, + "loss": 1.6549, + "step": 11501 + }, + { + "epoch": 1.3212337028315433, + "grad_norm": 0.5898725986480713, + "learning_rate": 0.0001, + "loss": 1.5524, + "step": 11502 + }, + { + "epoch": 1.3213485727413703, + "grad_norm": 0.543509304523468, + "learning_rate": 0.0001, + "loss": 1.4023, + "step": 11503 + }, + { + "epoch": 1.3214634426511975, + "grad_norm": 0.6843796372413635, + "learning_rate": 0.0001, + "loss": 1.6353, + "step": 11504 + }, + { + "epoch": 1.3215783125610248, + "grad_norm": 0.5604381561279297, + "learning_rate": 0.0001, + "loss": 1.4211, + "step": 11505 + }, + { + "epoch": 1.3216931824708518, + "grad_norm": 0.6264784336090088, + "learning_rate": 0.0001, + "loss": 1.6592, + "step": 11506 + }, + { + "epoch": 1.3218080523806788, + "grad_norm": 0.5538937449455261, + "learning_rate": 0.0001, + "loss": 1.6346, + "step": 11507 + }, + { + "epoch": 1.321922922290506, + "grad_norm": 0.5639132857322693, + "learning_rate": 0.0001, + "loss": 1.3514, + "step": 11508 + }, + { + "epoch": 1.3220377922003332, + "grad_norm": 0.5447933077812195, + "learning_rate": 0.0001, + "loss": 1.3798, + "step": 11509 + }, + { + "epoch": 1.3221526621101602, + "grad_norm": 0.5847629308700562, + "learning_rate": 0.0001, + "loss": 1.5987, + "step": 11510 + }, + { + "epoch": 1.3222675320199873, + "grad_norm": 0.5697048306465149, + "learning_rate": 0.0001, + "loss": 1.5463, + "step": 11511 + }, + { + "epoch": 1.3223824019298145, + "grad_norm": 0.6301547884941101, + "learning_rate": 0.0001, + "loss": 1.4724, + "step": 11512 + }, + { + "epoch": 1.3224972718396417, + "grad_norm": 0.5512596368789673, + "learning_rate": 0.0001, + "loss": 1.5218, + "step": 11513 + }, + { + "epoch": 1.3226121417494687, + "grad_norm": 0.5693661570549011, + "learning_rate": 0.0001, + "loss": 1.3942, + "step": 11514 + }, + { + "epoch": 1.3227270116592957, + "grad_norm": 0.5459357500076294, + "learning_rate": 0.0001, + "loss": 1.4969, + "step": 11515 + }, + { + "epoch": 1.322841881569123, + "grad_norm": 0.5842916965484619, + "learning_rate": 0.0001, + "loss": 1.4668, + "step": 11516 + }, + { + "epoch": 1.3229567514789502, + "grad_norm": 0.5588902235031128, + "learning_rate": 0.0001, + "loss": 1.397, + "step": 11517 + }, + { + "epoch": 1.3230716213887772, + "grad_norm": 0.5641937255859375, + "learning_rate": 0.0001, + "loss": 1.2773, + "step": 11518 + }, + { + "epoch": 1.3231864912986042, + "grad_norm": 0.5515016317367554, + "learning_rate": 0.0001, + "loss": 1.4645, + "step": 11519 + }, + { + "epoch": 1.3233013612084314, + "grad_norm": 0.6019710898399353, + "learning_rate": 0.0001, + "loss": 1.4588, + "step": 11520 + }, + { + "epoch": 1.3234162311182587, + "grad_norm": 0.6003029942512512, + "learning_rate": 0.0001, + "loss": 1.5908, + "step": 11521 + }, + { + "epoch": 1.3235311010280857, + "grad_norm": 0.5781972408294678, + "learning_rate": 0.0001, + "loss": 1.4488, + "step": 11522 + }, + { + "epoch": 1.3236459709379127, + "grad_norm": 0.597213089466095, + "learning_rate": 0.0001, + "loss": 1.4833, + "step": 11523 + }, + { + "epoch": 1.32376084084774, + "grad_norm": 0.6153642535209656, + "learning_rate": 0.0001, + "loss": 1.5441, + "step": 11524 + }, + { + "epoch": 1.3238757107575672, + "grad_norm": 0.5358846783638, + "learning_rate": 0.0001, + "loss": 1.2119, + "step": 11525 + }, + { + "epoch": 1.3239905806673942, + "grad_norm": 0.6239076852798462, + "learning_rate": 0.0001, + "loss": 1.5487, + "step": 11526 + }, + { + "epoch": 1.3241054505772212, + "grad_norm": 0.5684323906898499, + "learning_rate": 0.0001, + "loss": 1.5562, + "step": 11527 + }, + { + "epoch": 1.3242203204870484, + "grad_norm": 0.5422741174697876, + "learning_rate": 0.0001, + "loss": 1.3541, + "step": 11528 + }, + { + "epoch": 1.3243351903968756, + "grad_norm": 0.5979777574539185, + "learning_rate": 0.0001, + "loss": 1.5381, + "step": 11529 + }, + { + "epoch": 1.3244500603067026, + "grad_norm": 0.5492232441902161, + "learning_rate": 0.0001, + "loss": 1.2141, + "step": 11530 + }, + { + "epoch": 1.3245649302165297, + "grad_norm": 0.5755845904350281, + "learning_rate": 0.0001, + "loss": 1.4904, + "step": 11531 + }, + { + "epoch": 1.3246798001263569, + "grad_norm": 0.5299710035324097, + "learning_rate": 0.0001, + "loss": 1.3358, + "step": 11532 + }, + { + "epoch": 1.3247946700361841, + "grad_norm": 0.5345584154129028, + "learning_rate": 0.0001, + "loss": 1.4269, + "step": 11533 + }, + { + "epoch": 1.3249095399460111, + "grad_norm": 0.5578746199607849, + "learning_rate": 0.0001, + "loss": 1.4823, + "step": 11534 + }, + { + "epoch": 1.3250244098558381, + "grad_norm": 0.6171008944511414, + "learning_rate": 0.0001, + "loss": 1.5046, + "step": 11535 + }, + { + "epoch": 1.3251392797656654, + "grad_norm": 0.5888642072677612, + "learning_rate": 0.0001, + "loss": 1.4443, + "step": 11536 + }, + { + "epoch": 1.3252541496754926, + "grad_norm": 0.5797832012176514, + "learning_rate": 0.0001, + "loss": 1.5776, + "step": 11537 + }, + { + "epoch": 1.3253690195853196, + "grad_norm": 0.5411096215248108, + "learning_rate": 0.0001, + "loss": 1.2369, + "step": 11538 + }, + { + "epoch": 1.3254838894951466, + "grad_norm": 0.5486440062522888, + "learning_rate": 0.0001, + "loss": 1.5438, + "step": 11539 + }, + { + "epoch": 1.3255987594049738, + "grad_norm": 0.6069177985191345, + "learning_rate": 0.0001, + "loss": 1.3719, + "step": 11540 + }, + { + "epoch": 1.325713629314801, + "grad_norm": 0.5846525430679321, + "learning_rate": 0.0001, + "loss": 1.4982, + "step": 11541 + }, + { + "epoch": 1.325828499224628, + "grad_norm": 0.5994956493377686, + "learning_rate": 0.0001, + "loss": 1.6159, + "step": 11542 + }, + { + "epoch": 1.325943369134455, + "grad_norm": 0.6577488780021667, + "learning_rate": 0.0001, + "loss": 1.633, + "step": 11543 + }, + { + "epoch": 1.3260582390442823, + "grad_norm": 0.5551082491874695, + "learning_rate": 0.0001, + "loss": 1.4653, + "step": 11544 + }, + { + "epoch": 1.3261731089541096, + "grad_norm": 0.7000524401664734, + "learning_rate": 0.0001, + "loss": 1.5411, + "step": 11545 + }, + { + "epoch": 1.3262879788639366, + "grad_norm": 0.5763112902641296, + "learning_rate": 0.0001, + "loss": 1.5938, + "step": 11546 + }, + { + "epoch": 1.3264028487737636, + "grad_norm": 0.6176822185516357, + "learning_rate": 0.0001, + "loss": 1.586, + "step": 11547 + }, + { + "epoch": 1.3265177186835908, + "grad_norm": 0.5590544939041138, + "learning_rate": 0.0001, + "loss": 1.3263, + "step": 11548 + }, + { + "epoch": 1.326632588593418, + "grad_norm": 0.5652593374252319, + "learning_rate": 0.0001, + "loss": 1.3331, + "step": 11549 + }, + { + "epoch": 1.326747458503245, + "grad_norm": 0.657227635383606, + "learning_rate": 0.0001, + "loss": 1.5396, + "step": 11550 + }, + { + "epoch": 1.3268623284130723, + "grad_norm": 0.62965327501297, + "learning_rate": 0.0001, + "loss": 1.4074, + "step": 11551 + }, + { + "epoch": 1.3269771983228993, + "grad_norm": 0.5422511100769043, + "learning_rate": 0.0001, + "loss": 1.5188, + "step": 11552 + }, + { + "epoch": 1.3270920682327265, + "grad_norm": 0.5476226806640625, + "learning_rate": 0.0001, + "loss": 1.4102, + "step": 11553 + }, + { + "epoch": 1.3272069381425535, + "grad_norm": 0.6080157160758972, + "learning_rate": 0.0001, + "loss": 1.4651, + "step": 11554 + }, + { + "epoch": 1.3273218080523808, + "grad_norm": 0.6048029661178589, + "learning_rate": 0.0001, + "loss": 1.4795, + "step": 11555 + }, + { + "epoch": 1.3274366779622078, + "grad_norm": 0.5942696332931519, + "learning_rate": 0.0001, + "loss": 1.4198, + "step": 11556 + }, + { + "epoch": 1.327551547872035, + "grad_norm": 0.6302083134651184, + "learning_rate": 0.0001, + "loss": 1.6259, + "step": 11557 + }, + { + "epoch": 1.327666417781862, + "grad_norm": 0.6311014294624329, + "learning_rate": 0.0001, + "loss": 1.3774, + "step": 11558 + }, + { + "epoch": 1.3277812876916892, + "grad_norm": 0.6695850491523743, + "learning_rate": 0.0001, + "loss": 1.7421, + "step": 11559 + }, + { + "epoch": 1.3278961576015162, + "grad_norm": 0.5893605947494507, + "learning_rate": 0.0001, + "loss": 1.5078, + "step": 11560 + }, + { + "epoch": 1.3280110275113435, + "grad_norm": 0.7168806195259094, + "learning_rate": 0.0001, + "loss": 1.4677, + "step": 11561 + }, + { + "epoch": 1.3281258974211705, + "grad_norm": 0.5717088580131531, + "learning_rate": 0.0001, + "loss": 1.4689, + "step": 11562 + }, + { + "epoch": 1.3282407673309977, + "grad_norm": 0.5938601493835449, + "learning_rate": 0.0001, + "loss": 1.489, + "step": 11563 + }, + { + "epoch": 1.3283556372408247, + "grad_norm": 0.6174132227897644, + "learning_rate": 0.0001, + "loss": 1.4657, + "step": 11564 + }, + { + "epoch": 1.328470507150652, + "grad_norm": 0.6349033713340759, + "learning_rate": 0.0001, + "loss": 1.4751, + "step": 11565 + }, + { + "epoch": 1.328585377060479, + "grad_norm": 0.5308579206466675, + "learning_rate": 0.0001, + "loss": 1.4448, + "step": 11566 + }, + { + "epoch": 1.3287002469703062, + "grad_norm": 0.5638580322265625, + "learning_rate": 0.0001, + "loss": 1.5103, + "step": 11567 + }, + { + "epoch": 1.3288151168801332, + "grad_norm": 0.5843721628189087, + "learning_rate": 0.0001, + "loss": 1.4057, + "step": 11568 + }, + { + "epoch": 1.3289299867899604, + "grad_norm": 0.5811317563056946, + "learning_rate": 0.0001, + "loss": 1.6315, + "step": 11569 + }, + { + "epoch": 1.3290448566997874, + "grad_norm": 0.5734536051750183, + "learning_rate": 0.0001, + "loss": 1.5023, + "step": 11570 + }, + { + "epoch": 1.3291597266096147, + "grad_norm": 0.5713332295417786, + "learning_rate": 0.0001, + "loss": 1.3056, + "step": 11571 + }, + { + "epoch": 1.3292745965194417, + "grad_norm": 0.570878803730011, + "learning_rate": 0.0001, + "loss": 1.303, + "step": 11572 + }, + { + "epoch": 1.329389466429269, + "grad_norm": 0.5729346871376038, + "learning_rate": 0.0001, + "loss": 1.2979, + "step": 11573 + }, + { + "epoch": 1.329504336339096, + "grad_norm": 0.5870584845542908, + "learning_rate": 0.0001, + "loss": 1.4395, + "step": 11574 + }, + { + "epoch": 1.3296192062489232, + "grad_norm": 0.5481575131416321, + "learning_rate": 0.0001, + "loss": 1.3214, + "step": 11575 + }, + { + "epoch": 1.3297340761587502, + "grad_norm": 0.5822715759277344, + "learning_rate": 0.0001, + "loss": 1.3497, + "step": 11576 + }, + { + "epoch": 1.3298489460685774, + "grad_norm": 0.5590030550956726, + "learning_rate": 0.0001, + "loss": 1.3659, + "step": 11577 + }, + { + "epoch": 1.3299638159784044, + "grad_norm": 0.5739883184432983, + "learning_rate": 0.0001, + "loss": 1.553, + "step": 11578 + }, + { + "epoch": 1.3300786858882316, + "grad_norm": 0.5733195543289185, + "learning_rate": 0.0001, + "loss": 1.6149, + "step": 11579 + }, + { + "epoch": 1.3301935557980586, + "grad_norm": 0.5605318546295166, + "learning_rate": 0.0001, + "loss": 1.3656, + "step": 11580 + }, + { + "epoch": 1.3303084257078859, + "grad_norm": 0.574635922908783, + "learning_rate": 0.0001, + "loss": 1.3818, + "step": 11581 + }, + { + "epoch": 1.3304232956177129, + "grad_norm": 0.5863053202629089, + "learning_rate": 0.0001, + "loss": 1.5685, + "step": 11582 + }, + { + "epoch": 1.3305381655275401, + "grad_norm": 0.6203168630599976, + "learning_rate": 0.0001, + "loss": 1.5662, + "step": 11583 + }, + { + "epoch": 1.3306530354373671, + "grad_norm": 0.5553234815597534, + "learning_rate": 0.0001, + "loss": 1.2708, + "step": 11584 + }, + { + "epoch": 1.3307679053471944, + "grad_norm": 0.5634999871253967, + "learning_rate": 0.0001, + "loss": 1.2638, + "step": 11585 + }, + { + "epoch": 1.3308827752570214, + "grad_norm": 0.5584697723388672, + "learning_rate": 0.0001, + "loss": 1.3846, + "step": 11586 + }, + { + "epoch": 1.3309976451668486, + "grad_norm": 0.5859621167182922, + "learning_rate": 0.0001, + "loss": 1.4662, + "step": 11587 + }, + { + "epoch": 1.3311125150766756, + "grad_norm": 0.5909789800643921, + "learning_rate": 0.0001, + "loss": 1.5097, + "step": 11588 + }, + { + "epoch": 1.3312273849865028, + "grad_norm": 0.674757182598114, + "learning_rate": 0.0001, + "loss": 1.6189, + "step": 11589 + }, + { + "epoch": 1.3313422548963298, + "grad_norm": 0.5663519501686096, + "learning_rate": 0.0001, + "loss": 1.3768, + "step": 11590 + }, + { + "epoch": 1.331457124806157, + "grad_norm": 0.5656293630599976, + "learning_rate": 0.0001, + "loss": 1.3759, + "step": 11591 + }, + { + "epoch": 1.331571994715984, + "grad_norm": 0.652201771736145, + "learning_rate": 0.0001, + "loss": 1.6337, + "step": 11592 + }, + { + "epoch": 1.3316868646258113, + "grad_norm": 0.6089327931404114, + "learning_rate": 0.0001, + "loss": 1.5429, + "step": 11593 + }, + { + "epoch": 1.3318017345356383, + "grad_norm": 0.5640665888786316, + "learning_rate": 0.0001, + "loss": 1.4744, + "step": 11594 + }, + { + "epoch": 1.3319166044454656, + "grad_norm": 0.5926685333251953, + "learning_rate": 0.0001, + "loss": 1.6095, + "step": 11595 + }, + { + "epoch": 1.3320314743552926, + "grad_norm": 0.5491368770599365, + "learning_rate": 0.0001, + "loss": 1.4922, + "step": 11596 + }, + { + "epoch": 1.3321463442651198, + "grad_norm": 0.6609952449798584, + "learning_rate": 0.0001, + "loss": 1.6714, + "step": 11597 + }, + { + "epoch": 1.3322612141749468, + "grad_norm": 0.6160357594490051, + "learning_rate": 0.0001, + "loss": 1.5141, + "step": 11598 + }, + { + "epoch": 1.332376084084774, + "grad_norm": 0.5720334649085999, + "learning_rate": 0.0001, + "loss": 1.4616, + "step": 11599 + }, + { + "epoch": 1.332490953994601, + "grad_norm": 0.6039928197860718, + "learning_rate": 0.0001, + "loss": 1.5472, + "step": 11600 + }, + { + "epoch": 1.3326058239044283, + "grad_norm": 0.6017964482307434, + "learning_rate": 0.0001, + "loss": 1.3536, + "step": 11601 + }, + { + "epoch": 1.3327206938142553, + "grad_norm": 0.5808287858963013, + "learning_rate": 0.0001, + "loss": 1.5072, + "step": 11602 + }, + { + "epoch": 1.3328355637240825, + "grad_norm": 0.6469992995262146, + "learning_rate": 0.0001, + "loss": 1.6845, + "step": 11603 + }, + { + "epoch": 1.3329504336339095, + "grad_norm": 0.6133192777633667, + "learning_rate": 0.0001, + "loss": 1.385, + "step": 11604 + }, + { + "epoch": 1.3330653035437368, + "grad_norm": 0.575235903263092, + "learning_rate": 0.0001, + "loss": 1.465, + "step": 11605 + }, + { + "epoch": 1.3331801734535638, + "grad_norm": 0.5712215900421143, + "learning_rate": 0.0001, + "loss": 1.3618, + "step": 11606 + }, + { + "epoch": 1.333295043363391, + "grad_norm": 0.5588753819465637, + "learning_rate": 0.0001, + "loss": 1.4052, + "step": 11607 + }, + { + "epoch": 1.333409913273218, + "grad_norm": 0.5291929244995117, + "learning_rate": 0.0001, + "loss": 1.3567, + "step": 11608 + }, + { + "epoch": 1.3335247831830452, + "grad_norm": 0.5625868439674377, + "learning_rate": 0.0001, + "loss": 1.3899, + "step": 11609 + }, + { + "epoch": 1.3336396530928722, + "grad_norm": 0.5754292607307434, + "learning_rate": 0.0001, + "loss": 1.3608, + "step": 11610 + }, + { + "epoch": 1.3337545230026995, + "grad_norm": 0.6050917506217957, + "learning_rate": 0.0001, + "loss": 1.4607, + "step": 11611 + }, + { + "epoch": 1.3338693929125265, + "grad_norm": 0.5786774754524231, + "learning_rate": 0.0001, + "loss": 1.6412, + "step": 11612 + }, + { + "epoch": 1.3339842628223537, + "grad_norm": 0.6983840465545654, + "learning_rate": 0.0001, + "loss": 1.7368, + "step": 11613 + }, + { + "epoch": 1.3340991327321807, + "grad_norm": 0.5841389298439026, + "learning_rate": 0.0001, + "loss": 1.4559, + "step": 11614 + }, + { + "epoch": 1.334214002642008, + "grad_norm": 0.6447943449020386, + "learning_rate": 0.0001, + "loss": 1.6709, + "step": 11615 + }, + { + "epoch": 1.334328872551835, + "grad_norm": 0.5992538332939148, + "learning_rate": 0.0001, + "loss": 1.4572, + "step": 11616 + }, + { + "epoch": 1.3344437424616622, + "grad_norm": 0.6163121461868286, + "learning_rate": 0.0001, + "loss": 1.6213, + "step": 11617 + }, + { + "epoch": 1.3345586123714892, + "grad_norm": 0.5886964797973633, + "learning_rate": 0.0001, + "loss": 1.3598, + "step": 11618 + }, + { + "epoch": 1.3346734822813164, + "grad_norm": 0.5401976704597473, + "learning_rate": 0.0001, + "loss": 1.3276, + "step": 11619 + }, + { + "epoch": 1.3347883521911434, + "grad_norm": 0.543461799621582, + "learning_rate": 0.0001, + "loss": 1.4984, + "step": 11620 + }, + { + "epoch": 1.3349032221009707, + "grad_norm": 0.6902713179588318, + "learning_rate": 0.0001, + "loss": 1.5798, + "step": 11621 + }, + { + "epoch": 1.335018092010798, + "grad_norm": 0.6112170219421387, + "learning_rate": 0.0001, + "loss": 1.4523, + "step": 11622 + }, + { + "epoch": 1.335132961920625, + "grad_norm": 0.6658702492713928, + "learning_rate": 0.0001, + "loss": 1.6396, + "step": 11623 + }, + { + "epoch": 1.335247831830452, + "grad_norm": 0.5938554406166077, + "learning_rate": 0.0001, + "loss": 1.5521, + "step": 11624 + }, + { + "epoch": 1.3353627017402792, + "grad_norm": 0.5592337250709534, + "learning_rate": 0.0001, + "loss": 1.5078, + "step": 11625 + }, + { + "epoch": 1.3354775716501064, + "grad_norm": 0.5694896578788757, + "learning_rate": 0.0001, + "loss": 1.2984, + "step": 11626 + }, + { + "epoch": 1.3355924415599334, + "grad_norm": 0.6331267356872559, + "learning_rate": 0.0001, + "loss": 1.4092, + "step": 11627 + }, + { + "epoch": 1.3357073114697604, + "grad_norm": 0.5441679358482361, + "learning_rate": 0.0001, + "loss": 1.4533, + "step": 11628 + }, + { + "epoch": 1.3358221813795876, + "grad_norm": 0.5393032431602478, + "learning_rate": 0.0001, + "loss": 1.4713, + "step": 11629 + }, + { + "epoch": 1.3359370512894149, + "grad_norm": 0.5676888227462769, + "learning_rate": 0.0001, + "loss": 1.428, + "step": 11630 + }, + { + "epoch": 1.3360519211992419, + "grad_norm": 0.6380576491355896, + "learning_rate": 0.0001, + "loss": 1.4995, + "step": 11631 + }, + { + "epoch": 1.3361667911090689, + "grad_norm": 0.6264674663543701, + "learning_rate": 0.0001, + "loss": 1.6645, + "step": 11632 + }, + { + "epoch": 1.3362816610188961, + "grad_norm": 0.660614550113678, + "learning_rate": 0.0001, + "loss": 1.3874, + "step": 11633 + }, + { + "epoch": 1.3363965309287233, + "grad_norm": 0.5967418551445007, + "learning_rate": 0.0001, + "loss": 1.4457, + "step": 11634 + }, + { + "epoch": 1.3365114008385504, + "grad_norm": 0.5896640419960022, + "learning_rate": 0.0001, + "loss": 1.3916, + "step": 11635 + }, + { + "epoch": 1.3366262707483774, + "grad_norm": 0.5509446859359741, + "learning_rate": 0.0001, + "loss": 1.5138, + "step": 11636 + }, + { + "epoch": 1.3367411406582046, + "grad_norm": 0.5520437955856323, + "learning_rate": 0.0001, + "loss": 1.4022, + "step": 11637 + }, + { + "epoch": 1.3368560105680318, + "grad_norm": 0.6015188694000244, + "learning_rate": 0.0001, + "loss": 1.4852, + "step": 11638 + }, + { + "epoch": 1.3369708804778588, + "grad_norm": 0.6142217516899109, + "learning_rate": 0.0001, + "loss": 1.5066, + "step": 11639 + }, + { + "epoch": 1.3370857503876858, + "grad_norm": 0.5661596059799194, + "learning_rate": 0.0001, + "loss": 1.3181, + "step": 11640 + }, + { + "epoch": 1.337200620297513, + "grad_norm": 0.5863280296325684, + "learning_rate": 0.0001, + "loss": 1.5477, + "step": 11641 + }, + { + "epoch": 1.3373154902073403, + "grad_norm": 0.6303154826164246, + "learning_rate": 0.0001, + "loss": 1.437, + "step": 11642 + }, + { + "epoch": 1.3374303601171673, + "grad_norm": 0.6611382961273193, + "learning_rate": 0.0001, + "loss": 1.5357, + "step": 11643 + }, + { + "epoch": 1.3375452300269943, + "grad_norm": 0.5862720012664795, + "learning_rate": 0.0001, + "loss": 1.4288, + "step": 11644 + }, + { + "epoch": 1.3376600999368216, + "grad_norm": 0.6255212426185608, + "learning_rate": 0.0001, + "loss": 1.513, + "step": 11645 + }, + { + "epoch": 1.3377749698466488, + "grad_norm": 0.6159241199493408, + "learning_rate": 0.0001, + "loss": 1.5834, + "step": 11646 + }, + { + "epoch": 1.3378898397564758, + "grad_norm": 0.6486058235168457, + "learning_rate": 0.0001, + "loss": 1.6822, + "step": 11647 + }, + { + "epoch": 1.3380047096663028, + "grad_norm": 0.577648401260376, + "learning_rate": 0.0001, + "loss": 1.5894, + "step": 11648 + }, + { + "epoch": 1.33811957957613, + "grad_norm": 0.554535448551178, + "learning_rate": 0.0001, + "loss": 1.328, + "step": 11649 + }, + { + "epoch": 1.3382344494859573, + "grad_norm": 0.6508966088294983, + "learning_rate": 0.0001, + "loss": 1.475, + "step": 11650 + }, + { + "epoch": 1.3383493193957843, + "grad_norm": 0.558700442314148, + "learning_rate": 0.0001, + "loss": 1.502, + "step": 11651 + }, + { + "epoch": 1.3384641893056113, + "grad_norm": 0.5516871809959412, + "learning_rate": 0.0001, + "loss": 1.3675, + "step": 11652 + }, + { + "epoch": 1.3385790592154385, + "grad_norm": 0.5952174663543701, + "learning_rate": 0.0001, + "loss": 1.5799, + "step": 11653 + }, + { + "epoch": 1.3386939291252657, + "grad_norm": 0.5594809651374817, + "learning_rate": 0.0001, + "loss": 1.2648, + "step": 11654 + }, + { + "epoch": 1.3388087990350928, + "grad_norm": 0.6401504278182983, + "learning_rate": 0.0001, + "loss": 1.6669, + "step": 11655 + }, + { + "epoch": 1.3389236689449198, + "grad_norm": 0.5477997660636902, + "learning_rate": 0.0001, + "loss": 1.3875, + "step": 11656 + }, + { + "epoch": 1.339038538854747, + "grad_norm": 0.6509205102920532, + "learning_rate": 0.0001, + "loss": 1.6173, + "step": 11657 + }, + { + "epoch": 1.3391534087645742, + "grad_norm": 0.6122426986694336, + "learning_rate": 0.0001, + "loss": 1.4431, + "step": 11658 + }, + { + "epoch": 1.3392682786744012, + "grad_norm": 0.6294790506362915, + "learning_rate": 0.0001, + "loss": 1.3445, + "step": 11659 + }, + { + "epoch": 1.3393831485842282, + "grad_norm": 0.6401886940002441, + "learning_rate": 0.0001, + "loss": 1.5471, + "step": 11660 + }, + { + "epoch": 1.3394980184940555, + "grad_norm": 0.6541320085525513, + "learning_rate": 0.0001, + "loss": 1.7049, + "step": 11661 + }, + { + "epoch": 1.3396128884038827, + "grad_norm": 0.6253013014793396, + "learning_rate": 0.0001, + "loss": 1.4823, + "step": 11662 + }, + { + "epoch": 1.3397277583137097, + "grad_norm": 0.5853136777877808, + "learning_rate": 0.0001, + "loss": 1.5519, + "step": 11663 + }, + { + "epoch": 1.3398426282235367, + "grad_norm": 0.638195276260376, + "learning_rate": 0.0001, + "loss": 1.4023, + "step": 11664 + }, + { + "epoch": 1.339957498133364, + "grad_norm": 0.5563250184059143, + "learning_rate": 0.0001, + "loss": 1.4894, + "step": 11665 + }, + { + "epoch": 1.3400723680431912, + "grad_norm": 0.6137783527374268, + "learning_rate": 0.0001, + "loss": 1.6346, + "step": 11666 + }, + { + "epoch": 1.3401872379530182, + "grad_norm": 0.5672292709350586, + "learning_rate": 0.0001, + "loss": 1.4835, + "step": 11667 + }, + { + "epoch": 1.3403021078628452, + "grad_norm": 0.6190178394317627, + "learning_rate": 0.0001, + "loss": 1.3538, + "step": 11668 + }, + { + "epoch": 1.3404169777726724, + "grad_norm": 0.5791795253753662, + "learning_rate": 0.0001, + "loss": 1.5544, + "step": 11669 + }, + { + "epoch": 1.3405318476824997, + "grad_norm": 0.5504187345504761, + "learning_rate": 0.0001, + "loss": 1.5983, + "step": 11670 + }, + { + "epoch": 1.3406467175923267, + "grad_norm": 0.5945659875869751, + "learning_rate": 0.0001, + "loss": 1.5295, + "step": 11671 + }, + { + "epoch": 1.3407615875021537, + "grad_norm": 0.6134802103042603, + "learning_rate": 0.0001, + "loss": 1.5271, + "step": 11672 + }, + { + "epoch": 1.340876457411981, + "grad_norm": 0.5988081097602844, + "learning_rate": 0.0001, + "loss": 1.6372, + "step": 11673 + }, + { + "epoch": 1.3409913273218081, + "grad_norm": 0.5827198028564453, + "learning_rate": 0.0001, + "loss": 1.5544, + "step": 11674 + }, + { + "epoch": 1.3411061972316352, + "grad_norm": 0.5701010227203369, + "learning_rate": 0.0001, + "loss": 1.3642, + "step": 11675 + }, + { + "epoch": 1.3412210671414622, + "grad_norm": 0.6469464898109436, + "learning_rate": 0.0001, + "loss": 1.5485, + "step": 11676 + }, + { + "epoch": 1.3413359370512894, + "grad_norm": 0.5903701782226562, + "learning_rate": 0.0001, + "loss": 1.5042, + "step": 11677 + }, + { + "epoch": 1.3414508069611166, + "grad_norm": 0.5560135245323181, + "learning_rate": 0.0001, + "loss": 1.4042, + "step": 11678 + }, + { + "epoch": 1.3415656768709436, + "grad_norm": 0.5839285850524902, + "learning_rate": 0.0001, + "loss": 1.5252, + "step": 11679 + }, + { + "epoch": 1.3416805467807706, + "grad_norm": 0.6008885502815247, + "learning_rate": 0.0001, + "loss": 1.7412, + "step": 11680 + }, + { + "epoch": 1.3417954166905979, + "grad_norm": 0.542387843132019, + "learning_rate": 0.0001, + "loss": 1.4938, + "step": 11681 + }, + { + "epoch": 1.341910286600425, + "grad_norm": 0.5818324685096741, + "learning_rate": 0.0001, + "loss": 1.542, + "step": 11682 + }, + { + "epoch": 1.3420251565102521, + "grad_norm": 0.5558339953422546, + "learning_rate": 0.0001, + "loss": 1.3248, + "step": 11683 + }, + { + "epoch": 1.3421400264200791, + "grad_norm": 0.5538620948791504, + "learning_rate": 0.0001, + "loss": 1.4956, + "step": 11684 + }, + { + "epoch": 1.3422548963299064, + "grad_norm": 0.5701099038124084, + "learning_rate": 0.0001, + "loss": 1.5183, + "step": 11685 + }, + { + "epoch": 1.3423697662397336, + "grad_norm": 0.5815318822860718, + "learning_rate": 0.0001, + "loss": 1.4229, + "step": 11686 + }, + { + "epoch": 1.3424846361495606, + "grad_norm": 0.5730632543563843, + "learning_rate": 0.0001, + "loss": 1.252, + "step": 11687 + }, + { + "epoch": 1.3425995060593878, + "grad_norm": 0.587175726890564, + "learning_rate": 0.0001, + "loss": 1.5821, + "step": 11688 + }, + { + "epoch": 1.3427143759692148, + "grad_norm": 0.6182733774185181, + "learning_rate": 0.0001, + "loss": 1.493, + "step": 11689 + }, + { + "epoch": 1.342829245879042, + "grad_norm": 0.636389434337616, + "learning_rate": 0.0001, + "loss": 1.4714, + "step": 11690 + }, + { + "epoch": 1.342944115788869, + "grad_norm": 0.5567188858985901, + "learning_rate": 0.0001, + "loss": 1.2964, + "step": 11691 + }, + { + "epoch": 1.3430589856986963, + "grad_norm": 0.7240939140319824, + "learning_rate": 0.0001, + "loss": 1.4965, + "step": 11692 + }, + { + "epoch": 1.3431738556085233, + "grad_norm": 0.5807350277900696, + "learning_rate": 0.0001, + "loss": 1.4122, + "step": 11693 + }, + { + "epoch": 1.3432887255183505, + "grad_norm": 0.5738909840583801, + "learning_rate": 0.0001, + "loss": 1.5089, + "step": 11694 + }, + { + "epoch": 1.3434035954281776, + "grad_norm": 0.6102243661880493, + "learning_rate": 0.0001, + "loss": 1.501, + "step": 11695 + }, + { + "epoch": 1.3435184653380048, + "grad_norm": 0.5855363607406616, + "learning_rate": 0.0001, + "loss": 1.4738, + "step": 11696 + }, + { + "epoch": 1.3436333352478318, + "grad_norm": 0.6184929609298706, + "learning_rate": 0.0001, + "loss": 1.3653, + "step": 11697 + }, + { + "epoch": 1.343748205157659, + "grad_norm": 0.6024360060691833, + "learning_rate": 0.0001, + "loss": 1.601, + "step": 11698 + }, + { + "epoch": 1.343863075067486, + "grad_norm": 0.6664621233940125, + "learning_rate": 0.0001, + "loss": 1.4674, + "step": 11699 + }, + { + "epoch": 1.3439779449773133, + "grad_norm": 0.59235018491745, + "learning_rate": 0.0001, + "loss": 1.4277, + "step": 11700 + }, + { + "epoch": 1.3440928148871403, + "grad_norm": 0.5766885876655579, + "learning_rate": 0.0001, + "loss": 1.3778, + "step": 11701 + }, + { + "epoch": 1.3442076847969675, + "grad_norm": 0.6111820936203003, + "learning_rate": 0.0001, + "loss": 1.5498, + "step": 11702 + }, + { + "epoch": 1.3443225547067945, + "grad_norm": 0.5713093280792236, + "learning_rate": 0.0001, + "loss": 1.5413, + "step": 11703 + }, + { + "epoch": 1.3444374246166217, + "grad_norm": 0.5723089575767517, + "learning_rate": 0.0001, + "loss": 1.4658, + "step": 11704 + }, + { + "epoch": 1.3445522945264488, + "grad_norm": 0.5831340551376343, + "learning_rate": 0.0001, + "loss": 1.5091, + "step": 11705 + }, + { + "epoch": 1.344667164436276, + "grad_norm": 0.5750026702880859, + "learning_rate": 0.0001, + "loss": 1.4922, + "step": 11706 + }, + { + "epoch": 1.344782034346103, + "grad_norm": 0.6101966500282288, + "learning_rate": 0.0001, + "loss": 1.3965, + "step": 11707 + }, + { + "epoch": 1.3448969042559302, + "grad_norm": 0.5940831303596497, + "learning_rate": 0.0001, + "loss": 1.3699, + "step": 11708 + }, + { + "epoch": 1.3450117741657572, + "grad_norm": 0.5816580057144165, + "learning_rate": 0.0001, + "loss": 1.2569, + "step": 11709 + }, + { + "epoch": 1.3451266440755845, + "grad_norm": 0.651119589805603, + "learning_rate": 0.0001, + "loss": 1.454, + "step": 11710 + }, + { + "epoch": 1.3452415139854115, + "grad_norm": 0.5532911419868469, + "learning_rate": 0.0001, + "loss": 1.5302, + "step": 11711 + }, + { + "epoch": 1.3453563838952387, + "grad_norm": 0.5616252422332764, + "learning_rate": 0.0001, + "loss": 1.5337, + "step": 11712 + }, + { + "epoch": 1.3454712538050657, + "grad_norm": 0.6075287461280823, + "learning_rate": 0.0001, + "loss": 1.5848, + "step": 11713 + }, + { + "epoch": 1.345586123714893, + "grad_norm": 0.5586172938346863, + "learning_rate": 0.0001, + "loss": 1.4328, + "step": 11714 + }, + { + "epoch": 1.34570099362472, + "grad_norm": 0.6126081943511963, + "learning_rate": 0.0001, + "loss": 1.3231, + "step": 11715 + }, + { + "epoch": 1.3458158635345472, + "grad_norm": 0.619485080242157, + "learning_rate": 0.0001, + "loss": 1.5135, + "step": 11716 + }, + { + "epoch": 1.3459307334443742, + "grad_norm": 0.5817553997039795, + "learning_rate": 0.0001, + "loss": 1.5455, + "step": 11717 + }, + { + "epoch": 1.3460456033542014, + "grad_norm": 0.6091306209564209, + "learning_rate": 0.0001, + "loss": 1.1376, + "step": 11718 + }, + { + "epoch": 1.3461604732640284, + "grad_norm": 0.6346269845962524, + "learning_rate": 0.0001, + "loss": 1.5875, + "step": 11719 + }, + { + "epoch": 1.3462753431738557, + "grad_norm": 0.6019834876060486, + "learning_rate": 0.0001, + "loss": 1.1867, + "step": 11720 + }, + { + "epoch": 1.3463902130836827, + "grad_norm": 0.5959239602088928, + "learning_rate": 0.0001, + "loss": 1.5195, + "step": 11721 + }, + { + "epoch": 1.34650508299351, + "grad_norm": 0.5814604759216309, + "learning_rate": 0.0001, + "loss": 1.4918, + "step": 11722 + }, + { + "epoch": 1.346619952903337, + "grad_norm": 0.5841962695121765, + "learning_rate": 0.0001, + "loss": 1.553, + "step": 11723 + }, + { + "epoch": 1.3467348228131641, + "grad_norm": 0.626183032989502, + "learning_rate": 0.0001, + "loss": 1.4611, + "step": 11724 + }, + { + "epoch": 1.3468496927229912, + "grad_norm": 0.5390433073043823, + "learning_rate": 0.0001, + "loss": 1.5152, + "step": 11725 + }, + { + "epoch": 1.3469645626328184, + "grad_norm": 0.6256446242332458, + "learning_rate": 0.0001, + "loss": 1.5443, + "step": 11726 + }, + { + "epoch": 1.3470794325426454, + "grad_norm": 0.5755500793457031, + "learning_rate": 0.0001, + "loss": 1.4643, + "step": 11727 + }, + { + "epoch": 1.3471943024524726, + "grad_norm": 0.627149224281311, + "learning_rate": 0.0001, + "loss": 1.5566, + "step": 11728 + }, + { + "epoch": 1.3473091723622996, + "grad_norm": 0.5608257055282593, + "learning_rate": 0.0001, + "loss": 1.4637, + "step": 11729 + }, + { + "epoch": 1.3474240422721269, + "grad_norm": 0.5481683611869812, + "learning_rate": 0.0001, + "loss": 1.3497, + "step": 11730 + }, + { + "epoch": 1.3475389121819539, + "grad_norm": 0.5451379418373108, + "learning_rate": 0.0001, + "loss": 1.4324, + "step": 11731 + }, + { + "epoch": 1.347653782091781, + "grad_norm": 0.594100832939148, + "learning_rate": 0.0001, + "loss": 1.5026, + "step": 11732 + }, + { + "epoch": 1.3477686520016081, + "grad_norm": 0.5474432706832886, + "learning_rate": 0.0001, + "loss": 1.4132, + "step": 11733 + }, + { + "epoch": 1.3478835219114353, + "grad_norm": 0.5873172283172607, + "learning_rate": 0.0001, + "loss": 1.5379, + "step": 11734 + }, + { + "epoch": 1.3479983918212624, + "grad_norm": 0.5776464939117432, + "learning_rate": 0.0001, + "loss": 1.4039, + "step": 11735 + }, + { + "epoch": 1.3481132617310896, + "grad_norm": 0.5772274732589722, + "learning_rate": 0.0001, + "loss": 1.2844, + "step": 11736 + }, + { + "epoch": 1.3482281316409166, + "grad_norm": 0.6043945550918579, + "learning_rate": 0.0001, + "loss": 1.5532, + "step": 11737 + }, + { + "epoch": 1.3483430015507438, + "grad_norm": 0.5803393125534058, + "learning_rate": 0.0001, + "loss": 1.3332, + "step": 11738 + }, + { + "epoch": 1.3484578714605708, + "grad_norm": 0.5934081673622131, + "learning_rate": 0.0001, + "loss": 1.4883, + "step": 11739 + }, + { + "epoch": 1.348572741370398, + "grad_norm": 0.746346652507782, + "learning_rate": 0.0001, + "loss": 1.4929, + "step": 11740 + }, + { + "epoch": 1.348687611280225, + "grad_norm": 0.5849841833114624, + "learning_rate": 0.0001, + "loss": 1.4559, + "step": 11741 + }, + { + "epoch": 1.3488024811900523, + "grad_norm": 0.54823237657547, + "learning_rate": 0.0001, + "loss": 1.3391, + "step": 11742 + }, + { + "epoch": 1.3489173510998793, + "grad_norm": 0.5797605514526367, + "learning_rate": 0.0001, + "loss": 1.567, + "step": 11743 + }, + { + "epoch": 1.3490322210097065, + "grad_norm": 0.6208351850509644, + "learning_rate": 0.0001, + "loss": 1.7252, + "step": 11744 + }, + { + "epoch": 1.3491470909195336, + "grad_norm": 0.599469780921936, + "learning_rate": 0.0001, + "loss": 1.4858, + "step": 11745 + }, + { + "epoch": 1.3492619608293608, + "grad_norm": 0.5936621427536011, + "learning_rate": 0.0001, + "loss": 1.4009, + "step": 11746 + }, + { + "epoch": 1.3493768307391878, + "grad_norm": 0.5813693404197693, + "learning_rate": 0.0001, + "loss": 1.4898, + "step": 11747 + }, + { + "epoch": 1.349491700649015, + "grad_norm": 0.686309278011322, + "learning_rate": 0.0001, + "loss": 1.4567, + "step": 11748 + }, + { + "epoch": 1.349606570558842, + "grad_norm": 0.5535640120506287, + "learning_rate": 0.0001, + "loss": 1.3404, + "step": 11749 + }, + { + "epoch": 1.3497214404686693, + "grad_norm": 0.6416987776756287, + "learning_rate": 0.0001, + "loss": 1.6725, + "step": 11750 + }, + { + "epoch": 1.3498363103784963, + "grad_norm": 0.5824869275093079, + "learning_rate": 0.0001, + "loss": 1.3471, + "step": 11751 + }, + { + "epoch": 1.3499511802883235, + "grad_norm": 0.5436984896659851, + "learning_rate": 0.0001, + "loss": 1.4902, + "step": 11752 + }, + { + "epoch": 1.3500660501981505, + "grad_norm": 0.5425761342048645, + "learning_rate": 0.0001, + "loss": 1.424, + "step": 11753 + }, + { + "epoch": 1.3501809201079777, + "grad_norm": 0.5270239114761353, + "learning_rate": 0.0001, + "loss": 1.1909, + "step": 11754 + }, + { + "epoch": 1.3502957900178048, + "grad_norm": 0.5511675477027893, + "learning_rate": 0.0001, + "loss": 1.5399, + "step": 11755 + }, + { + "epoch": 1.350410659927632, + "grad_norm": 0.5438040494918823, + "learning_rate": 0.0001, + "loss": 1.4739, + "step": 11756 + }, + { + "epoch": 1.350525529837459, + "grad_norm": 0.5267738699913025, + "learning_rate": 0.0001, + "loss": 1.4464, + "step": 11757 + }, + { + "epoch": 1.3506403997472862, + "grad_norm": 0.5484282374382019, + "learning_rate": 0.0001, + "loss": 1.4149, + "step": 11758 + }, + { + "epoch": 1.3507552696571135, + "grad_norm": 0.5937207341194153, + "learning_rate": 0.0001, + "loss": 1.3918, + "step": 11759 + }, + { + "epoch": 1.3508701395669405, + "grad_norm": 0.5717691779136658, + "learning_rate": 0.0001, + "loss": 1.392, + "step": 11760 + }, + { + "epoch": 1.3509850094767675, + "grad_norm": 0.5394260287284851, + "learning_rate": 0.0001, + "loss": 1.426, + "step": 11761 + }, + { + "epoch": 1.3510998793865947, + "grad_norm": 0.6104713082313538, + "learning_rate": 0.0001, + "loss": 1.4614, + "step": 11762 + }, + { + "epoch": 1.351214749296422, + "grad_norm": 0.5929358601570129, + "learning_rate": 0.0001, + "loss": 1.4999, + "step": 11763 + }, + { + "epoch": 1.351329619206249, + "grad_norm": 0.581203818321228, + "learning_rate": 0.0001, + "loss": 1.4168, + "step": 11764 + }, + { + "epoch": 1.351444489116076, + "grad_norm": 0.5509240031242371, + "learning_rate": 0.0001, + "loss": 1.3605, + "step": 11765 + }, + { + "epoch": 1.3515593590259032, + "grad_norm": 0.5640619993209839, + "learning_rate": 0.0001, + "loss": 1.4869, + "step": 11766 + }, + { + "epoch": 1.3516742289357304, + "grad_norm": 0.5916575193405151, + "learning_rate": 0.0001, + "loss": 1.4334, + "step": 11767 + }, + { + "epoch": 1.3517890988455574, + "grad_norm": 0.573228120803833, + "learning_rate": 0.0001, + "loss": 1.4595, + "step": 11768 + }, + { + "epoch": 1.3519039687553844, + "grad_norm": 0.662548840045929, + "learning_rate": 0.0001, + "loss": 1.4985, + "step": 11769 + }, + { + "epoch": 1.3520188386652117, + "grad_norm": 0.5832405090332031, + "learning_rate": 0.0001, + "loss": 1.3511, + "step": 11770 + }, + { + "epoch": 1.352133708575039, + "grad_norm": 0.6026256084442139, + "learning_rate": 0.0001, + "loss": 1.4318, + "step": 11771 + }, + { + "epoch": 1.352248578484866, + "grad_norm": 0.5654007792472839, + "learning_rate": 0.0001, + "loss": 1.3918, + "step": 11772 + }, + { + "epoch": 1.352363448394693, + "grad_norm": 0.6701568961143494, + "learning_rate": 0.0001, + "loss": 1.6647, + "step": 11773 + }, + { + "epoch": 1.3524783183045201, + "grad_norm": 0.5737709999084473, + "learning_rate": 0.0001, + "loss": 1.291, + "step": 11774 + }, + { + "epoch": 1.3525931882143474, + "grad_norm": 0.6998119950294495, + "learning_rate": 0.0001, + "loss": 1.7295, + "step": 11775 + }, + { + "epoch": 1.3527080581241744, + "grad_norm": 0.645850658416748, + "learning_rate": 0.0001, + "loss": 1.4265, + "step": 11776 + }, + { + "epoch": 1.3528229280340014, + "grad_norm": 0.5715954899787903, + "learning_rate": 0.0001, + "loss": 1.282, + "step": 11777 + }, + { + "epoch": 1.3529377979438286, + "grad_norm": 0.5338268280029297, + "learning_rate": 0.0001, + "loss": 1.3373, + "step": 11778 + }, + { + "epoch": 1.3530526678536559, + "grad_norm": 0.5903444290161133, + "learning_rate": 0.0001, + "loss": 1.4409, + "step": 11779 + }, + { + "epoch": 1.3531675377634829, + "grad_norm": 0.5709140300750732, + "learning_rate": 0.0001, + "loss": 1.4397, + "step": 11780 + }, + { + "epoch": 1.3532824076733099, + "grad_norm": 0.6345384120941162, + "learning_rate": 0.0001, + "loss": 1.4992, + "step": 11781 + }, + { + "epoch": 1.353397277583137, + "grad_norm": 0.6136090755462646, + "learning_rate": 0.0001, + "loss": 1.416, + "step": 11782 + }, + { + "epoch": 1.3535121474929643, + "grad_norm": 0.6243228316307068, + "learning_rate": 0.0001, + "loss": 1.4065, + "step": 11783 + }, + { + "epoch": 1.3536270174027913, + "grad_norm": 0.6512452960014343, + "learning_rate": 0.0001, + "loss": 1.5381, + "step": 11784 + }, + { + "epoch": 1.3537418873126184, + "grad_norm": 0.5914151668548584, + "learning_rate": 0.0001, + "loss": 1.3474, + "step": 11785 + }, + { + "epoch": 1.3538567572224456, + "grad_norm": 0.5646949410438538, + "learning_rate": 0.0001, + "loss": 1.4334, + "step": 11786 + }, + { + "epoch": 1.3539716271322728, + "grad_norm": 0.5643793940544128, + "learning_rate": 0.0001, + "loss": 1.3393, + "step": 11787 + }, + { + "epoch": 1.3540864970420998, + "grad_norm": 0.5716279745101929, + "learning_rate": 0.0001, + "loss": 1.638, + "step": 11788 + }, + { + "epoch": 1.3542013669519268, + "grad_norm": 0.5285089612007141, + "learning_rate": 0.0001, + "loss": 1.2825, + "step": 11789 + }, + { + "epoch": 1.354316236861754, + "grad_norm": 0.5500566363334656, + "learning_rate": 0.0001, + "loss": 1.5749, + "step": 11790 + }, + { + "epoch": 1.3544311067715813, + "grad_norm": 0.556171715259552, + "learning_rate": 0.0001, + "loss": 1.4946, + "step": 11791 + }, + { + "epoch": 1.3545459766814083, + "grad_norm": 0.62319415807724, + "learning_rate": 0.0001, + "loss": 1.5568, + "step": 11792 + }, + { + "epoch": 1.3546608465912353, + "grad_norm": 0.5419564843177795, + "learning_rate": 0.0001, + "loss": 1.2894, + "step": 11793 + }, + { + "epoch": 1.3547757165010625, + "grad_norm": 0.581043541431427, + "learning_rate": 0.0001, + "loss": 1.4787, + "step": 11794 + }, + { + "epoch": 1.3548905864108898, + "grad_norm": 0.5013282895088196, + "learning_rate": 0.0001, + "loss": 1.1919, + "step": 11795 + }, + { + "epoch": 1.3550054563207168, + "grad_norm": 0.6230014562606812, + "learning_rate": 0.0001, + "loss": 1.6323, + "step": 11796 + }, + { + "epoch": 1.3551203262305438, + "grad_norm": 0.5700322389602661, + "learning_rate": 0.0001, + "loss": 1.5027, + "step": 11797 + }, + { + "epoch": 1.355235196140371, + "grad_norm": 0.5606763362884521, + "learning_rate": 0.0001, + "loss": 1.4711, + "step": 11798 + }, + { + "epoch": 1.3553500660501983, + "grad_norm": 0.5901811122894287, + "learning_rate": 0.0001, + "loss": 1.7085, + "step": 11799 + }, + { + "epoch": 1.3554649359600253, + "grad_norm": 0.5982941389083862, + "learning_rate": 0.0001, + "loss": 1.4593, + "step": 11800 + }, + { + "epoch": 1.3555798058698523, + "grad_norm": 0.5848938822746277, + "learning_rate": 0.0001, + "loss": 1.466, + "step": 11801 + }, + { + "epoch": 1.3556946757796795, + "grad_norm": 0.6245958209037781, + "learning_rate": 0.0001, + "loss": 1.4828, + "step": 11802 + }, + { + "epoch": 1.3558095456895067, + "grad_norm": 0.6128213405609131, + "learning_rate": 0.0001, + "loss": 1.5102, + "step": 11803 + }, + { + "epoch": 1.3559244155993337, + "grad_norm": 0.5603303909301758, + "learning_rate": 0.0001, + "loss": 1.3408, + "step": 11804 + }, + { + "epoch": 1.3560392855091608, + "grad_norm": 0.6169303059577942, + "learning_rate": 0.0001, + "loss": 1.5766, + "step": 11805 + }, + { + "epoch": 1.356154155418988, + "grad_norm": 0.5828307867050171, + "learning_rate": 0.0001, + "loss": 1.35, + "step": 11806 + }, + { + "epoch": 1.3562690253288152, + "grad_norm": 0.6106387972831726, + "learning_rate": 0.0001, + "loss": 1.4773, + "step": 11807 + }, + { + "epoch": 1.3563838952386422, + "grad_norm": 0.6269760131835938, + "learning_rate": 0.0001, + "loss": 1.4883, + "step": 11808 + }, + { + "epoch": 1.3564987651484692, + "grad_norm": 0.6034385561943054, + "learning_rate": 0.0001, + "loss": 1.5904, + "step": 11809 + }, + { + "epoch": 1.3566136350582965, + "grad_norm": 0.5654463768005371, + "learning_rate": 0.0001, + "loss": 1.548, + "step": 11810 + }, + { + "epoch": 1.3567285049681237, + "grad_norm": 0.5962846875190735, + "learning_rate": 0.0001, + "loss": 1.623, + "step": 11811 + }, + { + "epoch": 1.3568433748779507, + "grad_norm": 0.6377612352371216, + "learning_rate": 0.0001, + "loss": 1.7367, + "step": 11812 + }, + { + "epoch": 1.3569582447877777, + "grad_norm": 0.5528261065483093, + "learning_rate": 0.0001, + "loss": 1.3851, + "step": 11813 + }, + { + "epoch": 1.357073114697605, + "grad_norm": 0.5887748599052429, + "learning_rate": 0.0001, + "loss": 1.4795, + "step": 11814 + }, + { + "epoch": 1.3571879846074322, + "grad_norm": 0.6080367565155029, + "learning_rate": 0.0001, + "loss": 1.6172, + "step": 11815 + }, + { + "epoch": 1.3573028545172592, + "grad_norm": 0.5874114632606506, + "learning_rate": 0.0001, + "loss": 1.3564, + "step": 11816 + }, + { + "epoch": 1.3574177244270862, + "grad_norm": 0.5622687339782715, + "learning_rate": 0.0001, + "loss": 1.4871, + "step": 11817 + }, + { + "epoch": 1.3575325943369134, + "grad_norm": 0.5545808672904968, + "learning_rate": 0.0001, + "loss": 1.5439, + "step": 11818 + }, + { + "epoch": 1.3576474642467407, + "grad_norm": 0.5163676738739014, + "learning_rate": 0.0001, + "loss": 1.3761, + "step": 11819 + }, + { + "epoch": 1.3577623341565677, + "grad_norm": 0.5534794330596924, + "learning_rate": 0.0001, + "loss": 1.4561, + "step": 11820 + }, + { + "epoch": 1.3578772040663947, + "grad_norm": 0.582586407661438, + "learning_rate": 0.0001, + "loss": 1.7145, + "step": 11821 + }, + { + "epoch": 1.357992073976222, + "grad_norm": 0.5760107040405273, + "learning_rate": 0.0001, + "loss": 1.329, + "step": 11822 + }, + { + "epoch": 1.3581069438860491, + "grad_norm": 0.5939583778381348, + "learning_rate": 0.0001, + "loss": 1.3137, + "step": 11823 + }, + { + "epoch": 1.3582218137958761, + "grad_norm": 0.6262675523757935, + "learning_rate": 0.0001, + "loss": 1.5452, + "step": 11824 + }, + { + "epoch": 1.3583366837057034, + "grad_norm": 0.5676589012145996, + "learning_rate": 0.0001, + "loss": 1.525, + "step": 11825 + }, + { + "epoch": 1.3584515536155304, + "grad_norm": 0.5600330233573914, + "learning_rate": 0.0001, + "loss": 1.3734, + "step": 11826 + }, + { + "epoch": 1.3585664235253576, + "grad_norm": 0.6240646243095398, + "learning_rate": 0.0001, + "loss": 1.5052, + "step": 11827 + }, + { + "epoch": 1.3586812934351846, + "grad_norm": 0.6038419008255005, + "learning_rate": 0.0001, + "loss": 1.4135, + "step": 11828 + }, + { + "epoch": 1.3587961633450119, + "grad_norm": 0.5694374442100525, + "learning_rate": 0.0001, + "loss": 1.5767, + "step": 11829 + }, + { + "epoch": 1.3589110332548389, + "grad_norm": 0.5696335434913635, + "learning_rate": 0.0001, + "loss": 1.3895, + "step": 11830 + }, + { + "epoch": 1.359025903164666, + "grad_norm": 0.6053335666656494, + "learning_rate": 0.0001, + "loss": 1.5546, + "step": 11831 + }, + { + "epoch": 1.359140773074493, + "grad_norm": 0.5585938096046448, + "learning_rate": 0.0001, + "loss": 1.4728, + "step": 11832 + }, + { + "epoch": 1.3592556429843203, + "grad_norm": 0.5684964060783386, + "learning_rate": 0.0001, + "loss": 1.4404, + "step": 11833 + }, + { + "epoch": 1.3593705128941473, + "grad_norm": 0.5710092782974243, + "learning_rate": 0.0001, + "loss": 1.5633, + "step": 11834 + }, + { + "epoch": 1.3594853828039746, + "grad_norm": 0.545071542263031, + "learning_rate": 0.0001, + "loss": 1.3686, + "step": 11835 + }, + { + "epoch": 1.3596002527138016, + "grad_norm": 0.5682846307754517, + "learning_rate": 0.0001, + "loss": 1.4138, + "step": 11836 + }, + { + "epoch": 1.3597151226236288, + "grad_norm": 0.6198849081993103, + "learning_rate": 0.0001, + "loss": 1.4155, + "step": 11837 + }, + { + "epoch": 1.3598299925334558, + "grad_norm": 0.6224766969680786, + "learning_rate": 0.0001, + "loss": 1.4831, + "step": 11838 + }, + { + "epoch": 1.359944862443283, + "grad_norm": 0.5848830938339233, + "learning_rate": 0.0001, + "loss": 1.3544, + "step": 11839 + }, + { + "epoch": 1.36005973235311, + "grad_norm": 0.6136349439620972, + "learning_rate": 0.0001, + "loss": 1.5558, + "step": 11840 + }, + { + "epoch": 1.3601746022629373, + "grad_norm": 0.6168116927146912, + "learning_rate": 0.0001, + "loss": 1.4871, + "step": 11841 + }, + { + "epoch": 1.3602894721727643, + "grad_norm": 0.5960310697555542, + "learning_rate": 0.0001, + "loss": 1.4425, + "step": 11842 + }, + { + "epoch": 1.3604043420825915, + "grad_norm": 0.731585681438446, + "learning_rate": 0.0001, + "loss": 1.6861, + "step": 11843 + }, + { + "epoch": 1.3605192119924185, + "grad_norm": 0.6656792759895325, + "learning_rate": 0.0001, + "loss": 1.5898, + "step": 11844 + }, + { + "epoch": 1.3606340819022458, + "grad_norm": 0.5910264849662781, + "learning_rate": 0.0001, + "loss": 1.3898, + "step": 11845 + }, + { + "epoch": 1.3607489518120728, + "grad_norm": 0.6095188856124878, + "learning_rate": 0.0001, + "loss": 1.5024, + "step": 11846 + }, + { + "epoch": 1.3608638217219, + "grad_norm": 0.5570575594902039, + "learning_rate": 0.0001, + "loss": 1.3486, + "step": 11847 + }, + { + "epoch": 1.360978691631727, + "grad_norm": 0.5648567080497742, + "learning_rate": 0.0001, + "loss": 1.4618, + "step": 11848 + }, + { + "epoch": 1.3610935615415543, + "grad_norm": 0.5962167978286743, + "learning_rate": 0.0001, + "loss": 1.5203, + "step": 11849 + }, + { + "epoch": 1.3612084314513813, + "grad_norm": 0.6409988403320312, + "learning_rate": 0.0001, + "loss": 1.367, + "step": 11850 + }, + { + "epoch": 1.3613233013612085, + "grad_norm": 0.6274490356445312, + "learning_rate": 0.0001, + "loss": 1.4525, + "step": 11851 + }, + { + "epoch": 1.3614381712710355, + "grad_norm": 0.541801393032074, + "learning_rate": 0.0001, + "loss": 1.4423, + "step": 11852 + }, + { + "epoch": 1.3615530411808627, + "grad_norm": 0.6285209655761719, + "learning_rate": 0.0001, + "loss": 1.5118, + "step": 11853 + }, + { + "epoch": 1.3616679110906897, + "grad_norm": 0.5430086255073547, + "learning_rate": 0.0001, + "loss": 1.4768, + "step": 11854 + }, + { + "epoch": 1.361782781000517, + "grad_norm": 0.5744656324386597, + "learning_rate": 0.0001, + "loss": 1.3223, + "step": 11855 + }, + { + "epoch": 1.361897650910344, + "grad_norm": 0.6095870733261108, + "learning_rate": 0.0001, + "loss": 1.5144, + "step": 11856 + }, + { + "epoch": 1.3620125208201712, + "grad_norm": 0.6219245791435242, + "learning_rate": 0.0001, + "loss": 1.5718, + "step": 11857 + }, + { + "epoch": 1.3621273907299982, + "grad_norm": 0.5849238038063049, + "learning_rate": 0.0001, + "loss": 1.5043, + "step": 11858 + }, + { + "epoch": 1.3622422606398255, + "grad_norm": 0.6130156517028809, + "learning_rate": 0.0001, + "loss": 1.3665, + "step": 11859 + }, + { + "epoch": 1.3623571305496525, + "grad_norm": 0.6351000070571899, + "learning_rate": 0.0001, + "loss": 1.5092, + "step": 11860 + }, + { + "epoch": 1.3624720004594797, + "grad_norm": 0.5915348529815674, + "learning_rate": 0.0001, + "loss": 1.4129, + "step": 11861 + }, + { + "epoch": 1.3625868703693067, + "grad_norm": 0.6348772644996643, + "learning_rate": 0.0001, + "loss": 1.5171, + "step": 11862 + }, + { + "epoch": 1.362701740279134, + "grad_norm": 0.6160379648208618, + "learning_rate": 0.0001, + "loss": 1.4824, + "step": 11863 + }, + { + "epoch": 1.362816610188961, + "grad_norm": 0.6016219258308411, + "learning_rate": 0.0001, + "loss": 1.4674, + "step": 11864 + }, + { + "epoch": 1.3629314800987882, + "grad_norm": 0.6062735915184021, + "learning_rate": 0.0001, + "loss": 1.3464, + "step": 11865 + }, + { + "epoch": 1.3630463500086152, + "grad_norm": 0.6335691809654236, + "learning_rate": 0.0001, + "loss": 1.2892, + "step": 11866 + }, + { + "epoch": 1.3631612199184424, + "grad_norm": 0.597547173500061, + "learning_rate": 0.0001, + "loss": 1.398, + "step": 11867 + }, + { + "epoch": 1.3632760898282694, + "grad_norm": 0.6174020767211914, + "learning_rate": 0.0001, + "loss": 1.561, + "step": 11868 + }, + { + "epoch": 1.3633909597380967, + "grad_norm": 0.5814570784568787, + "learning_rate": 0.0001, + "loss": 1.2317, + "step": 11869 + }, + { + "epoch": 1.3635058296479237, + "grad_norm": 0.5660645365715027, + "learning_rate": 0.0001, + "loss": 1.3388, + "step": 11870 + }, + { + "epoch": 1.363620699557751, + "grad_norm": 0.5966171026229858, + "learning_rate": 0.0001, + "loss": 1.4789, + "step": 11871 + }, + { + "epoch": 1.363735569467578, + "grad_norm": 0.6484749913215637, + "learning_rate": 0.0001, + "loss": 1.5419, + "step": 11872 + }, + { + "epoch": 1.3638504393774051, + "grad_norm": 0.6023862361907959, + "learning_rate": 0.0001, + "loss": 1.4739, + "step": 11873 + }, + { + "epoch": 1.3639653092872321, + "grad_norm": 0.6137213706970215, + "learning_rate": 0.0001, + "loss": 1.5059, + "step": 11874 + }, + { + "epoch": 1.3640801791970594, + "grad_norm": 0.5899903774261475, + "learning_rate": 0.0001, + "loss": 1.6593, + "step": 11875 + }, + { + "epoch": 1.3641950491068864, + "grad_norm": 0.6372878551483154, + "learning_rate": 0.0001, + "loss": 1.6322, + "step": 11876 + }, + { + "epoch": 1.3643099190167136, + "grad_norm": 0.6417999863624573, + "learning_rate": 0.0001, + "loss": 1.5396, + "step": 11877 + }, + { + "epoch": 1.3644247889265406, + "grad_norm": 0.6401478052139282, + "learning_rate": 0.0001, + "loss": 1.5189, + "step": 11878 + }, + { + "epoch": 1.3645396588363679, + "grad_norm": 0.5522693395614624, + "learning_rate": 0.0001, + "loss": 1.4322, + "step": 11879 + }, + { + "epoch": 1.3646545287461949, + "grad_norm": 0.5718218088150024, + "learning_rate": 0.0001, + "loss": 1.445, + "step": 11880 + }, + { + "epoch": 1.364769398656022, + "grad_norm": 0.5563505291938782, + "learning_rate": 0.0001, + "loss": 1.5453, + "step": 11881 + }, + { + "epoch": 1.364884268565849, + "grad_norm": 0.5796937346458435, + "learning_rate": 0.0001, + "loss": 1.6481, + "step": 11882 + }, + { + "epoch": 1.3649991384756763, + "grad_norm": 0.5500465035438538, + "learning_rate": 0.0001, + "loss": 1.3196, + "step": 11883 + }, + { + "epoch": 1.3651140083855033, + "grad_norm": 0.5704468488693237, + "learning_rate": 0.0001, + "loss": 1.4476, + "step": 11884 + }, + { + "epoch": 1.3652288782953306, + "grad_norm": 0.6012165546417236, + "learning_rate": 0.0001, + "loss": 1.3108, + "step": 11885 + }, + { + "epoch": 1.3653437482051576, + "grad_norm": 0.5745958089828491, + "learning_rate": 0.0001, + "loss": 1.5241, + "step": 11886 + }, + { + "epoch": 1.3654586181149848, + "grad_norm": 0.5989466905593872, + "learning_rate": 0.0001, + "loss": 1.4294, + "step": 11887 + }, + { + "epoch": 1.3655734880248118, + "grad_norm": 0.5610129237174988, + "learning_rate": 0.0001, + "loss": 1.4047, + "step": 11888 + }, + { + "epoch": 1.365688357934639, + "grad_norm": 0.5722390413284302, + "learning_rate": 0.0001, + "loss": 1.5388, + "step": 11889 + }, + { + "epoch": 1.365803227844466, + "grad_norm": 0.5391660332679749, + "learning_rate": 0.0001, + "loss": 1.4419, + "step": 11890 + }, + { + "epoch": 1.3659180977542933, + "grad_norm": 0.6333369016647339, + "learning_rate": 0.0001, + "loss": 1.5286, + "step": 11891 + }, + { + "epoch": 1.3660329676641203, + "grad_norm": 0.5749605298042297, + "learning_rate": 0.0001, + "loss": 1.5987, + "step": 11892 + }, + { + "epoch": 1.3661478375739475, + "grad_norm": 0.5995118021965027, + "learning_rate": 0.0001, + "loss": 1.5819, + "step": 11893 + }, + { + "epoch": 1.3662627074837745, + "grad_norm": 0.6667972207069397, + "learning_rate": 0.0001, + "loss": 1.7452, + "step": 11894 + }, + { + "epoch": 1.3663775773936018, + "grad_norm": 0.5708993077278137, + "learning_rate": 0.0001, + "loss": 1.5707, + "step": 11895 + }, + { + "epoch": 1.366492447303429, + "grad_norm": 0.5501135587692261, + "learning_rate": 0.0001, + "loss": 1.4129, + "step": 11896 + }, + { + "epoch": 1.366607317213256, + "grad_norm": 0.5960387587547302, + "learning_rate": 0.0001, + "loss": 1.4311, + "step": 11897 + }, + { + "epoch": 1.366722187123083, + "grad_norm": 0.6244403123855591, + "learning_rate": 0.0001, + "loss": 1.5404, + "step": 11898 + }, + { + "epoch": 1.3668370570329103, + "grad_norm": 0.5725081562995911, + "learning_rate": 0.0001, + "loss": 1.4793, + "step": 11899 + }, + { + "epoch": 1.3669519269427375, + "grad_norm": 0.5716732144355774, + "learning_rate": 0.0001, + "loss": 1.2992, + "step": 11900 + }, + { + "epoch": 1.3670667968525645, + "grad_norm": 0.6741322875022888, + "learning_rate": 0.0001, + "loss": 1.6103, + "step": 11901 + }, + { + "epoch": 1.3671816667623915, + "grad_norm": 0.5683753490447998, + "learning_rate": 0.0001, + "loss": 1.4798, + "step": 11902 + }, + { + "epoch": 1.3672965366722187, + "grad_norm": 0.5858237743377686, + "learning_rate": 0.0001, + "loss": 1.493, + "step": 11903 + }, + { + "epoch": 1.367411406582046, + "grad_norm": 0.6018313765525818, + "learning_rate": 0.0001, + "loss": 1.3942, + "step": 11904 + }, + { + "epoch": 1.367526276491873, + "grad_norm": 0.5790035724639893, + "learning_rate": 0.0001, + "loss": 1.3989, + "step": 11905 + }, + { + "epoch": 1.3676411464017, + "grad_norm": 0.5737671256065369, + "learning_rate": 0.0001, + "loss": 1.3992, + "step": 11906 + }, + { + "epoch": 1.3677560163115272, + "grad_norm": 0.5786615610122681, + "learning_rate": 0.0001, + "loss": 1.4582, + "step": 11907 + }, + { + "epoch": 1.3678708862213544, + "grad_norm": 0.6264104247093201, + "learning_rate": 0.0001, + "loss": 1.429, + "step": 11908 + }, + { + "epoch": 1.3679857561311815, + "grad_norm": 0.5502702593803406, + "learning_rate": 0.0001, + "loss": 1.4506, + "step": 11909 + }, + { + "epoch": 1.3681006260410085, + "grad_norm": 0.5532498359680176, + "learning_rate": 0.0001, + "loss": 1.3379, + "step": 11910 + }, + { + "epoch": 1.3682154959508357, + "grad_norm": 0.5454950332641602, + "learning_rate": 0.0001, + "loss": 1.3526, + "step": 11911 + }, + { + "epoch": 1.368330365860663, + "grad_norm": 0.6295995712280273, + "learning_rate": 0.0001, + "loss": 1.4195, + "step": 11912 + }, + { + "epoch": 1.36844523577049, + "grad_norm": 0.6341611742973328, + "learning_rate": 0.0001, + "loss": 1.5017, + "step": 11913 + }, + { + "epoch": 1.368560105680317, + "grad_norm": 0.620488703250885, + "learning_rate": 0.0001, + "loss": 1.5425, + "step": 11914 + }, + { + "epoch": 1.3686749755901442, + "grad_norm": 0.5347122550010681, + "learning_rate": 0.0001, + "loss": 1.46, + "step": 11915 + }, + { + "epoch": 1.3687898454999714, + "grad_norm": 0.5783818960189819, + "learning_rate": 0.0001, + "loss": 1.2346, + "step": 11916 + }, + { + "epoch": 1.3689047154097984, + "grad_norm": 0.5772425532341003, + "learning_rate": 0.0001, + "loss": 1.4085, + "step": 11917 + }, + { + "epoch": 1.3690195853196254, + "grad_norm": 0.5653284788131714, + "learning_rate": 0.0001, + "loss": 1.5086, + "step": 11918 + }, + { + "epoch": 1.3691344552294527, + "grad_norm": 0.6505599021911621, + "learning_rate": 0.0001, + "loss": 1.4088, + "step": 11919 + }, + { + "epoch": 1.3692493251392799, + "grad_norm": 0.6298074126243591, + "learning_rate": 0.0001, + "loss": 1.591, + "step": 11920 + }, + { + "epoch": 1.369364195049107, + "grad_norm": 0.5957285761833191, + "learning_rate": 0.0001, + "loss": 1.4399, + "step": 11921 + }, + { + "epoch": 1.369479064958934, + "grad_norm": 0.5449360609054565, + "learning_rate": 0.0001, + "loss": 1.2528, + "step": 11922 + }, + { + "epoch": 1.3695939348687611, + "grad_norm": 0.6365380883216858, + "learning_rate": 0.0001, + "loss": 1.1535, + "step": 11923 + }, + { + "epoch": 1.3697088047785884, + "grad_norm": 0.668751060962677, + "learning_rate": 0.0001, + "loss": 1.4752, + "step": 11924 + }, + { + "epoch": 1.3698236746884154, + "grad_norm": 0.631753146648407, + "learning_rate": 0.0001, + "loss": 1.1487, + "step": 11925 + }, + { + "epoch": 1.3699385445982424, + "grad_norm": 0.5405638813972473, + "learning_rate": 0.0001, + "loss": 1.1319, + "step": 11926 + }, + { + "epoch": 1.3700534145080696, + "grad_norm": 0.6844695806503296, + "learning_rate": 0.0001, + "loss": 1.4908, + "step": 11927 + }, + { + "epoch": 1.3701682844178968, + "grad_norm": 0.6165083050727844, + "learning_rate": 0.0001, + "loss": 1.4513, + "step": 11928 + }, + { + "epoch": 1.3702831543277239, + "grad_norm": 0.5696147084236145, + "learning_rate": 0.0001, + "loss": 1.4323, + "step": 11929 + }, + { + "epoch": 1.3703980242375509, + "grad_norm": 0.6744541525840759, + "learning_rate": 0.0001, + "loss": 1.6197, + "step": 11930 + }, + { + "epoch": 1.370512894147378, + "grad_norm": 0.5944900512695312, + "learning_rate": 0.0001, + "loss": 1.3649, + "step": 11931 + }, + { + "epoch": 1.3706277640572053, + "grad_norm": 0.60816490650177, + "learning_rate": 0.0001, + "loss": 1.5909, + "step": 11932 + }, + { + "epoch": 1.3707426339670323, + "grad_norm": 0.5886834859848022, + "learning_rate": 0.0001, + "loss": 1.542, + "step": 11933 + }, + { + "epoch": 1.3708575038768593, + "grad_norm": 0.5649728178977966, + "learning_rate": 0.0001, + "loss": 1.4151, + "step": 11934 + }, + { + "epoch": 1.3709723737866866, + "grad_norm": 0.5838742852210999, + "learning_rate": 0.0001, + "loss": 1.4892, + "step": 11935 + }, + { + "epoch": 1.3710872436965138, + "grad_norm": 0.556206464767456, + "learning_rate": 0.0001, + "loss": 1.4125, + "step": 11936 + }, + { + "epoch": 1.3712021136063408, + "grad_norm": 0.5563154816627502, + "learning_rate": 0.0001, + "loss": 1.5298, + "step": 11937 + }, + { + "epoch": 1.3713169835161678, + "grad_norm": 0.5660498738288879, + "learning_rate": 0.0001, + "loss": 1.4787, + "step": 11938 + }, + { + "epoch": 1.371431853425995, + "grad_norm": 0.604303777217865, + "learning_rate": 0.0001, + "loss": 1.5178, + "step": 11939 + }, + { + "epoch": 1.3715467233358223, + "grad_norm": 0.624233067035675, + "learning_rate": 0.0001, + "loss": 1.4388, + "step": 11940 + }, + { + "epoch": 1.3716615932456493, + "grad_norm": 0.5636304020881653, + "learning_rate": 0.0001, + "loss": 1.2747, + "step": 11941 + }, + { + "epoch": 1.3717764631554763, + "grad_norm": 0.543554425239563, + "learning_rate": 0.0001, + "loss": 1.4925, + "step": 11942 + }, + { + "epoch": 1.3718913330653035, + "grad_norm": 0.5257536768913269, + "learning_rate": 0.0001, + "loss": 1.1757, + "step": 11943 + }, + { + "epoch": 1.3720062029751308, + "grad_norm": 0.6228072047233582, + "learning_rate": 0.0001, + "loss": 1.3553, + "step": 11944 + }, + { + "epoch": 1.3721210728849578, + "grad_norm": 0.7739028334617615, + "learning_rate": 0.0001, + "loss": 1.355, + "step": 11945 + }, + { + "epoch": 1.3722359427947848, + "grad_norm": 0.6044634580612183, + "learning_rate": 0.0001, + "loss": 1.5408, + "step": 11946 + }, + { + "epoch": 1.372350812704612, + "grad_norm": 0.6621956825256348, + "learning_rate": 0.0001, + "loss": 1.6615, + "step": 11947 + }, + { + "epoch": 1.3724656826144392, + "grad_norm": 0.5978637933731079, + "learning_rate": 0.0001, + "loss": 1.504, + "step": 11948 + }, + { + "epoch": 1.3725805525242663, + "grad_norm": 0.5536312460899353, + "learning_rate": 0.0001, + "loss": 1.4278, + "step": 11949 + }, + { + "epoch": 1.3726954224340933, + "grad_norm": 0.5836893320083618, + "learning_rate": 0.0001, + "loss": 1.5027, + "step": 11950 + }, + { + "epoch": 1.3728102923439205, + "grad_norm": 0.5607746243476868, + "learning_rate": 0.0001, + "loss": 1.5454, + "step": 11951 + }, + { + "epoch": 1.3729251622537477, + "grad_norm": 0.7326905727386475, + "learning_rate": 0.0001, + "loss": 1.7962, + "step": 11952 + }, + { + "epoch": 1.3730400321635747, + "grad_norm": 0.5599920749664307, + "learning_rate": 0.0001, + "loss": 1.4866, + "step": 11953 + }, + { + "epoch": 1.3731549020734017, + "grad_norm": 0.5913311839103699, + "learning_rate": 0.0001, + "loss": 1.5448, + "step": 11954 + }, + { + "epoch": 1.373269771983229, + "grad_norm": 0.596919059753418, + "learning_rate": 0.0001, + "loss": 1.3861, + "step": 11955 + }, + { + "epoch": 1.3733846418930562, + "grad_norm": 0.6204702854156494, + "learning_rate": 0.0001, + "loss": 1.572, + "step": 11956 + }, + { + "epoch": 1.3734995118028832, + "grad_norm": 0.6467247605323792, + "learning_rate": 0.0001, + "loss": 1.5979, + "step": 11957 + }, + { + "epoch": 1.3736143817127102, + "grad_norm": 0.6159607768058777, + "learning_rate": 0.0001, + "loss": 1.5928, + "step": 11958 + }, + { + "epoch": 1.3737292516225375, + "grad_norm": 0.5385196208953857, + "learning_rate": 0.0001, + "loss": 1.3476, + "step": 11959 + }, + { + "epoch": 1.3738441215323647, + "grad_norm": 0.5562816262245178, + "learning_rate": 0.0001, + "loss": 1.354, + "step": 11960 + }, + { + "epoch": 1.3739589914421917, + "grad_norm": 0.5553545355796814, + "learning_rate": 0.0001, + "loss": 1.5637, + "step": 11961 + }, + { + "epoch": 1.374073861352019, + "grad_norm": 0.5768440365791321, + "learning_rate": 0.0001, + "loss": 1.5775, + "step": 11962 + }, + { + "epoch": 1.374188731261846, + "grad_norm": 0.6480252146720886, + "learning_rate": 0.0001, + "loss": 1.5765, + "step": 11963 + }, + { + "epoch": 1.3743036011716732, + "grad_norm": 0.5714027881622314, + "learning_rate": 0.0001, + "loss": 1.651, + "step": 11964 + }, + { + "epoch": 1.3744184710815002, + "grad_norm": 0.5680522918701172, + "learning_rate": 0.0001, + "loss": 1.5401, + "step": 11965 + }, + { + "epoch": 1.3745333409913274, + "grad_norm": 0.6007116436958313, + "learning_rate": 0.0001, + "loss": 1.4423, + "step": 11966 + }, + { + "epoch": 1.3746482109011544, + "grad_norm": 0.5150221586227417, + "learning_rate": 0.0001, + "loss": 1.3199, + "step": 11967 + }, + { + "epoch": 1.3747630808109816, + "grad_norm": 0.5811969041824341, + "learning_rate": 0.0001, + "loss": 1.3401, + "step": 11968 + }, + { + "epoch": 1.3748779507208087, + "grad_norm": 0.5529438853263855, + "learning_rate": 0.0001, + "loss": 1.445, + "step": 11969 + }, + { + "epoch": 1.3749928206306359, + "grad_norm": 0.5678512454032898, + "learning_rate": 0.0001, + "loss": 1.3625, + "step": 11970 + }, + { + "epoch": 1.375107690540463, + "grad_norm": 0.5907145738601685, + "learning_rate": 0.0001, + "loss": 1.5975, + "step": 11971 + }, + { + "epoch": 1.3752225604502901, + "grad_norm": 0.5750979781150818, + "learning_rate": 0.0001, + "loss": 1.3721, + "step": 11972 + }, + { + "epoch": 1.3753374303601171, + "grad_norm": 0.5326782464981079, + "learning_rate": 0.0001, + "loss": 1.2268, + "step": 11973 + }, + { + "epoch": 1.3754523002699444, + "grad_norm": 0.6372673511505127, + "learning_rate": 0.0001, + "loss": 1.6161, + "step": 11974 + }, + { + "epoch": 1.3755671701797714, + "grad_norm": 0.702731192111969, + "learning_rate": 0.0001, + "loss": 1.5823, + "step": 11975 + }, + { + "epoch": 1.3756820400895986, + "grad_norm": 0.5595875978469849, + "learning_rate": 0.0001, + "loss": 1.3401, + "step": 11976 + }, + { + "epoch": 1.3757969099994256, + "grad_norm": 0.5655556917190552, + "learning_rate": 0.0001, + "loss": 1.4331, + "step": 11977 + }, + { + "epoch": 1.3759117799092528, + "grad_norm": 0.5677425265312195, + "learning_rate": 0.0001, + "loss": 1.435, + "step": 11978 + }, + { + "epoch": 1.3760266498190799, + "grad_norm": 0.5768133401870728, + "learning_rate": 0.0001, + "loss": 1.4443, + "step": 11979 + }, + { + "epoch": 1.376141519728907, + "grad_norm": 0.6946627497673035, + "learning_rate": 0.0001, + "loss": 1.437, + "step": 11980 + }, + { + "epoch": 1.376256389638734, + "grad_norm": 0.5521852374076843, + "learning_rate": 0.0001, + "loss": 1.3945, + "step": 11981 + }, + { + "epoch": 1.3763712595485613, + "grad_norm": 0.5932571291923523, + "learning_rate": 0.0001, + "loss": 1.3905, + "step": 11982 + }, + { + "epoch": 1.3764861294583883, + "grad_norm": 0.5667235851287842, + "learning_rate": 0.0001, + "loss": 1.4262, + "step": 11983 + }, + { + "epoch": 1.3766009993682156, + "grad_norm": 0.6410410404205322, + "learning_rate": 0.0001, + "loss": 1.5501, + "step": 11984 + }, + { + "epoch": 1.3767158692780426, + "grad_norm": 0.6235799193382263, + "learning_rate": 0.0001, + "loss": 1.5208, + "step": 11985 + }, + { + "epoch": 1.3768307391878698, + "grad_norm": 0.5973227024078369, + "learning_rate": 0.0001, + "loss": 1.4338, + "step": 11986 + }, + { + "epoch": 1.3769456090976968, + "grad_norm": 0.5862446427345276, + "learning_rate": 0.0001, + "loss": 1.5087, + "step": 11987 + }, + { + "epoch": 1.377060479007524, + "grad_norm": 0.6002928018569946, + "learning_rate": 0.0001, + "loss": 1.4927, + "step": 11988 + }, + { + "epoch": 1.377175348917351, + "grad_norm": 0.6475585699081421, + "learning_rate": 0.0001, + "loss": 1.6346, + "step": 11989 + }, + { + "epoch": 1.3772902188271783, + "grad_norm": 0.6113678216934204, + "learning_rate": 0.0001, + "loss": 1.5051, + "step": 11990 + }, + { + "epoch": 1.3774050887370053, + "grad_norm": 0.5376540422439575, + "learning_rate": 0.0001, + "loss": 1.2808, + "step": 11991 + }, + { + "epoch": 1.3775199586468325, + "grad_norm": 0.6065835356712341, + "learning_rate": 0.0001, + "loss": 1.6838, + "step": 11992 + }, + { + "epoch": 1.3776348285566595, + "grad_norm": 0.5882073640823364, + "learning_rate": 0.0001, + "loss": 1.3892, + "step": 11993 + }, + { + "epoch": 1.3777496984664868, + "grad_norm": 0.5682032704353333, + "learning_rate": 0.0001, + "loss": 1.3817, + "step": 11994 + }, + { + "epoch": 1.3778645683763138, + "grad_norm": 0.6414339542388916, + "learning_rate": 0.0001, + "loss": 1.6193, + "step": 11995 + }, + { + "epoch": 1.377979438286141, + "grad_norm": 0.6156001091003418, + "learning_rate": 0.0001, + "loss": 1.5661, + "step": 11996 + }, + { + "epoch": 1.378094308195968, + "grad_norm": 0.5534403324127197, + "learning_rate": 0.0001, + "loss": 1.2606, + "step": 11997 + }, + { + "epoch": 1.3782091781057952, + "grad_norm": 0.5855156779289246, + "learning_rate": 0.0001, + "loss": 1.4816, + "step": 11998 + }, + { + "epoch": 1.3783240480156222, + "grad_norm": 0.5981359481811523, + "learning_rate": 0.0001, + "loss": 1.672, + "step": 11999 + }, + { + "epoch": 1.3784389179254495, + "grad_norm": 0.6743270754814148, + "learning_rate": 0.0001, + "loss": 1.558, + "step": 12000 + }, + { + "epoch": 1.3785537878352765, + "grad_norm": 0.6285853981971741, + "learning_rate": 0.0001, + "loss": 1.7823, + "step": 12001 + }, + { + "epoch": 1.3786686577451037, + "grad_norm": 0.6270994544029236, + "learning_rate": 0.0001, + "loss": 1.3792, + "step": 12002 + }, + { + "epoch": 1.3787835276549307, + "grad_norm": 0.5615768432617188, + "learning_rate": 0.0001, + "loss": 1.5357, + "step": 12003 + }, + { + "epoch": 1.378898397564758, + "grad_norm": 0.5567690134048462, + "learning_rate": 0.0001, + "loss": 1.5045, + "step": 12004 + }, + { + "epoch": 1.379013267474585, + "grad_norm": 0.6072371602058411, + "learning_rate": 0.0001, + "loss": 1.4631, + "step": 12005 + }, + { + "epoch": 1.3791281373844122, + "grad_norm": 0.6124165058135986, + "learning_rate": 0.0001, + "loss": 1.396, + "step": 12006 + }, + { + "epoch": 1.3792430072942392, + "grad_norm": 0.6033795475959778, + "learning_rate": 0.0001, + "loss": 1.5735, + "step": 12007 + }, + { + "epoch": 1.3793578772040664, + "grad_norm": 0.5871466994285583, + "learning_rate": 0.0001, + "loss": 1.5119, + "step": 12008 + }, + { + "epoch": 1.3794727471138934, + "grad_norm": 0.6192461252212524, + "learning_rate": 0.0001, + "loss": 1.6427, + "step": 12009 + }, + { + "epoch": 1.3795876170237207, + "grad_norm": 0.5672609806060791, + "learning_rate": 0.0001, + "loss": 1.5876, + "step": 12010 + }, + { + "epoch": 1.3797024869335477, + "grad_norm": 0.5758570432662964, + "learning_rate": 0.0001, + "loss": 1.6809, + "step": 12011 + }, + { + "epoch": 1.379817356843375, + "grad_norm": 0.5541750192642212, + "learning_rate": 0.0001, + "loss": 1.4669, + "step": 12012 + }, + { + "epoch": 1.379932226753202, + "grad_norm": 0.652824878692627, + "learning_rate": 0.0001, + "loss": 1.2463, + "step": 12013 + }, + { + "epoch": 1.3800470966630292, + "grad_norm": 0.6101529598236084, + "learning_rate": 0.0001, + "loss": 1.6506, + "step": 12014 + }, + { + "epoch": 1.3801619665728562, + "grad_norm": 0.5852307677268982, + "learning_rate": 0.0001, + "loss": 1.6664, + "step": 12015 + }, + { + "epoch": 1.3802768364826834, + "grad_norm": 0.5421605706214905, + "learning_rate": 0.0001, + "loss": 1.3547, + "step": 12016 + }, + { + "epoch": 1.3803917063925104, + "grad_norm": 0.5122795701026917, + "learning_rate": 0.0001, + "loss": 1.3986, + "step": 12017 + }, + { + "epoch": 1.3805065763023376, + "grad_norm": 0.5709700584411621, + "learning_rate": 0.0001, + "loss": 1.4771, + "step": 12018 + }, + { + "epoch": 1.3806214462121646, + "grad_norm": 0.568841814994812, + "learning_rate": 0.0001, + "loss": 1.2557, + "step": 12019 + }, + { + "epoch": 1.3807363161219919, + "grad_norm": 0.5416236519813538, + "learning_rate": 0.0001, + "loss": 1.5226, + "step": 12020 + }, + { + "epoch": 1.380851186031819, + "grad_norm": 0.5720876455307007, + "learning_rate": 0.0001, + "loss": 1.4735, + "step": 12021 + }, + { + "epoch": 1.3809660559416461, + "grad_norm": 0.5533315539360046, + "learning_rate": 0.0001, + "loss": 1.2757, + "step": 12022 + }, + { + "epoch": 1.3810809258514731, + "grad_norm": 0.6400322914123535, + "learning_rate": 0.0001, + "loss": 1.3811, + "step": 12023 + }, + { + "epoch": 1.3811957957613004, + "grad_norm": 0.6613686680793762, + "learning_rate": 0.0001, + "loss": 1.6539, + "step": 12024 + }, + { + "epoch": 1.3813106656711274, + "grad_norm": 0.6082622408866882, + "learning_rate": 0.0001, + "loss": 1.6322, + "step": 12025 + }, + { + "epoch": 1.3814255355809546, + "grad_norm": 0.6383165121078491, + "learning_rate": 0.0001, + "loss": 1.579, + "step": 12026 + }, + { + "epoch": 1.3815404054907816, + "grad_norm": 0.5541550517082214, + "learning_rate": 0.0001, + "loss": 1.4047, + "step": 12027 + }, + { + "epoch": 1.3816552754006088, + "grad_norm": 0.5779566168785095, + "learning_rate": 0.0001, + "loss": 1.3272, + "step": 12028 + }, + { + "epoch": 1.3817701453104358, + "grad_norm": 0.599031925201416, + "learning_rate": 0.0001, + "loss": 1.5207, + "step": 12029 + }, + { + "epoch": 1.381885015220263, + "grad_norm": 0.5647594928741455, + "learning_rate": 0.0001, + "loss": 1.4146, + "step": 12030 + }, + { + "epoch": 1.38199988513009, + "grad_norm": 0.6167514324188232, + "learning_rate": 0.0001, + "loss": 1.5239, + "step": 12031 + }, + { + "epoch": 1.3821147550399173, + "grad_norm": 0.5759591460227966, + "learning_rate": 0.0001, + "loss": 1.3562, + "step": 12032 + }, + { + "epoch": 1.3822296249497446, + "grad_norm": 0.6348143815994263, + "learning_rate": 0.0001, + "loss": 1.5969, + "step": 12033 + }, + { + "epoch": 1.3823444948595716, + "grad_norm": 0.6396414041519165, + "learning_rate": 0.0001, + "loss": 1.595, + "step": 12034 + }, + { + "epoch": 1.3824593647693986, + "grad_norm": 0.5942586064338684, + "learning_rate": 0.0001, + "loss": 1.4623, + "step": 12035 + }, + { + "epoch": 1.3825742346792258, + "grad_norm": 0.625024676322937, + "learning_rate": 0.0001, + "loss": 1.5357, + "step": 12036 + }, + { + "epoch": 1.382689104589053, + "grad_norm": 0.6331309676170349, + "learning_rate": 0.0001, + "loss": 1.4587, + "step": 12037 + }, + { + "epoch": 1.38280397449888, + "grad_norm": 0.6286697387695312, + "learning_rate": 0.0001, + "loss": 1.5249, + "step": 12038 + }, + { + "epoch": 1.382918844408707, + "grad_norm": 0.6172159910202026, + "learning_rate": 0.0001, + "loss": 1.3461, + "step": 12039 + }, + { + "epoch": 1.3830337143185343, + "grad_norm": 0.5521469712257385, + "learning_rate": 0.0001, + "loss": 1.5414, + "step": 12040 + }, + { + "epoch": 1.3831485842283615, + "grad_norm": 0.5680788159370422, + "learning_rate": 0.0001, + "loss": 1.392, + "step": 12041 + }, + { + "epoch": 1.3832634541381885, + "grad_norm": 0.5680851936340332, + "learning_rate": 0.0001, + "loss": 1.4833, + "step": 12042 + }, + { + "epoch": 1.3833783240480155, + "grad_norm": 0.6148812174797058, + "learning_rate": 0.0001, + "loss": 1.4922, + "step": 12043 + }, + { + "epoch": 1.3834931939578428, + "grad_norm": 0.5677898526191711, + "learning_rate": 0.0001, + "loss": 1.3504, + "step": 12044 + }, + { + "epoch": 1.38360806386767, + "grad_norm": 0.5935441255569458, + "learning_rate": 0.0001, + "loss": 1.3455, + "step": 12045 + }, + { + "epoch": 1.383722933777497, + "grad_norm": 0.5716426968574524, + "learning_rate": 0.0001, + "loss": 1.4883, + "step": 12046 + }, + { + "epoch": 1.383837803687324, + "grad_norm": 0.5644375085830688, + "learning_rate": 0.0001, + "loss": 1.4816, + "step": 12047 + }, + { + "epoch": 1.3839526735971512, + "grad_norm": 0.5767795443534851, + "learning_rate": 0.0001, + "loss": 1.5569, + "step": 12048 + }, + { + "epoch": 1.3840675435069785, + "grad_norm": 0.5734127759933472, + "learning_rate": 0.0001, + "loss": 1.3649, + "step": 12049 + }, + { + "epoch": 1.3841824134168055, + "grad_norm": 0.628250777721405, + "learning_rate": 0.0001, + "loss": 1.5152, + "step": 12050 + }, + { + "epoch": 1.3842972833266325, + "grad_norm": 0.622211217880249, + "learning_rate": 0.0001, + "loss": 1.4912, + "step": 12051 + }, + { + "epoch": 1.3844121532364597, + "grad_norm": 0.5941185355186462, + "learning_rate": 0.0001, + "loss": 1.4555, + "step": 12052 + }, + { + "epoch": 1.384527023146287, + "grad_norm": 0.6594216227531433, + "learning_rate": 0.0001, + "loss": 1.4542, + "step": 12053 + }, + { + "epoch": 1.384641893056114, + "grad_norm": 0.5944557785987854, + "learning_rate": 0.0001, + "loss": 1.4187, + "step": 12054 + }, + { + "epoch": 1.384756762965941, + "grad_norm": 0.612331748008728, + "learning_rate": 0.0001, + "loss": 1.4077, + "step": 12055 + }, + { + "epoch": 1.3848716328757682, + "grad_norm": 0.5708249807357788, + "learning_rate": 0.0001, + "loss": 1.3858, + "step": 12056 + }, + { + "epoch": 1.3849865027855954, + "grad_norm": 0.7231257557868958, + "learning_rate": 0.0001, + "loss": 1.5112, + "step": 12057 + }, + { + "epoch": 1.3851013726954224, + "grad_norm": 0.5759585499763489, + "learning_rate": 0.0001, + "loss": 1.4769, + "step": 12058 + }, + { + "epoch": 1.3852162426052494, + "grad_norm": 0.5974624752998352, + "learning_rate": 0.0001, + "loss": 1.5837, + "step": 12059 + }, + { + "epoch": 1.3853311125150767, + "grad_norm": 0.5672141313552856, + "learning_rate": 0.0001, + "loss": 1.3981, + "step": 12060 + }, + { + "epoch": 1.385445982424904, + "grad_norm": 0.5883855819702148, + "learning_rate": 0.0001, + "loss": 1.2928, + "step": 12061 + }, + { + "epoch": 1.385560852334731, + "grad_norm": 0.6936026215553284, + "learning_rate": 0.0001, + "loss": 1.6683, + "step": 12062 + }, + { + "epoch": 1.385675722244558, + "grad_norm": 0.5789870619773865, + "learning_rate": 0.0001, + "loss": 1.6135, + "step": 12063 + }, + { + "epoch": 1.3857905921543852, + "grad_norm": 0.6692520380020142, + "learning_rate": 0.0001, + "loss": 1.6279, + "step": 12064 + }, + { + "epoch": 1.3859054620642124, + "grad_norm": 0.6601521372795105, + "learning_rate": 0.0001, + "loss": 1.6242, + "step": 12065 + }, + { + "epoch": 1.3860203319740394, + "grad_norm": 0.6020188927650452, + "learning_rate": 0.0001, + "loss": 1.5118, + "step": 12066 + }, + { + "epoch": 1.3861352018838664, + "grad_norm": 0.5850477814674377, + "learning_rate": 0.0001, + "loss": 1.5276, + "step": 12067 + }, + { + "epoch": 1.3862500717936936, + "grad_norm": 0.6538852453231812, + "learning_rate": 0.0001, + "loss": 1.6425, + "step": 12068 + }, + { + "epoch": 1.3863649417035209, + "grad_norm": 0.5990716218948364, + "learning_rate": 0.0001, + "loss": 1.4191, + "step": 12069 + }, + { + "epoch": 1.3864798116133479, + "grad_norm": 0.6036950945854187, + "learning_rate": 0.0001, + "loss": 1.5392, + "step": 12070 + }, + { + "epoch": 1.3865946815231749, + "grad_norm": 0.5441368818283081, + "learning_rate": 0.0001, + "loss": 1.4551, + "step": 12071 + }, + { + "epoch": 1.3867095514330021, + "grad_norm": 0.6289240121841431, + "learning_rate": 0.0001, + "loss": 1.274, + "step": 12072 + }, + { + "epoch": 1.3868244213428293, + "grad_norm": 0.5667722225189209, + "learning_rate": 0.0001, + "loss": 1.3627, + "step": 12073 + }, + { + "epoch": 1.3869392912526564, + "grad_norm": 0.5763718485832214, + "learning_rate": 0.0001, + "loss": 1.6001, + "step": 12074 + }, + { + "epoch": 1.3870541611624834, + "grad_norm": 0.5512832999229431, + "learning_rate": 0.0001, + "loss": 1.4434, + "step": 12075 + }, + { + "epoch": 1.3871690310723106, + "grad_norm": 0.5643221735954285, + "learning_rate": 0.0001, + "loss": 1.5439, + "step": 12076 + }, + { + "epoch": 1.3872839009821378, + "grad_norm": 0.5980443954467773, + "learning_rate": 0.0001, + "loss": 1.3511, + "step": 12077 + }, + { + "epoch": 1.3873987708919648, + "grad_norm": 0.6038445234298706, + "learning_rate": 0.0001, + "loss": 1.6927, + "step": 12078 + }, + { + "epoch": 1.3875136408017918, + "grad_norm": 0.6184426546096802, + "learning_rate": 0.0001, + "loss": 1.5709, + "step": 12079 + }, + { + "epoch": 1.387628510711619, + "grad_norm": 0.6515767574310303, + "learning_rate": 0.0001, + "loss": 1.684, + "step": 12080 + }, + { + "epoch": 1.3877433806214463, + "grad_norm": 0.6065769195556641, + "learning_rate": 0.0001, + "loss": 1.5179, + "step": 12081 + }, + { + "epoch": 1.3878582505312733, + "grad_norm": 0.6202980875968933, + "learning_rate": 0.0001, + "loss": 1.3916, + "step": 12082 + }, + { + "epoch": 1.3879731204411003, + "grad_norm": 0.6353384852409363, + "learning_rate": 0.0001, + "loss": 1.3461, + "step": 12083 + }, + { + "epoch": 1.3880879903509276, + "grad_norm": 0.6570160984992981, + "learning_rate": 0.0001, + "loss": 1.3667, + "step": 12084 + }, + { + "epoch": 1.3882028602607548, + "grad_norm": 0.5832175612449646, + "learning_rate": 0.0001, + "loss": 1.3499, + "step": 12085 + }, + { + "epoch": 1.3883177301705818, + "grad_norm": 0.5629262924194336, + "learning_rate": 0.0001, + "loss": 1.5559, + "step": 12086 + }, + { + "epoch": 1.3884326000804088, + "grad_norm": 0.586079478263855, + "learning_rate": 0.0001, + "loss": 1.432, + "step": 12087 + }, + { + "epoch": 1.388547469990236, + "grad_norm": 0.5707273483276367, + "learning_rate": 0.0001, + "loss": 1.4262, + "step": 12088 + }, + { + "epoch": 1.3886623399000633, + "grad_norm": 0.6245285868644714, + "learning_rate": 0.0001, + "loss": 1.5261, + "step": 12089 + }, + { + "epoch": 1.3887772098098903, + "grad_norm": 0.6213446855545044, + "learning_rate": 0.0001, + "loss": 1.6202, + "step": 12090 + }, + { + "epoch": 1.3888920797197173, + "grad_norm": 0.6027031540870667, + "learning_rate": 0.0001, + "loss": 1.5837, + "step": 12091 + }, + { + "epoch": 1.3890069496295445, + "grad_norm": 0.5713696479797363, + "learning_rate": 0.0001, + "loss": 1.4307, + "step": 12092 + }, + { + "epoch": 1.3891218195393717, + "grad_norm": 0.6017079949378967, + "learning_rate": 0.0001, + "loss": 1.3979, + "step": 12093 + }, + { + "epoch": 1.3892366894491988, + "grad_norm": 0.5776708722114563, + "learning_rate": 0.0001, + "loss": 1.4307, + "step": 12094 + }, + { + "epoch": 1.3893515593590258, + "grad_norm": 0.5756542086601257, + "learning_rate": 0.0001, + "loss": 1.431, + "step": 12095 + }, + { + "epoch": 1.389466429268853, + "grad_norm": 0.6287577152252197, + "learning_rate": 0.0001, + "loss": 1.3661, + "step": 12096 + }, + { + "epoch": 1.3895812991786802, + "grad_norm": 0.5801504850387573, + "learning_rate": 0.0001, + "loss": 1.4198, + "step": 12097 + }, + { + "epoch": 1.3896961690885072, + "grad_norm": 0.6163144111633301, + "learning_rate": 0.0001, + "loss": 1.3697, + "step": 12098 + }, + { + "epoch": 1.3898110389983345, + "grad_norm": 0.623882532119751, + "learning_rate": 0.0001, + "loss": 1.5625, + "step": 12099 + }, + { + "epoch": 1.3899259089081615, + "grad_norm": 0.6868638396263123, + "learning_rate": 0.0001, + "loss": 1.7235, + "step": 12100 + }, + { + "epoch": 1.3900407788179887, + "grad_norm": 0.5804413557052612, + "learning_rate": 0.0001, + "loss": 1.4517, + "step": 12101 + }, + { + "epoch": 1.3901556487278157, + "grad_norm": 0.6150039434432983, + "learning_rate": 0.0001, + "loss": 1.4018, + "step": 12102 + }, + { + "epoch": 1.390270518637643, + "grad_norm": 0.6017144322395325, + "learning_rate": 0.0001, + "loss": 1.326, + "step": 12103 + }, + { + "epoch": 1.39038538854747, + "grad_norm": 0.5922073721885681, + "learning_rate": 0.0001, + "loss": 1.4635, + "step": 12104 + }, + { + "epoch": 1.3905002584572972, + "grad_norm": 0.6403117775917053, + "learning_rate": 0.0001, + "loss": 1.5479, + "step": 12105 + }, + { + "epoch": 1.3906151283671242, + "grad_norm": 0.5838686227798462, + "learning_rate": 0.0001, + "loss": 1.5511, + "step": 12106 + }, + { + "epoch": 1.3907299982769514, + "grad_norm": 0.575851559638977, + "learning_rate": 0.0001, + "loss": 1.4667, + "step": 12107 + }, + { + "epoch": 1.3908448681867784, + "grad_norm": 0.5583367347717285, + "learning_rate": 0.0001, + "loss": 1.3506, + "step": 12108 + }, + { + "epoch": 1.3909597380966057, + "grad_norm": 0.6150028109550476, + "learning_rate": 0.0001, + "loss": 1.5256, + "step": 12109 + }, + { + "epoch": 1.3910746080064327, + "grad_norm": 0.5967298746109009, + "learning_rate": 0.0001, + "loss": 1.4282, + "step": 12110 + }, + { + "epoch": 1.39118947791626, + "grad_norm": 0.6403521299362183, + "learning_rate": 0.0001, + "loss": 1.6455, + "step": 12111 + }, + { + "epoch": 1.391304347826087, + "grad_norm": 0.579937219619751, + "learning_rate": 0.0001, + "loss": 1.4695, + "step": 12112 + }, + { + "epoch": 1.3914192177359141, + "grad_norm": 0.5739872455596924, + "learning_rate": 0.0001, + "loss": 1.6012, + "step": 12113 + }, + { + "epoch": 1.3915340876457412, + "grad_norm": 0.5874963402748108, + "learning_rate": 0.0001, + "loss": 1.5576, + "step": 12114 + }, + { + "epoch": 1.3916489575555684, + "grad_norm": 0.5656614899635315, + "learning_rate": 0.0001, + "loss": 1.4357, + "step": 12115 + }, + { + "epoch": 1.3917638274653954, + "grad_norm": 0.6302921175956726, + "learning_rate": 0.0001, + "loss": 1.5072, + "step": 12116 + }, + { + "epoch": 1.3918786973752226, + "grad_norm": 0.6151950359344482, + "learning_rate": 0.0001, + "loss": 1.4652, + "step": 12117 + }, + { + "epoch": 1.3919935672850496, + "grad_norm": 0.6326630711555481, + "learning_rate": 0.0001, + "loss": 1.4749, + "step": 12118 + }, + { + "epoch": 1.3921084371948769, + "grad_norm": 0.6131033897399902, + "learning_rate": 0.0001, + "loss": 1.382, + "step": 12119 + }, + { + "epoch": 1.3922233071047039, + "grad_norm": 0.6083925366401672, + "learning_rate": 0.0001, + "loss": 1.47, + "step": 12120 + }, + { + "epoch": 1.392338177014531, + "grad_norm": 0.5470343232154846, + "learning_rate": 0.0001, + "loss": 1.3606, + "step": 12121 + }, + { + "epoch": 1.3924530469243581, + "grad_norm": 0.6349021196365356, + "learning_rate": 0.0001, + "loss": 1.6476, + "step": 12122 + }, + { + "epoch": 1.3925679168341853, + "grad_norm": 0.5835825204849243, + "learning_rate": 0.0001, + "loss": 1.3953, + "step": 12123 + }, + { + "epoch": 1.3926827867440124, + "grad_norm": 0.6109292507171631, + "learning_rate": 0.0001, + "loss": 1.5743, + "step": 12124 + }, + { + "epoch": 1.3927976566538396, + "grad_norm": 0.5654585957527161, + "learning_rate": 0.0001, + "loss": 1.4229, + "step": 12125 + }, + { + "epoch": 1.3929125265636666, + "grad_norm": 0.5883094072341919, + "learning_rate": 0.0001, + "loss": 1.5037, + "step": 12126 + }, + { + "epoch": 1.3930273964734938, + "grad_norm": 0.6077958345413208, + "learning_rate": 0.0001, + "loss": 1.4381, + "step": 12127 + }, + { + "epoch": 1.3931422663833208, + "grad_norm": 0.6568667888641357, + "learning_rate": 0.0001, + "loss": 1.6935, + "step": 12128 + }, + { + "epoch": 1.393257136293148, + "grad_norm": 0.5563843250274658, + "learning_rate": 0.0001, + "loss": 1.2562, + "step": 12129 + }, + { + "epoch": 1.393372006202975, + "grad_norm": 0.5851126313209534, + "learning_rate": 0.0001, + "loss": 1.3322, + "step": 12130 + }, + { + "epoch": 1.3934868761128023, + "grad_norm": 0.579210102558136, + "learning_rate": 0.0001, + "loss": 1.3495, + "step": 12131 + }, + { + "epoch": 1.3936017460226293, + "grad_norm": 0.6247128844261169, + "learning_rate": 0.0001, + "loss": 1.455, + "step": 12132 + }, + { + "epoch": 1.3937166159324565, + "grad_norm": 0.6491114497184753, + "learning_rate": 0.0001, + "loss": 1.5916, + "step": 12133 + }, + { + "epoch": 1.3938314858422836, + "grad_norm": 0.5388944149017334, + "learning_rate": 0.0001, + "loss": 1.3398, + "step": 12134 + }, + { + "epoch": 1.3939463557521108, + "grad_norm": 0.5601046681404114, + "learning_rate": 0.0001, + "loss": 1.3384, + "step": 12135 + }, + { + "epoch": 1.3940612256619378, + "grad_norm": 0.5893126726150513, + "learning_rate": 0.0001, + "loss": 1.4805, + "step": 12136 + }, + { + "epoch": 1.394176095571765, + "grad_norm": 0.6289069056510925, + "learning_rate": 0.0001, + "loss": 1.5355, + "step": 12137 + }, + { + "epoch": 1.394290965481592, + "grad_norm": 0.6383712887763977, + "learning_rate": 0.0001, + "loss": 1.568, + "step": 12138 + }, + { + "epoch": 1.3944058353914193, + "grad_norm": 0.6105307340621948, + "learning_rate": 0.0001, + "loss": 1.5548, + "step": 12139 + }, + { + "epoch": 1.3945207053012463, + "grad_norm": 0.5872035622596741, + "learning_rate": 0.0001, + "loss": 1.4613, + "step": 12140 + }, + { + "epoch": 1.3946355752110735, + "grad_norm": 0.5394282937049866, + "learning_rate": 0.0001, + "loss": 1.4145, + "step": 12141 + }, + { + "epoch": 1.3947504451209005, + "grad_norm": 0.6004125475883484, + "learning_rate": 0.0001, + "loss": 1.3922, + "step": 12142 + }, + { + "epoch": 1.3948653150307277, + "grad_norm": 0.6273020505905151, + "learning_rate": 0.0001, + "loss": 1.5747, + "step": 12143 + }, + { + "epoch": 1.3949801849405548, + "grad_norm": 0.5593479871749878, + "learning_rate": 0.0001, + "loss": 1.4331, + "step": 12144 + }, + { + "epoch": 1.395095054850382, + "grad_norm": 0.5342775583267212, + "learning_rate": 0.0001, + "loss": 1.3331, + "step": 12145 + }, + { + "epoch": 1.395209924760209, + "grad_norm": 0.5714641213417053, + "learning_rate": 0.0001, + "loss": 1.1656, + "step": 12146 + }, + { + "epoch": 1.3953247946700362, + "grad_norm": 0.6640602350234985, + "learning_rate": 0.0001, + "loss": 1.4388, + "step": 12147 + }, + { + "epoch": 1.3954396645798632, + "grad_norm": 0.5924851298332214, + "learning_rate": 0.0001, + "loss": 1.2908, + "step": 12148 + }, + { + "epoch": 1.3955545344896905, + "grad_norm": 0.6022685170173645, + "learning_rate": 0.0001, + "loss": 1.4481, + "step": 12149 + }, + { + "epoch": 1.3956694043995175, + "grad_norm": 0.6584579348564148, + "learning_rate": 0.0001, + "loss": 1.5328, + "step": 12150 + }, + { + "epoch": 1.3957842743093447, + "grad_norm": 0.6461522579193115, + "learning_rate": 0.0001, + "loss": 1.4996, + "step": 12151 + }, + { + "epoch": 1.3958991442191717, + "grad_norm": 0.6562781929969788, + "learning_rate": 0.0001, + "loss": 1.6059, + "step": 12152 + }, + { + "epoch": 1.396014014128999, + "grad_norm": 0.593710720539093, + "learning_rate": 0.0001, + "loss": 1.5505, + "step": 12153 + }, + { + "epoch": 1.396128884038826, + "grad_norm": 0.6154862642288208, + "learning_rate": 0.0001, + "loss": 1.5141, + "step": 12154 + }, + { + "epoch": 1.3962437539486532, + "grad_norm": 0.5625523328781128, + "learning_rate": 0.0001, + "loss": 1.4415, + "step": 12155 + }, + { + "epoch": 1.3963586238584802, + "grad_norm": 0.5562409162521362, + "learning_rate": 0.0001, + "loss": 1.3692, + "step": 12156 + }, + { + "epoch": 1.3964734937683074, + "grad_norm": 0.6092026829719543, + "learning_rate": 0.0001, + "loss": 1.5876, + "step": 12157 + }, + { + "epoch": 1.3965883636781344, + "grad_norm": 0.5918128490447998, + "learning_rate": 0.0001, + "loss": 1.4551, + "step": 12158 + }, + { + "epoch": 1.3967032335879617, + "grad_norm": 0.6120827794075012, + "learning_rate": 0.0001, + "loss": 1.4987, + "step": 12159 + }, + { + "epoch": 1.3968181034977887, + "grad_norm": 0.6138426065444946, + "learning_rate": 0.0001, + "loss": 1.4106, + "step": 12160 + }, + { + "epoch": 1.396932973407616, + "grad_norm": 0.641486406326294, + "learning_rate": 0.0001, + "loss": 1.4812, + "step": 12161 + }, + { + "epoch": 1.397047843317443, + "grad_norm": 0.6255931258201599, + "learning_rate": 0.0001, + "loss": 1.4398, + "step": 12162 + }, + { + "epoch": 1.3971627132272701, + "grad_norm": 0.6870328187942505, + "learning_rate": 0.0001, + "loss": 1.661, + "step": 12163 + }, + { + "epoch": 1.3972775831370972, + "grad_norm": 0.5824881196022034, + "learning_rate": 0.0001, + "loss": 1.4912, + "step": 12164 + }, + { + "epoch": 1.3973924530469244, + "grad_norm": 0.557790994644165, + "learning_rate": 0.0001, + "loss": 1.4021, + "step": 12165 + }, + { + "epoch": 1.3975073229567514, + "grad_norm": 0.6548678874969482, + "learning_rate": 0.0001, + "loss": 1.3511, + "step": 12166 + }, + { + "epoch": 1.3976221928665786, + "grad_norm": 0.5523027777671814, + "learning_rate": 0.0001, + "loss": 1.1925, + "step": 12167 + }, + { + "epoch": 1.3977370627764056, + "grad_norm": 0.5813341736793518, + "learning_rate": 0.0001, + "loss": 1.477, + "step": 12168 + }, + { + "epoch": 1.3978519326862329, + "grad_norm": 0.5378603935241699, + "learning_rate": 0.0001, + "loss": 1.2486, + "step": 12169 + }, + { + "epoch": 1.39796680259606, + "grad_norm": 0.5774914622306824, + "learning_rate": 0.0001, + "loss": 1.361, + "step": 12170 + }, + { + "epoch": 1.398081672505887, + "grad_norm": 0.602354884147644, + "learning_rate": 0.0001, + "loss": 1.6387, + "step": 12171 + }, + { + "epoch": 1.3981965424157141, + "grad_norm": 0.559851884841919, + "learning_rate": 0.0001, + "loss": 1.488, + "step": 12172 + }, + { + "epoch": 1.3983114123255413, + "grad_norm": 0.6218519806861877, + "learning_rate": 0.0001, + "loss": 1.355, + "step": 12173 + }, + { + "epoch": 1.3984262822353686, + "grad_norm": 0.6444374918937683, + "learning_rate": 0.0001, + "loss": 1.5331, + "step": 12174 + }, + { + "epoch": 1.3985411521451956, + "grad_norm": 0.5984706282615662, + "learning_rate": 0.0001, + "loss": 1.4786, + "step": 12175 + }, + { + "epoch": 1.3986560220550226, + "grad_norm": 0.6040021777153015, + "learning_rate": 0.0001, + "loss": 1.4337, + "step": 12176 + }, + { + "epoch": 1.3987708919648498, + "grad_norm": 0.5464459657669067, + "learning_rate": 0.0001, + "loss": 1.4417, + "step": 12177 + }, + { + "epoch": 1.398885761874677, + "grad_norm": 0.5808142423629761, + "learning_rate": 0.0001, + "loss": 1.5646, + "step": 12178 + }, + { + "epoch": 1.399000631784504, + "grad_norm": 0.5869821906089783, + "learning_rate": 0.0001, + "loss": 1.593, + "step": 12179 + }, + { + "epoch": 1.399115501694331, + "grad_norm": 0.5694946050643921, + "learning_rate": 0.0001, + "loss": 1.5514, + "step": 12180 + }, + { + "epoch": 1.3992303716041583, + "grad_norm": 0.5913307666778564, + "learning_rate": 0.0001, + "loss": 1.6787, + "step": 12181 + }, + { + "epoch": 1.3993452415139855, + "grad_norm": 0.5427361130714417, + "learning_rate": 0.0001, + "loss": 1.4013, + "step": 12182 + }, + { + "epoch": 1.3994601114238125, + "grad_norm": 0.5571943521499634, + "learning_rate": 0.0001, + "loss": 1.4897, + "step": 12183 + }, + { + "epoch": 1.3995749813336396, + "grad_norm": 0.5840848684310913, + "learning_rate": 0.0001, + "loss": 1.4723, + "step": 12184 + }, + { + "epoch": 1.3996898512434668, + "grad_norm": 0.5792492628097534, + "learning_rate": 0.0001, + "loss": 1.5345, + "step": 12185 + }, + { + "epoch": 1.399804721153294, + "grad_norm": 0.5461111068725586, + "learning_rate": 0.0001, + "loss": 1.4224, + "step": 12186 + }, + { + "epoch": 1.399919591063121, + "grad_norm": 0.6092454195022583, + "learning_rate": 0.0001, + "loss": 1.5321, + "step": 12187 + }, + { + "epoch": 1.400034460972948, + "grad_norm": 0.5773031711578369, + "learning_rate": 0.0001, + "loss": 1.4072, + "step": 12188 + }, + { + "epoch": 1.4001493308827753, + "grad_norm": 0.6374520063400269, + "learning_rate": 0.0001, + "loss": 1.5028, + "step": 12189 + }, + { + "epoch": 1.4002642007926025, + "grad_norm": 0.5584102272987366, + "learning_rate": 0.0001, + "loss": 1.2906, + "step": 12190 + }, + { + "epoch": 1.4003790707024295, + "grad_norm": 0.6517335772514343, + "learning_rate": 0.0001, + "loss": 1.7582, + "step": 12191 + }, + { + "epoch": 1.4004939406122565, + "grad_norm": 0.5977898240089417, + "learning_rate": 0.0001, + "loss": 1.517, + "step": 12192 + }, + { + "epoch": 1.4006088105220837, + "grad_norm": 0.5880120992660522, + "learning_rate": 0.0001, + "loss": 1.6576, + "step": 12193 + }, + { + "epoch": 1.400723680431911, + "grad_norm": 0.5685825943946838, + "learning_rate": 0.0001, + "loss": 1.4966, + "step": 12194 + }, + { + "epoch": 1.400838550341738, + "grad_norm": 0.5807334780693054, + "learning_rate": 0.0001, + "loss": 1.3646, + "step": 12195 + }, + { + "epoch": 1.400953420251565, + "grad_norm": 0.602315366268158, + "learning_rate": 0.0001, + "loss": 1.2503, + "step": 12196 + }, + { + "epoch": 1.4010682901613922, + "grad_norm": 0.6140511631965637, + "learning_rate": 0.0001, + "loss": 1.4578, + "step": 12197 + }, + { + "epoch": 1.4011831600712195, + "grad_norm": 0.6030164361000061, + "learning_rate": 0.0001, + "loss": 1.4811, + "step": 12198 + }, + { + "epoch": 1.4012980299810465, + "grad_norm": 0.6865645051002502, + "learning_rate": 0.0001, + "loss": 1.5387, + "step": 12199 + }, + { + "epoch": 1.4014128998908735, + "grad_norm": 0.5891702175140381, + "learning_rate": 0.0001, + "loss": 1.3482, + "step": 12200 + }, + { + "epoch": 1.4015277698007007, + "grad_norm": 0.6280384063720703, + "learning_rate": 0.0001, + "loss": 1.4052, + "step": 12201 + }, + { + "epoch": 1.401642639710528, + "grad_norm": 0.5294430255889893, + "learning_rate": 0.0001, + "loss": 1.4011, + "step": 12202 + }, + { + "epoch": 1.401757509620355, + "grad_norm": 0.5137497782707214, + "learning_rate": 0.0001, + "loss": 1.3008, + "step": 12203 + }, + { + "epoch": 1.401872379530182, + "grad_norm": 0.6277523636817932, + "learning_rate": 0.0001, + "loss": 1.3415, + "step": 12204 + }, + { + "epoch": 1.4019872494400092, + "grad_norm": 0.5599936842918396, + "learning_rate": 0.0001, + "loss": 1.524, + "step": 12205 + }, + { + "epoch": 1.4021021193498364, + "grad_norm": 0.5398606657981873, + "learning_rate": 0.0001, + "loss": 1.3835, + "step": 12206 + }, + { + "epoch": 1.4022169892596634, + "grad_norm": 0.5670443773269653, + "learning_rate": 0.0001, + "loss": 1.4954, + "step": 12207 + }, + { + "epoch": 1.4023318591694904, + "grad_norm": 0.5825430154800415, + "learning_rate": 0.0001, + "loss": 1.4757, + "step": 12208 + }, + { + "epoch": 1.4024467290793177, + "grad_norm": 0.6514697670936584, + "learning_rate": 0.0001, + "loss": 1.4789, + "step": 12209 + }, + { + "epoch": 1.402561598989145, + "grad_norm": 0.5851296782493591, + "learning_rate": 0.0001, + "loss": 1.4696, + "step": 12210 + }, + { + "epoch": 1.402676468898972, + "grad_norm": 0.6211963295936584, + "learning_rate": 0.0001, + "loss": 1.192, + "step": 12211 + }, + { + "epoch": 1.402791338808799, + "grad_norm": 0.5917556881904602, + "learning_rate": 0.0001, + "loss": 1.3838, + "step": 12212 + }, + { + "epoch": 1.4029062087186261, + "grad_norm": 0.637103259563446, + "learning_rate": 0.0001, + "loss": 1.5705, + "step": 12213 + }, + { + "epoch": 1.4030210786284534, + "grad_norm": 0.6678568720817566, + "learning_rate": 0.0001, + "loss": 1.7563, + "step": 12214 + }, + { + "epoch": 1.4031359485382804, + "grad_norm": 0.6561994552612305, + "learning_rate": 0.0001, + "loss": 1.485, + "step": 12215 + }, + { + "epoch": 1.4032508184481074, + "grad_norm": 0.590522825717926, + "learning_rate": 0.0001, + "loss": 1.6996, + "step": 12216 + }, + { + "epoch": 1.4033656883579346, + "grad_norm": 0.5511714816093445, + "learning_rate": 0.0001, + "loss": 1.405, + "step": 12217 + }, + { + "epoch": 1.4034805582677619, + "grad_norm": 0.567674458026886, + "learning_rate": 0.0001, + "loss": 1.2435, + "step": 12218 + }, + { + "epoch": 1.4035954281775889, + "grad_norm": 0.5764709711074829, + "learning_rate": 0.0001, + "loss": 1.6194, + "step": 12219 + }, + { + "epoch": 1.4037102980874159, + "grad_norm": 0.5814299583435059, + "learning_rate": 0.0001, + "loss": 1.561, + "step": 12220 + }, + { + "epoch": 1.403825167997243, + "grad_norm": 0.6138479709625244, + "learning_rate": 0.0001, + "loss": 1.3847, + "step": 12221 + }, + { + "epoch": 1.4039400379070703, + "grad_norm": 0.577967643737793, + "learning_rate": 0.0001, + "loss": 1.4667, + "step": 12222 + }, + { + "epoch": 1.4040549078168973, + "grad_norm": 0.5953364372253418, + "learning_rate": 0.0001, + "loss": 1.433, + "step": 12223 + }, + { + "epoch": 1.4041697777267244, + "grad_norm": 0.6030754446983337, + "learning_rate": 0.0001, + "loss": 1.3456, + "step": 12224 + }, + { + "epoch": 1.4042846476365516, + "grad_norm": 0.6069220304489136, + "learning_rate": 0.0001, + "loss": 1.4485, + "step": 12225 + }, + { + "epoch": 1.4043995175463788, + "grad_norm": 0.5867301821708679, + "learning_rate": 0.0001, + "loss": 1.4798, + "step": 12226 + }, + { + "epoch": 1.4045143874562058, + "grad_norm": 0.6797637343406677, + "learning_rate": 0.0001, + "loss": 1.6056, + "step": 12227 + }, + { + "epoch": 1.4046292573660328, + "grad_norm": 0.6498664617538452, + "learning_rate": 0.0001, + "loss": 1.2396, + "step": 12228 + }, + { + "epoch": 1.40474412727586, + "grad_norm": 0.645907461643219, + "learning_rate": 0.0001, + "loss": 1.6652, + "step": 12229 + }, + { + "epoch": 1.4048589971856873, + "grad_norm": 0.5839877128601074, + "learning_rate": 0.0001, + "loss": 1.366, + "step": 12230 + }, + { + "epoch": 1.4049738670955143, + "grad_norm": 0.5825426578521729, + "learning_rate": 0.0001, + "loss": 1.3383, + "step": 12231 + }, + { + "epoch": 1.4050887370053413, + "grad_norm": 0.6786919236183167, + "learning_rate": 0.0001, + "loss": 1.6, + "step": 12232 + }, + { + "epoch": 1.4052036069151685, + "grad_norm": 0.572919487953186, + "learning_rate": 0.0001, + "loss": 1.5307, + "step": 12233 + }, + { + "epoch": 1.4053184768249958, + "grad_norm": 0.5826205611228943, + "learning_rate": 0.0001, + "loss": 1.227, + "step": 12234 + }, + { + "epoch": 1.4054333467348228, + "grad_norm": 0.5886760354042053, + "learning_rate": 0.0001, + "loss": 1.3468, + "step": 12235 + }, + { + "epoch": 1.40554821664465, + "grad_norm": 0.579044759273529, + "learning_rate": 0.0001, + "loss": 1.3405, + "step": 12236 + }, + { + "epoch": 1.405663086554477, + "grad_norm": 0.5720400810241699, + "learning_rate": 0.0001, + "loss": 1.5135, + "step": 12237 + }, + { + "epoch": 1.4057779564643043, + "grad_norm": 0.5574225783348083, + "learning_rate": 0.0001, + "loss": 1.3995, + "step": 12238 + }, + { + "epoch": 1.4058928263741313, + "grad_norm": 0.5698866844177246, + "learning_rate": 0.0001, + "loss": 1.5275, + "step": 12239 + }, + { + "epoch": 1.4060076962839585, + "grad_norm": 0.5897749066352844, + "learning_rate": 0.0001, + "loss": 1.4587, + "step": 12240 + }, + { + "epoch": 1.4061225661937855, + "grad_norm": 0.5804178714752197, + "learning_rate": 0.0001, + "loss": 1.5626, + "step": 12241 + }, + { + "epoch": 1.4062374361036127, + "grad_norm": 0.5859702229499817, + "learning_rate": 0.0001, + "loss": 1.431, + "step": 12242 + }, + { + "epoch": 1.4063523060134397, + "grad_norm": 0.5918383598327637, + "learning_rate": 0.0001, + "loss": 1.611, + "step": 12243 + }, + { + "epoch": 1.406467175923267, + "grad_norm": 0.5748845934867859, + "learning_rate": 0.0001, + "loss": 1.6516, + "step": 12244 + }, + { + "epoch": 1.406582045833094, + "grad_norm": 0.6131268739700317, + "learning_rate": 0.0001, + "loss": 1.5947, + "step": 12245 + }, + { + "epoch": 1.4066969157429212, + "grad_norm": 0.6054122447967529, + "learning_rate": 0.0001, + "loss": 1.535, + "step": 12246 + }, + { + "epoch": 1.4068117856527482, + "grad_norm": 0.6443667411804199, + "learning_rate": 0.0001, + "loss": 1.5172, + "step": 12247 + }, + { + "epoch": 1.4069266555625755, + "grad_norm": 0.6411632895469666, + "learning_rate": 0.0001, + "loss": 1.4362, + "step": 12248 + }, + { + "epoch": 1.4070415254724025, + "grad_norm": 0.5433965921401978, + "learning_rate": 0.0001, + "loss": 1.4244, + "step": 12249 + }, + { + "epoch": 1.4071563953822297, + "grad_norm": 0.6453516483306885, + "learning_rate": 0.0001, + "loss": 1.6362, + "step": 12250 + }, + { + "epoch": 1.4072712652920567, + "grad_norm": 0.5545192360877991, + "learning_rate": 0.0001, + "loss": 1.1791, + "step": 12251 + }, + { + "epoch": 1.407386135201884, + "grad_norm": 0.5485440492630005, + "learning_rate": 0.0001, + "loss": 1.2924, + "step": 12252 + }, + { + "epoch": 1.407501005111711, + "grad_norm": 0.614592432975769, + "learning_rate": 0.0001, + "loss": 1.2284, + "step": 12253 + }, + { + "epoch": 1.4076158750215382, + "grad_norm": 0.5941545963287354, + "learning_rate": 0.0001, + "loss": 1.4719, + "step": 12254 + }, + { + "epoch": 1.4077307449313652, + "grad_norm": 0.5332536697387695, + "learning_rate": 0.0001, + "loss": 1.4855, + "step": 12255 + }, + { + "epoch": 1.4078456148411924, + "grad_norm": 0.6020111441612244, + "learning_rate": 0.0001, + "loss": 1.603, + "step": 12256 + }, + { + "epoch": 1.4079604847510194, + "grad_norm": 0.6182122826576233, + "learning_rate": 0.0001, + "loss": 1.4545, + "step": 12257 + }, + { + "epoch": 1.4080753546608467, + "grad_norm": 0.5976362824440002, + "learning_rate": 0.0001, + "loss": 1.623, + "step": 12258 + }, + { + "epoch": 1.4081902245706737, + "grad_norm": 0.5798993706703186, + "learning_rate": 0.0001, + "loss": 1.3952, + "step": 12259 + }, + { + "epoch": 1.408305094480501, + "grad_norm": 0.5920160412788391, + "learning_rate": 0.0001, + "loss": 1.4257, + "step": 12260 + }, + { + "epoch": 1.408419964390328, + "grad_norm": 0.6999684572219849, + "learning_rate": 0.0001, + "loss": 1.4828, + "step": 12261 + }, + { + "epoch": 1.4085348343001551, + "grad_norm": 0.6646612286567688, + "learning_rate": 0.0001, + "loss": 1.6587, + "step": 12262 + }, + { + "epoch": 1.4086497042099821, + "grad_norm": 0.6494892239570618, + "learning_rate": 0.0001, + "loss": 1.5008, + "step": 12263 + }, + { + "epoch": 1.4087645741198094, + "grad_norm": 0.6330276727676392, + "learning_rate": 0.0001, + "loss": 1.5038, + "step": 12264 + }, + { + "epoch": 1.4088794440296364, + "grad_norm": 0.6663432717323303, + "learning_rate": 0.0001, + "loss": 1.3514, + "step": 12265 + }, + { + "epoch": 1.4089943139394636, + "grad_norm": 0.6159442067146301, + "learning_rate": 0.0001, + "loss": 1.5268, + "step": 12266 + }, + { + "epoch": 1.4091091838492906, + "grad_norm": 0.5885686278343201, + "learning_rate": 0.0001, + "loss": 1.4262, + "step": 12267 + }, + { + "epoch": 1.4092240537591179, + "grad_norm": 0.6478608250617981, + "learning_rate": 0.0001, + "loss": 1.3103, + "step": 12268 + }, + { + "epoch": 1.4093389236689449, + "grad_norm": 0.598627507686615, + "learning_rate": 0.0001, + "loss": 1.5664, + "step": 12269 + }, + { + "epoch": 1.409453793578772, + "grad_norm": 0.5440186262130737, + "learning_rate": 0.0001, + "loss": 1.5294, + "step": 12270 + }, + { + "epoch": 1.409568663488599, + "grad_norm": 0.5913492441177368, + "learning_rate": 0.0001, + "loss": 1.5455, + "step": 12271 + }, + { + "epoch": 1.4096835333984263, + "grad_norm": 0.6720054149627686, + "learning_rate": 0.0001, + "loss": 1.6878, + "step": 12272 + }, + { + "epoch": 1.4097984033082533, + "grad_norm": 0.550347089767456, + "learning_rate": 0.0001, + "loss": 1.4825, + "step": 12273 + }, + { + "epoch": 1.4099132732180806, + "grad_norm": 0.5974934697151184, + "learning_rate": 0.0001, + "loss": 1.5707, + "step": 12274 + }, + { + "epoch": 1.4100281431279076, + "grad_norm": 0.568218469619751, + "learning_rate": 0.0001, + "loss": 1.3921, + "step": 12275 + }, + { + "epoch": 1.4101430130377348, + "grad_norm": 0.6132494211196899, + "learning_rate": 0.0001, + "loss": 1.3798, + "step": 12276 + }, + { + "epoch": 1.4102578829475618, + "grad_norm": 0.5562697649002075, + "learning_rate": 0.0001, + "loss": 1.5631, + "step": 12277 + }, + { + "epoch": 1.410372752857389, + "grad_norm": 0.5881964564323425, + "learning_rate": 0.0001, + "loss": 1.4807, + "step": 12278 + }, + { + "epoch": 1.410487622767216, + "grad_norm": 0.6052611470222473, + "learning_rate": 0.0001, + "loss": 1.3232, + "step": 12279 + }, + { + "epoch": 1.4106024926770433, + "grad_norm": 0.568783164024353, + "learning_rate": 0.0001, + "loss": 1.4518, + "step": 12280 + }, + { + "epoch": 1.4107173625868703, + "grad_norm": 0.5390822291374207, + "learning_rate": 0.0001, + "loss": 1.5032, + "step": 12281 + }, + { + "epoch": 1.4108322324966975, + "grad_norm": 0.6221309900283813, + "learning_rate": 0.0001, + "loss": 1.4751, + "step": 12282 + }, + { + "epoch": 1.4109471024065245, + "grad_norm": 0.6314241886138916, + "learning_rate": 0.0001, + "loss": 1.5097, + "step": 12283 + }, + { + "epoch": 1.4110619723163518, + "grad_norm": 0.5789543390274048, + "learning_rate": 0.0001, + "loss": 1.4302, + "step": 12284 + }, + { + "epoch": 1.4111768422261788, + "grad_norm": 0.6005547642707825, + "learning_rate": 0.0001, + "loss": 1.2483, + "step": 12285 + }, + { + "epoch": 1.411291712136006, + "grad_norm": 0.6064243912696838, + "learning_rate": 0.0001, + "loss": 1.5456, + "step": 12286 + }, + { + "epoch": 1.411406582045833, + "grad_norm": 0.6371053457260132, + "learning_rate": 0.0001, + "loss": 1.5302, + "step": 12287 + }, + { + "epoch": 1.4115214519556603, + "grad_norm": 0.6137773394584656, + "learning_rate": 0.0001, + "loss": 1.4473, + "step": 12288 + }, + { + "epoch": 1.4116363218654873, + "grad_norm": 0.6478999853134155, + "learning_rate": 0.0001, + "loss": 1.6225, + "step": 12289 + }, + { + "epoch": 1.4117511917753145, + "grad_norm": 0.5467931032180786, + "learning_rate": 0.0001, + "loss": 1.3421, + "step": 12290 + }, + { + "epoch": 1.4118660616851415, + "grad_norm": 0.5850721001625061, + "learning_rate": 0.0001, + "loss": 1.3787, + "step": 12291 + }, + { + "epoch": 1.4119809315949687, + "grad_norm": 0.5739957690238953, + "learning_rate": 0.0001, + "loss": 1.4527, + "step": 12292 + }, + { + "epoch": 1.4120958015047957, + "grad_norm": 0.6541691422462463, + "learning_rate": 0.0001, + "loss": 1.5428, + "step": 12293 + }, + { + "epoch": 1.412210671414623, + "grad_norm": 0.6030059456825256, + "learning_rate": 0.0001, + "loss": 1.4744, + "step": 12294 + }, + { + "epoch": 1.41232554132445, + "grad_norm": 0.5912495255470276, + "learning_rate": 0.0001, + "loss": 1.3846, + "step": 12295 + }, + { + "epoch": 1.4124404112342772, + "grad_norm": 0.6480311751365662, + "learning_rate": 0.0001, + "loss": 1.5656, + "step": 12296 + }, + { + "epoch": 1.4125552811441042, + "grad_norm": 0.6374021768569946, + "learning_rate": 0.0001, + "loss": 1.498, + "step": 12297 + }, + { + "epoch": 1.4126701510539315, + "grad_norm": 0.7061396241188049, + "learning_rate": 0.0001, + "loss": 1.5376, + "step": 12298 + }, + { + "epoch": 1.4127850209637585, + "grad_norm": 0.5619686841964722, + "learning_rate": 0.0001, + "loss": 1.4337, + "step": 12299 + }, + { + "epoch": 1.4128998908735857, + "grad_norm": 0.5579246282577515, + "learning_rate": 0.0001, + "loss": 1.4374, + "step": 12300 + }, + { + "epoch": 1.4130147607834127, + "grad_norm": 0.5994849801063538, + "learning_rate": 0.0001, + "loss": 1.4444, + "step": 12301 + }, + { + "epoch": 1.41312963069324, + "grad_norm": 0.5560767650604248, + "learning_rate": 0.0001, + "loss": 1.5121, + "step": 12302 + }, + { + "epoch": 1.413244500603067, + "grad_norm": 0.5769314765930176, + "learning_rate": 0.0001, + "loss": 1.3982, + "step": 12303 + }, + { + "epoch": 1.4133593705128942, + "grad_norm": 0.5469468235969543, + "learning_rate": 0.0001, + "loss": 1.3255, + "step": 12304 + }, + { + "epoch": 1.4134742404227212, + "grad_norm": 0.5225635170936584, + "learning_rate": 0.0001, + "loss": 1.4925, + "step": 12305 + }, + { + "epoch": 1.4135891103325484, + "grad_norm": 0.5575945377349854, + "learning_rate": 0.0001, + "loss": 1.4781, + "step": 12306 + }, + { + "epoch": 1.4137039802423756, + "grad_norm": 0.6048114895820618, + "learning_rate": 0.0001, + "loss": 1.5532, + "step": 12307 + }, + { + "epoch": 1.4138188501522027, + "grad_norm": 0.560836911201477, + "learning_rate": 0.0001, + "loss": 1.3568, + "step": 12308 + }, + { + "epoch": 1.4139337200620297, + "grad_norm": 0.5700927376747131, + "learning_rate": 0.0001, + "loss": 1.4198, + "step": 12309 + }, + { + "epoch": 1.414048589971857, + "grad_norm": 0.549271285533905, + "learning_rate": 0.0001, + "loss": 1.4586, + "step": 12310 + }, + { + "epoch": 1.4141634598816841, + "grad_norm": 0.6149371266365051, + "learning_rate": 0.0001, + "loss": 1.4705, + "step": 12311 + }, + { + "epoch": 1.4142783297915111, + "grad_norm": 0.6246234178543091, + "learning_rate": 0.0001, + "loss": 1.427, + "step": 12312 + }, + { + "epoch": 1.4143931997013381, + "grad_norm": 0.6859720349311829, + "learning_rate": 0.0001, + "loss": 1.4536, + "step": 12313 + }, + { + "epoch": 1.4145080696111654, + "grad_norm": 0.5962328910827637, + "learning_rate": 0.0001, + "loss": 1.4477, + "step": 12314 + }, + { + "epoch": 1.4146229395209926, + "grad_norm": 0.6179618239402771, + "learning_rate": 0.0001, + "loss": 1.3629, + "step": 12315 + }, + { + "epoch": 1.4147378094308196, + "grad_norm": 0.5705560445785522, + "learning_rate": 0.0001, + "loss": 1.4853, + "step": 12316 + }, + { + "epoch": 1.4148526793406466, + "grad_norm": 0.6366815567016602, + "learning_rate": 0.0001, + "loss": 1.5446, + "step": 12317 + }, + { + "epoch": 1.4149675492504739, + "grad_norm": 0.664026141166687, + "learning_rate": 0.0001, + "loss": 1.6581, + "step": 12318 + }, + { + "epoch": 1.415082419160301, + "grad_norm": 0.6808172464370728, + "learning_rate": 0.0001, + "loss": 1.6226, + "step": 12319 + }, + { + "epoch": 1.415197289070128, + "grad_norm": 0.5884132385253906, + "learning_rate": 0.0001, + "loss": 1.4446, + "step": 12320 + }, + { + "epoch": 1.415312158979955, + "grad_norm": 0.6115908026695251, + "learning_rate": 0.0001, + "loss": 1.6656, + "step": 12321 + }, + { + "epoch": 1.4154270288897823, + "grad_norm": 0.610191285610199, + "learning_rate": 0.0001, + "loss": 1.5849, + "step": 12322 + }, + { + "epoch": 1.4155418987996096, + "grad_norm": 0.5975642800331116, + "learning_rate": 0.0001, + "loss": 1.6194, + "step": 12323 + }, + { + "epoch": 1.4156567687094366, + "grad_norm": 0.5843347907066345, + "learning_rate": 0.0001, + "loss": 1.4948, + "step": 12324 + }, + { + "epoch": 1.4157716386192636, + "grad_norm": 0.5770195126533508, + "learning_rate": 0.0001, + "loss": 1.537, + "step": 12325 + }, + { + "epoch": 1.4158865085290908, + "grad_norm": 0.5760985016822815, + "learning_rate": 0.0001, + "loss": 1.4043, + "step": 12326 + }, + { + "epoch": 1.416001378438918, + "grad_norm": 0.5809157490730286, + "learning_rate": 0.0001, + "loss": 1.5153, + "step": 12327 + }, + { + "epoch": 1.416116248348745, + "grad_norm": 0.59894198179245, + "learning_rate": 0.0001, + "loss": 1.5943, + "step": 12328 + }, + { + "epoch": 1.416231118258572, + "grad_norm": 0.5635504722595215, + "learning_rate": 0.0001, + "loss": 1.4517, + "step": 12329 + }, + { + "epoch": 1.4163459881683993, + "grad_norm": 0.5939011573791504, + "learning_rate": 0.0001, + "loss": 1.6191, + "step": 12330 + }, + { + "epoch": 1.4164608580782265, + "grad_norm": 0.5841618776321411, + "learning_rate": 0.0001, + "loss": 1.4749, + "step": 12331 + }, + { + "epoch": 1.4165757279880535, + "grad_norm": 0.5628432631492615, + "learning_rate": 0.0001, + "loss": 1.3689, + "step": 12332 + }, + { + "epoch": 1.4166905978978805, + "grad_norm": 0.5952514410018921, + "learning_rate": 0.0001, + "loss": 1.3696, + "step": 12333 + }, + { + "epoch": 1.4168054678077078, + "grad_norm": 0.5918675065040588, + "learning_rate": 0.0001, + "loss": 1.4191, + "step": 12334 + }, + { + "epoch": 1.416920337717535, + "grad_norm": 0.629119873046875, + "learning_rate": 0.0001, + "loss": 1.593, + "step": 12335 + }, + { + "epoch": 1.417035207627362, + "grad_norm": 0.5868558287620544, + "learning_rate": 0.0001, + "loss": 1.4134, + "step": 12336 + }, + { + "epoch": 1.417150077537189, + "grad_norm": 0.6055565476417542, + "learning_rate": 0.0001, + "loss": 1.3543, + "step": 12337 + }, + { + "epoch": 1.4172649474470163, + "grad_norm": 0.5665161609649658, + "learning_rate": 0.0001, + "loss": 1.4376, + "step": 12338 + }, + { + "epoch": 1.4173798173568435, + "grad_norm": 0.6178751587867737, + "learning_rate": 0.0001, + "loss": 1.4882, + "step": 12339 + }, + { + "epoch": 1.4174946872666705, + "grad_norm": 0.6303858160972595, + "learning_rate": 0.0001, + "loss": 1.5314, + "step": 12340 + }, + { + "epoch": 1.4176095571764975, + "grad_norm": 0.6005488634109497, + "learning_rate": 0.0001, + "loss": 1.4275, + "step": 12341 + }, + { + "epoch": 1.4177244270863247, + "grad_norm": 0.6242131590843201, + "learning_rate": 0.0001, + "loss": 1.2941, + "step": 12342 + }, + { + "epoch": 1.417839296996152, + "grad_norm": 0.6233993768692017, + "learning_rate": 0.0001, + "loss": 1.5075, + "step": 12343 + }, + { + "epoch": 1.417954166905979, + "grad_norm": 0.6324300169944763, + "learning_rate": 0.0001, + "loss": 1.5645, + "step": 12344 + }, + { + "epoch": 1.418069036815806, + "grad_norm": 0.6000421047210693, + "learning_rate": 0.0001, + "loss": 1.4319, + "step": 12345 + }, + { + "epoch": 1.4181839067256332, + "grad_norm": 0.6152875423431396, + "learning_rate": 0.0001, + "loss": 1.3411, + "step": 12346 + }, + { + "epoch": 1.4182987766354604, + "grad_norm": 0.6526452898979187, + "learning_rate": 0.0001, + "loss": 1.6113, + "step": 12347 + }, + { + "epoch": 1.4184136465452875, + "grad_norm": 0.6107121109962463, + "learning_rate": 0.0001, + "loss": 1.5193, + "step": 12348 + }, + { + "epoch": 1.4185285164551145, + "grad_norm": 0.5946810841560364, + "learning_rate": 0.0001, + "loss": 1.5623, + "step": 12349 + }, + { + "epoch": 1.4186433863649417, + "grad_norm": 0.5849339365959167, + "learning_rate": 0.0001, + "loss": 1.4287, + "step": 12350 + }, + { + "epoch": 1.418758256274769, + "grad_norm": 0.6404920816421509, + "learning_rate": 0.0001, + "loss": 1.596, + "step": 12351 + }, + { + "epoch": 1.418873126184596, + "grad_norm": 0.6268463134765625, + "learning_rate": 0.0001, + "loss": 1.566, + "step": 12352 + }, + { + "epoch": 1.418987996094423, + "grad_norm": 0.5831562876701355, + "learning_rate": 0.0001, + "loss": 1.3506, + "step": 12353 + }, + { + "epoch": 1.4191028660042502, + "grad_norm": 0.6019381284713745, + "learning_rate": 0.0001, + "loss": 1.428, + "step": 12354 + }, + { + "epoch": 1.4192177359140774, + "grad_norm": 0.6480855941772461, + "learning_rate": 0.0001, + "loss": 1.4975, + "step": 12355 + }, + { + "epoch": 1.4193326058239044, + "grad_norm": 0.6324148774147034, + "learning_rate": 0.0001, + "loss": 1.3886, + "step": 12356 + }, + { + "epoch": 1.4194474757337314, + "grad_norm": 0.5891407132148743, + "learning_rate": 0.0001, + "loss": 1.4664, + "step": 12357 + }, + { + "epoch": 1.4195623456435587, + "grad_norm": 0.5745696425437927, + "learning_rate": 0.0001, + "loss": 1.4676, + "step": 12358 + }, + { + "epoch": 1.4196772155533859, + "grad_norm": 0.5865878462791443, + "learning_rate": 0.0001, + "loss": 1.4144, + "step": 12359 + }, + { + "epoch": 1.419792085463213, + "grad_norm": 0.5463162064552307, + "learning_rate": 0.0001, + "loss": 1.2957, + "step": 12360 + }, + { + "epoch": 1.41990695537304, + "grad_norm": 0.5702036619186401, + "learning_rate": 0.0001, + "loss": 1.3271, + "step": 12361 + }, + { + "epoch": 1.4200218252828671, + "grad_norm": 0.5767183899879456, + "learning_rate": 0.0001, + "loss": 1.5055, + "step": 12362 + }, + { + "epoch": 1.4201366951926944, + "grad_norm": 0.6075996160507202, + "learning_rate": 0.0001, + "loss": 1.5536, + "step": 12363 + }, + { + "epoch": 1.4202515651025214, + "grad_norm": 0.6220189332962036, + "learning_rate": 0.0001, + "loss": 1.4745, + "step": 12364 + }, + { + "epoch": 1.4203664350123484, + "grad_norm": 0.5823253393173218, + "learning_rate": 0.0001, + "loss": 1.4564, + "step": 12365 + }, + { + "epoch": 1.4204813049221756, + "grad_norm": 0.592589259147644, + "learning_rate": 0.0001, + "loss": 1.3409, + "step": 12366 + }, + { + "epoch": 1.4205961748320028, + "grad_norm": 0.624343991279602, + "learning_rate": 0.0001, + "loss": 1.4716, + "step": 12367 + }, + { + "epoch": 1.4207110447418299, + "grad_norm": 0.568588137626648, + "learning_rate": 0.0001, + "loss": 1.4421, + "step": 12368 + }, + { + "epoch": 1.4208259146516569, + "grad_norm": 0.6244280338287354, + "learning_rate": 0.0001, + "loss": 1.3673, + "step": 12369 + }, + { + "epoch": 1.420940784561484, + "grad_norm": 0.5823641419410706, + "learning_rate": 0.0001, + "loss": 1.4835, + "step": 12370 + }, + { + "epoch": 1.4210556544713113, + "grad_norm": 0.627984344959259, + "learning_rate": 0.0001, + "loss": 1.396, + "step": 12371 + }, + { + "epoch": 1.4211705243811383, + "grad_norm": 0.6518086194992065, + "learning_rate": 0.0001, + "loss": 1.1716, + "step": 12372 + }, + { + "epoch": 1.4212853942909656, + "grad_norm": 0.6243144273757935, + "learning_rate": 0.0001, + "loss": 1.5423, + "step": 12373 + }, + { + "epoch": 1.4214002642007926, + "grad_norm": 0.6055872440338135, + "learning_rate": 0.0001, + "loss": 1.567, + "step": 12374 + }, + { + "epoch": 1.4215151341106198, + "grad_norm": 0.5985282063484192, + "learning_rate": 0.0001, + "loss": 1.4185, + "step": 12375 + }, + { + "epoch": 1.4216300040204468, + "grad_norm": 0.5503548383712769, + "learning_rate": 0.0001, + "loss": 1.524, + "step": 12376 + }, + { + "epoch": 1.421744873930274, + "grad_norm": 0.5615320205688477, + "learning_rate": 0.0001, + "loss": 1.5034, + "step": 12377 + }, + { + "epoch": 1.421859743840101, + "grad_norm": 0.5671427249908447, + "learning_rate": 0.0001, + "loss": 1.4848, + "step": 12378 + }, + { + "epoch": 1.4219746137499283, + "grad_norm": 0.6086657643318176, + "learning_rate": 0.0001, + "loss": 1.5969, + "step": 12379 + }, + { + "epoch": 1.4220894836597553, + "grad_norm": 0.567254364490509, + "learning_rate": 0.0001, + "loss": 1.4461, + "step": 12380 + }, + { + "epoch": 1.4222043535695825, + "grad_norm": 0.655489444732666, + "learning_rate": 0.0001, + "loss": 1.4207, + "step": 12381 + }, + { + "epoch": 1.4223192234794095, + "grad_norm": 0.5274955630302429, + "learning_rate": 0.0001, + "loss": 1.2654, + "step": 12382 + }, + { + "epoch": 1.4224340933892368, + "grad_norm": 0.574601948261261, + "learning_rate": 0.0001, + "loss": 1.4474, + "step": 12383 + }, + { + "epoch": 1.4225489632990638, + "grad_norm": 0.5798234343528748, + "learning_rate": 0.0001, + "loss": 1.4091, + "step": 12384 + }, + { + "epoch": 1.422663833208891, + "grad_norm": 0.6089479923248291, + "learning_rate": 0.0001, + "loss": 1.2936, + "step": 12385 + }, + { + "epoch": 1.422778703118718, + "grad_norm": 0.5854637026786804, + "learning_rate": 0.0001, + "loss": 1.4876, + "step": 12386 + }, + { + "epoch": 1.4228935730285452, + "grad_norm": 0.681310772895813, + "learning_rate": 0.0001, + "loss": 1.7594, + "step": 12387 + }, + { + "epoch": 1.4230084429383723, + "grad_norm": 0.5869307518005371, + "learning_rate": 0.0001, + "loss": 1.3828, + "step": 12388 + }, + { + "epoch": 1.4231233128481995, + "grad_norm": 0.5680341124534607, + "learning_rate": 0.0001, + "loss": 1.3795, + "step": 12389 + }, + { + "epoch": 1.4232381827580265, + "grad_norm": 0.5726298093795776, + "learning_rate": 0.0001, + "loss": 1.5208, + "step": 12390 + }, + { + "epoch": 1.4233530526678537, + "grad_norm": 0.6122094988822937, + "learning_rate": 0.0001, + "loss": 1.541, + "step": 12391 + }, + { + "epoch": 1.4234679225776807, + "grad_norm": 0.6463402509689331, + "learning_rate": 0.0001, + "loss": 1.5133, + "step": 12392 + }, + { + "epoch": 1.423582792487508, + "grad_norm": 0.6010882258415222, + "learning_rate": 0.0001, + "loss": 1.3532, + "step": 12393 + }, + { + "epoch": 1.423697662397335, + "grad_norm": 0.5905262231826782, + "learning_rate": 0.0001, + "loss": 1.2767, + "step": 12394 + }, + { + "epoch": 1.4238125323071622, + "grad_norm": 0.5448614358901978, + "learning_rate": 0.0001, + "loss": 1.4319, + "step": 12395 + }, + { + "epoch": 1.4239274022169892, + "grad_norm": 0.5370798707008362, + "learning_rate": 0.0001, + "loss": 1.4836, + "step": 12396 + }, + { + "epoch": 1.4240422721268164, + "grad_norm": 0.5748822689056396, + "learning_rate": 0.0001, + "loss": 1.4327, + "step": 12397 + }, + { + "epoch": 1.4241571420366435, + "grad_norm": 0.6279718279838562, + "learning_rate": 0.0001, + "loss": 1.5636, + "step": 12398 + }, + { + "epoch": 1.4242720119464707, + "grad_norm": 0.612967848777771, + "learning_rate": 0.0001, + "loss": 1.4631, + "step": 12399 + }, + { + "epoch": 1.4243868818562977, + "grad_norm": 0.5825915336608887, + "learning_rate": 0.0001, + "loss": 1.4405, + "step": 12400 + }, + { + "epoch": 1.424501751766125, + "grad_norm": 0.5820374488830566, + "learning_rate": 0.0001, + "loss": 1.607, + "step": 12401 + }, + { + "epoch": 1.424616621675952, + "grad_norm": 0.5795537829399109, + "learning_rate": 0.0001, + "loss": 1.3931, + "step": 12402 + }, + { + "epoch": 1.4247314915857792, + "grad_norm": 0.5845256447792053, + "learning_rate": 0.0001, + "loss": 1.5411, + "step": 12403 + }, + { + "epoch": 1.4248463614956062, + "grad_norm": 0.6226243376731873, + "learning_rate": 0.0001, + "loss": 1.5372, + "step": 12404 + }, + { + "epoch": 1.4249612314054334, + "grad_norm": 0.5559086203575134, + "learning_rate": 0.0001, + "loss": 1.488, + "step": 12405 + }, + { + "epoch": 1.4250761013152604, + "grad_norm": 0.5699470043182373, + "learning_rate": 0.0001, + "loss": 1.4175, + "step": 12406 + }, + { + "epoch": 1.4251909712250876, + "grad_norm": 0.6197097301483154, + "learning_rate": 0.0001, + "loss": 1.5927, + "step": 12407 + }, + { + "epoch": 1.4253058411349147, + "grad_norm": 0.5672743320465088, + "learning_rate": 0.0001, + "loss": 1.4876, + "step": 12408 + }, + { + "epoch": 1.4254207110447419, + "grad_norm": 0.5639980435371399, + "learning_rate": 0.0001, + "loss": 1.5616, + "step": 12409 + }, + { + "epoch": 1.425535580954569, + "grad_norm": 0.6176483631134033, + "learning_rate": 0.0001, + "loss": 1.6189, + "step": 12410 + }, + { + "epoch": 1.4256504508643961, + "grad_norm": 0.599812388420105, + "learning_rate": 0.0001, + "loss": 1.3885, + "step": 12411 + }, + { + "epoch": 1.4257653207742231, + "grad_norm": 0.5804879069328308, + "learning_rate": 0.0001, + "loss": 1.4213, + "step": 12412 + }, + { + "epoch": 1.4258801906840504, + "grad_norm": 0.553686261177063, + "learning_rate": 0.0001, + "loss": 1.4293, + "step": 12413 + }, + { + "epoch": 1.4259950605938774, + "grad_norm": 0.5969316959381104, + "learning_rate": 0.0001, + "loss": 1.4132, + "step": 12414 + }, + { + "epoch": 1.4261099305037046, + "grad_norm": 0.6206128597259521, + "learning_rate": 0.0001, + "loss": 1.4204, + "step": 12415 + }, + { + "epoch": 1.4262248004135316, + "grad_norm": 0.6273772120475769, + "learning_rate": 0.0001, + "loss": 1.5474, + "step": 12416 + }, + { + "epoch": 1.4263396703233588, + "grad_norm": 0.5739526748657227, + "learning_rate": 0.0001, + "loss": 1.5694, + "step": 12417 + }, + { + "epoch": 1.4264545402331859, + "grad_norm": 0.5758925080299377, + "learning_rate": 0.0001, + "loss": 1.3505, + "step": 12418 + }, + { + "epoch": 1.426569410143013, + "grad_norm": 0.5880077481269836, + "learning_rate": 0.0001, + "loss": 1.4535, + "step": 12419 + }, + { + "epoch": 1.42668428005284, + "grad_norm": 0.6464236378669739, + "learning_rate": 0.0001, + "loss": 1.6595, + "step": 12420 + }, + { + "epoch": 1.4267991499626673, + "grad_norm": 0.6200028657913208, + "learning_rate": 0.0001, + "loss": 1.4498, + "step": 12421 + }, + { + "epoch": 1.4269140198724943, + "grad_norm": 0.5777352452278137, + "learning_rate": 0.0001, + "loss": 1.3672, + "step": 12422 + }, + { + "epoch": 1.4270288897823216, + "grad_norm": 0.6160234808921814, + "learning_rate": 0.0001, + "loss": 1.6537, + "step": 12423 + }, + { + "epoch": 1.4271437596921486, + "grad_norm": 0.5613859295845032, + "learning_rate": 0.0001, + "loss": 1.1053, + "step": 12424 + }, + { + "epoch": 1.4272586296019758, + "grad_norm": 0.5593403577804565, + "learning_rate": 0.0001, + "loss": 1.531, + "step": 12425 + }, + { + "epoch": 1.4273734995118028, + "grad_norm": 0.5781793594360352, + "learning_rate": 0.0001, + "loss": 1.5957, + "step": 12426 + }, + { + "epoch": 1.42748836942163, + "grad_norm": 0.5942292809486389, + "learning_rate": 0.0001, + "loss": 1.4096, + "step": 12427 + }, + { + "epoch": 1.427603239331457, + "grad_norm": 0.5752783417701721, + "learning_rate": 0.0001, + "loss": 1.3625, + "step": 12428 + }, + { + "epoch": 1.4277181092412843, + "grad_norm": 0.7537838816642761, + "learning_rate": 0.0001, + "loss": 1.6131, + "step": 12429 + }, + { + "epoch": 1.4278329791511113, + "grad_norm": 0.5820350050926208, + "learning_rate": 0.0001, + "loss": 1.6196, + "step": 12430 + }, + { + "epoch": 1.4279478490609385, + "grad_norm": 0.5638639330863953, + "learning_rate": 0.0001, + "loss": 1.4653, + "step": 12431 + }, + { + "epoch": 1.4280627189707655, + "grad_norm": 0.6400308012962341, + "learning_rate": 0.0001, + "loss": 1.6973, + "step": 12432 + }, + { + "epoch": 1.4281775888805928, + "grad_norm": 0.5983318090438843, + "learning_rate": 0.0001, + "loss": 1.4876, + "step": 12433 + }, + { + "epoch": 1.4282924587904198, + "grad_norm": 0.6239731311798096, + "learning_rate": 0.0001, + "loss": 1.7114, + "step": 12434 + }, + { + "epoch": 1.428407328700247, + "grad_norm": 0.6159213781356812, + "learning_rate": 0.0001, + "loss": 1.4131, + "step": 12435 + }, + { + "epoch": 1.428522198610074, + "grad_norm": 0.5617234110832214, + "learning_rate": 0.0001, + "loss": 1.4996, + "step": 12436 + }, + { + "epoch": 1.4286370685199012, + "grad_norm": 0.5928561687469482, + "learning_rate": 0.0001, + "loss": 1.5274, + "step": 12437 + }, + { + "epoch": 1.4287519384297283, + "grad_norm": 0.6398497819900513, + "learning_rate": 0.0001, + "loss": 1.5243, + "step": 12438 + }, + { + "epoch": 1.4288668083395555, + "grad_norm": 0.6484475135803223, + "learning_rate": 0.0001, + "loss": 1.6719, + "step": 12439 + }, + { + "epoch": 1.4289816782493825, + "grad_norm": 0.6493231058120728, + "learning_rate": 0.0001, + "loss": 1.6668, + "step": 12440 + }, + { + "epoch": 1.4290965481592097, + "grad_norm": 0.5870835781097412, + "learning_rate": 0.0001, + "loss": 1.2109, + "step": 12441 + }, + { + "epoch": 1.4292114180690367, + "grad_norm": 0.6100109815597534, + "learning_rate": 0.0001, + "loss": 1.4725, + "step": 12442 + }, + { + "epoch": 1.429326287978864, + "grad_norm": 0.6027024388313293, + "learning_rate": 0.0001, + "loss": 1.262, + "step": 12443 + }, + { + "epoch": 1.4294411578886912, + "grad_norm": 0.635283887386322, + "learning_rate": 0.0001, + "loss": 1.4945, + "step": 12444 + }, + { + "epoch": 1.4295560277985182, + "grad_norm": 0.5338662266731262, + "learning_rate": 0.0001, + "loss": 1.2908, + "step": 12445 + }, + { + "epoch": 1.4296708977083452, + "grad_norm": 0.579879879951477, + "learning_rate": 0.0001, + "loss": 1.3954, + "step": 12446 + }, + { + "epoch": 1.4297857676181724, + "grad_norm": 0.5709049105644226, + "learning_rate": 0.0001, + "loss": 1.4578, + "step": 12447 + }, + { + "epoch": 1.4299006375279997, + "grad_norm": 0.5717498660087585, + "learning_rate": 0.0001, + "loss": 1.4321, + "step": 12448 + }, + { + "epoch": 1.4300155074378267, + "grad_norm": 0.6005428433418274, + "learning_rate": 0.0001, + "loss": 1.6311, + "step": 12449 + }, + { + "epoch": 1.4301303773476537, + "grad_norm": 0.5593380928039551, + "learning_rate": 0.0001, + "loss": 1.4149, + "step": 12450 + }, + { + "epoch": 1.430245247257481, + "grad_norm": 0.5910133719444275, + "learning_rate": 0.0001, + "loss": 1.3897, + "step": 12451 + }, + { + "epoch": 1.4303601171673082, + "grad_norm": 0.6213868856430054, + "learning_rate": 0.0001, + "loss": 1.4776, + "step": 12452 + }, + { + "epoch": 1.4304749870771352, + "grad_norm": 0.552069365978241, + "learning_rate": 0.0001, + "loss": 1.0546, + "step": 12453 + }, + { + "epoch": 1.4305898569869622, + "grad_norm": 0.5821637511253357, + "learning_rate": 0.0001, + "loss": 1.4505, + "step": 12454 + }, + { + "epoch": 1.4307047268967894, + "grad_norm": 0.6346561312675476, + "learning_rate": 0.0001, + "loss": 1.3883, + "step": 12455 + }, + { + "epoch": 1.4308195968066166, + "grad_norm": 0.5623376369476318, + "learning_rate": 0.0001, + "loss": 1.2515, + "step": 12456 + }, + { + "epoch": 1.4309344667164436, + "grad_norm": 0.5908299088478088, + "learning_rate": 0.0001, + "loss": 1.5804, + "step": 12457 + }, + { + "epoch": 1.4310493366262707, + "grad_norm": 0.5913065671920776, + "learning_rate": 0.0001, + "loss": 1.3443, + "step": 12458 + }, + { + "epoch": 1.4311642065360979, + "grad_norm": 0.6305943727493286, + "learning_rate": 0.0001, + "loss": 1.4428, + "step": 12459 + }, + { + "epoch": 1.4312790764459251, + "grad_norm": 0.611919105052948, + "learning_rate": 0.0001, + "loss": 1.6211, + "step": 12460 + }, + { + "epoch": 1.4313939463557521, + "grad_norm": 0.5658090710639954, + "learning_rate": 0.0001, + "loss": 1.4647, + "step": 12461 + }, + { + "epoch": 1.4315088162655791, + "grad_norm": 0.5929034948348999, + "learning_rate": 0.0001, + "loss": 1.4665, + "step": 12462 + }, + { + "epoch": 1.4316236861754064, + "grad_norm": 0.6326093673706055, + "learning_rate": 0.0001, + "loss": 1.4622, + "step": 12463 + }, + { + "epoch": 1.4317385560852336, + "grad_norm": 0.6289833784103394, + "learning_rate": 0.0001, + "loss": 1.5712, + "step": 12464 + }, + { + "epoch": 1.4318534259950606, + "grad_norm": 0.6180969476699829, + "learning_rate": 0.0001, + "loss": 1.5338, + "step": 12465 + }, + { + "epoch": 1.4319682959048876, + "grad_norm": 0.5622907876968384, + "learning_rate": 0.0001, + "loss": 1.4497, + "step": 12466 + }, + { + "epoch": 1.4320831658147148, + "grad_norm": 0.5641545653343201, + "learning_rate": 0.0001, + "loss": 1.5119, + "step": 12467 + }, + { + "epoch": 1.432198035724542, + "grad_norm": 0.6269649863243103, + "learning_rate": 0.0001, + "loss": 1.3424, + "step": 12468 + }, + { + "epoch": 1.432312905634369, + "grad_norm": 0.5639840960502625, + "learning_rate": 0.0001, + "loss": 1.4185, + "step": 12469 + }, + { + "epoch": 1.432427775544196, + "grad_norm": 0.630839467048645, + "learning_rate": 0.0001, + "loss": 1.4605, + "step": 12470 + }, + { + "epoch": 1.4325426454540233, + "grad_norm": 0.5446004867553711, + "learning_rate": 0.0001, + "loss": 1.2578, + "step": 12471 + }, + { + "epoch": 1.4326575153638506, + "grad_norm": 0.5677520036697388, + "learning_rate": 0.0001, + "loss": 1.5337, + "step": 12472 + }, + { + "epoch": 1.4327723852736776, + "grad_norm": 0.6485244035720825, + "learning_rate": 0.0001, + "loss": 1.336, + "step": 12473 + }, + { + "epoch": 1.4328872551835046, + "grad_norm": 0.6378061771392822, + "learning_rate": 0.0001, + "loss": 1.6011, + "step": 12474 + }, + { + "epoch": 1.4330021250933318, + "grad_norm": 0.5592242479324341, + "learning_rate": 0.0001, + "loss": 1.5086, + "step": 12475 + }, + { + "epoch": 1.433116995003159, + "grad_norm": 0.5973872542381287, + "learning_rate": 0.0001, + "loss": 1.655, + "step": 12476 + }, + { + "epoch": 1.433231864912986, + "grad_norm": 0.6042687892913818, + "learning_rate": 0.0001, + "loss": 1.4897, + "step": 12477 + }, + { + "epoch": 1.433346734822813, + "grad_norm": 0.6249580979347229, + "learning_rate": 0.0001, + "loss": 1.504, + "step": 12478 + }, + { + "epoch": 1.4334616047326403, + "grad_norm": 0.5797646045684814, + "learning_rate": 0.0001, + "loss": 1.5292, + "step": 12479 + }, + { + "epoch": 1.4335764746424675, + "grad_norm": 0.616908609867096, + "learning_rate": 0.0001, + "loss": 1.6353, + "step": 12480 + }, + { + "epoch": 1.4336913445522945, + "grad_norm": 0.5572378039360046, + "learning_rate": 0.0001, + "loss": 1.5465, + "step": 12481 + }, + { + "epoch": 1.4338062144621215, + "grad_norm": 0.6019822955131531, + "learning_rate": 0.0001, + "loss": 1.2772, + "step": 12482 + }, + { + "epoch": 1.4339210843719488, + "grad_norm": 0.5784163475036621, + "learning_rate": 0.0001, + "loss": 1.4937, + "step": 12483 + }, + { + "epoch": 1.434035954281776, + "grad_norm": 0.6143441796302795, + "learning_rate": 0.0001, + "loss": 1.5212, + "step": 12484 + }, + { + "epoch": 1.434150824191603, + "grad_norm": 0.6093927621841431, + "learning_rate": 0.0001, + "loss": 1.3583, + "step": 12485 + }, + { + "epoch": 1.43426569410143, + "grad_norm": 0.6261629462242126, + "learning_rate": 0.0001, + "loss": 1.4718, + "step": 12486 + }, + { + "epoch": 1.4343805640112572, + "grad_norm": 0.5928403735160828, + "learning_rate": 0.0001, + "loss": 1.4865, + "step": 12487 + }, + { + "epoch": 1.4344954339210845, + "grad_norm": 0.5965852737426758, + "learning_rate": 0.0001, + "loss": 1.5387, + "step": 12488 + }, + { + "epoch": 1.4346103038309115, + "grad_norm": 0.6321581602096558, + "learning_rate": 0.0001, + "loss": 1.4827, + "step": 12489 + }, + { + "epoch": 1.4347251737407385, + "grad_norm": 0.5445297956466675, + "learning_rate": 0.0001, + "loss": 1.2522, + "step": 12490 + }, + { + "epoch": 1.4348400436505657, + "grad_norm": 0.5668053030967712, + "learning_rate": 0.0001, + "loss": 1.2608, + "step": 12491 + }, + { + "epoch": 1.434954913560393, + "grad_norm": 0.606621265411377, + "learning_rate": 0.0001, + "loss": 1.6458, + "step": 12492 + }, + { + "epoch": 1.43506978347022, + "grad_norm": 0.5955449938774109, + "learning_rate": 0.0001, + "loss": 1.2963, + "step": 12493 + }, + { + "epoch": 1.435184653380047, + "grad_norm": 0.6068549156188965, + "learning_rate": 0.0001, + "loss": 1.6448, + "step": 12494 + }, + { + "epoch": 1.4352995232898742, + "grad_norm": 0.5606309175491333, + "learning_rate": 0.0001, + "loss": 1.3839, + "step": 12495 + }, + { + "epoch": 1.4354143931997014, + "grad_norm": 0.58128821849823, + "learning_rate": 0.0001, + "loss": 1.5119, + "step": 12496 + }, + { + "epoch": 1.4355292631095284, + "grad_norm": 0.5731803774833679, + "learning_rate": 0.0001, + "loss": 1.3732, + "step": 12497 + }, + { + "epoch": 1.4356441330193555, + "grad_norm": 0.5677974224090576, + "learning_rate": 0.0001, + "loss": 1.4033, + "step": 12498 + }, + { + "epoch": 1.4357590029291827, + "grad_norm": 0.6500546336174011, + "learning_rate": 0.0001, + "loss": 1.6548, + "step": 12499 + }, + { + "epoch": 1.43587387283901, + "grad_norm": 0.5387493968009949, + "learning_rate": 0.0001, + "loss": 1.3859, + "step": 12500 + }, + { + "epoch": 1.435988742748837, + "grad_norm": 0.630449652671814, + "learning_rate": 0.0001, + "loss": 1.5172, + "step": 12501 + }, + { + "epoch": 1.436103612658664, + "grad_norm": 0.5695238709449768, + "learning_rate": 0.0001, + "loss": 1.2597, + "step": 12502 + }, + { + "epoch": 1.4362184825684912, + "grad_norm": 0.611565113067627, + "learning_rate": 0.0001, + "loss": 1.4379, + "step": 12503 + }, + { + "epoch": 1.4363333524783184, + "grad_norm": 0.5593751668930054, + "learning_rate": 0.0001, + "loss": 1.5487, + "step": 12504 + }, + { + "epoch": 1.4364482223881454, + "grad_norm": 0.5889508724212646, + "learning_rate": 0.0001, + "loss": 1.5008, + "step": 12505 + }, + { + "epoch": 1.4365630922979724, + "grad_norm": 0.6173695921897888, + "learning_rate": 0.0001, + "loss": 1.5322, + "step": 12506 + }, + { + "epoch": 1.4366779622077996, + "grad_norm": 0.591052770614624, + "learning_rate": 0.0001, + "loss": 1.4149, + "step": 12507 + }, + { + "epoch": 1.4367928321176269, + "grad_norm": 0.5677117109298706, + "learning_rate": 0.0001, + "loss": 1.4339, + "step": 12508 + }, + { + "epoch": 1.4369077020274539, + "grad_norm": 0.5549595355987549, + "learning_rate": 0.0001, + "loss": 1.2642, + "step": 12509 + }, + { + "epoch": 1.4370225719372811, + "grad_norm": 0.5533115267753601, + "learning_rate": 0.0001, + "loss": 1.4143, + "step": 12510 + }, + { + "epoch": 1.4371374418471081, + "grad_norm": 0.5983604788780212, + "learning_rate": 0.0001, + "loss": 1.3387, + "step": 12511 + }, + { + "epoch": 1.4372523117569354, + "grad_norm": 0.6087487936019897, + "learning_rate": 0.0001, + "loss": 1.5874, + "step": 12512 + }, + { + "epoch": 1.4373671816667624, + "grad_norm": 0.629578709602356, + "learning_rate": 0.0001, + "loss": 1.4994, + "step": 12513 + }, + { + "epoch": 1.4374820515765896, + "grad_norm": 0.6290776133537292, + "learning_rate": 0.0001, + "loss": 1.5329, + "step": 12514 + }, + { + "epoch": 1.4375969214864166, + "grad_norm": 0.6354580521583557, + "learning_rate": 0.0001, + "loss": 1.5526, + "step": 12515 + }, + { + "epoch": 1.4377117913962438, + "grad_norm": 0.5979134440422058, + "learning_rate": 0.0001, + "loss": 1.5663, + "step": 12516 + }, + { + "epoch": 1.4378266613060708, + "grad_norm": 0.5574628114700317, + "learning_rate": 0.0001, + "loss": 1.2554, + "step": 12517 + }, + { + "epoch": 1.437941531215898, + "grad_norm": 0.5842434763908386, + "learning_rate": 0.0001, + "loss": 1.3712, + "step": 12518 + }, + { + "epoch": 1.438056401125725, + "grad_norm": 0.6743392944335938, + "learning_rate": 0.0001, + "loss": 1.7602, + "step": 12519 + }, + { + "epoch": 1.4381712710355523, + "grad_norm": 0.576398491859436, + "learning_rate": 0.0001, + "loss": 1.3749, + "step": 12520 + }, + { + "epoch": 1.4382861409453793, + "grad_norm": 0.5756230354309082, + "learning_rate": 0.0001, + "loss": 1.4367, + "step": 12521 + }, + { + "epoch": 1.4384010108552066, + "grad_norm": 0.6155824065208435, + "learning_rate": 0.0001, + "loss": 1.5771, + "step": 12522 + }, + { + "epoch": 1.4385158807650336, + "grad_norm": 0.6590107679367065, + "learning_rate": 0.0001, + "loss": 1.6097, + "step": 12523 + }, + { + "epoch": 1.4386307506748608, + "grad_norm": 0.6111704707145691, + "learning_rate": 0.0001, + "loss": 1.6088, + "step": 12524 + }, + { + "epoch": 1.4387456205846878, + "grad_norm": 0.6221973299980164, + "learning_rate": 0.0001, + "loss": 1.4813, + "step": 12525 + }, + { + "epoch": 1.438860490494515, + "grad_norm": 0.5589838624000549, + "learning_rate": 0.0001, + "loss": 1.2726, + "step": 12526 + }, + { + "epoch": 1.438975360404342, + "grad_norm": 0.6924046874046326, + "learning_rate": 0.0001, + "loss": 1.1983, + "step": 12527 + }, + { + "epoch": 1.4390902303141693, + "grad_norm": 0.5677695870399475, + "learning_rate": 0.0001, + "loss": 1.5127, + "step": 12528 + }, + { + "epoch": 1.4392051002239963, + "grad_norm": 0.6168467402458191, + "learning_rate": 0.0001, + "loss": 1.5937, + "step": 12529 + }, + { + "epoch": 1.4393199701338235, + "grad_norm": 0.5841947197914124, + "learning_rate": 0.0001, + "loss": 1.5834, + "step": 12530 + }, + { + "epoch": 1.4394348400436505, + "grad_norm": 0.6030665636062622, + "learning_rate": 0.0001, + "loss": 1.5023, + "step": 12531 + }, + { + "epoch": 1.4395497099534778, + "grad_norm": 0.6445423364639282, + "learning_rate": 0.0001, + "loss": 1.6561, + "step": 12532 + }, + { + "epoch": 1.4396645798633048, + "grad_norm": 0.5743504166603088, + "learning_rate": 0.0001, + "loss": 1.3297, + "step": 12533 + }, + { + "epoch": 1.439779449773132, + "grad_norm": 0.6461590528488159, + "learning_rate": 0.0001, + "loss": 1.5937, + "step": 12534 + }, + { + "epoch": 1.439894319682959, + "grad_norm": 0.535851001739502, + "learning_rate": 0.0001, + "loss": 1.343, + "step": 12535 + }, + { + "epoch": 1.4400091895927862, + "grad_norm": 0.6115723252296448, + "learning_rate": 0.0001, + "loss": 1.4831, + "step": 12536 + }, + { + "epoch": 1.4401240595026132, + "grad_norm": 0.5796070694923401, + "learning_rate": 0.0001, + "loss": 1.4744, + "step": 12537 + }, + { + "epoch": 1.4402389294124405, + "grad_norm": 0.6212785243988037, + "learning_rate": 0.0001, + "loss": 1.4577, + "step": 12538 + }, + { + "epoch": 1.4403537993222675, + "grad_norm": 0.6039834022521973, + "learning_rate": 0.0001, + "loss": 1.6319, + "step": 12539 + }, + { + "epoch": 1.4404686692320947, + "grad_norm": 0.6413995027542114, + "learning_rate": 0.0001, + "loss": 1.4666, + "step": 12540 + }, + { + "epoch": 1.4405835391419217, + "grad_norm": 0.6055036187171936, + "learning_rate": 0.0001, + "loss": 1.6406, + "step": 12541 + }, + { + "epoch": 1.440698409051749, + "grad_norm": 0.5628538727760315, + "learning_rate": 0.0001, + "loss": 1.442, + "step": 12542 + }, + { + "epoch": 1.440813278961576, + "grad_norm": 0.6067841649055481, + "learning_rate": 0.0001, + "loss": 1.5706, + "step": 12543 + }, + { + "epoch": 1.4409281488714032, + "grad_norm": 0.5712527632713318, + "learning_rate": 0.0001, + "loss": 1.421, + "step": 12544 + }, + { + "epoch": 1.4410430187812302, + "grad_norm": 0.5913591384887695, + "learning_rate": 0.0001, + "loss": 1.469, + "step": 12545 + }, + { + "epoch": 1.4411578886910574, + "grad_norm": 0.5872635841369629, + "learning_rate": 0.0001, + "loss": 1.4475, + "step": 12546 + }, + { + "epoch": 1.4412727586008844, + "grad_norm": 0.6234807372093201, + "learning_rate": 0.0001, + "loss": 1.4239, + "step": 12547 + }, + { + "epoch": 1.4413876285107117, + "grad_norm": 0.6241154074668884, + "learning_rate": 0.0001, + "loss": 1.4686, + "step": 12548 + }, + { + "epoch": 1.4415024984205387, + "grad_norm": 0.5826482772827148, + "learning_rate": 0.0001, + "loss": 1.3162, + "step": 12549 + }, + { + "epoch": 1.441617368330366, + "grad_norm": 0.5742958784103394, + "learning_rate": 0.0001, + "loss": 1.5757, + "step": 12550 + }, + { + "epoch": 1.441732238240193, + "grad_norm": 0.5949505567550659, + "learning_rate": 0.0001, + "loss": 1.3672, + "step": 12551 + }, + { + "epoch": 1.4418471081500202, + "grad_norm": 0.6136342287063599, + "learning_rate": 0.0001, + "loss": 1.4474, + "step": 12552 + }, + { + "epoch": 1.4419619780598472, + "grad_norm": 0.5760694146156311, + "learning_rate": 0.0001, + "loss": 1.3999, + "step": 12553 + }, + { + "epoch": 1.4420768479696744, + "grad_norm": 0.6037359833717346, + "learning_rate": 0.0001, + "loss": 1.7367, + "step": 12554 + }, + { + "epoch": 1.4421917178795014, + "grad_norm": 0.5782321095466614, + "learning_rate": 0.0001, + "loss": 1.3474, + "step": 12555 + }, + { + "epoch": 1.4423065877893286, + "grad_norm": 0.5973963737487793, + "learning_rate": 0.0001, + "loss": 1.5053, + "step": 12556 + }, + { + "epoch": 1.4424214576991556, + "grad_norm": 0.588340699672699, + "learning_rate": 0.0001, + "loss": 1.5116, + "step": 12557 + }, + { + "epoch": 1.4425363276089829, + "grad_norm": 0.5831438302993774, + "learning_rate": 0.0001, + "loss": 1.4753, + "step": 12558 + }, + { + "epoch": 1.4426511975188099, + "grad_norm": 0.6051815748214722, + "learning_rate": 0.0001, + "loss": 1.466, + "step": 12559 + }, + { + "epoch": 1.4427660674286371, + "grad_norm": 0.5373536348342896, + "learning_rate": 0.0001, + "loss": 1.049, + "step": 12560 + }, + { + "epoch": 1.4428809373384641, + "grad_norm": 0.6360757350921631, + "learning_rate": 0.0001, + "loss": 1.497, + "step": 12561 + }, + { + "epoch": 1.4429958072482914, + "grad_norm": 0.6303163170814514, + "learning_rate": 0.0001, + "loss": 1.5535, + "step": 12562 + }, + { + "epoch": 1.4431106771581184, + "grad_norm": 0.6021014451980591, + "learning_rate": 0.0001, + "loss": 1.3477, + "step": 12563 + }, + { + "epoch": 1.4432255470679456, + "grad_norm": 0.6191940307617188, + "learning_rate": 0.0001, + "loss": 1.5356, + "step": 12564 + }, + { + "epoch": 1.4433404169777726, + "grad_norm": 0.6756370067596436, + "learning_rate": 0.0001, + "loss": 1.5671, + "step": 12565 + }, + { + "epoch": 1.4434552868875998, + "grad_norm": 0.7018874883651733, + "learning_rate": 0.0001, + "loss": 1.863, + "step": 12566 + }, + { + "epoch": 1.4435701567974268, + "grad_norm": 0.5474680066108704, + "learning_rate": 0.0001, + "loss": 1.3566, + "step": 12567 + }, + { + "epoch": 1.443685026707254, + "grad_norm": 0.6140058040618896, + "learning_rate": 0.0001, + "loss": 1.2638, + "step": 12568 + }, + { + "epoch": 1.443799896617081, + "grad_norm": 0.5624337196350098, + "learning_rate": 0.0001, + "loss": 1.4342, + "step": 12569 + }, + { + "epoch": 1.4439147665269083, + "grad_norm": 0.6037470698356628, + "learning_rate": 0.0001, + "loss": 1.6618, + "step": 12570 + }, + { + "epoch": 1.4440296364367353, + "grad_norm": 0.594399094581604, + "learning_rate": 0.0001, + "loss": 1.619, + "step": 12571 + }, + { + "epoch": 1.4441445063465626, + "grad_norm": 0.5606778264045715, + "learning_rate": 0.0001, + "loss": 1.3396, + "step": 12572 + }, + { + "epoch": 1.4442593762563896, + "grad_norm": 0.6168720126152039, + "learning_rate": 0.0001, + "loss": 1.4917, + "step": 12573 + }, + { + "epoch": 1.4443742461662168, + "grad_norm": 0.5519090294837952, + "learning_rate": 0.0001, + "loss": 1.5099, + "step": 12574 + }, + { + "epoch": 1.4444891160760438, + "grad_norm": 0.5982587933540344, + "learning_rate": 0.0001, + "loss": 1.5885, + "step": 12575 + }, + { + "epoch": 1.444603985985871, + "grad_norm": 0.6236387491226196, + "learning_rate": 0.0001, + "loss": 1.6722, + "step": 12576 + }, + { + "epoch": 1.444718855895698, + "grad_norm": 0.6040252447128296, + "learning_rate": 0.0001, + "loss": 1.5336, + "step": 12577 + }, + { + "epoch": 1.4448337258055253, + "grad_norm": 0.5864963531494141, + "learning_rate": 0.0001, + "loss": 1.4526, + "step": 12578 + }, + { + "epoch": 1.4449485957153523, + "grad_norm": 0.5800042152404785, + "learning_rate": 0.0001, + "loss": 1.3706, + "step": 12579 + }, + { + "epoch": 1.4450634656251795, + "grad_norm": 0.6318053007125854, + "learning_rate": 0.0001, + "loss": 1.5137, + "step": 12580 + }, + { + "epoch": 1.4451783355350067, + "grad_norm": 0.6059226989746094, + "learning_rate": 0.0001, + "loss": 1.3472, + "step": 12581 + }, + { + "epoch": 1.4452932054448338, + "grad_norm": 0.5987119078636169, + "learning_rate": 0.0001, + "loss": 1.4228, + "step": 12582 + }, + { + "epoch": 1.4454080753546608, + "grad_norm": 0.6017065048217773, + "learning_rate": 0.0001, + "loss": 1.3683, + "step": 12583 + }, + { + "epoch": 1.445522945264488, + "grad_norm": 0.5962862968444824, + "learning_rate": 0.0001, + "loss": 1.5466, + "step": 12584 + }, + { + "epoch": 1.4456378151743152, + "grad_norm": 0.6121211647987366, + "learning_rate": 0.0001, + "loss": 1.6054, + "step": 12585 + }, + { + "epoch": 1.4457526850841422, + "grad_norm": 0.5887848138809204, + "learning_rate": 0.0001, + "loss": 1.3906, + "step": 12586 + }, + { + "epoch": 1.4458675549939692, + "grad_norm": 0.5756409168243408, + "learning_rate": 0.0001, + "loss": 1.345, + "step": 12587 + }, + { + "epoch": 1.4459824249037965, + "grad_norm": 0.6187815070152283, + "learning_rate": 0.0001, + "loss": 1.2619, + "step": 12588 + }, + { + "epoch": 1.4460972948136237, + "grad_norm": 0.6031472086906433, + "learning_rate": 0.0001, + "loss": 1.4761, + "step": 12589 + }, + { + "epoch": 1.4462121647234507, + "grad_norm": 0.5487967729568481, + "learning_rate": 0.0001, + "loss": 1.258, + "step": 12590 + }, + { + "epoch": 1.4463270346332777, + "grad_norm": 0.5621272921562195, + "learning_rate": 0.0001, + "loss": 1.0927, + "step": 12591 + }, + { + "epoch": 1.446441904543105, + "grad_norm": 0.598263144493103, + "learning_rate": 0.0001, + "loss": 1.462, + "step": 12592 + }, + { + "epoch": 1.4465567744529322, + "grad_norm": 0.5940863490104675, + "learning_rate": 0.0001, + "loss": 1.426, + "step": 12593 + }, + { + "epoch": 1.4466716443627592, + "grad_norm": 0.6629210114479065, + "learning_rate": 0.0001, + "loss": 1.638, + "step": 12594 + }, + { + "epoch": 1.4467865142725862, + "grad_norm": 0.5858151912689209, + "learning_rate": 0.0001, + "loss": 1.3906, + "step": 12595 + }, + { + "epoch": 1.4469013841824134, + "grad_norm": 0.6476069092750549, + "learning_rate": 0.0001, + "loss": 1.3094, + "step": 12596 + }, + { + "epoch": 1.4470162540922407, + "grad_norm": 0.6032461524009705, + "learning_rate": 0.0001, + "loss": 1.3134, + "step": 12597 + }, + { + "epoch": 1.4471311240020677, + "grad_norm": 0.591644823551178, + "learning_rate": 0.0001, + "loss": 1.3682, + "step": 12598 + }, + { + "epoch": 1.4472459939118947, + "grad_norm": 0.656358540058136, + "learning_rate": 0.0001, + "loss": 1.1411, + "step": 12599 + }, + { + "epoch": 1.447360863821722, + "grad_norm": 0.6357599496841431, + "learning_rate": 0.0001, + "loss": 1.5471, + "step": 12600 + }, + { + "epoch": 1.4474757337315491, + "grad_norm": 0.5842412710189819, + "learning_rate": 0.0001, + "loss": 1.4944, + "step": 12601 + }, + { + "epoch": 1.4475906036413762, + "grad_norm": 0.580599308013916, + "learning_rate": 0.0001, + "loss": 1.5245, + "step": 12602 + }, + { + "epoch": 1.4477054735512032, + "grad_norm": 0.6119430661201477, + "learning_rate": 0.0001, + "loss": 1.4649, + "step": 12603 + }, + { + "epoch": 1.4478203434610304, + "grad_norm": 0.5994784832000732, + "learning_rate": 0.0001, + "loss": 1.4999, + "step": 12604 + }, + { + "epoch": 1.4479352133708576, + "grad_norm": 0.5609249472618103, + "learning_rate": 0.0001, + "loss": 1.498, + "step": 12605 + }, + { + "epoch": 1.4480500832806846, + "grad_norm": 0.5915836095809937, + "learning_rate": 0.0001, + "loss": 1.3597, + "step": 12606 + }, + { + "epoch": 1.4481649531905116, + "grad_norm": 0.6126052141189575, + "learning_rate": 0.0001, + "loss": 1.6027, + "step": 12607 + }, + { + "epoch": 1.4482798231003389, + "grad_norm": 0.6269632577896118, + "learning_rate": 0.0001, + "loss": 1.5438, + "step": 12608 + }, + { + "epoch": 1.448394693010166, + "grad_norm": 0.5238544344902039, + "learning_rate": 0.0001, + "loss": 1.4742, + "step": 12609 + }, + { + "epoch": 1.448509562919993, + "grad_norm": 0.607177197933197, + "learning_rate": 0.0001, + "loss": 1.5231, + "step": 12610 + }, + { + "epoch": 1.4486244328298201, + "grad_norm": 0.5543795228004456, + "learning_rate": 0.0001, + "loss": 1.4268, + "step": 12611 + }, + { + "epoch": 1.4487393027396473, + "grad_norm": 0.6204040050506592, + "learning_rate": 0.0001, + "loss": 1.5602, + "step": 12612 + }, + { + "epoch": 1.4488541726494746, + "grad_norm": 0.5601338148117065, + "learning_rate": 0.0001, + "loss": 1.5038, + "step": 12613 + }, + { + "epoch": 1.4489690425593016, + "grad_norm": 0.5878952145576477, + "learning_rate": 0.0001, + "loss": 1.5171, + "step": 12614 + }, + { + "epoch": 1.4490839124691286, + "grad_norm": 0.5514255166053772, + "learning_rate": 0.0001, + "loss": 1.3956, + "step": 12615 + }, + { + "epoch": 1.4491987823789558, + "grad_norm": 0.6429269313812256, + "learning_rate": 0.0001, + "loss": 1.3746, + "step": 12616 + }, + { + "epoch": 1.449313652288783, + "grad_norm": 0.5910865068435669, + "learning_rate": 0.0001, + "loss": 1.4062, + "step": 12617 + }, + { + "epoch": 1.44942852219861, + "grad_norm": 0.6003382205963135, + "learning_rate": 0.0001, + "loss": 1.5859, + "step": 12618 + }, + { + "epoch": 1.449543392108437, + "grad_norm": 0.5780314803123474, + "learning_rate": 0.0001, + "loss": 1.4629, + "step": 12619 + }, + { + "epoch": 1.4496582620182643, + "grad_norm": 0.5906615257263184, + "learning_rate": 0.0001, + "loss": 1.414, + "step": 12620 + }, + { + "epoch": 1.4497731319280915, + "grad_norm": 0.6168799996376038, + "learning_rate": 0.0001, + "loss": 1.6244, + "step": 12621 + }, + { + "epoch": 1.4498880018379185, + "grad_norm": 0.5736768245697021, + "learning_rate": 0.0001, + "loss": 1.5504, + "step": 12622 + }, + { + "epoch": 1.4500028717477456, + "grad_norm": 0.5871389508247375, + "learning_rate": 0.0001, + "loss": 1.488, + "step": 12623 + }, + { + "epoch": 1.4501177416575728, + "grad_norm": 0.6376186013221741, + "learning_rate": 0.0001, + "loss": 1.4979, + "step": 12624 + }, + { + "epoch": 1.4502326115674, + "grad_norm": 0.5403526425361633, + "learning_rate": 0.0001, + "loss": 1.2595, + "step": 12625 + }, + { + "epoch": 1.450347481477227, + "grad_norm": 0.5680791139602661, + "learning_rate": 0.0001, + "loss": 1.4837, + "step": 12626 + }, + { + "epoch": 1.450462351387054, + "grad_norm": 0.5831477642059326, + "learning_rate": 0.0001, + "loss": 1.5409, + "step": 12627 + }, + { + "epoch": 1.4505772212968813, + "grad_norm": 0.5702657103538513, + "learning_rate": 0.0001, + "loss": 1.5092, + "step": 12628 + }, + { + "epoch": 1.4506920912067085, + "grad_norm": 0.5878103971481323, + "learning_rate": 0.0001, + "loss": 1.5792, + "step": 12629 + }, + { + "epoch": 1.4508069611165355, + "grad_norm": 0.6267426609992981, + "learning_rate": 0.0001, + "loss": 1.3794, + "step": 12630 + }, + { + "epoch": 1.4509218310263625, + "grad_norm": 0.578369140625, + "learning_rate": 0.0001, + "loss": 1.5242, + "step": 12631 + }, + { + "epoch": 1.4510367009361897, + "grad_norm": 0.601699709892273, + "learning_rate": 0.0001, + "loss": 1.5346, + "step": 12632 + }, + { + "epoch": 1.451151570846017, + "grad_norm": 0.5736028552055359, + "learning_rate": 0.0001, + "loss": 1.3779, + "step": 12633 + }, + { + "epoch": 1.451266440755844, + "grad_norm": 0.5805512070655823, + "learning_rate": 0.0001, + "loss": 1.4821, + "step": 12634 + }, + { + "epoch": 1.451381310665671, + "grad_norm": 0.6317859292030334, + "learning_rate": 0.0001, + "loss": 1.4541, + "step": 12635 + }, + { + "epoch": 1.4514961805754982, + "grad_norm": 0.5671908259391785, + "learning_rate": 0.0001, + "loss": 1.3133, + "step": 12636 + }, + { + "epoch": 1.4516110504853255, + "grad_norm": 0.5456831455230713, + "learning_rate": 0.0001, + "loss": 1.463, + "step": 12637 + }, + { + "epoch": 1.4517259203951525, + "grad_norm": 0.6048300266265869, + "learning_rate": 0.0001, + "loss": 1.4596, + "step": 12638 + }, + { + "epoch": 1.4518407903049795, + "grad_norm": 0.5492627024650574, + "learning_rate": 0.0001, + "loss": 1.372, + "step": 12639 + }, + { + "epoch": 1.4519556602148067, + "grad_norm": 0.6605719923973083, + "learning_rate": 0.0001, + "loss": 1.4796, + "step": 12640 + }, + { + "epoch": 1.452070530124634, + "grad_norm": 0.6105278730392456, + "learning_rate": 0.0001, + "loss": 1.6718, + "step": 12641 + }, + { + "epoch": 1.452185400034461, + "grad_norm": 0.6340276598930359, + "learning_rate": 0.0001, + "loss": 1.5242, + "step": 12642 + }, + { + "epoch": 1.452300269944288, + "grad_norm": 0.5779246687889099, + "learning_rate": 0.0001, + "loss": 1.5908, + "step": 12643 + }, + { + "epoch": 1.4524151398541152, + "grad_norm": 0.5638899803161621, + "learning_rate": 0.0001, + "loss": 1.3403, + "step": 12644 + }, + { + "epoch": 1.4525300097639424, + "grad_norm": 0.5835622549057007, + "learning_rate": 0.0001, + "loss": 1.5487, + "step": 12645 + }, + { + "epoch": 1.4526448796737694, + "grad_norm": 0.5531151294708252, + "learning_rate": 0.0001, + "loss": 1.3624, + "step": 12646 + }, + { + "epoch": 1.4527597495835967, + "grad_norm": 0.6040196418762207, + "learning_rate": 0.0001, + "loss": 1.5721, + "step": 12647 + }, + { + "epoch": 1.4528746194934237, + "grad_norm": 0.6672126054763794, + "learning_rate": 0.0001, + "loss": 1.3766, + "step": 12648 + }, + { + "epoch": 1.452989489403251, + "grad_norm": 0.5610442757606506, + "learning_rate": 0.0001, + "loss": 1.3557, + "step": 12649 + }, + { + "epoch": 1.453104359313078, + "grad_norm": 0.5616607666015625, + "learning_rate": 0.0001, + "loss": 1.3134, + "step": 12650 + }, + { + "epoch": 1.4532192292229051, + "grad_norm": 0.5792796015739441, + "learning_rate": 0.0001, + "loss": 1.7026, + "step": 12651 + }, + { + "epoch": 1.4533340991327321, + "grad_norm": 0.5875088572502136, + "learning_rate": 0.0001, + "loss": 1.6033, + "step": 12652 + }, + { + "epoch": 1.4534489690425594, + "grad_norm": 0.5907067060470581, + "learning_rate": 0.0001, + "loss": 1.5317, + "step": 12653 + }, + { + "epoch": 1.4535638389523864, + "grad_norm": 0.5939152836799622, + "learning_rate": 0.0001, + "loss": 1.4968, + "step": 12654 + }, + { + "epoch": 1.4536787088622136, + "grad_norm": 0.585138201713562, + "learning_rate": 0.0001, + "loss": 1.4367, + "step": 12655 + }, + { + "epoch": 1.4537935787720406, + "grad_norm": 0.634059727191925, + "learning_rate": 0.0001, + "loss": 1.3877, + "step": 12656 + }, + { + "epoch": 1.4539084486818679, + "grad_norm": 0.5916883945465088, + "learning_rate": 0.0001, + "loss": 1.455, + "step": 12657 + }, + { + "epoch": 1.4540233185916949, + "grad_norm": 0.6057246923446655, + "learning_rate": 0.0001, + "loss": 1.539, + "step": 12658 + }, + { + "epoch": 1.454138188501522, + "grad_norm": 0.5644753575325012, + "learning_rate": 0.0001, + "loss": 1.4298, + "step": 12659 + }, + { + "epoch": 1.454253058411349, + "grad_norm": 0.6254203915596008, + "learning_rate": 0.0001, + "loss": 1.501, + "step": 12660 + }, + { + "epoch": 1.4543679283211763, + "grad_norm": 0.6106442213058472, + "learning_rate": 0.0001, + "loss": 1.5069, + "step": 12661 + }, + { + "epoch": 1.4544827982310033, + "grad_norm": 0.5752881765365601, + "learning_rate": 0.0001, + "loss": 1.4354, + "step": 12662 + }, + { + "epoch": 1.4545976681408306, + "grad_norm": 0.6398709416389465, + "learning_rate": 0.0001, + "loss": 1.3857, + "step": 12663 + }, + { + "epoch": 1.4547125380506576, + "grad_norm": 0.6222221255302429, + "learning_rate": 0.0001, + "loss": 1.3953, + "step": 12664 + }, + { + "epoch": 1.4548274079604848, + "grad_norm": 0.5810672044754028, + "learning_rate": 0.0001, + "loss": 1.4166, + "step": 12665 + }, + { + "epoch": 1.4549422778703118, + "grad_norm": 0.5670533180236816, + "learning_rate": 0.0001, + "loss": 1.3454, + "step": 12666 + }, + { + "epoch": 1.455057147780139, + "grad_norm": 0.6999189853668213, + "learning_rate": 0.0001, + "loss": 1.6628, + "step": 12667 + }, + { + "epoch": 1.455172017689966, + "grad_norm": 0.6700528860092163, + "learning_rate": 0.0001, + "loss": 1.3558, + "step": 12668 + }, + { + "epoch": 1.4552868875997933, + "grad_norm": 0.6250571012496948, + "learning_rate": 0.0001, + "loss": 1.6178, + "step": 12669 + }, + { + "epoch": 1.4554017575096203, + "grad_norm": 0.573645293712616, + "learning_rate": 0.0001, + "loss": 1.4697, + "step": 12670 + }, + { + "epoch": 1.4555166274194475, + "grad_norm": 0.5824663043022156, + "learning_rate": 0.0001, + "loss": 1.3154, + "step": 12671 + }, + { + "epoch": 1.4556314973292745, + "grad_norm": 0.5397822260856628, + "learning_rate": 0.0001, + "loss": 1.3799, + "step": 12672 + }, + { + "epoch": 1.4557463672391018, + "grad_norm": 0.6227380633354187, + "learning_rate": 0.0001, + "loss": 1.5445, + "step": 12673 + }, + { + "epoch": 1.4558612371489288, + "grad_norm": 0.5699251294136047, + "learning_rate": 0.0001, + "loss": 1.272, + "step": 12674 + }, + { + "epoch": 1.455976107058756, + "grad_norm": 0.58524489402771, + "learning_rate": 0.0001, + "loss": 1.3419, + "step": 12675 + }, + { + "epoch": 1.456090976968583, + "grad_norm": 0.6593015789985657, + "learning_rate": 0.0001, + "loss": 1.5772, + "step": 12676 + }, + { + "epoch": 1.4562058468784103, + "grad_norm": 0.6179425120353699, + "learning_rate": 0.0001, + "loss": 1.3337, + "step": 12677 + }, + { + "epoch": 1.4563207167882373, + "grad_norm": 0.591990053653717, + "learning_rate": 0.0001, + "loss": 1.5066, + "step": 12678 + }, + { + "epoch": 1.4564355866980645, + "grad_norm": 0.5718491673469543, + "learning_rate": 0.0001, + "loss": 1.5063, + "step": 12679 + }, + { + "epoch": 1.4565504566078915, + "grad_norm": 0.609443187713623, + "learning_rate": 0.0001, + "loss": 1.4913, + "step": 12680 + }, + { + "epoch": 1.4566653265177187, + "grad_norm": 0.5620328187942505, + "learning_rate": 0.0001, + "loss": 1.4505, + "step": 12681 + }, + { + "epoch": 1.4567801964275457, + "grad_norm": 0.6245093941688538, + "learning_rate": 0.0001, + "loss": 1.5944, + "step": 12682 + }, + { + "epoch": 1.456895066337373, + "grad_norm": 0.5638138651847839, + "learning_rate": 0.0001, + "loss": 1.3709, + "step": 12683 + }, + { + "epoch": 1.4570099362472, + "grad_norm": 0.6003065705299377, + "learning_rate": 0.0001, + "loss": 1.4879, + "step": 12684 + }, + { + "epoch": 1.4571248061570272, + "grad_norm": 0.6570206880569458, + "learning_rate": 0.0001, + "loss": 1.3938, + "step": 12685 + }, + { + "epoch": 1.4572396760668542, + "grad_norm": 0.6485840678215027, + "learning_rate": 0.0001, + "loss": 1.3326, + "step": 12686 + }, + { + "epoch": 1.4573545459766815, + "grad_norm": 0.603420615196228, + "learning_rate": 0.0001, + "loss": 1.4235, + "step": 12687 + }, + { + "epoch": 1.4574694158865085, + "grad_norm": 0.6239102482795715, + "learning_rate": 0.0001, + "loss": 1.3894, + "step": 12688 + }, + { + "epoch": 1.4575842857963357, + "grad_norm": 0.6117807626724243, + "learning_rate": 0.0001, + "loss": 1.333, + "step": 12689 + }, + { + "epoch": 1.4576991557061627, + "grad_norm": 0.6549608707427979, + "learning_rate": 0.0001, + "loss": 1.4666, + "step": 12690 + }, + { + "epoch": 1.45781402561599, + "grad_norm": 0.6886609196662903, + "learning_rate": 0.0001, + "loss": 1.6555, + "step": 12691 + }, + { + "epoch": 1.457928895525817, + "grad_norm": 0.6125257611274719, + "learning_rate": 0.0001, + "loss": 1.3924, + "step": 12692 + }, + { + "epoch": 1.4580437654356442, + "grad_norm": 0.5616730451583862, + "learning_rate": 0.0001, + "loss": 1.4452, + "step": 12693 + }, + { + "epoch": 1.4581586353454712, + "grad_norm": 0.6304099559783936, + "learning_rate": 0.0001, + "loss": 1.556, + "step": 12694 + }, + { + "epoch": 1.4582735052552984, + "grad_norm": 0.6536027789115906, + "learning_rate": 0.0001, + "loss": 1.5309, + "step": 12695 + }, + { + "epoch": 1.4583883751651254, + "grad_norm": 0.6076638698577881, + "learning_rate": 0.0001, + "loss": 1.4965, + "step": 12696 + }, + { + "epoch": 1.4585032450749527, + "grad_norm": 0.6095083355903625, + "learning_rate": 0.0001, + "loss": 1.593, + "step": 12697 + }, + { + "epoch": 1.4586181149847797, + "grad_norm": 0.5856631994247437, + "learning_rate": 0.0001, + "loss": 1.4475, + "step": 12698 + }, + { + "epoch": 1.458732984894607, + "grad_norm": 0.582143247127533, + "learning_rate": 0.0001, + "loss": 1.4553, + "step": 12699 + }, + { + "epoch": 1.458847854804434, + "grad_norm": 0.6474102735519409, + "learning_rate": 0.0001, + "loss": 1.6157, + "step": 12700 + }, + { + "epoch": 1.4589627247142611, + "grad_norm": 0.6195456981658936, + "learning_rate": 0.0001, + "loss": 1.4974, + "step": 12701 + }, + { + "epoch": 1.4590775946240881, + "grad_norm": 0.5719879269599915, + "learning_rate": 0.0001, + "loss": 1.3453, + "step": 12702 + }, + { + "epoch": 1.4591924645339154, + "grad_norm": 0.5978543162345886, + "learning_rate": 0.0001, + "loss": 1.5412, + "step": 12703 + }, + { + "epoch": 1.4593073344437424, + "grad_norm": 0.5765978097915649, + "learning_rate": 0.0001, + "loss": 1.2654, + "step": 12704 + }, + { + "epoch": 1.4594222043535696, + "grad_norm": 0.6159289479255676, + "learning_rate": 0.0001, + "loss": 1.3155, + "step": 12705 + }, + { + "epoch": 1.4595370742633966, + "grad_norm": 0.6160826683044434, + "learning_rate": 0.0001, + "loss": 1.4765, + "step": 12706 + }, + { + "epoch": 1.4596519441732239, + "grad_norm": 0.5803123712539673, + "learning_rate": 0.0001, + "loss": 1.3856, + "step": 12707 + }, + { + "epoch": 1.4597668140830509, + "grad_norm": 0.6179252862930298, + "learning_rate": 0.0001, + "loss": 1.3149, + "step": 12708 + }, + { + "epoch": 1.459881683992878, + "grad_norm": 0.6384448409080505, + "learning_rate": 0.0001, + "loss": 1.2182, + "step": 12709 + }, + { + "epoch": 1.459996553902705, + "grad_norm": 0.5776406526565552, + "learning_rate": 0.0001, + "loss": 1.3238, + "step": 12710 + }, + { + "epoch": 1.4601114238125323, + "grad_norm": 0.6758487820625305, + "learning_rate": 0.0001, + "loss": 1.4016, + "step": 12711 + }, + { + "epoch": 1.4602262937223593, + "grad_norm": 0.7246870398521423, + "learning_rate": 0.0001, + "loss": 1.6412, + "step": 12712 + }, + { + "epoch": 1.4603411636321866, + "grad_norm": 0.6004265546798706, + "learning_rate": 0.0001, + "loss": 1.2532, + "step": 12713 + }, + { + "epoch": 1.4604560335420136, + "grad_norm": 0.6100044846534729, + "learning_rate": 0.0001, + "loss": 1.4896, + "step": 12714 + }, + { + "epoch": 1.4605709034518408, + "grad_norm": 0.6924980282783508, + "learning_rate": 0.0001, + "loss": 1.7668, + "step": 12715 + }, + { + "epoch": 1.4606857733616678, + "grad_norm": 0.5812683701515198, + "learning_rate": 0.0001, + "loss": 1.4722, + "step": 12716 + }, + { + "epoch": 1.460800643271495, + "grad_norm": 0.6266067624092102, + "learning_rate": 0.0001, + "loss": 1.4486, + "step": 12717 + }, + { + "epoch": 1.4609155131813223, + "grad_norm": 0.5701542496681213, + "learning_rate": 0.0001, + "loss": 1.4599, + "step": 12718 + }, + { + "epoch": 1.4610303830911493, + "grad_norm": 0.6322028636932373, + "learning_rate": 0.0001, + "loss": 1.4134, + "step": 12719 + }, + { + "epoch": 1.4611452530009763, + "grad_norm": 0.6089679598808289, + "learning_rate": 0.0001, + "loss": 1.6838, + "step": 12720 + }, + { + "epoch": 1.4612601229108035, + "grad_norm": 0.5674778819084167, + "learning_rate": 0.0001, + "loss": 1.4329, + "step": 12721 + }, + { + "epoch": 1.4613749928206308, + "grad_norm": 0.5603479146957397, + "learning_rate": 0.0001, + "loss": 1.4397, + "step": 12722 + }, + { + "epoch": 1.4614898627304578, + "grad_norm": 0.5788118839263916, + "learning_rate": 0.0001, + "loss": 1.5433, + "step": 12723 + }, + { + "epoch": 1.4616047326402848, + "grad_norm": 0.5956910848617554, + "learning_rate": 0.0001, + "loss": 1.6042, + "step": 12724 + }, + { + "epoch": 1.461719602550112, + "grad_norm": 0.5729474425315857, + "learning_rate": 0.0001, + "loss": 1.4532, + "step": 12725 + }, + { + "epoch": 1.4618344724599392, + "grad_norm": 0.6295613646507263, + "learning_rate": 0.0001, + "loss": 1.6566, + "step": 12726 + }, + { + "epoch": 1.4619493423697663, + "grad_norm": 0.5930306911468506, + "learning_rate": 0.0001, + "loss": 1.5756, + "step": 12727 + }, + { + "epoch": 1.4620642122795933, + "grad_norm": 0.6040239930152893, + "learning_rate": 0.0001, + "loss": 1.3305, + "step": 12728 + }, + { + "epoch": 1.4621790821894205, + "grad_norm": 0.5989452004432678, + "learning_rate": 0.0001, + "loss": 1.4624, + "step": 12729 + }, + { + "epoch": 1.4622939520992477, + "grad_norm": 0.5778357982635498, + "learning_rate": 0.0001, + "loss": 1.5392, + "step": 12730 + }, + { + "epoch": 1.4624088220090747, + "grad_norm": 0.6063084602355957, + "learning_rate": 0.0001, + "loss": 1.3575, + "step": 12731 + }, + { + "epoch": 1.4625236919189017, + "grad_norm": 0.6218013763427734, + "learning_rate": 0.0001, + "loss": 1.2905, + "step": 12732 + }, + { + "epoch": 1.462638561828729, + "grad_norm": 0.6372556090354919, + "learning_rate": 0.0001, + "loss": 1.3767, + "step": 12733 + }, + { + "epoch": 1.4627534317385562, + "grad_norm": 0.624243438243866, + "learning_rate": 0.0001, + "loss": 1.3002, + "step": 12734 + }, + { + "epoch": 1.4628683016483832, + "grad_norm": 0.6607589721679688, + "learning_rate": 0.0001, + "loss": 1.3411, + "step": 12735 + }, + { + "epoch": 1.4629831715582102, + "grad_norm": 0.5998088121414185, + "learning_rate": 0.0001, + "loss": 1.3656, + "step": 12736 + }, + { + "epoch": 1.4630980414680375, + "grad_norm": 0.5681934356689453, + "learning_rate": 0.0001, + "loss": 1.3278, + "step": 12737 + }, + { + "epoch": 1.4632129113778647, + "grad_norm": 0.6242961883544922, + "learning_rate": 0.0001, + "loss": 1.7247, + "step": 12738 + }, + { + "epoch": 1.4633277812876917, + "grad_norm": 0.5514942407608032, + "learning_rate": 0.0001, + "loss": 1.6772, + "step": 12739 + }, + { + "epoch": 1.4634426511975187, + "grad_norm": 0.5704249739646912, + "learning_rate": 0.0001, + "loss": 1.2248, + "step": 12740 + }, + { + "epoch": 1.463557521107346, + "grad_norm": 0.6051421165466309, + "learning_rate": 0.0001, + "loss": 1.4941, + "step": 12741 + }, + { + "epoch": 1.4636723910171732, + "grad_norm": 0.59198397397995, + "learning_rate": 0.0001, + "loss": 1.4633, + "step": 12742 + }, + { + "epoch": 1.4637872609270002, + "grad_norm": 0.5700596570968628, + "learning_rate": 0.0001, + "loss": 1.2688, + "step": 12743 + }, + { + "epoch": 1.4639021308368272, + "grad_norm": 0.613609254360199, + "learning_rate": 0.0001, + "loss": 1.6539, + "step": 12744 + }, + { + "epoch": 1.4640170007466544, + "grad_norm": 0.6272452473640442, + "learning_rate": 0.0001, + "loss": 1.6001, + "step": 12745 + }, + { + "epoch": 1.4641318706564816, + "grad_norm": 0.5880221128463745, + "learning_rate": 0.0001, + "loss": 1.4157, + "step": 12746 + }, + { + "epoch": 1.4642467405663087, + "grad_norm": 0.6015984416007996, + "learning_rate": 0.0001, + "loss": 1.3987, + "step": 12747 + }, + { + "epoch": 1.4643616104761357, + "grad_norm": 0.6657059192657471, + "learning_rate": 0.0001, + "loss": 1.4861, + "step": 12748 + }, + { + "epoch": 1.464476480385963, + "grad_norm": 0.6863231062889099, + "learning_rate": 0.0001, + "loss": 1.3948, + "step": 12749 + }, + { + "epoch": 1.4645913502957901, + "grad_norm": 0.5913318395614624, + "learning_rate": 0.0001, + "loss": 1.412, + "step": 12750 + }, + { + "epoch": 1.4647062202056171, + "grad_norm": 0.5482379794120789, + "learning_rate": 0.0001, + "loss": 1.5154, + "step": 12751 + }, + { + "epoch": 1.4648210901154441, + "grad_norm": 0.5808722972869873, + "learning_rate": 0.0001, + "loss": 1.5977, + "step": 12752 + }, + { + "epoch": 1.4649359600252714, + "grad_norm": 0.553925633430481, + "learning_rate": 0.0001, + "loss": 1.2278, + "step": 12753 + }, + { + "epoch": 1.4650508299350986, + "grad_norm": 0.5533438920974731, + "learning_rate": 0.0001, + "loss": 1.401, + "step": 12754 + }, + { + "epoch": 1.4651656998449256, + "grad_norm": 0.6688501834869385, + "learning_rate": 0.0001, + "loss": 1.5844, + "step": 12755 + }, + { + "epoch": 1.4652805697547526, + "grad_norm": 0.6241793036460876, + "learning_rate": 0.0001, + "loss": 1.5635, + "step": 12756 + }, + { + "epoch": 1.4653954396645799, + "grad_norm": 0.6105805039405823, + "learning_rate": 0.0001, + "loss": 1.3178, + "step": 12757 + }, + { + "epoch": 1.465510309574407, + "grad_norm": 0.621698260307312, + "learning_rate": 0.0001, + "loss": 1.369, + "step": 12758 + }, + { + "epoch": 1.465625179484234, + "grad_norm": 0.6072282791137695, + "learning_rate": 0.0001, + "loss": 1.3693, + "step": 12759 + }, + { + "epoch": 1.465740049394061, + "grad_norm": 0.5882850885391235, + "learning_rate": 0.0001, + "loss": 1.4573, + "step": 12760 + }, + { + "epoch": 1.4658549193038883, + "grad_norm": 0.5509545207023621, + "learning_rate": 0.0001, + "loss": 1.4878, + "step": 12761 + }, + { + "epoch": 1.4659697892137156, + "grad_norm": 0.6017540693283081, + "learning_rate": 0.0001, + "loss": 1.2881, + "step": 12762 + }, + { + "epoch": 1.4660846591235426, + "grad_norm": 0.6178678870201111, + "learning_rate": 0.0001, + "loss": 1.4212, + "step": 12763 + }, + { + "epoch": 1.4661995290333696, + "grad_norm": 0.5732308626174927, + "learning_rate": 0.0001, + "loss": 1.4519, + "step": 12764 + }, + { + "epoch": 1.4663143989431968, + "grad_norm": 0.5819944739341736, + "learning_rate": 0.0001, + "loss": 1.4526, + "step": 12765 + }, + { + "epoch": 1.466429268853024, + "grad_norm": 0.6570119857788086, + "learning_rate": 0.0001, + "loss": 1.68, + "step": 12766 + }, + { + "epoch": 1.466544138762851, + "grad_norm": 0.6044562458992004, + "learning_rate": 0.0001, + "loss": 1.4818, + "step": 12767 + }, + { + "epoch": 1.466659008672678, + "grad_norm": 0.6516227126121521, + "learning_rate": 0.0001, + "loss": 1.4827, + "step": 12768 + }, + { + "epoch": 1.4667738785825053, + "grad_norm": 0.5833653211593628, + "learning_rate": 0.0001, + "loss": 1.5451, + "step": 12769 + }, + { + "epoch": 1.4668887484923325, + "grad_norm": 0.6204677820205688, + "learning_rate": 0.0001, + "loss": 1.6553, + "step": 12770 + }, + { + "epoch": 1.4670036184021595, + "grad_norm": 0.5666163563728333, + "learning_rate": 0.0001, + "loss": 1.4902, + "step": 12771 + }, + { + "epoch": 1.4671184883119865, + "grad_norm": 0.5615315437316895, + "learning_rate": 0.0001, + "loss": 1.3985, + "step": 12772 + }, + { + "epoch": 1.4672333582218138, + "grad_norm": 0.5774404406547546, + "learning_rate": 0.0001, + "loss": 1.2866, + "step": 12773 + }, + { + "epoch": 1.467348228131641, + "grad_norm": 0.6420813798904419, + "learning_rate": 0.0001, + "loss": 1.2587, + "step": 12774 + }, + { + "epoch": 1.467463098041468, + "grad_norm": 0.5721668601036072, + "learning_rate": 0.0001, + "loss": 1.4001, + "step": 12775 + }, + { + "epoch": 1.467577967951295, + "grad_norm": 0.5667369961738586, + "learning_rate": 0.0001, + "loss": 1.4447, + "step": 12776 + }, + { + "epoch": 1.4676928378611223, + "grad_norm": 0.5806662440299988, + "learning_rate": 0.0001, + "loss": 1.4093, + "step": 12777 + }, + { + "epoch": 1.4678077077709495, + "grad_norm": 0.5815864205360413, + "learning_rate": 0.0001, + "loss": 1.4261, + "step": 12778 + }, + { + "epoch": 1.4679225776807765, + "grad_norm": 0.5797282457351685, + "learning_rate": 0.0001, + "loss": 1.3684, + "step": 12779 + }, + { + "epoch": 1.4680374475906035, + "grad_norm": 0.686774492263794, + "learning_rate": 0.0001, + "loss": 1.6016, + "step": 12780 + }, + { + "epoch": 1.4681523175004307, + "grad_norm": 0.570772647857666, + "learning_rate": 0.0001, + "loss": 1.3664, + "step": 12781 + }, + { + "epoch": 1.468267187410258, + "grad_norm": 0.6441384553909302, + "learning_rate": 0.0001, + "loss": 1.5228, + "step": 12782 + }, + { + "epoch": 1.468382057320085, + "grad_norm": 0.6247745156288147, + "learning_rate": 0.0001, + "loss": 1.3402, + "step": 12783 + }, + { + "epoch": 1.468496927229912, + "grad_norm": 0.5949456095695496, + "learning_rate": 0.0001, + "loss": 1.6516, + "step": 12784 + }, + { + "epoch": 1.4686117971397392, + "grad_norm": 0.5714166760444641, + "learning_rate": 0.0001, + "loss": 1.4388, + "step": 12785 + }, + { + "epoch": 1.4687266670495664, + "grad_norm": 0.5946084856987, + "learning_rate": 0.0001, + "loss": 1.4548, + "step": 12786 + }, + { + "epoch": 1.4688415369593935, + "grad_norm": 0.607331395149231, + "learning_rate": 0.0001, + "loss": 1.3089, + "step": 12787 + }, + { + "epoch": 1.4689564068692207, + "grad_norm": 0.5990446209907532, + "learning_rate": 0.0001, + "loss": 1.4873, + "step": 12788 + }, + { + "epoch": 1.4690712767790477, + "grad_norm": 0.6370789408683777, + "learning_rate": 0.0001, + "loss": 1.5659, + "step": 12789 + }, + { + "epoch": 1.469186146688875, + "grad_norm": 0.6360231041908264, + "learning_rate": 0.0001, + "loss": 1.5793, + "step": 12790 + }, + { + "epoch": 1.469301016598702, + "grad_norm": 0.6127180457115173, + "learning_rate": 0.0001, + "loss": 1.5618, + "step": 12791 + }, + { + "epoch": 1.4694158865085292, + "grad_norm": 0.5970214009284973, + "learning_rate": 0.0001, + "loss": 1.2743, + "step": 12792 + }, + { + "epoch": 1.4695307564183562, + "grad_norm": 0.6505621075630188, + "learning_rate": 0.0001, + "loss": 1.7703, + "step": 12793 + }, + { + "epoch": 1.4696456263281834, + "grad_norm": 0.5832335352897644, + "learning_rate": 0.0001, + "loss": 1.3529, + "step": 12794 + }, + { + "epoch": 1.4697604962380104, + "grad_norm": 0.61517733335495, + "learning_rate": 0.0001, + "loss": 1.6536, + "step": 12795 + }, + { + "epoch": 1.4698753661478376, + "grad_norm": 0.5566610097885132, + "learning_rate": 0.0001, + "loss": 1.269, + "step": 12796 + }, + { + "epoch": 1.4699902360576647, + "grad_norm": 0.5564651489257812, + "learning_rate": 0.0001, + "loss": 1.2414, + "step": 12797 + }, + { + "epoch": 1.4701051059674919, + "grad_norm": 0.5723354816436768, + "learning_rate": 0.0001, + "loss": 1.4185, + "step": 12798 + }, + { + "epoch": 1.470219975877319, + "grad_norm": 0.6019946932792664, + "learning_rate": 0.0001, + "loss": 1.4871, + "step": 12799 + }, + { + "epoch": 1.4703348457871461, + "grad_norm": 0.6336812973022461, + "learning_rate": 0.0001, + "loss": 1.3159, + "step": 12800 + }, + { + "epoch": 1.4704497156969731, + "grad_norm": 0.602342426776886, + "learning_rate": 0.0001, + "loss": 1.4548, + "step": 12801 + }, + { + "epoch": 1.4705645856068004, + "grad_norm": 0.6818926334381104, + "learning_rate": 0.0001, + "loss": 1.4655, + "step": 12802 + }, + { + "epoch": 1.4706794555166274, + "grad_norm": 0.6180601716041565, + "learning_rate": 0.0001, + "loss": 1.457, + "step": 12803 + }, + { + "epoch": 1.4707943254264546, + "grad_norm": 0.6229212284088135, + "learning_rate": 0.0001, + "loss": 1.5556, + "step": 12804 + }, + { + "epoch": 1.4709091953362816, + "grad_norm": 0.6223541498184204, + "learning_rate": 0.0001, + "loss": 1.4241, + "step": 12805 + }, + { + "epoch": 1.4710240652461088, + "grad_norm": 0.5592056512832642, + "learning_rate": 0.0001, + "loss": 1.3271, + "step": 12806 + }, + { + "epoch": 1.4711389351559359, + "grad_norm": 0.569546639919281, + "learning_rate": 0.0001, + "loss": 1.4232, + "step": 12807 + }, + { + "epoch": 1.471253805065763, + "grad_norm": 0.5602309107780457, + "learning_rate": 0.0001, + "loss": 1.543, + "step": 12808 + }, + { + "epoch": 1.47136867497559, + "grad_norm": 0.7094202637672424, + "learning_rate": 0.0001, + "loss": 1.6989, + "step": 12809 + }, + { + "epoch": 1.4714835448854173, + "grad_norm": 0.6181465983390808, + "learning_rate": 0.0001, + "loss": 1.6225, + "step": 12810 + }, + { + "epoch": 1.4715984147952443, + "grad_norm": 0.5809864401817322, + "learning_rate": 0.0001, + "loss": 1.3377, + "step": 12811 + }, + { + "epoch": 1.4717132847050716, + "grad_norm": 0.5605827569961548, + "learning_rate": 0.0001, + "loss": 1.4811, + "step": 12812 + }, + { + "epoch": 1.4718281546148986, + "grad_norm": 0.5921470522880554, + "learning_rate": 0.0001, + "loss": 1.5827, + "step": 12813 + }, + { + "epoch": 1.4719430245247258, + "grad_norm": 0.5679218173027039, + "learning_rate": 0.0001, + "loss": 1.2956, + "step": 12814 + }, + { + "epoch": 1.4720578944345528, + "grad_norm": 0.6096241474151611, + "learning_rate": 0.0001, + "loss": 1.509, + "step": 12815 + }, + { + "epoch": 1.47217276434438, + "grad_norm": 0.5679489970207214, + "learning_rate": 0.0001, + "loss": 1.4101, + "step": 12816 + }, + { + "epoch": 1.472287634254207, + "grad_norm": 0.5891135931015015, + "learning_rate": 0.0001, + "loss": 1.4284, + "step": 12817 + }, + { + "epoch": 1.4724025041640343, + "grad_norm": 0.6344767212867737, + "learning_rate": 0.0001, + "loss": 1.5762, + "step": 12818 + }, + { + "epoch": 1.4725173740738613, + "grad_norm": 0.5692608952522278, + "learning_rate": 0.0001, + "loss": 1.3893, + "step": 12819 + }, + { + "epoch": 1.4726322439836885, + "grad_norm": 0.6075809001922607, + "learning_rate": 0.0001, + "loss": 1.391, + "step": 12820 + }, + { + "epoch": 1.4727471138935155, + "grad_norm": 0.5786142349243164, + "learning_rate": 0.0001, + "loss": 1.6002, + "step": 12821 + }, + { + "epoch": 1.4728619838033428, + "grad_norm": 0.5931715965270996, + "learning_rate": 0.0001, + "loss": 1.3525, + "step": 12822 + }, + { + "epoch": 1.4729768537131698, + "grad_norm": 0.5575855374336243, + "learning_rate": 0.0001, + "loss": 1.4526, + "step": 12823 + }, + { + "epoch": 1.473091723622997, + "grad_norm": 0.5985252857208252, + "learning_rate": 0.0001, + "loss": 1.4732, + "step": 12824 + }, + { + "epoch": 1.473206593532824, + "grad_norm": 0.5632007718086243, + "learning_rate": 0.0001, + "loss": 1.5949, + "step": 12825 + }, + { + "epoch": 1.4733214634426512, + "grad_norm": 0.5793201923370361, + "learning_rate": 0.0001, + "loss": 1.5883, + "step": 12826 + }, + { + "epoch": 1.4734363333524783, + "grad_norm": 0.5845635533332825, + "learning_rate": 0.0001, + "loss": 1.4624, + "step": 12827 + }, + { + "epoch": 1.4735512032623055, + "grad_norm": 0.6314181089401245, + "learning_rate": 0.0001, + "loss": 1.4328, + "step": 12828 + }, + { + "epoch": 1.4736660731721325, + "grad_norm": 0.6269012093544006, + "learning_rate": 0.0001, + "loss": 1.5634, + "step": 12829 + }, + { + "epoch": 1.4737809430819597, + "grad_norm": 0.6091486215591431, + "learning_rate": 0.0001, + "loss": 1.3224, + "step": 12830 + }, + { + "epoch": 1.4738958129917867, + "grad_norm": 0.6564414501190186, + "learning_rate": 0.0001, + "loss": 1.482, + "step": 12831 + }, + { + "epoch": 1.474010682901614, + "grad_norm": 0.5696612000465393, + "learning_rate": 0.0001, + "loss": 1.403, + "step": 12832 + }, + { + "epoch": 1.474125552811441, + "grad_norm": 0.612686276435852, + "learning_rate": 0.0001, + "loss": 1.5524, + "step": 12833 + }, + { + "epoch": 1.4742404227212682, + "grad_norm": 0.5725769996643066, + "learning_rate": 0.0001, + "loss": 1.474, + "step": 12834 + }, + { + "epoch": 1.4743552926310952, + "grad_norm": 0.6267582774162292, + "learning_rate": 0.0001, + "loss": 1.6184, + "step": 12835 + }, + { + "epoch": 1.4744701625409224, + "grad_norm": 0.6302061676979065, + "learning_rate": 0.0001, + "loss": 1.4283, + "step": 12836 + }, + { + "epoch": 1.4745850324507495, + "grad_norm": 0.5919556617736816, + "learning_rate": 0.0001, + "loss": 1.4403, + "step": 12837 + }, + { + "epoch": 1.4746999023605767, + "grad_norm": 0.6446247100830078, + "learning_rate": 0.0001, + "loss": 1.5849, + "step": 12838 + }, + { + "epoch": 1.4748147722704037, + "grad_norm": 0.6327805519104004, + "learning_rate": 0.0001, + "loss": 1.4543, + "step": 12839 + }, + { + "epoch": 1.474929642180231, + "grad_norm": 0.6169029474258423, + "learning_rate": 0.0001, + "loss": 1.5194, + "step": 12840 + }, + { + "epoch": 1.475044512090058, + "grad_norm": 0.576792299747467, + "learning_rate": 0.0001, + "loss": 1.2857, + "step": 12841 + }, + { + "epoch": 1.4751593819998852, + "grad_norm": 0.6225866675376892, + "learning_rate": 0.0001, + "loss": 1.5189, + "step": 12842 + }, + { + "epoch": 1.4752742519097122, + "grad_norm": 0.5920522212982178, + "learning_rate": 0.0001, + "loss": 1.4826, + "step": 12843 + }, + { + "epoch": 1.4753891218195394, + "grad_norm": 0.5702757835388184, + "learning_rate": 0.0001, + "loss": 1.4564, + "step": 12844 + }, + { + "epoch": 1.4755039917293664, + "grad_norm": 0.5631170272827148, + "learning_rate": 0.0001, + "loss": 1.3581, + "step": 12845 + }, + { + "epoch": 1.4756188616391936, + "grad_norm": 0.5859529376029968, + "learning_rate": 0.0001, + "loss": 1.5269, + "step": 12846 + }, + { + "epoch": 1.4757337315490207, + "grad_norm": 0.5922108888626099, + "learning_rate": 0.0001, + "loss": 1.4155, + "step": 12847 + }, + { + "epoch": 1.4758486014588479, + "grad_norm": 0.5865465998649597, + "learning_rate": 0.0001, + "loss": 1.4789, + "step": 12848 + }, + { + "epoch": 1.475963471368675, + "grad_norm": 0.5963805317878723, + "learning_rate": 0.0001, + "loss": 1.3117, + "step": 12849 + }, + { + "epoch": 1.4760783412785021, + "grad_norm": 0.6101550459861755, + "learning_rate": 0.0001, + "loss": 1.5932, + "step": 12850 + }, + { + "epoch": 1.4761932111883291, + "grad_norm": 0.5703292489051819, + "learning_rate": 0.0001, + "loss": 1.4169, + "step": 12851 + }, + { + "epoch": 1.4763080810981564, + "grad_norm": 0.726719856262207, + "learning_rate": 0.0001, + "loss": 1.3441, + "step": 12852 + }, + { + "epoch": 1.4764229510079834, + "grad_norm": 0.5899834036827087, + "learning_rate": 0.0001, + "loss": 1.3448, + "step": 12853 + }, + { + "epoch": 1.4765378209178106, + "grad_norm": 0.6098437905311584, + "learning_rate": 0.0001, + "loss": 1.5906, + "step": 12854 + }, + { + "epoch": 1.4766526908276378, + "grad_norm": 0.5887052416801453, + "learning_rate": 0.0001, + "loss": 1.5883, + "step": 12855 + }, + { + "epoch": 1.4767675607374648, + "grad_norm": 0.6153585910797119, + "learning_rate": 0.0001, + "loss": 1.4361, + "step": 12856 + }, + { + "epoch": 1.4768824306472919, + "grad_norm": 0.6315271854400635, + "learning_rate": 0.0001, + "loss": 1.3454, + "step": 12857 + }, + { + "epoch": 1.476997300557119, + "grad_norm": 0.6163341403007507, + "learning_rate": 0.0001, + "loss": 1.4341, + "step": 12858 + }, + { + "epoch": 1.4771121704669463, + "grad_norm": 0.6163727045059204, + "learning_rate": 0.0001, + "loss": 1.4261, + "step": 12859 + }, + { + "epoch": 1.4772270403767733, + "grad_norm": 0.5802782773971558, + "learning_rate": 0.0001, + "loss": 1.464, + "step": 12860 + }, + { + "epoch": 1.4773419102866003, + "grad_norm": 0.6903051733970642, + "learning_rate": 0.0001, + "loss": 1.642, + "step": 12861 + }, + { + "epoch": 1.4774567801964276, + "grad_norm": 0.5828868746757507, + "learning_rate": 0.0001, + "loss": 1.381, + "step": 12862 + }, + { + "epoch": 1.4775716501062548, + "grad_norm": 0.5923581719398499, + "learning_rate": 0.0001, + "loss": 1.4827, + "step": 12863 + }, + { + "epoch": 1.4776865200160818, + "grad_norm": 0.606850802898407, + "learning_rate": 0.0001, + "loss": 1.5009, + "step": 12864 + }, + { + "epoch": 1.4778013899259088, + "grad_norm": 0.5898141264915466, + "learning_rate": 0.0001, + "loss": 1.2978, + "step": 12865 + }, + { + "epoch": 1.477916259835736, + "grad_norm": 0.5947832465171814, + "learning_rate": 0.0001, + "loss": 1.5102, + "step": 12866 + }, + { + "epoch": 1.4780311297455633, + "grad_norm": 0.5990186929702759, + "learning_rate": 0.0001, + "loss": 1.4618, + "step": 12867 + }, + { + "epoch": 1.4781459996553903, + "grad_norm": 0.6089809536933899, + "learning_rate": 0.0001, + "loss": 1.5126, + "step": 12868 + }, + { + "epoch": 1.4782608695652173, + "grad_norm": 0.5526347756385803, + "learning_rate": 0.0001, + "loss": 1.3861, + "step": 12869 + }, + { + "epoch": 1.4783757394750445, + "grad_norm": 0.631928026676178, + "learning_rate": 0.0001, + "loss": 1.4951, + "step": 12870 + }, + { + "epoch": 1.4784906093848718, + "grad_norm": 0.6184859275817871, + "learning_rate": 0.0001, + "loss": 1.3549, + "step": 12871 + }, + { + "epoch": 1.4786054792946988, + "grad_norm": 0.5659068822860718, + "learning_rate": 0.0001, + "loss": 1.4886, + "step": 12872 + }, + { + "epoch": 1.4787203492045258, + "grad_norm": 0.5731703639030457, + "learning_rate": 0.0001, + "loss": 1.1864, + "step": 12873 + }, + { + "epoch": 1.478835219114353, + "grad_norm": 0.5723096132278442, + "learning_rate": 0.0001, + "loss": 1.5536, + "step": 12874 + }, + { + "epoch": 1.4789500890241802, + "grad_norm": 0.5880361199378967, + "learning_rate": 0.0001, + "loss": 1.4777, + "step": 12875 + }, + { + "epoch": 1.4790649589340072, + "grad_norm": 0.6019386053085327, + "learning_rate": 0.0001, + "loss": 1.4355, + "step": 12876 + }, + { + "epoch": 1.4791798288438343, + "grad_norm": 0.6749945282936096, + "learning_rate": 0.0001, + "loss": 1.1855, + "step": 12877 + }, + { + "epoch": 1.4792946987536615, + "grad_norm": 0.5709156394004822, + "learning_rate": 0.0001, + "loss": 1.4813, + "step": 12878 + }, + { + "epoch": 1.4794095686634887, + "grad_norm": 0.604844331741333, + "learning_rate": 0.0001, + "loss": 1.487, + "step": 12879 + }, + { + "epoch": 1.4795244385733157, + "grad_norm": 0.608182966709137, + "learning_rate": 0.0001, + "loss": 1.3741, + "step": 12880 + }, + { + "epoch": 1.4796393084831427, + "grad_norm": 0.5870488882064819, + "learning_rate": 0.0001, + "loss": 1.3744, + "step": 12881 + }, + { + "epoch": 1.47975417839297, + "grad_norm": 0.6059988737106323, + "learning_rate": 0.0001, + "loss": 1.3412, + "step": 12882 + }, + { + "epoch": 1.4798690483027972, + "grad_norm": 0.6506321430206299, + "learning_rate": 0.0001, + "loss": 1.6666, + "step": 12883 + }, + { + "epoch": 1.4799839182126242, + "grad_norm": 0.5870694518089294, + "learning_rate": 0.0001, + "loss": 1.3905, + "step": 12884 + }, + { + "epoch": 1.4800987881224512, + "grad_norm": 0.5936076641082764, + "learning_rate": 0.0001, + "loss": 1.5622, + "step": 12885 + }, + { + "epoch": 1.4802136580322784, + "grad_norm": 0.6202870607376099, + "learning_rate": 0.0001, + "loss": 1.4999, + "step": 12886 + }, + { + "epoch": 1.4803285279421057, + "grad_norm": 0.5941158533096313, + "learning_rate": 0.0001, + "loss": 1.5883, + "step": 12887 + }, + { + "epoch": 1.4804433978519327, + "grad_norm": 0.5812159180641174, + "learning_rate": 0.0001, + "loss": 1.4815, + "step": 12888 + }, + { + "epoch": 1.4805582677617597, + "grad_norm": 0.6922297477722168, + "learning_rate": 0.0001, + "loss": 1.6634, + "step": 12889 + }, + { + "epoch": 1.480673137671587, + "grad_norm": 0.6124345064163208, + "learning_rate": 0.0001, + "loss": 1.3621, + "step": 12890 + }, + { + "epoch": 1.4807880075814142, + "grad_norm": 0.5976356267929077, + "learning_rate": 0.0001, + "loss": 1.4017, + "step": 12891 + }, + { + "epoch": 1.4809028774912412, + "grad_norm": 0.6526122689247131, + "learning_rate": 0.0001, + "loss": 1.2287, + "step": 12892 + }, + { + "epoch": 1.4810177474010682, + "grad_norm": 0.5740790367126465, + "learning_rate": 0.0001, + "loss": 1.4163, + "step": 12893 + }, + { + "epoch": 1.4811326173108954, + "grad_norm": 0.6110181212425232, + "learning_rate": 0.0001, + "loss": 1.3558, + "step": 12894 + }, + { + "epoch": 1.4812474872207226, + "grad_norm": 0.62778240442276, + "learning_rate": 0.0001, + "loss": 1.4462, + "step": 12895 + }, + { + "epoch": 1.4813623571305496, + "grad_norm": 0.6331201195716858, + "learning_rate": 0.0001, + "loss": 1.4694, + "step": 12896 + }, + { + "epoch": 1.4814772270403767, + "grad_norm": 0.647627055644989, + "learning_rate": 0.0001, + "loss": 1.3772, + "step": 12897 + }, + { + "epoch": 1.4815920969502039, + "grad_norm": 0.5829721093177795, + "learning_rate": 0.0001, + "loss": 1.4, + "step": 12898 + }, + { + "epoch": 1.4817069668600311, + "grad_norm": 0.6361407041549683, + "learning_rate": 0.0001, + "loss": 1.5327, + "step": 12899 + }, + { + "epoch": 1.4818218367698581, + "grad_norm": 0.5852049589157104, + "learning_rate": 0.0001, + "loss": 1.3346, + "step": 12900 + }, + { + "epoch": 1.4819367066796851, + "grad_norm": 0.5944021344184875, + "learning_rate": 0.0001, + "loss": 1.5299, + "step": 12901 + }, + { + "epoch": 1.4820515765895124, + "grad_norm": 0.5802842378616333, + "learning_rate": 0.0001, + "loss": 1.4209, + "step": 12902 + }, + { + "epoch": 1.4821664464993396, + "grad_norm": 0.582327663898468, + "learning_rate": 0.0001, + "loss": 1.2857, + "step": 12903 + }, + { + "epoch": 1.4822813164091666, + "grad_norm": 0.6188213229179382, + "learning_rate": 0.0001, + "loss": 1.5666, + "step": 12904 + }, + { + "epoch": 1.4823961863189936, + "grad_norm": 0.6501442790031433, + "learning_rate": 0.0001, + "loss": 1.3503, + "step": 12905 + }, + { + "epoch": 1.4825110562288208, + "grad_norm": 0.6042030453681946, + "learning_rate": 0.0001, + "loss": 1.3655, + "step": 12906 + }, + { + "epoch": 1.482625926138648, + "grad_norm": 0.6173478960990906, + "learning_rate": 0.0001, + "loss": 1.519, + "step": 12907 + }, + { + "epoch": 1.482740796048475, + "grad_norm": 0.6603444814682007, + "learning_rate": 0.0001, + "loss": 1.6004, + "step": 12908 + }, + { + "epoch": 1.482855665958302, + "grad_norm": 0.5978038311004639, + "learning_rate": 0.0001, + "loss": 1.2935, + "step": 12909 + }, + { + "epoch": 1.4829705358681293, + "grad_norm": 0.6178515553474426, + "learning_rate": 0.0001, + "loss": 1.4831, + "step": 12910 + }, + { + "epoch": 1.4830854057779566, + "grad_norm": 0.6077051758766174, + "learning_rate": 0.0001, + "loss": 1.4124, + "step": 12911 + }, + { + "epoch": 1.4832002756877836, + "grad_norm": 0.6282074451446533, + "learning_rate": 0.0001, + "loss": 1.5461, + "step": 12912 + }, + { + "epoch": 1.4833151455976106, + "grad_norm": 0.5881144404411316, + "learning_rate": 0.0001, + "loss": 1.4419, + "step": 12913 + }, + { + "epoch": 1.4834300155074378, + "grad_norm": 0.5621463656425476, + "learning_rate": 0.0001, + "loss": 1.4631, + "step": 12914 + }, + { + "epoch": 1.483544885417265, + "grad_norm": 0.6100106239318848, + "learning_rate": 0.0001, + "loss": 1.5192, + "step": 12915 + }, + { + "epoch": 1.483659755327092, + "grad_norm": 0.5708780288696289, + "learning_rate": 0.0001, + "loss": 1.3741, + "step": 12916 + }, + { + "epoch": 1.483774625236919, + "grad_norm": 0.6632856726646423, + "learning_rate": 0.0001, + "loss": 1.6289, + "step": 12917 + }, + { + "epoch": 1.4838894951467463, + "grad_norm": 0.598359227180481, + "learning_rate": 0.0001, + "loss": 1.3337, + "step": 12918 + }, + { + "epoch": 1.4840043650565735, + "grad_norm": 0.6106839179992676, + "learning_rate": 0.0001, + "loss": 1.4881, + "step": 12919 + }, + { + "epoch": 1.4841192349664005, + "grad_norm": 0.6464507579803467, + "learning_rate": 0.0001, + "loss": 1.4999, + "step": 12920 + }, + { + "epoch": 1.4842341048762275, + "grad_norm": 0.6744695901870728, + "learning_rate": 0.0001, + "loss": 1.6106, + "step": 12921 + }, + { + "epoch": 1.4843489747860548, + "grad_norm": 0.6888026595115662, + "learning_rate": 0.0001, + "loss": 1.6446, + "step": 12922 + }, + { + "epoch": 1.484463844695882, + "grad_norm": 0.5786687731742859, + "learning_rate": 0.0001, + "loss": 1.4022, + "step": 12923 + }, + { + "epoch": 1.484578714605709, + "grad_norm": 0.5795042514801025, + "learning_rate": 0.0001, + "loss": 1.4461, + "step": 12924 + }, + { + "epoch": 1.4846935845155362, + "grad_norm": 0.5988849997520447, + "learning_rate": 0.0001, + "loss": 1.5296, + "step": 12925 + }, + { + "epoch": 1.4848084544253632, + "grad_norm": 0.5869229435920715, + "learning_rate": 0.0001, + "loss": 1.3634, + "step": 12926 + }, + { + "epoch": 1.4849233243351905, + "grad_norm": 0.6069849133491516, + "learning_rate": 0.0001, + "loss": 1.4881, + "step": 12927 + }, + { + "epoch": 1.4850381942450175, + "grad_norm": 0.6400530338287354, + "learning_rate": 0.0001, + "loss": 1.6889, + "step": 12928 + }, + { + "epoch": 1.4851530641548447, + "grad_norm": 0.605957567691803, + "learning_rate": 0.0001, + "loss": 1.3198, + "step": 12929 + }, + { + "epoch": 1.4852679340646717, + "grad_norm": 0.6827652454376221, + "learning_rate": 0.0001, + "loss": 1.6951, + "step": 12930 + }, + { + "epoch": 1.485382803974499, + "grad_norm": 0.6778180003166199, + "learning_rate": 0.0001, + "loss": 1.4847, + "step": 12931 + }, + { + "epoch": 1.485497673884326, + "grad_norm": 0.5894115567207336, + "learning_rate": 0.0001, + "loss": 1.5448, + "step": 12932 + }, + { + "epoch": 1.4856125437941532, + "grad_norm": 0.5633677244186401, + "learning_rate": 0.0001, + "loss": 1.4671, + "step": 12933 + }, + { + "epoch": 1.4857274137039802, + "grad_norm": 0.5746277570724487, + "learning_rate": 0.0001, + "loss": 1.3921, + "step": 12934 + }, + { + "epoch": 1.4858422836138074, + "grad_norm": 0.5946472883224487, + "learning_rate": 0.0001, + "loss": 1.4943, + "step": 12935 + }, + { + "epoch": 1.4859571535236344, + "grad_norm": 0.6017822027206421, + "learning_rate": 0.0001, + "loss": 1.3947, + "step": 12936 + }, + { + "epoch": 1.4860720234334617, + "grad_norm": 0.5574742555618286, + "learning_rate": 0.0001, + "loss": 1.3781, + "step": 12937 + }, + { + "epoch": 1.4861868933432887, + "grad_norm": 0.6060687899589539, + "learning_rate": 0.0001, + "loss": 1.6548, + "step": 12938 + }, + { + "epoch": 1.486301763253116, + "grad_norm": 0.5431711673736572, + "learning_rate": 0.0001, + "loss": 1.4442, + "step": 12939 + }, + { + "epoch": 1.486416633162943, + "grad_norm": 0.5980778336524963, + "learning_rate": 0.0001, + "loss": 1.4474, + "step": 12940 + }, + { + "epoch": 1.4865315030727702, + "grad_norm": 0.5813719034194946, + "learning_rate": 0.0001, + "loss": 1.5765, + "step": 12941 + }, + { + "epoch": 1.4866463729825972, + "grad_norm": 0.5789249539375305, + "learning_rate": 0.0001, + "loss": 1.4672, + "step": 12942 + }, + { + "epoch": 1.4867612428924244, + "grad_norm": 0.592308759689331, + "learning_rate": 0.0001, + "loss": 1.5762, + "step": 12943 + }, + { + "epoch": 1.4868761128022514, + "grad_norm": 0.5930988192558289, + "learning_rate": 0.0001, + "loss": 1.3819, + "step": 12944 + }, + { + "epoch": 1.4869909827120786, + "grad_norm": 0.5978574752807617, + "learning_rate": 0.0001, + "loss": 1.3716, + "step": 12945 + }, + { + "epoch": 1.4871058526219056, + "grad_norm": 0.554966151714325, + "learning_rate": 0.0001, + "loss": 1.2418, + "step": 12946 + }, + { + "epoch": 1.4872207225317329, + "grad_norm": 0.6341855525970459, + "learning_rate": 0.0001, + "loss": 1.6254, + "step": 12947 + }, + { + "epoch": 1.4873355924415599, + "grad_norm": 0.5823864340782166, + "learning_rate": 0.0001, + "loss": 1.461, + "step": 12948 + }, + { + "epoch": 1.4874504623513871, + "grad_norm": 0.5773583650588989, + "learning_rate": 0.0001, + "loss": 1.4707, + "step": 12949 + }, + { + "epoch": 1.4875653322612141, + "grad_norm": 0.6104335188865662, + "learning_rate": 0.0001, + "loss": 1.2777, + "step": 12950 + }, + { + "epoch": 1.4876802021710414, + "grad_norm": 0.5461379885673523, + "learning_rate": 0.0001, + "loss": 1.3399, + "step": 12951 + }, + { + "epoch": 1.4877950720808684, + "grad_norm": 0.5723500847816467, + "learning_rate": 0.0001, + "loss": 1.3066, + "step": 12952 + }, + { + "epoch": 1.4879099419906956, + "grad_norm": 0.6174570918083191, + "learning_rate": 0.0001, + "loss": 1.6148, + "step": 12953 + }, + { + "epoch": 1.4880248119005226, + "grad_norm": 0.5926865935325623, + "learning_rate": 0.0001, + "loss": 1.5758, + "step": 12954 + }, + { + "epoch": 1.4881396818103498, + "grad_norm": 0.593763530254364, + "learning_rate": 0.0001, + "loss": 1.4821, + "step": 12955 + }, + { + "epoch": 1.4882545517201768, + "grad_norm": 0.6111153364181519, + "learning_rate": 0.0001, + "loss": 1.5675, + "step": 12956 + }, + { + "epoch": 1.488369421630004, + "grad_norm": 0.6312853693962097, + "learning_rate": 0.0001, + "loss": 1.4476, + "step": 12957 + }, + { + "epoch": 1.488484291539831, + "grad_norm": 0.6247866153717041, + "learning_rate": 0.0001, + "loss": 1.3772, + "step": 12958 + }, + { + "epoch": 1.4885991614496583, + "grad_norm": 0.5472710728645325, + "learning_rate": 0.0001, + "loss": 1.1579, + "step": 12959 + }, + { + "epoch": 1.4887140313594853, + "grad_norm": 0.5936057567596436, + "learning_rate": 0.0001, + "loss": 1.4158, + "step": 12960 + }, + { + "epoch": 1.4888289012693126, + "grad_norm": 0.5668746829032898, + "learning_rate": 0.0001, + "loss": 1.5599, + "step": 12961 + }, + { + "epoch": 1.4889437711791396, + "grad_norm": 0.605015218257904, + "learning_rate": 0.0001, + "loss": 1.7382, + "step": 12962 + }, + { + "epoch": 1.4890586410889668, + "grad_norm": 0.6417834162712097, + "learning_rate": 0.0001, + "loss": 1.5262, + "step": 12963 + }, + { + "epoch": 1.4891735109987938, + "grad_norm": 0.6083729267120361, + "learning_rate": 0.0001, + "loss": 1.4273, + "step": 12964 + }, + { + "epoch": 1.489288380908621, + "grad_norm": 0.6126262545585632, + "learning_rate": 0.0001, + "loss": 1.6093, + "step": 12965 + }, + { + "epoch": 1.489403250818448, + "grad_norm": 0.6185763478279114, + "learning_rate": 0.0001, + "loss": 1.3363, + "step": 12966 + }, + { + "epoch": 1.4895181207282753, + "grad_norm": 0.6177452802658081, + "learning_rate": 0.0001, + "loss": 1.2466, + "step": 12967 + }, + { + "epoch": 1.4896329906381023, + "grad_norm": 0.6014622449874878, + "learning_rate": 0.0001, + "loss": 1.4981, + "step": 12968 + }, + { + "epoch": 1.4897478605479295, + "grad_norm": 0.6131553053855896, + "learning_rate": 0.0001, + "loss": 1.4818, + "step": 12969 + }, + { + "epoch": 1.4898627304577565, + "grad_norm": 0.6212412118911743, + "learning_rate": 0.0001, + "loss": 1.5509, + "step": 12970 + }, + { + "epoch": 1.4899776003675838, + "grad_norm": 0.595173180103302, + "learning_rate": 0.0001, + "loss": 1.4124, + "step": 12971 + }, + { + "epoch": 1.4900924702774108, + "grad_norm": 0.623760998249054, + "learning_rate": 0.0001, + "loss": 1.4035, + "step": 12972 + }, + { + "epoch": 1.490207340187238, + "grad_norm": 0.5903745889663696, + "learning_rate": 0.0001, + "loss": 1.5047, + "step": 12973 + }, + { + "epoch": 1.490322210097065, + "grad_norm": 0.5988485813140869, + "learning_rate": 0.0001, + "loss": 1.505, + "step": 12974 + }, + { + "epoch": 1.4904370800068922, + "grad_norm": 0.592363178730011, + "learning_rate": 0.0001, + "loss": 1.3643, + "step": 12975 + }, + { + "epoch": 1.4905519499167192, + "grad_norm": 0.6512883901596069, + "learning_rate": 0.0001, + "loss": 1.3463, + "step": 12976 + }, + { + "epoch": 1.4906668198265465, + "grad_norm": 0.5865827798843384, + "learning_rate": 0.0001, + "loss": 1.3116, + "step": 12977 + }, + { + "epoch": 1.4907816897363735, + "grad_norm": 0.5729326605796814, + "learning_rate": 0.0001, + "loss": 1.3622, + "step": 12978 + }, + { + "epoch": 1.4908965596462007, + "grad_norm": 0.599138617515564, + "learning_rate": 0.0001, + "loss": 1.5096, + "step": 12979 + }, + { + "epoch": 1.4910114295560277, + "grad_norm": 0.6441994905471802, + "learning_rate": 0.0001, + "loss": 1.5755, + "step": 12980 + }, + { + "epoch": 1.491126299465855, + "grad_norm": 0.5867522358894348, + "learning_rate": 0.0001, + "loss": 1.4342, + "step": 12981 + }, + { + "epoch": 1.491241169375682, + "grad_norm": 0.6497024297714233, + "learning_rate": 0.0001, + "loss": 1.5281, + "step": 12982 + }, + { + "epoch": 1.4913560392855092, + "grad_norm": 0.6006242632865906, + "learning_rate": 0.0001, + "loss": 1.5463, + "step": 12983 + }, + { + "epoch": 1.4914709091953362, + "grad_norm": 0.5676483511924744, + "learning_rate": 0.0001, + "loss": 1.2727, + "step": 12984 + }, + { + "epoch": 1.4915857791051634, + "grad_norm": 0.6318432092666626, + "learning_rate": 0.0001, + "loss": 1.56, + "step": 12985 + }, + { + "epoch": 1.4917006490149904, + "grad_norm": 0.5977509021759033, + "learning_rate": 0.0001, + "loss": 1.289, + "step": 12986 + }, + { + "epoch": 1.4918155189248177, + "grad_norm": 0.6188353896141052, + "learning_rate": 0.0001, + "loss": 1.3374, + "step": 12987 + }, + { + "epoch": 1.4919303888346447, + "grad_norm": 0.5568304657936096, + "learning_rate": 0.0001, + "loss": 1.4605, + "step": 12988 + }, + { + "epoch": 1.492045258744472, + "grad_norm": 0.5960107445716858, + "learning_rate": 0.0001, + "loss": 1.6009, + "step": 12989 + }, + { + "epoch": 1.492160128654299, + "grad_norm": 0.5843378305435181, + "learning_rate": 0.0001, + "loss": 1.4434, + "step": 12990 + }, + { + "epoch": 1.4922749985641262, + "grad_norm": 0.5970795154571533, + "learning_rate": 0.0001, + "loss": 1.6927, + "step": 12991 + }, + { + "epoch": 1.4923898684739534, + "grad_norm": 0.5971647500991821, + "learning_rate": 0.0001, + "loss": 1.5712, + "step": 12992 + }, + { + "epoch": 1.4925047383837804, + "grad_norm": 0.5765950679779053, + "learning_rate": 0.0001, + "loss": 1.4302, + "step": 12993 + }, + { + "epoch": 1.4926196082936074, + "grad_norm": 0.6342061758041382, + "learning_rate": 0.0001, + "loss": 1.4708, + "step": 12994 + }, + { + "epoch": 1.4927344782034346, + "grad_norm": 0.5808407664299011, + "learning_rate": 0.0001, + "loss": 1.4504, + "step": 12995 + }, + { + "epoch": 1.4928493481132619, + "grad_norm": 0.7014811635017395, + "learning_rate": 0.0001, + "loss": 1.6261, + "step": 12996 + }, + { + "epoch": 1.4929642180230889, + "grad_norm": 0.6212725639343262, + "learning_rate": 0.0001, + "loss": 1.6025, + "step": 12997 + }, + { + "epoch": 1.4930790879329159, + "grad_norm": 0.5805021524429321, + "learning_rate": 0.0001, + "loss": 1.4603, + "step": 12998 + }, + { + "epoch": 1.4931939578427431, + "grad_norm": 0.5734280943870544, + "learning_rate": 0.0001, + "loss": 1.5504, + "step": 12999 + }, + { + "epoch": 1.4933088277525703, + "grad_norm": 0.5856978297233582, + "learning_rate": 0.0001, + "loss": 1.3655, + "step": 13000 + }, + { + "epoch": 1.4934236976623974, + "grad_norm": 0.5353971123695374, + "learning_rate": 0.0001, + "loss": 1.2464, + "step": 13001 + }, + { + "epoch": 1.4935385675722244, + "grad_norm": 0.6052659749984741, + "learning_rate": 0.0001, + "loss": 1.6794, + "step": 13002 + }, + { + "epoch": 1.4936534374820516, + "grad_norm": 0.5781797170639038, + "learning_rate": 0.0001, + "loss": 1.3597, + "step": 13003 + }, + { + "epoch": 1.4937683073918788, + "grad_norm": 0.576927125453949, + "learning_rate": 0.0001, + "loss": 1.4228, + "step": 13004 + }, + { + "epoch": 1.4938831773017058, + "grad_norm": 0.6039324998855591, + "learning_rate": 0.0001, + "loss": 1.4246, + "step": 13005 + }, + { + "epoch": 1.4939980472115328, + "grad_norm": 0.5810524821281433, + "learning_rate": 0.0001, + "loss": 1.4884, + "step": 13006 + }, + { + "epoch": 1.49411291712136, + "grad_norm": 0.5944836735725403, + "learning_rate": 0.0001, + "loss": 1.4696, + "step": 13007 + }, + { + "epoch": 1.4942277870311873, + "grad_norm": 0.6197353601455688, + "learning_rate": 0.0001, + "loss": 1.4723, + "step": 13008 + }, + { + "epoch": 1.4943426569410143, + "grad_norm": 0.5848913788795471, + "learning_rate": 0.0001, + "loss": 1.4212, + "step": 13009 + }, + { + "epoch": 1.4944575268508413, + "grad_norm": 0.6071828603744507, + "learning_rate": 0.0001, + "loss": 1.5183, + "step": 13010 + }, + { + "epoch": 1.4945723967606686, + "grad_norm": 0.6387497186660767, + "learning_rate": 0.0001, + "loss": 1.5511, + "step": 13011 + }, + { + "epoch": 1.4946872666704958, + "grad_norm": 0.5986767411231995, + "learning_rate": 0.0001, + "loss": 1.461, + "step": 13012 + }, + { + "epoch": 1.4948021365803228, + "grad_norm": 0.6346409916877747, + "learning_rate": 0.0001, + "loss": 1.5136, + "step": 13013 + }, + { + "epoch": 1.4949170064901498, + "grad_norm": 0.6802158951759338, + "learning_rate": 0.0001, + "loss": 1.5238, + "step": 13014 + }, + { + "epoch": 1.495031876399977, + "grad_norm": 0.6537509560585022, + "learning_rate": 0.0001, + "loss": 1.1075, + "step": 13015 + }, + { + "epoch": 1.4951467463098043, + "grad_norm": 0.5992099642753601, + "learning_rate": 0.0001, + "loss": 1.3288, + "step": 13016 + }, + { + "epoch": 1.4952616162196313, + "grad_norm": 0.6179826855659485, + "learning_rate": 0.0001, + "loss": 1.5179, + "step": 13017 + }, + { + "epoch": 1.4953764861294583, + "grad_norm": 0.6495833992958069, + "learning_rate": 0.0001, + "loss": 1.7106, + "step": 13018 + }, + { + "epoch": 1.4954913560392855, + "grad_norm": 0.6056380271911621, + "learning_rate": 0.0001, + "loss": 1.4111, + "step": 13019 + }, + { + "epoch": 1.4956062259491127, + "grad_norm": 0.7197732925415039, + "learning_rate": 0.0001, + "loss": 1.8647, + "step": 13020 + }, + { + "epoch": 1.4957210958589398, + "grad_norm": 0.63360196352005, + "learning_rate": 0.0001, + "loss": 1.5772, + "step": 13021 + }, + { + "epoch": 1.4958359657687668, + "grad_norm": 0.6025167107582092, + "learning_rate": 0.0001, + "loss": 1.425, + "step": 13022 + }, + { + "epoch": 1.495950835678594, + "grad_norm": 0.596898078918457, + "learning_rate": 0.0001, + "loss": 1.6411, + "step": 13023 + }, + { + "epoch": 1.4960657055884212, + "grad_norm": 0.5755943059921265, + "learning_rate": 0.0001, + "loss": 1.5664, + "step": 13024 + }, + { + "epoch": 1.4961805754982482, + "grad_norm": 0.5723254084587097, + "learning_rate": 0.0001, + "loss": 1.5093, + "step": 13025 + }, + { + "epoch": 1.4962954454080752, + "grad_norm": 0.5714227557182312, + "learning_rate": 0.0001, + "loss": 1.3679, + "step": 13026 + }, + { + "epoch": 1.4964103153179025, + "grad_norm": 0.6034862399101257, + "learning_rate": 0.0001, + "loss": 1.5995, + "step": 13027 + }, + { + "epoch": 1.4965251852277297, + "grad_norm": 0.6068670749664307, + "learning_rate": 0.0001, + "loss": 1.6586, + "step": 13028 + }, + { + "epoch": 1.4966400551375567, + "grad_norm": 0.5679345726966858, + "learning_rate": 0.0001, + "loss": 1.236, + "step": 13029 + }, + { + "epoch": 1.4967549250473837, + "grad_norm": 0.56649249792099, + "learning_rate": 0.0001, + "loss": 1.3815, + "step": 13030 + }, + { + "epoch": 1.496869794957211, + "grad_norm": 0.617186963558197, + "learning_rate": 0.0001, + "loss": 1.4181, + "step": 13031 + }, + { + "epoch": 1.4969846648670382, + "grad_norm": 0.5779293179512024, + "learning_rate": 0.0001, + "loss": 1.4227, + "step": 13032 + }, + { + "epoch": 1.4970995347768652, + "grad_norm": 0.6814537644386292, + "learning_rate": 0.0001, + "loss": 1.4959, + "step": 13033 + }, + { + "epoch": 1.4972144046866922, + "grad_norm": 0.6161949038505554, + "learning_rate": 0.0001, + "loss": 1.4994, + "step": 13034 + }, + { + "epoch": 1.4973292745965194, + "grad_norm": 0.6186149716377258, + "learning_rate": 0.0001, + "loss": 1.5224, + "step": 13035 + }, + { + "epoch": 1.4974441445063467, + "grad_norm": 0.6470484137535095, + "learning_rate": 0.0001, + "loss": 1.6088, + "step": 13036 + }, + { + "epoch": 1.4975590144161737, + "grad_norm": 0.6378332376480103, + "learning_rate": 0.0001, + "loss": 1.5992, + "step": 13037 + }, + { + "epoch": 1.4976738843260007, + "grad_norm": 0.6506776809692383, + "learning_rate": 0.0001, + "loss": 1.6681, + "step": 13038 + }, + { + "epoch": 1.497788754235828, + "grad_norm": 0.6596044301986694, + "learning_rate": 0.0001, + "loss": 1.634, + "step": 13039 + }, + { + "epoch": 1.4979036241456551, + "grad_norm": 0.5984453558921814, + "learning_rate": 0.0001, + "loss": 1.3366, + "step": 13040 + }, + { + "epoch": 1.4980184940554822, + "grad_norm": 0.6259294748306274, + "learning_rate": 0.0001, + "loss": 1.568, + "step": 13041 + }, + { + "epoch": 1.4981333639653092, + "grad_norm": 0.6096507906913757, + "learning_rate": 0.0001, + "loss": 1.4598, + "step": 13042 + }, + { + "epoch": 1.4982482338751364, + "grad_norm": 0.6044954657554626, + "learning_rate": 0.0001, + "loss": 1.4842, + "step": 13043 + }, + { + "epoch": 1.4983631037849636, + "grad_norm": 0.5996847748756409, + "learning_rate": 0.0001, + "loss": 1.4205, + "step": 13044 + }, + { + "epoch": 1.4984779736947906, + "grad_norm": 0.5740419030189514, + "learning_rate": 0.0001, + "loss": 1.3223, + "step": 13045 + }, + { + "epoch": 1.4985928436046176, + "grad_norm": 0.6097593903541565, + "learning_rate": 0.0001, + "loss": 1.4634, + "step": 13046 + }, + { + "epoch": 1.4987077135144449, + "grad_norm": 0.6650487780570984, + "learning_rate": 0.0001, + "loss": 1.3505, + "step": 13047 + }, + { + "epoch": 1.498822583424272, + "grad_norm": 0.5681633949279785, + "learning_rate": 0.0001, + "loss": 1.3723, + "step": 13048 + }, + { + "epoch": 1.4989374533340991, + "grad_norm": 0.6020470857620239, + "learning_rate": 0.0001, + "loss": 1.4584, + "step": 13049 + }, + { + "epoch": 1.4990523232439261, + "grad_norm": 0.6068710088729858, + "learning_rate": 0.0001, + "loss": 1.2623, + "step": 13050 + }, + { + "epoch": 1.4991671931537534, + "grad_norm": 0.6180760860443115, + "learning_rate": 0.0001, + "loss": 1.4976, + "step": 13051 + }, + { + "epoch": 1.4992820630635806, + "grad_norm": 0.6852160096168518, + "learning_rate": 0.0001, + "loss": 1.3216, + "step": 13052 + }, + { + "epoch": 1.4993969329734076, + "grad_norm": 0.6118769645690918, + "learning_rate": 0.0001, + "loss": 1.4306, + "step": 13053 + }, + { + "epoch": 1.4995118028832346, + "grad_norm": 0.5923478007316589, + "learning_rate": 0.0001, + "loss": 1.3799, + "step": 13054 + }, + { + "epoch": 1.4996266727930618, + "grad_norm": 0.5937912464141846, + "learning_rate": 0.0001, + "loss": 1.2611, + "step": 13055 + }, + { + "epoch": 1.499741542702889, + "grad_norm": 0.5981078147888184, + "learning_rate": 0.0001, + "loss": 1.363, + "step": 13056 + }, + { + "epoch": 1.499856412612716, + "grad_norm": 0.5924525260925293, + "learning_rate": 0.0001, + "loss": 1.2294, + "step": 13057 + }, + { + "epoch": 1.499971282522543, + "grad_norm": 0.6042497754096985, + "learning_rate": 0.0001, + "loss": 1.3568, + "step": 13058 + }, + { + "epoch": 1.5000861524323703, + "grad_norm": 0.6669368147850037, + "learning_rate": 0.0001, + "loss": 1.3832, + "step": 13059 + }, + { + "epoch": 1.5002010223421975, + "grad_norm": 0.6114850044250488, + "learning_rate": 0.0001, + "loss": 1.5816, + "step": 13060 + }, + { + "epoch": 1.5003158922520246, + "grad_norm": 0.6326112151145935, + "learning_rate": 0.0001, + "loss": 1.6052, + "step": 13061 + }, + { + "epoch": 1.5004307621618516, + "grad_norm": 0.648796796798706, + "learning_rate": 0.0001, + "loss": 1.5702, + "step": 13062 + }, + { + "epoch": 1.5005456320716788, + "grad_norm": 0.618161141872406, + "learning_rate": 0.0001, + "loss": 1.5032, + "step": 13063 + }, + { + "epoch": 1.500660501981506, + "grad_norm": 0.5921330451965332, + "learning_rate": 0.0001, + "loss": 1.3333, + "step": 13064 + }, + { + "epoch": 1.500775371891333, + "grad_norm": 0.6372246146202087, + "learning_rate": 0.0001, + "loss": 1.7721, + "step": 13065 + }, + { + "epoch": 1.50089024180116, + "grad_norm": 0.5890547037124634, + "learning_rate": 0.0001, + "loss": 1.5761, + "step": 13066 + }, + { + "epoch": 1.5010051117109873, + "grad_norm": 0.5621899366378784, + "learning_rate": 0.0001, + "loss": 1.5637, + "step": 13067 + }, + { + "epoch": 1.5011199816208145, + "grad_norm": 0.6191715598106384, + "learning_rate": 0.0001, + "loss": 1.5025, + "step": 13068 + }, + { + "epoch": 1.5012348515306415, + "grad_norm": 0.601288914680481, + "learning_rate": 0.0001, + "loss": 1.4675, + "step": 13069 + }, + { + "epoch": 1.5013497214404685, + "grad_norm": 0.5595393180847168, + "learning_rate": 0.0001, + "loss": 1.4565, + "step": 13070 + }, + { + "epoch": 1.5014645913502958, + "grad_norm": 0.6976054906845093, + "learning_rate": 0.0001, + "loss": 1.4749, + "step": 13071 + }, + { + "epoch": 1.501579461260123, + "grad_norm": 0.5838610529899597, + "learning_rate": 0.0001, + "loss": 1.5687, + "step": 13072 + }, + { + "epoch": 1.50169433116995, + "grad_norm": 0.6203344464302063, + "learning_rate": 0.0001, + "loss": 1.566, + "step": 13073 + }, + { + "epoch": 1.501809201079777, + "grad_norm": 0.5664482712745667, + "learning_rate": 0.0001, + "loss": 1.4541, + "step": 13074 + }, + { + "epoch": 1.5019240709896042, + "grad_norm": 0.5609536170959473, + "learning_rate": 0.0001, + "loss": 1.5321, + "step": 13075 + }, + { + "epoch": 1.5020389408994315, + "grad_norm": 0.6260766386985779, + "learning_rate": 0.0001, + "loss": 1.5236, + "step": 13076 + }, + { + "epoch": 1.5021538108092585, + "grad_norm": 0.6110790967941284, + "learning_rate": 0.0001, + "loss": 1.5518, + "step": 13077 + }, + { + "epoch": 1.5022686807190855, + "grad_norm": 0.590969443321228, + "learning_rate": 0.0001, + "loss": 1.5212, + "step": 13078 + }, + { + "epoch": 1.5023835506289127, + "grad_norm": 0.5963428616523743, + "learning_rate": 0.0001, + "loss": 1.4506, + "step": 13079 + }, + { + "epoch": 1.50249842053874, + "grad_norm": 0.649579644203186, + "learning_rate": 0.0001, + "loss": 1.4977, + "step": 13080 + }, + { + "epoch": 1.502613290448567, + "grad_norm": 0.5847672820091248, + "learning_rate": 0.0001, + "loss": 1.4921, + "step": 13081 + }, + { + "epoch": 1.502728160358394, + "grad_norm": 0.6328948736190796, + "learning_rate": 0.0001, + "loss": 1.188, + "step": 13082 + }, + { + "epoch": 1.5028430302682212, + "grad_norm": 0.5965337753295898, + "learning_rate": 0.0001, + "loss": 1.413, + "step": 13083 + }, + { + "epoch": 1.5029579001780484, + "grad_norm": 0.6650028824806213, + "learning_rate": 0.0001, + "loss": 1.4197, + "step": 13084 + }, + { + "epoch": 1.5030727700878754, + "grad_norm": 0.6406525373458862, + "learning_rate": 0.0001, + "loss": 1.6736, + "step": 13085 + }, + { + "epoch": 1.5031876399977024, + "grad_norm": 0.5682651400566101, + "learning_rate": 0.0001, + "loss": 1.5361, + "step": 13086 + }, + { + "epoch": 1.5033025099075297, + "grad_norm": 0.6791881322860718, + "learning_rate": 0.0001, + "loss": 1.4621, + "step": 13087 + }, + { + "epoch": 1.503417379817357, + "grad_norm": 0.6566523909568787, + "learning_rate": 0.0001, + "loss": 1.469, + "step": 13088 + }, + { + "epoch": 1.503532249727184, + "grad_norm": 0.6123066544532776, + "learning_rate": 0.0001, + "loss": 1.3571, + "step": 13089 + }, + { + "epoch": 1.503647119637011, + "grad_norm": 0.6262611746788025, + "learning_rate": 0.0001, + "loss": 1.5851, + "step": 13090 + }, + { + "epoch": 1.5037619895468382, + "grad_norm": 0.5882257223129272, + "learning_rate": 0.0001, + "loss": 1.4777, + "step": 13091 + }, + { + "epoch": 1.5038768594566654, + "grad_norm": 0.6783078908920288, + "learning_rate": 0.0001, + "loss": 1.6693, + "step": 13092 + }, + { + "epoch": 1.5039917293664926, + "grad_norm": 0.6360931992530823, + "learning_rate": 0.0001, + "loss": 1.3838, + "step": 13093 + }, + { + "epoch": 1.5041065992763196, + "grad_norm": 0.5902203917503357, + "learning_rate": 0.0001, + "loss": 1.3392, + "step": 13094 + }, + { + "epoch": 1.5042214691861466, + "grad_norm": 0.5917932987213135, + "learning_rate": 0.0001, + "loss": 1.5356, + "step": 13095 + }, + { + "epoch": 1.5043363390959739, + "grad_norm": 0.5984362959861755, + "learning_rate": 0.0001, + "loss": 1.4222, + "step": 13096 + }, + { + "epoch": 1.504451209005801, + "grad_norm": 0.5655670166015625, + "learning_rate": 0.0001, + "loss": 1.3595, + "step": 13097 + }, + { + "epoch": 1.504566078915628, + "grad_norm": 0.5472081899642944, + "learning_rate": 0.0001, + "loss": 1.3729, + "step": 13098 + }, + { + "epoch": 1.5046809488254551, + "grad_norm": 0.6080209016799927, + "learning_rate": 0.0001, + "loss": 1.3025, + "step": 13099 + }, + { + "epoch": 1.5047958187352823, + "grad_norm": 0.6341947913169861, + "learning_rate": 0.0001, + "loss": 1.5839, + "step": 13100 + }, + { + "epoch": 1.5049106886451096, + "grad_norm": 0.6027140617370605, + "learning_rate": 0.0001, + "loss": 1.4337, + "step": 13101 + }, + { + "epoch": 1.5050255585549366, + "grad_norm": 0.6108810901641846, + "learning_rate": 0.0001, + "loss": 1.1485, + "step": 13102 + }, + { + "epoch": 1.5051404284647636, + "grad_norm": 0.6450017094612122, + "learning_rate": 0.0001, + "loss": 1.5739, + "step": 13103 + }, + { + "epoch": 1.5052552983745908, + "grad_norm": 0.612722635269165, + "learning_rate": 0.0001, + "loss": 1.4161, + "step": 13104 + }, + { + "epoch": 1.505370168284418, + "grad_norm": 0.6334837675094604, + "learning_rate": 0.0001, + "loss": 1.3841, + "step": 13105 + }, + { + "epoch": 1.505485038194245, + "grad_norm": 0.5804214477539062, + "learning_rate": 0.0001, + "loss": 1.4075, + "step": 13106 + }, + { + "epoch": 1.505599908104072, + "grad_norm": 0.590505063533783, + "learning_rate": 0.0001, + "loss": 1.3492, + "step": 13107 + }, + { + "epoch": 1.5057147780138993, + "grad_norm": 0.5730953216552734, + "learning_rate": 0.0001, + "loss": 1.6026, + "step": 13108 + }, + { + "epoch": 1.5058296479237265, + "grad_norm": 0.6718409061431885, + "learning_rate": 0.0001, + "loss": 1.4637, + "step": 13109 + }, + { + "epoch": 1.5059445178335535, + "grad_norm": 0.5778510570526123, + "learning_rate": 0.0001, + "loss": 1.419, + "step": 13110 + }, + { + "epoch": 1.5060593877433806, + "grad_norm": 0.5760291218757629, + "learning_rate": 0.0001, + "loss": 1.4774, + "step": 13111 + }, + { + "epoch": 1.5061742576532078, + "grad_norm": 0.6402103900909424, + "learning_rate": 0.0001, + "loss": 1.4435, + "step": 13112 + }, + { + "epoch": 1.506289127563035, + "grad_norm": 0.6180935502052307, + "learning_rate": 0.0001, + "loss": 1.5586, + "step": 13113 + }, + { + "epoch": 1.506403997472862, + "grad_norm": 0.5968201756477356, + "learning_rate": 0.0001, + "loss": 1.2183, + "step": 13114 + }, + { + "epoch": 1.506518867382689, + "grad_norm": 0.5823055505752563, + "learning_rate": 0.0001, + "loss": 1.4475, + "step": 13115 + }, + { + "epoch": 1.5066337372925163, + "grad_norm": 0.6111396551132202, + "learning_rate": 0.0001, + "loss": 1.4014, + "step": 13116 + }, + { + "epoch": 1.5067486072023435, + "grad_norm": 0.6117756366729736, + "learning_rate": 0.0001, + "loss": 1.3061, + "step": 13117 + }, + { + "epoch": 1.5068634771121705, + "grad_norm": 0.6762123703956604, + "learning_rate": 0.0001, + "loss": 1.6192, + "step": 13118 + }, + { + "epoch": 1.5069783470219975, + "grad_norm": 0.6174688339233398, + "learning_rate": 0.0001, + "loss": 1.5198, + "step": 13119 + }, + { + "epoch": 1.5070932169318247, + "grad_norm": 0.6349688172340393, + "learning_rate": 0.0001, + "loss": 1.4694, + "step": 13120 + }, + { + "epoch": 1.507208086841652, + "grad_norm": 0.6256809830665588, + "learning_rate": 0.0001, + "loss": 1.3266, + "step": 13121 + }, + { + "epoch": 1.507322956751479, + "grad_norm": 0.6662157773971558, + "learning_rate": 0.0001, + "loss": 1.5674, + "step": 13122 + }, + { + "epoch": 1.507437826661306, + "grad_norm": 0.6114436984062195, + "learning_rate": 0.0001, + "loss": 1.4598, + "step": 13123 + }, + { + "epoch": 1.5075526965711332, + "grad_norm": 0.6376405358314514, + "learning_rate": 0.0001, + "loss": 1.4969, + "step": 13124 + }, + { + "epoch": 1.5076675664809605, + "grad_norm": 0.6283373832702637, + "learning_rate": 0.0001, + "loss": 1.4329, + "step": 13125 + }, + { + "epoch": 1.5077824363907875, + "grad_norm": 0.6334078311920166, + "learning_rate": 0.0001, + "loss": 1.5302, + "step": 13126 + }, + { + "epoch": 1.5078973063006145, + "grad_norm": 0.606662929058075, + "learning_rate": 0.0001, + "loss": 1.3271, + "step": 13127 + }, + { + "epoch": 1.5080121762104417, + "grad_norm": 0.6204425096511841, + "learning_rate": 0.0001, + "loss": 1.6041, + "step": 13128 + }, + { + "epoch": 1.508127046120269, + "grad_norm": 0.6233339905738831, + "learning_rate": 0.0001, + "loss": 1.5494, + "step": 13129 + }, + { + "epoch": 1.508241916030096, + "grad_norm": 0.5537604093551636, + "learning_rate": 0.0001, + "loss": 1.4306, + "step": 13130 + }, + { + "epoch": 1.508356785939923, + "grad_norm": 0.5744999051094055, + "learning_rate": 0.0001, + "loss": 1.5209, + "step": 13131 + }, + { + "epoch": 1.5084716558497502, + "grad_norm": 0.5970391035079956, + "learning_rate": 0.0001, + "loss": 1.4262, + "step": 13132 + }, + { + "epoch": 1.5085865257595774, + "grad_norm": 0.5616945028305054, + "learning_rate": 0.0001, + "loss": 1.2999, + "step": 13133 + }, + { + "epoch": 1.5087013956694044, + "grad_norm": 0.566780686378479, + "learning_rate": 0.0001, + "loss": 1.3291, + "step": 13134 + }, + { + "epoch": 1.5088162655792314, + "grad_norm": 0.6132379174232483, + "learning_rate": 0.0001, + "loss": 1.3745, + "step": 13135 + }, + { + "epoch": 1.5089311354890587, + "grad_norm": 0.6889569163322449, + "learning_rate": 0.0001, + "loss": 1.4654, + "step": 13136 + }, + { + "epoch": 1.509046005398886, + "grad_norm": 0.5599787831306458, + "learning_rate": 0.0001, + "loss": 1.3133, + "step": 13137 + }, + { + "epoch": 1.509160875308713, + "grad_norm": 0.5910610556602478, + "learning_rate": 0.0001, + "loss": 1.4698, + "step": 13138 + }, + { + "epoch": 1.50927574521854, + "grad_norm": 0.6421205401420593, + "learning_rate": 0.0001, + "loss": 1.5509, + "step": 13139 + }, + { + "epoch": 1.5093906151283671, + "grad_norm": 0.6151227355003357, + "learning_rate": 0.0001, + "loss": 1.3632, + "step": 13140 + }, + { + "epoch": 1.5095054850381944, + "grad_norm": 0.569758951663971, + "learning_rate": 0.0001, + "loss": 1.3274, + "step": 13141 + }, + { + "epoch": 1.5096203549480214, + "grad_norm": 0.6561845541000366, + "learning_rate": 0.0001, + "loss": 1.541, + "step": 13142 + }, + { + "epoch": 1.5097352248578484, + "grad_norm": 0.6225590705871582, + "learning_rate": 0.0001, + "loss": 1.6304, + "step": 13143 + }, + { + "epoch": 1.5098500947676756, + "grad_norm": 0.6006361842155457, + "learning_rate": 0.0001, + "loss": 1.4622, + "step": 13144 + }, + { + "epoch": 1.5099649646775029, + "grad_norm": 0.7036729454994202, + "learning_rate": 0.0001, + "loss": 1.635, + "step": 13145 + }, + { + "epoch": 1.5100798345873299, + "grad_norm": 0.6505088806152344, + "learning_rate": 0.0001, + "loss": 1.6313, + "step": 13146 + }, + { + "epoch": 1.5101947044971569, + "grad_norm": 0.6297341585159302, + "learning_rate": 0.0001, + "loss": 1.5059, + "step": 13147 + }, + { + "epoch": 1.510309574406984, + "grad_norm": 0.6522087454795837, + "learning_rate": 0.0001, + "loss": 1.4154, + "step": 13148 + }, + { + "epoch": 1.5104244443168113, + "grad_norm": 0.6192685961723328, + "learning_rate": 0.0001, + "loss": 1.5964, + "step": 13149 + }, + { + "epoch": 1.5105393142266383, + "grad_norm": 0.6250871419906616, + "learning_rate": 0.0001, + "loss": 1.5964, + "step": 13150 + }, + { + "epoch": 1.5106541841364654, + "grad_norm": 0.5791242122650146, + "learning_rate": 0.0001, + "loss": 1.4324, + "step": 13151 + }, + { + "epoch": 1.5107690540462926, + "grad_norm": 0.6033053994178772, + "learning_rate": 0.0001, + "loss": 1.441, + "step": 13152 + }, + { + "epoch": 1.5108839239561198, + "grad_norm": 0.5738871693611145, + "learning_rate": 0.0001, + "loss": 1.5468, + "step": 13153 + }, + { + "epoch": 1.5109987938659468, + "grad_norm": 0.6556652188301086, + "learning_rate": 0.0001, + "loss": 1.6179, + "step": 13154 + }, + { + "epoch": 1.5111136637757738, + "grad_norm": 0.6059442758560181, + "learning_rate": 0.0001, + "loss": 1.5285, + "step": 13155 + }, + { + "epoch": 1.511228533685601, + "grad_norm": 0.6075924038887024, + "learning_rate": 0.0001, + "loss": 1.3265, + "step": 13156 + }, + { + "epoch": 1.5113434035954283, + "grad_norm": 0.5933572053909302, + "learning_rate": 0.0001, + "loss": 1.5268, + "step": 13157 + }, + { + "epoch": 1.5114582735052553, + "grad_norm": 0.6054621934890747, + "learning_rate": 0.0001, + "loss": 1.66, + "step": 13158 + }, + { + "epoch": 1.5115731434150823, + "grad_norm": 0.5910248756408691, + "learning_rate": 0.0001, + "loss": 1.5443, + "step": 13159 + }, + { + "epoch": 1.5116880133249095, + "grad_norm": 0.6314256191253662, + "learning_rate": 0.0001, + "loss": 1.7058, + "step": 13160 + }, + { + "epoch": 1.5118028832347368, + "grad_norm": 0.5735428929328918, + "learning_rate": 0.0001, + "loss": 1.3867, + "step": 13161 + }, + { + "epoch": 1.5119177531445638, + "grad_norm": 0.5875412821769714, + "learning_rate": 0.0001, + "loss": 1.2807, + "step": 13162 + }, + { + "epoch": 1.5120326230543908, + "grad_norm": 0.5720763206481934, + "learning_rate": 0.0001, + "loss": 1.3435, + "step": 13163 + }, + { + "epoch": 1.512147492964218, + "grad_norm": 0.5954453349113464, + "learning_rate": 0.0001, + "loss": 1.3863, + "step": 13164 + }, + { + "epoch": 1.5122623628740453, + "grad_norm": 0.577720582485199, + "learning_rate": 0.0001, + "loss": 1.2367, + "step": 13165 + }, + { + "epoch": 1.5123772327838723, + "grad_norm": 0.5905912518501282, + "learning_rate": 0.0001, + "loss": 1.3191, + "step": 13166 + }, + { + "epoch": 1.5124921026936993, + "grad_norm": 0.5706838369369507, + "learning_rate": 0.0001, + "loss": 1.2935, + "step": 13167 + }, + { + "epoch": 1.5126069726035265, + "grad_norm": 0.6354959011077881, + "learning_rate": 0.0001, + "loss": 1.4469, + "step": 13168 + }, + { + "epoch": 1.5127218425133537, + "grad_norm": 0.6335857510566711, + "learning_rate": 0.0001, + "loss": 1.4592, + "step": 13169 + }, + { + "epoch": 1.5128367124231807, + "grad_norm": 0.6097238063812256, + "learning_rate": 0.0001, + "loss": 1.464, + "step": 13170 + }, + { + "epoch": 1.5129515823330077, + "grad_norm": 0.5822547674179077, + "learning_rate": 0.0001, + "loss": 1.41, + "step": 13171 + }, + { + "epoch": 1.513066452242835, + "grad_norm": 0.6244901418685913, + "learning_rate": 0.0001, + "loss": 1.4444, + "step": 13172 + }, + { + "epoch": 1.5131813221526622, + "grad_norm": 0.5513818860054016, + "learning_rate": 0.0001, + "loss": 1.3421, + "step": 13173 + }, + { + "epoch": 1.5132961920624892, + "grad_norm": 0.582343578338623, + "learning_rate": 0.0001, + "loss": 1.317, + "step": 13174 + }, + { + "epoch": 1.5134110619723162, + "grad_norm": 0.6151304841041565, + "learning_rate": 0.0001, + "loss": 1.4798, + "step": 13175 + }, + { + "epoch": 1.5135259318821435, + "grad_norm": 0.5942151546478271, + "learning_rate": 0.0001, + "loss": 1.546, + "step": 13176 + }, + { + "epoch": 1.5136408017919707, + "grad_norm": 0.589524507522583, + "learning_rate": 0.0001, + "loss": 1.4663, + "step": 13177 + }, + { + "epoch": 1.5137556717017977, + "grad_norm": 0.5877421498298645, + "learning_rate": 0.0001, + "loss": 1.4425, + "step": 13178 + }, + { + "epoch": 1.5138705416116247, + "grad_norm": 0.5954683423042297, + "learning_rate": 0.0001, + "loss": 1.5715, + "step": 13179 + }, + { + "epoch": 1.513985411521452, + "grad_norm": 0.5939478874206543, + "learning_rate": 0.0001, + "loss": 1.4957, + "step": 13180 + }, + { + "epoch": 1.5141002814312792, + "grad_norm": 0.6020224690437317, + "learning_rate": 0.0001, + "loss": 1.605, + "step": 13181 + }, + { + "epoch": 1.5142151513411062, + "grad_norm": 0.6681519746780396, + "learning_rate": 0.0001, + "loss": 1.2384, + "step": 13182 + }, + { + "epoch": 1.5143300212509332, + "grad_norm": 0.622969925403595, + "learning_rate": 0.0001, + "loss": 1.423, + "step": 13183 + }, + { + "epoch": 1.5144448911607604, + "grad_norm": 0.5562835931777954, + "learning_rate": 0.0001, + "loss": 1.3621, + "step": 13184 + }, + { + "epoch": 1.5145597610705877, + "grad_norm": 0.6770687103271484, + "learning_rate": 0.0001, + "loss": 1.5962, + "step": 13185 + }, + { + "epoch": 1.5146746309804147, + "grad_norm": 0.6299651861190796, + "learning_rate": 0.0001, + "loss": 1.5734, + "step": 13186 + }, + { + "epoch": 1.5147895008902417, + "grad_norm": 0.566347599029541, + "learning_rate": 0.0001, + "loss": 1.4579, + "step": 13187 + }, + { + "epoch": 1.514904370800069, + "grad_norm": 0.5551129579544067, + "learning_rate": 0.0001, + "loss": 1.4055, + "step": 13188 + }, + { + "epoch": 1.5150192407098961, + "grad_norm": 0.5839925408363342, + "learning_rate": 0.0001, + "loss": 1.5315, + "step": 13189 + }, + { + "epoch": 1.5151341106197231, + "grad_norm": 0.5784244537353516, + "learning_rate": 0.0001, + "loss": 1.515, + "step": 13190 + }, + { + "epoch": 1.5152489805295501, + "grad_norm": 0.5645365118980408, + "learning_rate": 0.0001, + "loss": 1.342, + "step": 13191 + }, + { + "epoch": 1.5153638504393774, + "grad_norm": 0.6406526565551758, + "learning_rate": 0.0001, + "loss": 1.3734, + "step": 13192 + }, + { + "epoch": 1.5154787203492046, + "grad_norm": 0.6612091064453125, + "learning_rate": 0.0001, + "loss": 1.6727, + "step": 13193 + }, + { + "epoch": 1.5155935902590316, + "grad_norm": 0.582217812538147, + "learning_rate": 0.0001, + "loss": 1.4829, + "step": 13194 + }, + { + "epoch": 1.5157084601688586, + "grad_norm": 0.6395711302757263, + "learning_rate": 0.0001, + "loss": 1.4942, + "step": 13195 + }, + { + "epoch": 1.5158233300786859, + "grad_norm": 0.6635745763778687, + "learning_rate": 0.0001, + "loss": 1.4659, + "step": 13196 + }, + { + "epoch": 1.515938199988513, + "grad_norm": 0.615152895450592, + "learning_rate": 0.0001, + "loss": 1.3869, + "step": 13197 + }, + { + "epoch": 1.51605306989834, + "grad_norm": 0.6119014620780945, + "learning_rate": 0.0001, + "loss": 1.5678, + "step": 13198 + }, + { + "epoch": 1.516167939808167, + "grad_norm": 0.6164930462837219, + "learning_rate": 0.0001, + "loss": 1.5306, + "step": 13199 + }, + { + "epoch": 1.5162828097179943, + "grad_norm": 0.5925434231758118, + "learning_rate": 0.0001, + "loss": 1.6357, + "step": 13200 + }, + { + "epoch": 1.5163976796278216, + "grad_norm": 0.6095196008682251, + "learning_rate": 0.0001, + "loss": 1.5921, + "step": 13201 + }, + { + "epoch": 1.5165125495376486, + "grad_norm": 0.6188748478889465, + "learning_rate": 0.0001, + "loss": 1.6986, + "step": 13202 + }, + { + "epoch": 1.5166274194474756, + "grad_norm": 0.5641734004020691, + "learning_rate": 0.0001, + "loss": 1.516, + "step": 13203 + }, + { + "epoch": 1.5167422893573028, + "grad_norm": 0.7535280585289001, + "learning_rate": 0.0001, + "loss": 1.5137, + "step": 13204 + }, + { + "epoch": 1.51685715926713, + "grad_norm": 0.6287049055099487, + "learning_rate": 0.0001, + "loss": 1.53, + "step": 13205 + }, + { + "epoch": 1.516972029176957, + "grad_norm": 0.6373167037963867, + "learning_rate": 0.0001, + "loss": 1.5328, + "step": 13206 + }, + { + "epoch": 1.517086899086784, + "grad_norm": 0.5664235949516296, + "learning_rate": 0.0001, + "loss": 1.4884, + "step": 13207 + }, + { + "epoch": 1.5172017689966113, + "grad_norm": 0.5645571351051331, + "learning_rate": 0.0001, + "loss": 1.2935, + "step": 13208 + }, + { + "epoch": 1.5173166389064385, + "grad_norm": 0.6005182862281799, + "learning_rate": 0.0001, + "loss": 1.4981, + "step": 13209 + }, + { + "epoch": 1.5174315088162655, + "grad_norm": 0.595731794834137, + "learning_rate": 0.0001, + "loss": 1.342, + "step": 13210 + }, + { + "epoch": 1.5175463787260925, + "grad_norm": 0.6699162721633911, + "learning_rate": 0.0001, + "loss": 1.6479, + "step": 13211 + }, + { + "epoch": 1.5176612486359198, + "grad_norm": 0.5972051620483398, + "learning_rate": 0.0001, + "loss": 1.438, + "step": 13212 + }, + { + "epoch": 1.517776118545747, + "grad_norm": 0.6613103747367859, + "learning_rate": 0.0001, + "loss": 1.6691, + "step": 13213 + }, + { + "epoch": 1.517890988455574, + "grad_norm": 0.5806509852409363, + "learning_rate": 0.0001, + "loss": 1.5286, + "step": 13214 + }, + { + "epoch": 1.518005858365401, + "grad_norm": 0.6001474857330322, + "learning_rate": 0.0001, + "loss": 1.5538, + "step": 13215 + }, + { + "epoch": 1.5181207282752283, + "grad_norm": 0.6385689377784729, + "learning_rate": 0.0001, + "loss": 1.6138, + "step": 13216 + }, + { + "epoch": 1.5182355981850555, + "grad_norm": 0.6373685002326965, + "learning_rate": 0.0001, + "loss": 1.478, + "step": 13217 + }, + { + "epoch": 1.5183504680948825, + "grad_norm": 0.5941466689109802, + "learning_rate": 0.0001, + "loss": 1.384, + "step": 13218 + }, + { + "epoch": 1.5184653380047095, + "grad_norm": 0.5440075993537903, + "learning_rate": 0.0001, + "loss": 1.388, + "step": 13219 + }, + { + "epoch": 1.5185802079145367, + "grad_norm": 0.5798068046569824, + "learning_rate": 0.0001, + "loss": 1.4018, + "step": 13220 + }, + { + "epoch": 1.518695077824364, + "grad_norm": 0.6054274439811707, + "learning_rate": 0.0001, + "loss": 1.5795, + "step": 13221 + }, + { + "epoch": 1.518809947734191, + "grad_norm": 0.623768150806427, + "learning_rate": 0.0001, + "loss": 1.6136, + "step": 13222 + }, + { + "epoch": 1.518924817644018, + "grad_norm": 0.5744849443435669, + "learning_rate": 0.0001, + "loss": 1.3231, + "step": 13223 + }, + { + "epoch": 1.5190396875538452, + "grad_norm": 0.6288904547691345, + "learning_rate": 0.0001, + "loss": 1.411, + "step": 13224 + }, + { + "epoch": 1.5191545574636725, + "grad_norm": 0.58237624168396, + "learning_rate": 0.0001, + "loss": 1.2721, + "step": 13225 + }, + { + "epoch": 1.5192694273734995, + "grad_norm": 0.5864928364753723, + "learning_rate": 0.0001, + "loss": 1.4331, + "step": 13226 + }, + { + "epoch": 1.5193842972833265, + "grad_norm": 0.5528786778450012, + "learning_rate": 0.0001, + "loss": 1.4004, + "step": 13227 + }, + { + "epoch": 1.5194991671931537, + "grad_norm": 0.639892578125, + "learning_rate": 0.0001, + "loss": 1.3726, + "step": 13228 + }, + { + "epoch": 1.519614037102981, + "grad_norm": 0.5863984227180481, + "learning_rate": 0.0001, + "loss": 1.4963, + "step": 13229 + }, + { + "epoch": 1.5197289070128082, + "grad_norm": 0.6669512391090393, + "learning_rate": 0.0001, + "loss": 1.5823, + "step": 13230 + }, + { + "epoch": 1.5198437769226352, + "grad_norm": 0.628280520439148, + "learning_rate": 0.0001, + "loss": 1.5261, + "step": 13231 + }, + { + "epoch": 1.5199586468324622, + "grad_norm": 0.6582902669906616, + "learning_rate": 0.0001, + "loss": 1.4258, + "step": 13232 + }, + { + "epoch": 1.5200735167422894, + "grad_norm": 0.5654552578926086, + "learning_rate": 0.0001, + "loss": 1.3741, + "step": 13233 + }, + { + "epoch": 1.5201883866521166, + "grad_norm": 0.6104324460029602, + "learning_rate": 0.0001, + "loss": 1.6584, + "step": 13234 + }, + { + "epoch": 1.5203032565619437, + "grad_norm": 0.6294476389884949, + "learning_rate": 0.0001, + "loss": 1.5948, + "step": 13235 + }, + { + "epoch": 1.5204181264717707, + "grad_norm": 0.6104316115379333, + "learning_rate": 0.0001, + "loss": 1.3727, + "step": 13236 + }, + { + "epoch": 1.520532996381598, + "grad_norm": 0.6369021534919739, + "learning_rate": 0.0001, + "loss": 1.593, + "step": 13237 + }, + { + "epoch": 1.5206478662914251, + "grad_norm": 0.5884815454483032, + "learning_rate": 0.0001, + "loss": 1.5229, + "step": 13238 + }, + { + "epoch": 1.5207627362012521, + "grad_norm": 0.6035114526748657, + "learning_rate": 0.0001, + "loss": 1.5668, + "step": 13239 + }, + { + "epoch": 1.5208776061110791, + "grad_norm": 0.5970353484153748, + "learning_rate": 0.0001, + "loss": 1.4528, + "step": 13240 + }, + { + "epoch": 1.5209924760209064, + "grad_norm": 0.6134264469146729, + "learning_rate": 0.0001, + "loss": 1.5959, + "step": 13241 + }, + { + "epoch": 1.5211073459307336, + "grad_norm": 0.6086723208427429, + "learning_rate": 0.0001, + "loss": 1.4763, + "step": 13242 + }, + { + "epoch": 1.5212222158405606, + "grad_norm": 0.6190268397331238, + "learning_rate": 0.0001, + "loss": 1.4347, + "step": 13243 + }, + { + "epoch": 1.5213370857503876, + "grad_norm": 0.5801183581352234, + "learning_rate": 0.0001, + "loss": 1.4204, + "step": 13244 + }, + { + "epoch": 1.5214519556602148, + "grad_norm": 0.5259165167808533, + "learning_rate": 0.0001, + "loss": 1.2951, + "step": 13245 + }, + { + "epoch": 1.521566825570042, + "grad_norm": 0.6810782551765442, + "learning_rate": 0.0001, + "loss": 1.5469, + "step": 13246 + }, + { + "epoch": 1.521681695479869, + "grad_norm": 0.6331126093864441, + "learning_rate": 0.0001, + "loss": 1.4176, + "step": 13247 + }, + { + "epoch": 1.521796565389696, + "grad_norm": 0.561797022819519, + "learning_rate": 0.0001, + "loss": 1.5015, + "step": 13248 + }, + { + "epoch": 1.5219114352995233, + "grad_norm": 0.6428273320198059, + "learning_rate": 0.0001, + "loss": 1.2756, + "step": 13249 + }, + { + "epoch": 1.5220263052093506, + "grad_norm": 0.5670008659362793, + "learning_rate": 0.0001, + "loss": 1.4275, + "step": 13250 + }, + { + "epoch": 1.5221411751191776, + "grad_norm": 0.6418376564979553, + "learning_rate": 0.0001, + "loss": 1.5854, + "step": 13251 + }, + { + "epoch": 1.5222560450290046, + "grad_norm": 0.6684396266937256, + "learning_rate": 0.0001, + "loss": 1.4903, + "step": 13252 + }, + { + "epoch": 1.5223709149388318, + "grad_norm": 0.5767911672592163, + "learning_rate": 0.0001, + "loss": 1.2792, + "step": 13253 + }, + { + "epoch": 1.522485784848659, + "grad_norm": 0.620569109916687, + "learning_rate": 0.0001, + "loss": 1.4702, + "step": 13254 + }, + { + "epoch": 1.522600654758486, + "grad_norm": 0.6131866574287415, + "learning_rate": 0.0001, + "loss": 1.4637, + "step": 13255 + }, + { + "epoch": 1.522715524668313, + "grad_norm": 0.6050616502761841, + "learning_rate": 0.0001, + "loss": 1.5023, + "step": 13256 + }, + { + "epoch": 1.5228303945781403, + "grad_norm": 0.6381790637969971, + "learning_rate": 0.0001, + "loss": 1.5207, + "step": 13257 + }, + { + "epoch": 1.5229452644879675, + "grad_norm": 0.6472856998443604, + "learning_rate": 0.0001, + "loss": 1.3212, + "step": 13258 + }, + { + "epoch": 1.5230601343977945, + "grad_norm": 0.5918861031532288, + "learning_rate": 0.0001, + "loss": 1.4754, + "step": 13259 + }, + { + "epoch": 1.5231750043076215, + "grad_norm": 0.6339123249053955, + "learning_rate": 0.0001, + "loss": 1.5166, + "step": 13260 + }, + { + "epoch": 1.5232898742174488, + "grad_norm": 0.5977100133895874, + "learning_rate": 0.0001, + "loss": 1.3237, + "step": 13261 + }, + { + "epoch": 1.523404744127276, + "grad_norm": 0.6018535494804382, + "learning_rate": 0.0001, + "loss": 1.354, + "step": 13262 + }, + { + "epoch": 1.523519614037103, + "grad_norm": 0.5871580243110657, + "learning_rate": 0.0001, + "loss": 1.5308, + "step": 13263 + }, + { + "epoch": 1.52363448394693, + "grad_norm": 0.6061954498291016, + "learning_rate": 0.0001, + "loss": 1.3217, + "step": 13264 + }, + { + "epoch": 1.5237493538567572, + "grad_norm": 0.637395977973938, + "learning_rate": 0.0001, + "loss": 1.5238, + "step": 13265 + }, + { + "epoch": 1.5238642237665845, + "grad_norm": 0.7003656029701233, + "learning_rate": 0.0001, + "loss": 1.7488, + "step": 13266 + }, + { + "epoch": 1.5239790936764115, + "grad_norm": 0.6120108366012573, + "learning_rate": 0.0001, + "loss": 1.4945, + "step": 13267 + }, + { + "epoch": 1.5240939635862385, + "grad_norm": 0.5899313688278198, + "learning_rate": 0.0001, + "loss": 1.5474, + "step": 13268 + }, + { + "epoch": 1.5242088334960657, + "grad_norm": 0.6798549890518188, + "learning_rate": 0.0001, + "loss": 1.5794, + "step": 13269 + }, + { + "epoch": 1.524323703405893, + "grad_norm": 0.6134473085403442, + "learning_rate": 0.0001, + "loss": 1.4846, + "step": 13270 + }, + { + "epoch": 1.52443857331572, + "grad_norm": 0.5412236452102661, + "learning_rate": 0.0001, + "loss": 1.5211, + "step": 13271 + }, + { + "epoch": 1.524553443225547, + "grad_norm": 0.5635609030723572, + "learning_rate": 0.0001, + "loss": 1.3889, + "step": 13272 + }, + { + "epoch": 1.5246683131353742, + "grad_norm": 0.5932890772819519, + "learning_rate": 0.0001, + "loss": 1.5587, + "step": 13273 + }, + { + "epoch": 1.5247831830452014, + "grad_norm": 0.5923974514007568, + "learning_rate": 0.0001, + "loss": 1.6642, + "step": 13274 + }, + { + "epoch": 1.5248980529550284, + "grad_norm": 0.6257017254829407, + "learning_rate": 0.0001, + "loss": 1.599, + "step": 13275 + }, + { + "epoch": 1.5250129228648555, + "grad_norm": 0.5884189605712891, + "learning_rate": 0.0001, + "loss": 1.5512, + "step": 13276 + }, + { + "epoch": 1.5251277927746827, + "grad_norm": 0.5920361876487732, + "learning_rate": 0.0001, + "loss": 1.5368, + "step": 13277 + }, + { + "epoch": 1.52524266268451, + "grad_norm": 0.6029490232467651, + "learning_rate": 0.0001, + "loss": 1.5597, + "step": 13278 + }, + { + "epoch": 1.525357532594337, + "grad_norm": 0.5848401784896851, + "learning_rate": 0.0001, + "loss": 1.4009, + "step": 13279 + }, + { + "epoch": 1.525472402504164, + "grad_norm": 0.6184453964233398, + "learning_rate": 0.0001, + "loss": 1.588, + "step": 13280 + }, + { + "epoch": 1.5255872724139912, + "grad_norm": 0.7568985223770142, + "learning_rate": 0.0001, + "loss": 1.642, + "step": 13281 + }, + { + "epoch": 1.5257021423238184, + "grad_norm": 0.5871482491493225, + "learning_rate": 0.0001, + "loss": 1.3935, + "step": 13282 + }, + { + "epoch": 1.5258170122336454, + "grad_norm": 0.6043814420700073, + "learning_rate": 0.0001, + "loss": 1.2961, + "step": 13283 + }, + { + "epoch": 1.5259318821434724, + "grad_norm": 0.5470103621482849, + "learning_rate": 0.0001, + "loss": 1.3294, + "step": 13284 + }, + { + "epoch": 1.5260467520532996, + "grad_norm": 0.5336215496063232, + "learning_rate": 0.0001, + "loss": 1.1915, + "step": 13285 + }, + { + "epoch": 1.5261616219631269, + "grad_norm": 0.6141431331634521, + "learning_rate": 0.0001, + "loss": 1.469, + "step": 13286 + }, + { + "epoch": 1.5262764918729539, + "grad_norm": 0.5851207375526428, + "learning_rate": 0.0001, + "loss": 1.424, + "step": 13287 + }, + { + "epoch": 1.526391361782781, + "grad_norm": 0.5804892182350159, + "learning_rate": 0.0001, + "loss": 1.3296, + "step": 13288 + }, + { + "epoch": 1.5265062316926081, + "grad_norm": 0.6511141061782837, + "learning_rate": 0.0001, + "loss": 1.4835, + "step": 13289 + }, + { + "epoch": 1.5266211016024354, + "grad_norm": 0.6962987184524536, + "learning_rate": 0.0001, + "loss": 1.5535, + "step": 13290 + }, + { + "epoch": 1.5267359715122624, + "grad_norm": 0.5867136120796204, + "learning_rate": 0.0001, + "loss": 1.2938, + "step": 13291 + }, + { + "epoch": 1.5268508414220894, + "grad_norm": 0.6161892414093018, + "learning_rate": 0.0001, + "loss": 1.5418, + "step": 13292 + }, + { + "epoch": 1.5269657113319166, + "grad_norm": 0.5810278058052063, + "learning_rate": 0.0001, + "loss": 1.197, + "step": 13293 + }, + { + "epoch": 1.5270805812417438, + "grad_norm": 0.6159192323684692, + "learning_rate": 0.0001, + "loss": 1.3235, + "step": 13294 + }, + { + "epoch": 1.5271954511515708, + "grad_norm": 0.6228445768356323, + "learning_rate": 0.0001, + "loss": 1.3081, + "step": 13295 + }, + { + "epoch": 1.5273103210613979, + "grad_norm": 0.6097662448883057, + "learning_rate": 0.0001, + "loss": 1.468, + "step": 13296 + }, + { + "epoch": 1.527425190971225, + "grad_norm": 0.6655218005180359, + "learning_rate": 0.0001, + "loss": 1.4785, + "step": 13297 + }, + { + "epoch": 1.5275400608810523, + "grad_norm": 0.5990704298019409, + "learning_rate": 0.0001, + "loss": 1.6224, + "step": 13298 + }, + { + "epoch": 1.5276549307908793, + "grad_norm": 0.6513421535491943, + "learning_rate": 0.0001, + "loss": 1.6009, + "step": 13299 + }, + { + "epoch": 1.5277698007007063, + "grad_norm": 0.6456665992736816, + "learning_rate": 0.0001, + "loss": 1.4663, + "step": 13300 + }, + { + "epoch": 1.5278846706105336, + "grad_norm": 0.6521137952804565, + "learning_rate": 0.0001, + "loss": 1.5566, + "step": 13301 + }, + { + "epoch": 1.5279995405203608, + "grad_norm": 0.5967424511909485, + "learning_rate": 0.0001, + "loss": 1.4281, + "step": 13302 + }, + { + "epoch": 1.5281144104301878, + "grad_norm": 0.6816303730010986, + "learning_rate": 0.0001, + "loss": 1.5269, + "step": 13303 + }, + { + "epoch": 1.5282292803400148, + "grad_norm": 0.5861914753913879, + "learning_rate": 0.0001, + "loss": 1.4389, + "step": 13304 + }, + { + "epoch": 1.528344150249842, + "grad_norm": 0.6135704517364502, + "learning_rate": 0.0001, + "loss": 1.3657, + "step": 13305 + }, + { + "epoch": 1.5284590201596693, + "grad_norm": 0.6036303043365479, + "learning_rate": 0.0001, + "loss": 1.4097, + "step": 13306 + }, + { + "epoch": 1.5285738900694963, + "grad_norm": 0.5533670783042908, + "learning_rate": 0.0001, + "loss": 1.3476, + "step": 13307 + }, + { + "epoch": 1.5286887599793233, + "grad_norm": 0.5623646974563599, + "learning_rate": 0.0001, + "loss": 1.5436, + "step": 13308 + }, + { + "epoch": 1.5288036298891505, + "grad_norm": 0.5962461829185486, + "learning_rate": 0.0001, + "loss": 1.5308, + "step": 13309 + }, + { + "epoch": 1.5289184997989778, + "grad_norm": 0.626592218875885, + "learning_rate": 0.0001, + "loss": 1.4646, + "step": 13310 + }, + { + "epoch": 1.5290333697088048, + "grad_norm": 0.6010507941246033, + "learning_rate": 0.0001, + "loss": 1.5502, + "step": 13311 + }, + { + "epoch": 1.5291482396186318, + "grad_norm": 0.5912574529647827, + "learning_rate": 0.0001, + "loss": 1.4557, + "step": 13312 + }, + { + "epoch": 1.529263109528459, + "grad_norm": 0.6870297789573669, + "learning_rate": 0.0001, + "loss": 1.7032, + "step": 13313 + }, + { + "epoch": 1.5293779794382862, + "grad_norm": 0.5987645983695984, + "learning_rate": 0.0001, + "loss": 1.5357, + "step": 13314 + }, + { + "epoch": 1.5294928493481132, + "grad_norm": 0.6424558162689209, + "learning_rate": 0.0001, + "loss": 1.2024, + "step": 13315 + }, + { + "epoch": 1.5296077192579403, + "grad_norm": 0.6005625128746033, + "learning_rate": 0.0001, + "loss": 1.5736, + "step": 13316 + }, + { + "epoch": 1.5297225891677675, + "grad_norm": 0.6098487973213196, + "learning_rate": 0.0001, + "loss": 1.3273, + "step": 13317 + }, + { + "epoch": 1.5298374590775947, + "grad_norm": 0.625301718711853, + "learning_rate": 0.0001, + "loss": 1.3964, + "step": 13318 + }, + { + "epoch": 1.5299523289874217, + "grad_norm": 0.6038113236427307, + "learning_rate": 0.0001, + "loss": 1.4294, + "step": 13319 + }, + { + "epoch": 1.5300671988972487, + "grad_norm": 0.5868397951126099, + "learning_rate": 0.0001, + "loss": 1.4582, + "step": 13320 + }, + { + "epoch": 1.530182068807076, + "grad_norm": 0.6464180946350098, + "learning_rate": 0.0001, + "loss": 1.5435, + "step": 13321 + }, + { + "epoch": 1.5302969387169032, + "grad_norm": 0.604279637336731, + "learning_rate": 0.0001, + "loss": 1.5115, + "step": 13322 + }, + { + "epoch": 1.5304118086267302, + "grad_norm": 0.5744287371635437, + "learning_rate": 0.0001, + "loss": 1.3912, + "step": 13323 + }, + { + "epoch": 1.5305266785365572, + "grad_norm": 0.6124972105026245, + "learning_rate": 0.0001, + "loss": 1.432, + "step": 13324 + }, + { + "epoch": 1.5306415484463844, + "grad_norm": 0.5574883818626404, + "learning_rate": 0.0001, + "loss": 1.3208, + "step": 13325 + }, + { + "epoch": 1.5307564183562117, + "grad_norm": 0.6026033759117126, + "learning_rate": 0.0001, + "loss": 1.3873, + "step": 13326 + }, + { + "epoch": 1.5308712882660387, + "grad_norm": 0.6555309295654297, + "learning_rate": 0.0001, + "loss": 1.4002, + "step": 13327 + }, + { + "epoch": 1.5309861581758657, + "grad_norm": 0.5827085375785828, + "learning_rate": 0.0001, + "loss": 1.3754, + "step": 13328 + }, + { + "epoch": 1.531101028085693, + "grad_norm": 0.6004881262779236, + "learning_rate": 0.0001, + "loss": 1.2212, + "step": 13329 + }, + { + "epoch": 1.5312158979955202, + "grad_norm": 0.7817880511283875, + "learning_rate": 0.0001, + "loss": 1.5378, + "step": 13330 + }, + { + "epoch": 1.5313307679053472, + "grad_norm": 0.6626024842262268, + "learning_rate": 0.0001, + "loss": 1.4952, + "step": 13331 + }, + { + "epoch": 1.5314456378151742, + "grad_norm": 0.6087819337844849, + "learning_rate": 0.0001, + "loss": 1.4887, + "step": 13332 + }, + { + "epoch": 1.5315605077250014, + "grad_norm": 0.7612007260322571, + "learning_rate": 0.0001, + "loss": 1.4343, + "step": 13333 + }, + { + "epoch": 1.5316753776348286, + "grad_norm": 0.5998708009719849, + "learning_rate": 0.0001, + "loss": 1.3433, + "step": 13334 + }, + { + "epoch": 1.5317902475446556, + "grad_norm": 0.5892654061317444, + "learning_rate": 0.0001, + "loss": 1.5178, + "step": 13335 + }, + { + "epoch": 1.5319051174544827, + "grad_norm": 0.6381953954696655, + "learning_rate": 0.0001, + "loss": 1.5053, + "step": 13336 + }, + { + "epoch": 1.5320199873643099, + "grad_norm": 0.6319192051887512, + "learning_rate": 0.0001, + "loss": 1.2256, + "step": 13337 + }, + { + "epoch": 1.5321348572741371, + "grad_norm": 0.6061521172523499, + "learning_rate": 0.0001, + "loss": 1.3842, + "step": 13338 + }, + { + "epoch": 1.5322497271839641, + "grad_norm": 0.6164060235023499, + "learning_rate": 0.0001, + "loss": 1.5006, + "step": 13339 + }, + { + "epoch": 1.5323645970937911, + "grad_norm": 0.5808977484703064, + "learning_rate": 0.0001, + "loss": 1.3016, + "step": 13340 + }, + { + "epoch": 1.5324794670036184, + "grad_norm": 0.6272817254066467, + "learning_rate": 0.0001, + "loss": 1.6399, + "step": 13341 + }, + { + "epoch": 1.5325943369134456, + "grad_norm": 0.5705950260162354, + "learning_rate": 0.0001, + "loss": 1.4289, + "step": 13342 + }, + { + "epoch": 1.5327092068232726, + "grad_norm": 0.5681813359260559, + "learning_rate": 0.0001, + "loss": 1.3662, + "step": 13343 + }, + { + "epoch": 1.5328240767330996, + "grad_norm": 0.591742753982544, + "learning_rate": 0.0001, + "loss": 1.411, + "step": 13344 + }, + { + "epoch": 1.5329389466429268, + "grad_norm": 0.6885697841644287, + "learning_rate": 0.0001, + "loss": 1.4915, + "step": 13345 + }, + { + "epoch": 1.533053816552754, + "grad_norm": 0.6182805895805359, + "learning_rate": 0.0001, + "loss": 1.3183, + "step": 13346 + }, + { + "epoch": 1.533168686462581, + "grad_norm": 0.630598247051239, + "learning_rate": 0.0001, + "loss": 1.399, + "step": 13347 + }, + { + "epoch": 1.533283556372408, + "grad_norm": 0.5968937277793884, + "learning_rate": 0.0001, + "loss": 1.3648, + "step": 13348 + }, + { + "epoch": 1.5333984262822353, + "grad_norm": 0.6997851133346558, + "learning_rate": 0.0001, + "loss": 1.5873, + "step": 13349 + }, + { + "epoch": 1.5335132961920626, + "grad_norm": 0.615630030632019, + "learning_rate": 0.0001, + "loss": 1.3916, + "step": 13350 + }, + { + "epoch": 1.5336281661018896, + "grad_norm": 0.6529400944709778, + "learning_rate": 0.0001, + "loss": 1.4654, + "step": 13351 + }, + { + "epoch": 1.5337430360117166, + "grad_norm": 0.6269962191581726, + "learning_rate": 0.0001, + "loss": 1.3357, + "step": 13352 + }, + { + "epoch": 1.5338579059215438, + "grad_norm": 0.613075852394104, + "learning_rate": 0.0001, + "loss": 1.5735, + "step": 13353 + }, + { + "epoch": 1.533972775831371, + "grad_norm": 0.6532135605812073, + "learning_rate": 0.0001, + "loss": 1.5337, + "step": 13354 + }, + { + "epoch": 1.534087645741198, + "grad_norm": 0.6102808117866516, + "learning_rate": 0.0001, + "loss": 1.4645, + "step": 13355 + }, + { + "epoch": 1.534202515651025, + "grad_norm": 0.6019791960716248, + "learning_rate": 0.0001, + "loss": 1.3113, + "step": 13356 + }, + { + "epoch": 1.5343173855608523, + "grad_norm": 0.6130610108375549, + "learning_rate": 0.0001, + "loss": 1.5375, + "step": 13357 + }, + { + "epoch": 1.5344322554706795, + "grad_norm": 0.5954639911651611, + "learning_rate": 0.0001, + "loss": 1.5233, + "step": 13358 + }, + { + "epoch": 1.5345471253805065, + "grad_norm": 0.6210713982582092, + "learning_rate": 0.0001, + "loss": 1.5327, + "step": 13359 + }, + { + "epoch": 1.5346619952903335, + "grad_norm": 0.5767752528190613, + "learning_rate": 0.0001, + "loss": 1.2935, + "step": 13360 + }, + { + "epoch": 1.5347768652001608, + "grad_norm": 0.5763528943061829, + "learning_rate": 0.0001, + "loss": 1.3317, + "step": 13361 + }, + { + "epoch": 1.534891735109988, + "grad_norm": 0.6128670573234558, + "learning_rate": 0.0001, + "loss": 1.4817, + "step": 13362 + }, + { + "epoch": 1.535006605019815, + "grad_norm": 0.6389924883842468, + "learning_rate": 0.0001, + "loss": 1.4359, + "step": 13363 + }, + { + "epoch": 1.535121474929642, + "grad_norm": 0.6564880013465881, + "learning_rate": 0.0001, + "loss": 1.4894, + "step": 13364 + }, + { + "epoch": 1.5352363448394692, + "grad_norm": 0.6105384826660156, + "learning_rate": 0.0001, + "loss": 1.3985, + "step": 13365 + }, + { + "epoch": 1.5353512147492965, + "grad_norm": 0.7015275359153748, + "learning_rate": 0.0001, + "loss": 1.5958, + "step": 13366 + }, + { + "epoch": 1.5354660846591237, + "grad_norm": 0.6538365483283997, + "learning_rate": 0.0001, + "loss": 1.5722, + "step": 13367 + }, + { + "epoch": 1.5355809545689507, + "grad_norm": 0.5591805577278137, + "learning_rate": 0.0001, + "loss": 1.306, + "step": 13368 + }, + { + "epoch": 1.5356958244787777, + "grad_norm": 0.5733972787857056, + "learning_rate": 0.0001, + "loss": 1.2758, + "step": 13369 + }, + { + "epoch": 1.535810694388605, + "grad_norm": 0.6370871067047119, + "learning_rate": 0.0001, + "loss": 1.3703, + "step": 13370 + }, + { + "epoch": 1.5359255642984322, + "grad_norm": 0.5944746136665344, + "learning_rate": 0.0001, + "loss": 1.461, + "step": 13371 + }, + { + "epoch": 1.5360404342082592, + "grad_norm": 0.5778318047523499, + "learning_rate": 0.0001, + "loss": 1.5607, + "step": 13372 + }, + { + "epoch": 1.5361553041180862, + "grad_norm": 0.6232753992080688, + "learning_rate": 0.0001, + "loss": 1.5324, + "step": 13373 + }, + { + "epoch": 1.5362701740279134, + "grad_norm": 0.6197500228881836, + "learning_rate": 0.0001, + "loss": 1.437, + "step": 13374 + }, + { + "epoch": 1.5363850439377407, + "grad_norm": 0.6072701811790466, + "learning_rate": 0.0001, + "loss": 1.4755, + "step": 13375 + }, + { + "epoch": 1.5364999138475677, + "grad_norm": 0.602379560470581, + "learning_rate": 0.0001, + "loss": 1.4888, + "step": 13376 + }, + { + "epoch": 1.5366147837573947, + "grad_norm": 0.5745888352394104, + "learning_rate": 0.0001, + "loss": 1.4408, + "step": 13377 + }, + { + "epoch": 1.536729653667222, + "grad_norm": 0.615543782711029, + "learning_rate": 0.0001, + "loss": 1.2821, + "step": 13378 + }, + { + "epoch": 1.5368445235770491, + "grad_norm": 0.5586651563644409, + "learning_rate": 0.0001, + "loss": 1.29, + "step": 13379 + }, + { + "epoch": 1.5369593934868762, + "grad_norm": 0.5711967945098877, + "learning_rate": 0.0001, + "loss": 1.4938, + "step": 13380 + }, + { + "epoch": 1.5370742633967032, + "grad_norm": 0.5827999711036682, + "learning_rate": 0.0001, + "loss": 1.2618, + "step": 13381 + }, + { + "epoch": 1.5371891333065304, + "grad_norm": 0.5403852462768555, + "learning_rate": 0.0001, + "loss": 1.2958, + "step": 13382 + }, + { + "epoch": 1.5373040032163576, + "grad_norm": 0.6437862515449524, + "learning_rate": 0.0001, + "loss": 1.5734, + "step": 13383 + }, + { + "epoch": 1.5374188731261846, + "grad_norm": 0.5628765225410461, + "learning_rate": 0.0001, + "loss": 1.4437, + "step": 13384 + }, + { + "epoch": 1.5375337430360116, + "grad_norm": 0.6011320948600769, + "learning_rate": 0.0001, + "loss": 1.4629, + "step": 13385 + }, + { + "epoch": 1.5376486129458389, + "grad_norm": 0.5917825102806091, + "learning_rate": 0.0001, + "loss": 1.4444, + "step": 13386 + }, + { + "epoch": 1.537763482855666, + "grad_norm": 0.5793378353118896, + "learning_rate": 0.0001, + "loss": 1.3797, + "step": 13387 + }, + { + "epoch": 1.5378783527654931, + "grad_norm": 0.6069167852401733, + "learning_rate": 0.0001, + "loss": 1.4303, + "step": 13388 + }, + { + "epoch": 1.5379932226753201, + "grad_norm": 0.6057384014129639, + "learning_rate": 0.0001, + "loss": 1.5104, + "step": 13389 + }, + { + "epoch": 1.5381080925851474, + "grad_norm": 0.5952368974685669, + "learning_rate": 0.0001, + "loss": 1.4508, + "step": 13390 + }, + { + "epoch": 1.5382229624949746, + "grad_norm": 0.58933025598526, + "learning_rate": 0.0001, + "loss": 1.4134, + "step": 13391 + }, + { + "epoch": 1.5383378324048016, + "grad_norm": 0.5574020743370056, + "learning_rate": 0.0001, + "loss": 1.2945, + "step": 13392 + }, + { + "epoch": 1.5384527023146286, + "grad_norm": 0.5393736958503723, + "learning_rate": 0.0001, + "loss": 1.3008, + "step": 13393 + }, + { + "epoch": 1.5385675722244558, + "grad_norm": 0.5925777554512024, + "learning_rate": 0.0001, + "loss": 1.4943, + "step": 13394 + }, + { + "epoch": 1.538682442134283, + "grad_norm": 0.6799826622009277, + "learning_rate": 0.0001, + "loss": 1.3157, + "step": 13395 + }, + { + "epoch": 1.53879731204411, + "grad_norm": 0.6065730452537537, + "learning_rate": 0.0001, + "loss": 1.3707, + "step": 13396 + }, + { + "epoch": 1.538912181953937, + "grad_norm": 0.6125333905220032, + "learning_rate": 0.0001, + "loss": 1.5379, + "step": 13397 + }, + { + "epoch": 1.5390270518637643, + "grad_norm": 0.5929028391838074, + "learning_rate": 0.0001, + "loss": 1.5625, + "step": 13398 + }, + { + "epoch": 1.5391419217735915, + "grad_norm": 0.6272832751274109, + "learning_rate": 0.0001, + "loss": 1.3904, + "step": 13399 + }, + { + "epoch": 1.5392567916834186, + "grad_norm": 0.5839588642120361, + "learning_rate": 0.0001, + "loss": 1.452, + "step": 13400 + }, + { + "epoch": 1.5393716615932456, + "grad_norm": 0.6149510145187378, + "learning_rate": 0.0001, + "loss": 1.4483, + "step": 13401 + }, + { + "epoch": 1.5394865315030728, + "grad_norm": 0.5844335556030273, + "learning_rate": 0.0001, + "loss": 1.4239, + "step": 13402 + }, + { + "epoch": 1.5396014014129, + "grad_norm": 0.5901253819465637, + "learning_rate": 0.0001, + "loss": 1.3002, + "step": 13403 + }, + { + "epoch": 1.539716271322727, + "grad_norm": 0.6704103946685791, + "learning_rate": 0.0001, + "loss": 1.5101, + "step": 13404 + }, + { + "epoch": 1.539831141232554, + "grad_norm": 0.5861269235610962, + "learning_rate": 0.0001, + "loss": 1.5826, + "step": 13405 + }, + { + "epoch": 1.5399460111423813, + "grad_norm": 0.6078442335128784, + "learning_rate": 0.0001, + "loss": 1.5645, + "step": 13406 + }, + { + "epoch": 1.5400608810522085, + "grad_norm": 0.5604826211929321, + "learning_rate": 0.0001, + "loss": 1.4701, + "step": 13407 + }, + { + "epoch": 1.5401757509620355, + "grad_norm": 0.5622230172157288, + "learning_rate": 0.0001, + "loss": 1.5088, + "step": 13408 + }, + { + "epoch": 1.5402906208718625, + "grad_norm": 0.6920380592346191, + "learning_rate": 0.0001, + "loss": 1.3965, + "step": 13409 + }, + { + "epoch": 1.5404054907816898, + "grad_norm": 0.5716781616210938, + "learning_rate": 0.0001, + "loss": 1.6094, + "step": 13410 + }, + { + "epoch": 1.540520360691517, + "grad_norm": 0.5778867602348328, + "learning_rate": 0.0001, + "loss": 1.4645, + "step": 13411 + }, + { + "epoch": 1.540635230601344, + "grad_norm": 0.564794659614563, + "learning_rate": 0.0001, + "loss": 1.4669, + "step": 13412 + }, + { + "epoch": 1.540750100511171, + "grad_norm": 0.5998444557189941, + "learning_rate": 0.0001, + "loss": 1.3983, + "step": 13413 + }, + { + "epoch": 1.5408649704209982, + "grad_norm": 0.6303174495697021, + "learning_rate": 0.0001, + "loss": 1.6117, + "step": 13414 + }, + { + "epoch": 1.5409798403308255, + "grad_norm": 0.6035495400428772, + "learning_rate": 0.0001, + "loss": 1.4046, + "step": 13415 + }, + { + "epoch": 1.5410947102406525, + "grad_norm": 0.6213974952697754, + "learning_rate": 0.0001, + "loss": 1.6235, + "step": 13416 + }, + { + "epoch": 1.5412095801504795, + "grad_norm": 0.6108240485191345, + "learning_rate": 0.0001, + "loss": 1.5057, + "step": 13417 + }, + { + "epoch": 1.5413244500603067, + "grad_norm": 0.5867082476615906, + "learning_rate": 0.0001, + "loss": 1.4281, + "step": 13418 + }, + { + "epoch": 1.541439319970134, + "grad_norm": 0.6038832664489746, + "learning_rate": 0.0001, + "loss": 1.586, + "step": 13419 + }, + { + "epoch": 1.541554189879961, + "grad_norm": 0.6144563555717468, + "learning_rate": 0.0001, + "loss": 1.68, + "step": 13420 + }, + { + "epoch": 1.541669059789788, + "grad_norm": 0.568112850189209, + "learning_rate": 0.0001, + "loss": 1.4584, + "step": 13421 + }, + { + "epoch": 1.5417839296996152, + "grad_norm": 0.6124117374420166, + "learning_rate": 0.0001, + "loss": 1.6241, + "step": 13422 + }, + { + "epoch": 1.5418987996094424, + "grad_norm": 0.5999955534934998, + "learning_rate": 0.0001, + "loss": 1.4416, + "step": 13423 + }, + { + "epoch": 1.5420136695192694, + "grad_norm": 0.6205251812934875, + "learning_rate": 0.0001, + "loss": 1.5107, + "step": 13424 + }, + { + "epoch": 1.5421285394290964, + "grad_norm": 0.5759456753730774, + "learning_rate": 0.0001, + "loss": 1.542, + "step": 13425 + }, + { + "epoch": 1.5422434093389237, + "grad_norm": 0.6211003661155701, + "learning_rate": 0.0001, + "loss": 1.5235, + "step": 13426 + }, + { + "epoch": 1.542358279248751, + "grad_norm": 0.5795714259147644, + "learning_rate": 0.0001, + "loss": 1.44, + "step": 13427 + }, + { + "epoch": 1.542473149158578, + "grad_norm": 0.5453798770904541, + "learning_rate": 0.0001, + "loss": 1.3127, + "step": 13428 + }, + { + "epoch": 1.542588019068405, + "grad_norm": 0.6491063833236694, + "learning_rate": 0.0001, + "loss": 1.4514, + "step": 13429 + }, + { + "epoch": 1.5427028889782322, + "grad_norm": 0.5597581267356873, + "learning_rate": 0.0001, + "loss": 1.4427, + "step": 13430 + }, + { + "epoch": 1.5428177588880594, + "grad_norm": 0.6014596819877625, + "learning_rate": 0.0001, + "loss": 1.6442, + "step": 13431 + }, + { + "epoch": 1.5429326287978864, + "grad_norm": 0.5784545540809631, + "learning_rate": 0.0001, + "loss": 1.4709, + "step": 13432 + }, + { + "epoch": 1.5430474987077134, + "grad_norm": 0.5702968835830688, + "learning_rate": 0.0001, + "loss": 1.4526, + "step": 13433 + }, + { + "epoch": 1.5431623686175406, + "grad_norm": 0.5658290386199951, + "learning_rate": 0.0001, + "loss": 1.5356, + "step": 13434 + }, + { + "epoch": 1.5432772385273679, + "grad_norm": 0.598718523979187, + "learning_rate": 0.0001, + "loss": 1.3576, + "step": 13435 + }, + { + "epoch": 1.5433921084371949, + "grad_norm": 0.6943349242210388, + "learning_rate": 0.0001, + "loss": 1.7476, + "step": 13436 + }, + { + "epoch": 1.5435069783470219, + "grad_norm": 0.5755090117454529, + "learning_rate": 0.0001, + "loss": 1.3345, + "step": 13437 + }, + { + "epoch": 1.5436218482568491, + "grad_norm": 0.6275411248207092, + "learning_rate": 0.0001, + "loss": 1.497, + "step": 13438 + }, + { + "epoch": 1.5437367181666763, + "grad_norm": 0.5974034667015076, + "learning_rate": 0.0001, + "loss": 1.5015, + "step": 13439 + }, + { + "epoch": 1.5438515880765034, + "grad_norm": 0.5933687090873718, + "learning_rate": 0.0001, + "loss": 1.564, + "step": 13440 + }, + { + "epoch": 1.5439664579863304, + "grad_norm": 0.7212704420089722, + "learning_rate": 0.0001, + "loss": 1.562, + "step": 13441 + }, + { + "epoch": 1.5440813278961576, + "grad_norm": 0.5720301866531372, + "learning_rate": 0.0001, + "loss": 1.3696, + "step": 13442 + }, + { + "epoch": 1.5441961978059848, + "grad_norm": 0.6417856812477112, + "learning_rate": 0.0001, + "loss": 1.4741, + "step": 13443 + }, + { + "epoch": 1.5443110677158118, + "grad_norm": 0.6052496433258057, + "learning_rate": 0.0001, + "loss": 1.2494, + "step": 13444 + }, + { + "epoch": 1.5444259376256388, + "grad_norm": 0.6466429829597473, + "learning_rate": 0.0001, + "loss": 1.483, + "step": 13445 + }, + { + "epoch": 1.544540807535466, + "grad_norm": 0.6234352588653564, + "learning_rate": 0.0001, + "loss": 1.5925, + "step": 13446 + }, + { + "epoch": 1.5446556774452933, + "grad_norm": 0.6527000069618225, + "learning_rate": 0.0001, + "loss": 1.5712, + "step": 13447 + }, + { + "epoch": 1.5447705473551203, + "grad_norm": 0.5902075171470642, + "learning_rate": 0.0001, + "loss": 1.5089, + "step": 13448 + }, + { + "epoch": 1.5448854172649473, + "grad_norm": 0.5839083790779114, + "learning_rate": 0.0001, + "loss": 1.3611, + "step": 13449 + }, + { + "epoch": 1.5450002871747746, + "grad_norm": 0.6253820061683655, + "learning_rate": 0.0001, + "loss": 1.6101, + "step": 13450 + }, + { + "epoch": 1.5451151570846018, + "grad_norm": 0.5373639464378357, + "learning_rate": 0.0001, + "loss": 1.3338, + "step": 13451 + }, + { + "epoch": 1.5452300269944288, + "grad_norm": 0.5663428902626038, + "learning_rate": 0.0001, + "loss": 1.3177, + "step": 13452 + }, + { + "epoch": 1.5453448969042558, + "grad_norm": 0.6304009556770325, + "learning_rate": 0.0001, + "loss": 1.5829, + "step": 13453 + }, + { + "epoch": 1.545459766814083, + "grad_norm": 0.6668345332145691, + "learning_rate": 0.0001, + "loss": 1.4433, + "step": 13454 + }, + { + "epoch": 1.5455746367239103, + "grad_norm": 0.5478126406669617, + "learning_rate": 0.0001, + "loss": 1.4109, + "step": 13455 + }, + { + "epoch": 1.5456895066337373, + "grad_norm": 0.6082738637924194, + "learning_rate": 0.0001, + "loss": 1.4676, + "step": 13456 + }, + { + "epoch": 1.5458043765435643, + "grad_norm": 0.6476391553878784, + "learning_rate": 0.0001, + "loss": 1.554, + "step": 13457 + }, + { + "epoch": 1.5459192464533915, + "grad_norm": 0.6350123286247253, + "learning_rate": 0.0001, + "loss": 1.4257, + "step": 13458 + }, + { + "epoch": 1.5460341163632187, + "grad_norm": 0.621774435043335, + "learning_rate": 0.0001, + "loss": 1.5731, + "step": 13459 + }, + { + "epoch": 1.5461489862730458, + "grad_norm": 0.6082205176353455, + "learning_rate": 0.0001, + "loss": 1.5612, + "step": 13460 + }, + { + "epoch": 1.5462638561828728, + "grad_norm": 0.5338382124900818, + "learning_rate": 0.0001, + "loss": 1.2507, + "step": 13461 + }, + { + "epoch": 1.5463787260927, + "grad_norm": 0.6611436605453491, + "learning_rate": 0.0001, + "loss": 1.5725, + "step": 13462 + }, + { + "epoch": 1.5464935960025272, + "grad_norm": 0.6387352347373962, + "learning_rate": 0.0001, + "loss": 1.4834, + "step": 13463 + }, + { + "epoch": 1.5466084659123542, + "grad_norm": 0.6265326142311096, + "learning_rate": 0.0001, + "loss": 1.5716, + "step": 13464 + }, + { + "epoch": 1.5467233358221812, + "grad_norm": 0.5728742480278015, + "learning_rate": 0.0001, + "loss": 1.3898, + "step": 13465 + }, + { + "epoch": 1.5468382057320085, + "grad_norm": 0.5909122228622437, + "learning_rate": 0.0001, + "loss": 1.5714, + "step": 13466 + }, + { + "epoch": 1.5469530756418357, + "grad_norm": 0.5926406979560852, + "learning_rate": 0.0001, + "loss": 1.3875, + "step": 13467 + }, + { + "epoch": 1.5470679455516627, + "grad_norm": 0.6227415204048157, + "learning_rate": 0.0001, + "loss": 1.3836, + "step": 13468 + }, + { + "epoch": 1.5471828154614897, + "grad_norm": 0.6383469104766846, + "learning_rate": 0.0001, + "loss": 1.4241, + "step": 13469 + }, + { + "epoch": 1.547297685371317, + "grad_norm": 0.606417715549469, + "learning_rate": 0.0001, + "loss": 1.4469, + "step": 13470 + }, + { + "epoch": 1.5474125552811442, + "grad_norm": 0.5884244441986084, + "learning_rate": 0.0001, + "loss": 1.2275, + "step": 13471 + }, + { + "epoch": 1.5475274251909712, + "grad_norm": 0.6884910464286804, + "learning_rate": 0.0001, + "loss": 1.6115, + "step": 13472 + }, + { + "epoch": 1.5476422951007982, + "grad_norm": 0.6169816851615906, + "learning_rate": 0.0001, + "loss": 1.5857, + "step": 13473 + }, + { + "epoch": 1.5477571650106254, + "grad_norm": 0.5996578335762024, + "learning_rate": 0.0001, + "loss": 1.3095, + "step": 13474 + }, + { + "epoch": 1.5478720349204527, + "grad_norm": 0.6751422882080078, + "learning_rate": 0.0001, + "loss": 1.6056, + "step": 13475 + }, + { + "epoch": 1.5479869048302797, + "grad_norm": 0.6498405337333679, + "learning_rate": 0.0001, + "loss": 1.4262, + "step": 13476 + }, + { + "epoch": 1.5481017747401067, + "grad_norm": 0.6042332053184509, + "learning_rate": 0.0001, + "loss": 1.5048, + "step": 13477 + }, + { + "epoch": 1.548216644649934, + "grad_norm": 0.6130797863006592, + "learning_rate": 0.0001, + "loss": 1.5824, + "step": 13478 + }, + { + "epoch": 1.5483315145597611, + "grad_norm": 0.6066043972969055, + "learning_rate": 0.0001, + "loss": 1.4461, + "step": 13479 + }, + { + "epoch": 1.5484463844695882, + "grad_norm": 0.5683161616325378, + "learning_rate": 0.0001, + "loss": 1.3388, + "step": 13480 + }, + { + "epoch": 1.5485612543794152, + "grad_norm": 0.5794828534126282, + "learning_rate": 0.0001, + "loss": 1.5624, + "step": 13481 + }, + { + "epoch": 1.5486761242892424, + "grad_norm": 0.5909842848777771, + "learning_rate": 0.0001, + "loss": 1.3772, + "step": 13482 + }, + { + "epoch": 1.5487909941990696, + "grad_norm": 0.6028822660446167, + "learning_rate": 0.0001, + "loss": 1.461, + "step": 13483 + }, + { + "epoch": 1.5489058641088966, + "grad_norm": 0.6567312479019165, + "learning_rate": 0.0001, + "loss": 1.4616, + "step": 13484 + }, + { + "epoch": 1.5490207340187236, + "grad_norm": 0.6116561889648438, + "learning_rate": 0.0001, + "loss": 1.5806, + "step": 13485 + }, + { + "epoch": 1.5491356039285509, + "grad_norm": 0.638192892074585, + "learning_rate": 0.0001, + "loss": 1.4588, + "step": 13486 + }, + { + "epoch": 1.549250473838378, + "grad_norm": 0.6210533380508423, + "learning_rate": 0.0001, + "loss": 1.4859, + "step": 13487 + }, + { + "epoch": 1.5493653437482051, + "grad_norm": 0.6891507506370544, + "learning_rate": 0.0001, + "loss": 1.332, + "step": 13488 + }, + { + "epoch": 1.5494802136580321, + "grad_norm": 0.5958152413368225, + "learning_rate": 0.0001, + "loss": 1.3574, + "step": 13489 + }, + { + "epoch": 1.5495950835678594, + "grad_norm": 0.5857873558998108, + "learning_rate": 0.0001, + "loss": 1.3016, + "step": 13490 + }, + { + "epoch": 1.5497099534776866, + "grad_norm": 0.5769304037094116, + "learning_rate": 0.0001, + "loss": 1.3742, + "step": 13491 + }, + { + "epoch": 1.5498248233875136, + "grad_norm": 0.5681525468826294, + "learning_rate": 0.0001, + "loss": 1.2591, + "step": 13492 + }, + { + "epoch": 1.5499396932973406, + "grad_norm": 0.6011735796928406, + "learning_rate": 0.0001, + "loss": 1.3664, + "step": 13493 + }, + { + "epoch": 1.5500545632071678, + "grad_norm": 0.5927474498748779, + "learning_rate": 0.0001, + "loss": 1.4265, + "step": 13494 + }, + { + "epoch": 1.550169433116995, + "grad_norm": 0.6287341713905334, + "learning_rate": 0.0001, + "loss": 1.4407, + "step": 13495 + }, + { + "epoch": 1.550284303026822, + "grad_norm": 0.6226458549499512, + "learning_rate": 0.0001, + "loss": 1.3742, + "step": 13496 + }, + { + "epoch": 1.550399172936649, + "grad_norm": 0.6283382773399353, + "learning_rate": 0.0001, + "loss": 1.576, + "step": 13497 + }, + { + "epoch": 1.5505140428464763, + "grad_norm": 0.6250648498535156, + "learning_rate": 0.0001, + "loss": 1.4687, + "step": 13498 + }, + { + "epoch": 1.5506289127563035, + "grad_norm": 0.6573746204376221, + "learning_rate": 0.0001, + "loss": 1.4691, + "step": 13499 + }, + { + "epoch": 1.5507437826661306, + "grad_norm": 0.6034247279167175, + "learning_rate": 0.0001, + "loss": 1.4162, + "step": 13500 + }, + { + "epoch": 1.5508586525759576, + "grad_norm": 0.5965235829353333, + "learning_rate": 0.0001, + "loss": 1.3892, + "step": 13501 + }, + { + "epoch": 1.5509735224857848, + "grad_norm": 0.6325914263725281, + "learning_rate": 0.0001, + "loss": 1.3612, + "step": 13502 + }, + { + "epoch": 1.551088392395612, + "grad_norm": 0.6564382910728455, + "learning_rate": 0.0001, + "loss": 1.5547, + "step": 13503 + }, + { + "epoch": 1.5512032623054393, + "grad_norm": 0.6648733019828796, + "learning_rate": 0.0001, + "loss": 1.5704, + "step": 13504 + }, + { + "epoch": 1.5513181322152663, + "grad_norm": 0.6087038516998291, + "learning_rate": 0.0001, + "loss": 1.2969, + "step": 13505 + }, + { + "epoch": 1.5514330021250933, + "grad_norm": 0.5483207106590271, + "learning_rate": 0.0001, + "loss": 1.2729, + "step": 13506 + }, + { + "epoch": 1.5515478720349205, + "grad_norm": 0.5833482146263123, + "learning_rate": 0.0001, + "loss": 1.2962, + "step": 13507 + }, + { + "epoch": 1.5516627419447477, + "grad_norm": 0.6498992443084717, + "learning_rate": 0.0001, + "loss": 1.4952, + "step": 13508 + }, + { + "epoch": 1.5517776118545747, + "grad_norm": 0.5728626847267151, + "learning_rate": 0.0001, + "loss": 1.3803, + "step": 13509 + }, + { + "epoch": 1.5518924817644018, + "grad_norm": 0.5859959125518799, + "learning_rate": 0.0001, + "loss": 1.4502, + "step": 13510 + }, + { + "epoch": 1.552007351674229, + "grad_norm": 0.6737546324729919, + "learning_rate": 0.0001, + "loss": 1.6645, + "step": 13511 + }, + { + "epoch": 1.5521222215840562, + "grad_norm": 0.6166685819625854, + "learning_rate": 0.0001, + "loss": 1.4754, + "step": 13512 + }, + { + "epoch": 1.5522370914938832, + "grad_norm": 0.7040491104125977, + "learning_rate": 0.0001, + "loss": 1.6972, + "step": 13513 + }, + { + "epoch": 1.5523519614037102, + "grad_norm": 0.6027595400810242, + "learning_rate": 0.0001, + "loss": 1.4652, + "step": 13514 + }, + { + "epoch": 1.5524668313135375, + "grad_norm": 0.5595097541809082, + "learning_rate": 0.0001, + "loss": 1.3197, + "step": 13515 + }, + { + "epoch": 1.5525817012233647, + "grad_norm": 0.5787383317947388, + "learning_rate": 0.0001, + "loss": 1.3246, + "step": 13516 + }, + { + "epoch": 1.5526965711331917, + "grad_norm": 0.64249187707901, + "learning_rate": 0.0001, + "loss": 1.6401, + "step": 13517 + }, + { + "epoch": 1.5528114410430187, + "grad_norm": 0.5845315456390381, + "learning_rate": 0.0001, + "loss": 1.2307, + "step": 13518 + }, + { + "epoch": 1.552926310952846, + "grad_norm": 0.6052901744842529, + "learning_rate": 0.0001, + "loss": 1.5411, + "step": 13519 + }, + { + "epoch": 1.5530411808626732, + "grad_norm": 0.5542384386062622, + "learning_rate": 0.0001, + "loss": 1.3201, + "step": 13520 + }, + { + "epoch": 1.5531560507725002, + "grad_norm": 0.5924617648124695, + "learning_rate": 0.0001, + "loss": 1.4083, + "step": 13521 + }, + { + "epoch": 1.5532709206823272, + "grad_norm": 0.5823348760604858, + "learning_rate": 0.0001, + "loss": 1.433, + "step": 13522 + }, + { + "epoch": 1.5533857905921544, + "grad_norm": 0.5733996033668518, + "learning_rate": 0.0001, + "loss": 1.3618, + "step": 13523 + }, + { + "epoch": 1.5535006605019817, + "grad_norm": 0.5738714933395386, + "learning_rate": 0.0001, + "loss": 1.3387, + "step": 13524 + }, + { + "epoch": 1.5536155304118087, + "grad_norm": 0.5945634245872498, + "learning_rate": 0.0001, + "loss": 1.5014, + "step": 13525 + }, + { + "epoch": 1.5537304003216357, + "grad_norm": 0.6478724479675293, + "learning_rate": 0.0001, + "loss": 1.5344, + "step": 13526 + }, + { + "epoch": 1.553845270231463, + "grad_norm": 0.6466807126998901, + "learning_rate": 0.0001, + "loss": 1.3919, + "step": 13527 + }, + { + "epoch": 1.5539601401412901, + "grad_norm": 0.5651605725288391, + "learning_rate": 0.0001, + "loss": 1.3325, + "step": 13528 + }, + { + "epoch": 1.5540750100511171, + "grad_norm": 0.6229522228240967, + "learning_rate": 0.0001, + "loss": 1.4839, + "step": 13529 + }, + { + "epoch": 1.5541898799609442, + "grad_norm": 0.6317934393882751, + "learning_rate": 0.0001, + "loss": 1.524, + "step": 13530 + }, + { + "epoch": 1.5543047498707714, + "grad_norm": 0.5871884822845459, + "learning_rate": 0.0001, + "loss": 1.4019, + "step": 13531 + }, + { + "epoch": 1.5544196197805986, + "grad_norm": 0.5875590443611145, + "learning_rate": 0.0001, + "loss": 1.3716, + "step": 13532 + }, + { + "epoch": 1.5545344896904256, + "grad_norm": 0.5747091174125671, + "learning_rate": 0.0001, + "loss": 1.2858, + "step": 13533 + }, + { + "epoch": 1.5546493596002526, + "grad_norm": 0.5531101226806641, + "learning_rate": 0.0001, + "loss": 1.394, + "step": 13534 + }, + { + "epoch": 1.5547642295100799, + "grad_norm": 0.7090408205986023, + "learning_rate": 0.0001, + "loss": 1.4974, + "step": 13535 + }, + { + "epoch": 1.554879099419907, + "grad_norm": 0.6552852392196655, + "learning_rate": 0.0001, + "loss": 1.6069, + "step": 13536 + }, + { + "epoch": 1.554993969329734, + "grad_norm": 0.6876298189163208, + "learning_rate": 0.0001, + "loss": 1.6896, + "step": 13537 + }, + { + "epoch": 1.5551088392395611, + "grad_norm": 0.5838922262191772, + "learning_rate": 0.0001, + "loss": 1.4006, + "step": 13538 + }, + { + "epoch": 1.5552237091493883, + "grad_norm": 0.6967250108718872, + "learning_rate": 0.0001, + "loss": 1.3594, + "step": 13539 + }, + { + "epoch": 1.5553385790592156, + "grad_norm": 0.6267087459564209, + "learning_rate": 0.0001, + "loss": 1.1573, + "step": 13540 + }, + { + "epoch": 1.5554534489690426, + "grad_norm": 0.5778437852859497, + "learning_rate": 0.0001, + "loss": 1.4125, + "step": 13541 + }, + { + "epoch": 1.5555683188788696, + "grad_norm": 0.5467836260795593, + "learning_rate": 0.0001, + "loss": 1.1714, + "step": 13542 + }, + { + "epoch": 1.5556831887886968, + "grad_norm": 0.5664315819740295, + "learning_rate": 0.0001, + "loss": 1.3083, + "step": 13543 + }, + { + "epoch": 1.555798058698524, + "grad_norm": 0.6014490127563477, + "learning_rate": 0.0001, + "loss": 1.5584, + "step": 13544 + }, + { + "epoch": 1.555912928608351, + "grad_norm": 0.6335508823394775, + "learning_rate": 0.0001, + "loss": 1.236, + "step": 13545 + }, + { + "epoch": 1.556027798518178, + "grad_norm": 0.6208090782165527, + "learning_rate": 0.0001, + "loss": 1.3677, + "step": 13546 + }, + { + "epoch": 1.5561426684280053, + "grad_norm": 0.61933833360672, + "learning_rate": 0.0001, + "loss": 1.5519, + "step": 13547 + }, + { + "epoch": 1.5562575383378325, + "grad_norm": 0.6805666089057922, + "learning_rate": 0.0001, + "loss": 1.5495, + "step": 13548 + }, + { + "epoch": 1.5563724082476595, + "grad_norm": 0.7156261205673218, + "learning_rate": 0.0001, + "loss": 1.6308, + "step": 13549 + }, + { + "epoch": 1.5564872781574866, + "grad_norm": 0.6893269419670105, + "learning_rate": 0.0001, + "loss": 1.4658, + "step": 13550 + }, + { + "epoch": 1.5566021480673138, + "grad_norm": 0.6271970868110657, + "learning_rate": 0.0001, + "loss": 1.5036, + "step": 13551 + }, + { + "epoch": 1.556717017977141, + "grad_norm": 0.6317713856697083, + "learning_rate": 0.0001, + "loss": 1.5933, + "step": 13552 + }, + { + "epoch": 1.556831887886968, + "grad_norm": 0.6471002697944641, + "learning_rate": 0.0001, + "loss": 1.4808, + "step": 13553 + }, + { + "epoch": 1.556946757796795, + "grad_norm": 0.6045787930488586, + "learning_rate": 0.0001, + "loss": 1.411, + "step": 13554 + }, + { + "epoch": 1.5570616277066223, + "grad_norm": 0.5843409299850464, + "learning_rate": 0.0001, + "loss": 1.3993, + "step": 13555 + }, + { + "epoch": 1.5571764976164495, + "grad_norm": 0.5926359295845032, + "learning_rate": 0.0001, + "loss": 1.2548, + "step": 13556 + }, + { + "epoch": 1.5572913675262765, + "grad_norm": 0.5961912274360657, + "learning_rate": 0.0001, + "loss": 1.677, + "step": 13557 + }, + { + "epoch": 1.5574062374361035, + "grad_norm": 0.7315113544464111, + "learning_rate": 0.0001, + "loss": 1.5993, + "step": 13558 + }, + { + "epoch": 1.5575211073459307, + "grad_norm": 0.576897919178009, + "learning_rate": 0.0001, + "loss": 1.4251, + "step": 13559 + }, + { + "epoch": 1.557635977255758, + "grad_norm": 0.5911585092544556, + "learning_rate": 0.0001, + "loss": 1.2031, + "step": 13560 + }, + { + "epoch": 1.557750847165585, + "grad_norm": 0.6434839963912964, + "learning_rate": 0.0001, + "loss": 1.4475, + "step": 13561 + }, + { + "epoch": 1.557865717075412, + "grad_norm": 0.5712766647338867, + "learning_rate": 0.0001, + "loss": 1.1872, + "step": 13562 + }, + { + "epoch": 1.5579805869852392, + "grad_norm": 0.6965934634208679, + "learning_rate": 0.0001, + "loss": 1.3617, + "step": 13563 + }, + { + "epoch": 1.5580954568950665, + "grad_norm": 0.5756476521492004, + "learning_rate": 0.0001, + "loss": 1.424, + "step": 13564 + }, + { + "epoch": 1.5582103268048935, + "grad_norm": 0.6307820081710815, + "learning_rate": 0.0001, + "loss": 1.5449, + "step": 13565 + }, + { + "epoch": 1.5583251967147205, + "grad_norm": 0.5934085845947266, + "learning_rate": 0.0001, + "loss": 1.5171, + "step": 13566 + }, + { + "epoch": 1.5584400666245477, + "grad_norm": 0.6400713324546814, + "learning_rate": 0.0001, + "loss": 1.6674, + "step": 13567 + }, + { + "epoch": 1.558554936534375, + "grad_norm": 0.6499615907669067, + "learning_rate": 0.0001, + "loss": 1.5194, + "step": 13568 + }, + { + "epoch": 1.558669806444202, + "grad_norm": 0.6657858490943909, + "learning_rate": 0.0001, + "loss": 1.5722, + "step": 13569 + }, + { + "epoch": 1.558784676354029, + "grad_norm": 0.5988503098487854, + "learning_rate": 0.0001, + "loss": 1.4663, + "step": 13570 + }, + { + "epoch": 1.5588995462638562, + "grad_norm": 0.6103773713111877, + "learning_rate": 0.0001, + "loss": 1.5287, + "step": 13571 + }, + { + "epoch": 1.5590144161736834, + "grad_norm": 0.5706753134727478, + "learning_rate": 0.0001, + "loss": 1.4824, + "step": 13572 + }, + { + "epoch": 1.5591292860835104, + "grad_norm": 0.5638065338134766, + "learning_rate": 0.0001, + "loss": 1.3968, + "step": 13573 + }, + { + "epoch": 1.5592441559933374, + "grad_norm": 0.5787845849990845, + "learning_rate": 0.0001, + "loss": 1.455, + "step": 13574 + }, + { + "epoch": 1.5593590259031647, + "grad_norm": 0.5700173377990723, + "learning_rate": 0.0001, + "loss": 1.3468, + "step": 13575 + }, + { + "epoch": 1.559473895812992, + "grad_norm": 0.5860016345977783, + "learning_rate": 0.0001, + "loss": 1.4658, + "step": 13576 + }, + { + "epoch": 1.559588765722819, + "grad_norm": 0.604753851890564, + "learning_rate": 0.0001, + "loss": 1.4624, + "step": 13577 + }, + { + "epoch": 1.559703635632646, + "grad_norm": 0.600627601146698, + "learning_rate": 0.0001, + "loss": 1.5398, + "step": 13578 + }, + { + "epoch": 1.5598185055424731, + "grad_norm": 0.6065698266029358, + "learning_rate": 0.0001, + "loss": 1.5139, + "step": 13579 + }, + { + "epoch": 1.5599333754523004, + "grad_norm": 0.5824136137962341, + "learning_rate": 0.0001, + "loss": 1.4065, + "step": 13580 + }, + { + "epoch": 1.5600482453621274, + "grad_norm": 0.6180105209350586, + "learning_rate": 0.0001, + "loss": 1.4328, + "step": 13581 + }, + { + "epoch": 1.5601631152719544, + "grad_norm": 0.6440869569778442, + "learning_rate": 0.0001, + "loss": 1.592, + "step": 13582 + }, + { + "epoch": 1.5602779851817816, + "grad_norm": 0.6405513286590576, + "learning_rate": 0.0001, + "loss": 1.3444, + "step": 13583 + }, + { + "epoch": 1.5603928550916089, + "grad_norm": 0.6168532371520996, + "learning_rate": 0.0001, + "loss": 1.2609, + "step": 13584 + }, + { + "epoch": 1.5605077250014359, + "grad_norm": 0.5789191722869873, + "learning_rate": 0.0001, + "loss": 1.2043, + "step": 13585 + }, + { + "epoch": 1.5606225949112629, + "grad_norm": 0.6007379293441772, + "learning_rate": 0.0001, + "loss": 1.5273, + "step": 13586 + }, + { + "epoch": 1.56073746482109, + "grad_norm": 0.5817810297012329, + "learning_rate": 0.0001, + "loss": 1.3975, + "step": 13587 + }, + { + "epoch": 1.5608523347309173, + "grad_norm": 0.5913118720054626, + "learning_rate": 0.0001, + "loss": 1.3675, + "step": 13588 + }, + { + "epoch": 1.5609672046407443, + "grad_norm": 0.5883614420890808, + "learning_rate": 0.0001, + "loss": 1.5754, + "step": 13589 + }, + { + "epoch": 1.5610820745505714, + "grad_norm": 0.6002376079559326, + "learning_rate": 0.0001, + "loss": 1.5627, + "step": 13590 + }, + { + "epoch": 1.5611969444603986, + "grad_norm": 0.6250750422477722, + "learning_rate": 0.0001, + "loss": 1.5516, + "step": 13591 + }, + { + "epoch": 1.5613118143702258, + "grad_norm": 0.611410915851593, + "learning_rate": 0.0001, + "loss": 1.5293, + "step": 13592 + }, + { + "epoch": 1.5614266842800528, + "grad_norm": 0.5881221890449524, + "learning_rate": 0.0001, + "loss": 1.5605, + "step": 13593 + }, + { + "epoch": 1.5615415541898798, + "grad_norm": 0.5879666805267334, + "learning_rate": 0.0001, + "loss": 1.4588, + "step": 13594 + }, + { + "epoch": 1.561656424099707, + "grad_norm": 0.5731499791145325, + "learning_rate": 0.0001, + "loss": 1.6253, + "step": 13595 + }, + { + "epoch": 1.5617712940095343, + "grad_norm": 0.6417600512504578, + "learning_rate": 0.0001, + "loss": 1.5114, + "step": 13596 + }, + { + "epoch": 1.5618861639193613, + "grad_norm": 0.5718529224395752, + "learning_rate": 0.0001, + "loss": 1.3185, + "step": 13597 + }, + { + "epoch": 1.5620010338291883, + "grad_norm": 0.6178721189498901, + "learning_rate": 0.0001, + "loss": 1.311, + "step": 13598 + }, + { + "epoch": 1.5621159037390155, + "grad_norm": 0.5907917618751526, + "learning_rate": 0.0001, + "loss": 1.4209, + "step": 13599 + }, + { + "epoch": 1.5622307736488428, + "grad_norm": 0.6152604818344116, + "learning_rate": 0.0001, + "loss": 1.4549, + "step": 13600 + }, + { + "epoch": 1.5623456435586698, + "grad_norm": 0.6977373361587524, + "learning_rate": 0.0001, + "loss": 1.7487, + "step": 13601 + }, + { + "epoch": 1.5624605134684968, + "grad_norm": 0.6169999241828918, + "learning_rate": 0.0001, + "loss": 1.5021, + "step": 13602 + }, + { + "epoch": 1.562575383378324, + "grad_norm": 0.5735710859298706, + "learning_rate": 0.0001, + "loss": 1.3729, + "step": 13603 + }, + { + "epoch": 1.5626902532881513, + "grad_norm": 0.5788447856903076, + "learning_rate": 0.0001, + "loss": 1.2963, + "step": 13604 + }, + { + "epoch": 1.5628051231979783, + "grad_norm": 0.6432527303695679, + "learning_rate": 0.0001, + "loss": 1.4393, + "step": 13605 + }, + { + "epoch": 1.5629199931078053, + "grad_norm": 0.6032426357269287, + "learning_rate": 0.0001, + "loss": 1.5303, + "step": 13606 + }, + { + "epoch": 1.5630348630176325, + "grad_norm": 0.6332036852836609, + "learning_rate": 0.0001, + "loss": 1.5395, + "step": 13607 + }, + { + "epoch": 1.5631497329274597, + "grad_norm": 0.5610132217407227, + "learning_rate": 0.0001, + "loss": 1.4589, + "step": 13608 + }, + { + "epoch": 1.5632646028372867, + "grad_norm": 0.5653334259986877, + "learning_rate": 0.0001, + "loss": 1.2573, + "step": 13609 + }, + { + "epoch": 1.5633794727471138, + "grad_norm": 0.5841345191001892, + "learning_rate": 0.0001, + "loss": 1.4447, + "step": 13610 + }, + { + "epoch": 1.563494342656941, + "grad_norm": 0.6390504240989685, + "learning_rate": 0.0001, + "loss": 1.5338, + "step": 13611 + }, + { + "epoch": 1.5636092125667682, + "grad_norm": 0.6101760268211365, + "learning_rate": 0.0001, + "loss": 1.5201, + "step": 13612 + }, + { + "epoch": 1.5637240824765952, + "grad_norm": 0.6613549590110779, + "learning_rate": 0.0001, + "loss": 1.4788, + "step": 13613 + }, + { + "epoch": 1.5638389523864222, + "grad_norm": 0.6057902574539185, + "learning_rate": 0.0001, + "loss": 1.3687, + "step": 13614 + }, + { + "epoch": 1.5639538222962495, + "grad_norm": 0.6379287838935852, + "learning_rate": 0.0001, + "loss": 1.6718, + "step": 13615 + }, + { + "epoch": 1.5640686922060767, + "grad_norm": 0.597686231136322, + "learning_rate": 0.0001, + "loss": 1.3398, + "step": 13616 + }, + { + "epoch": 1.5641835621159037, + "grad_norm": 0.5676443576812744, + "learning_rate": 0.0001, + "loss": 1.5195, + "step": 13617 + }, + { + "epoch": 1.5642984320257307, + "grad_norm": 0.6300573348999023, + "learning_rate": 0.0001, + "loss": 1.6266, + "step": 13618 + }, + { + "epoch": 1.564413301935558, + "grad_norm": 0.5722482204437256, + "learning_rate": 0.0001, + "loss": 1.4044, + "step": 13619 + }, + { + "epoch": 1.5645281718453852, + "grad_norm": 0.6020122170448303, + "learning_rate": 0.0001, + "loss": 1.4094, + "step": 13620 + }, + { + "epoch": 1.5646430417552122, + "grad_norm": 0.6561688780784607, + "learning_rate": 0.0001, + "loss": 1.5187, + "step": 13621 + }, + { + "epoch": 1.5647579116650392, + "grad_norm": 0.589190661907196, + "learning_rate": 0.0001, + "loss": 1.4078, + "step": 13622 + }, + { + "epoch": 1.5648727815748664, + "grad_norm": 0.593706488609314, + "learning_rate": 0.0001, + "loss": 1.5398, + "step": 13623 + }, + { + "epoch": 1.5649876514846937, + "grad_norm": 0.6446245312690735, + "learning_rate": 0.0001, + "loss": 1.5302, + "step": 13624 + }, + { + "epoch": 1.5651025213945207, + "grad_norm": 0.6118418574333191, + "learning_rate": 0.0001, + "loss": 1.699, + "step": 13625 + }, + { + "epoch": 1.5652173913043477, + "grad_norm": 0.6244831681251526, + "learning_rate": 0.0001, + "loss": 1.5944, + "step": 13626 + }, + { + "epoch": 1.565332261214175, + "grad_norm": 0.6229195594787598, + "learning_rate": 0.0001, + "loss": 1.4809, + "step": 13627 + }, + { + "epoch": 1.5654471311240021, + "grad_norm": 0.5555204153060913, + "learning_rate": 0.0001, + "loss": 1.2535, + "step": 13628 + }, + { + "epoch": 1.5655620010338291, + "grad_norm": 0.6049181222915649, + "learning_rate": 0.0001, + "loss": 1.4407, + "step": 13629 + }, + { + "epoch": 1.5656768709436562, + "grad_norm": 0.5908517837524414, + "learning_rate": 0.0001, + "loss": 1.3301, + "step": 13630 + }, + { + "epoch": 1.5657917408534834, + "grad_norm": 0.6150537133216858, + "learning_rate": 0.0001, + "loss": 1.5415, + "step": 13631 + }, + { + "epoch": 1.5659066107633106, + "grad_norm": 0.5594236850738525, + "learning_rate": 0.0001, + "loss": 1.464, + "step": 13632 + }, + { + "epoch": 1.5660214806731376, + "grad_norm": 0.6012204885482788, + "learning_rate": 0.0001, + "loss": 1.4467, + "step": 13633 + }, + { + "epoch": 1.5661363505829646, + "grad_norm": 0.6006706357002258, + "learning_rate": 0.0001, + "loss": 1.4424, + "step": 13634 + }, + { + "epoch": 1.5662512204927919, + "grad_norm": 0.5952029824256897, + "learning_rate": 0.0001, + "loss": 1.3238, + "step": 13635 + }, + { + "epoch": 1.566366090402619, + "grad_norm": 0.5738430619239807, + "learning_rate": 0.0001, + "loss": 1.5614, + "step": 13636 + }, + { + "epoch": 1.566480960312446, + "grad_norm": 0.6680176258087158, + "learning_rate": 0.0001, + "loss": 1.453, + "step": 13637 + }, + { + "epoch": 1.5665958302222731, + "grad_norm": 0.6369667649269104, + "learning_rate": 0.0001, + "loss": 1.476, + "step": 13638 + }, + { + "epoch": 1.5667107001321003, + "grad_norm": 0.6867667436599731, + "learning_rate": 0.0001, + "loss": 1.491, + "step": 13639 + }, + { + "epoch": 1.5668255700419276, + "grad_norm": 0.5914698243141174, + "learning_rate": 0.0001, + "loss": 1.6164, + "step": 13640 + }, + { + "epoch": 1.5669404399517548, + "grad_norm": 0.5792491436004639, + "learning_rate": 0.0001, + "loss": 1.4495, + "step": 13641 + }, + { + "epoch": 1.5670553098615818, + "grad_norm": 0.6214803457260132, + "learning_rate": 0.0001, + "loss": 1.4099, + "step": 13642 + }, + { + "epoch": 1.5671701797714088, + "grad_norm": 0.5663346648216248, + "learning_rate": 0.0001, + "loss": 1.3742, + "step": 13643 + }, + { + "epoch": 1.567285049681236, + "grad_norm": 0.5920063257217407, + "learning_rate": 0.0001, + "loss": 1.4131, + "step": 13644 + }, + { + "epoch": 1.5673999195910633, + "grad_norm": 0.6462366580963135, + "learning_rate": 0.0001, + "loss": 1.422, + "step": 13645 + }, + { + "epoch": 1.5675147895008903, + "grad_norm": 0.6337886452674866, + "learning_rate": 0.0001, + "loss": 1.475, + "step": 13646 + }, + { + "epoch": 1.5676296594107173, + "grad_norm": 0.6497291326522827, + "learning_rate": 0.0001, + "loss": 1.3826, + "step": 13647 + }, + { + "epoch": 1.5677445293205445, + "grad_norm": 0.6257938146591187, + "learning_rate": 0.0001, + "loss": 1.4504, + "step": 13648 + }, + { + "epoch": 1.5678593992303718, + "grad_norm": 0.6551761031150818, + "learning_rate": 0.0001, + "loss": 1.5617, + "step": 13649 + }, + { + "epoch": 1.5679742691401988, + "grad_norm": 0.6262896656990051, + "learning_rate": 0.0001, + "loss": 1.6237, + "step": 13650 + }, + { + "epoch": 1.5680891390500258, + "grad_norm": 0.5628766417503357, + "learning_rate": 0.0001, + "loss": 1.3154, + "step": 13651 + }, + { + "epoch": 1.568204008959853, + "grad_norm": 0.6413166522979736, + "learning_rate": 0.0001, + "loss": 1.5788, + "step": 13652 + }, + { + "epoch": 1.5683188788696802, + "grad_norm": 0.568213164806366, + "learning_rate": 0.0001, + "loss": 1.3689, + "step": 13653 + }, + { + "epoch": 1.5684337487795073, + "grad_norm": 0.6713234186172485, + "learning_rate": 0.0001, + "loss": 1.4422, + "step": 13654 + }, + { + "epoch": 1.5685486186893343, + "grad_norm": 0.5839575529098511, + "learning_rate": 0.0001, + "loss": 1.3933, + "step": 13655 + }, + { + "epoch": 1.5686634885991615, + "grad_norm": 0.596601128578186, + "learning_rate": 0.0001, + "loss": 1.61, + "step": 13656 + }, + { + "epoch": 1.5687783585089887, + "grad_norm": 0.5839077830314636, + "learning_rate": 0.0001, + "loss": 1.3594, + "step": 13657 + }, + { + "epoch": 1.5688932284188157, + "grad_norm": 0.6276641488075256, + "learning_rate": 0.0001, + "loss": 1.4735, + "step": 13658 + }, + { + "epoch": 1.5690080983286427, + "grad_norm": 0.6786941289901733, + "learning_rate": 0.0001, + "loss": 1.4608, + "step": 13659 + }, + { + "epoch": 1.56912296823847, + "grad_norm": 0.7449260354042053, + "learning_rate": 0.0001, + "loss": 1.6882, + "step": 13660 + }, + { + "epoch": 1.5692378381482972, + "grad_norm": 0.6506140828132629, + "learning_rate": 0.0001, + "loss": 1.6329, + "step": 13661 + }, + { + "epoch": 1.5693527080581242, + "grad_norm": 0.5764363408088684, + "learning_rate": 0.0001, + "loss": 1.396, + "step": 13662 + }, + { + "epoch": 1.5694675779679512, + "grad_norm": 0.6293970346450806, + "learning_rate": 0.0001, + "loss": 1.3189, + "step": 13663 + }, + { + "epoch": 1.5695824478777785, + "grad_norm": 0.5982877016067505, + "learning_rate": 0.0001, + "loss": 1.3611, + "step": 13664 + }, + { + "epoch": 1.5696973177876057, + "grad_norm": 0.7153692841529846, + "learning_rate": 0.0001, + "loss": 1.5204, + "step": 13665 + }, + { + "epoch": 1.5698121876974327, + "grad_norm": 0.556322455406189, + "learning_rate": 0.0001, + "loss": 1.1547, + "step": 13666 + }, + { + "epoch": 1.5699270576072597, + "grad_norm": 0.5707471370697021, + "learning_rate": 0.0001, + "loss": 1.4342, + "step": 13667 + }, + { + "epoch": 1.570041927517087, + "grad_norm": 0.6068229079246521, + "learning_rate": 0.0001, + "loss": 1.5907, + "step": 13668 + }, + { + "epoch": 1.5701567974269142, + "grad_norm": 0.5805314183235168, + "learning_rate": 0.0001, + "loss": 1.3441, + "step": 13669 + }, + { + "epoch": 1.5702716673367412, + "grad_norm": 0.5970333814620972, + "learning_rate": 0.0001, + "loss": 1.4048, + "step": 13670 + }, + { + "epoch": 1.5703865372465682, + "grad_norm": 0.6311222314834595, + "learning_rate": 0.0001, + "loss": 1.5804, + "step": 13671 + }, + { + "epoch": 1.5705014071563954, + "grad_norm": 0.6435375213623047, + "learning_rate": 0.0001, + "loss": 1.514, + "step": 13672 + }, + { + "epoch": 1.5706162770662226, + "grad_norm": 0.62371826171875, + "learning_rate": 0.0001, + "loss": 1.4818, + "step": 13673 + }, + { + "epoch": 1.5707311469760497, + "grad_norm": 0.6377353072166443, + "learning_rate": 0.0001, + "loss": 1.5304, + "step": 13674 + }, + { + "epoch": 1.5708460168858767, + "grad_norm": 0.630799412727356, + "learning_rate": 0.0001, + "loss": 1.4142, + "step": 13675 + }, + { + "epoch": 1.570960886795704, + "grad_norm": 0.6089138388633728, + "learning_rate": 0.0001, + "loss": 1.344, + "step": 13676 + }, + { + "epoch": 1.5710757567055311, + "grad_norm": 0.6124714612960815, + "learning_rate": 0.0001, + "loss": 1.3856, + "step": 13677 + }, + { + "epoch": 1.5711906266153581, + "grad_norm": 0.592974066734314, + "learning_rate": 0.0001, + "loss": 1.3503, + "step": 13678 + }, + { + "epoch": 1.5713054965251851, + "grad_norm": 0.662214994430542, + "learning_rate": 0.0001, + "loss": 1.4621, + "step": 13679 + }, + { + "epoch": 1.5714203664350124, + "grad_norm": 0.5967538952827454, + "learning_rate": 0.0001, + "loss": 1.4994, + "step": 13680 + }, + { + "epoch": 1.5715352363448396, + "grad_norm": 0.6241264939308167, + "learning_rate": 0.0001, + "loss": 1.4568, + "step": 13681 + }, + { + "epoch": 1.5716501062546666, + "grad_norm": 0.6036410927772522, + "learning_rate": 0.0001, + "loss": 1.5817, + "step": 13682 + }, + { + "epoch": 1.5717649761644936, + "grad_norm": 0.5998589992523193, + "learning_rate": 0.0001, + "loss": 1.2894, + "step": 13683 + }, + { + "epoch": 1.5718798460743209, + "grad_norm": 0.6088207960128784, + "learning_rate": 0.0001, + "loss": 1.5821, + "step": 13684 + }, + { + "epoch": 1.571994715984148, + "grad_norm": 0.6031522750854492, + "learning_rate": 0.0001, + "loss": 1.4404, + "step": 13685 + }, + { + "epoch": 1.572109585893975, + "grad_norm": 0.5819301605224609, + "learning_rate": 0.0001, + "loss": 1.6401, + "step": 13686 + }, + { + "epoch": 1.572224455803802, + "grad_norm": 0.5897693634033203, + "learning_rate": 0.0001, + "loss": 1.306, + "step": 13687 + }, + { + "epoch": 1.5723393257136293, + "grad_norm": 0.6375783681869507, + "learning_rate": 0.0001, + "loss": 1.5457, + "step": 13688 + }, + { + "epoch": 1.5724541956234566, + "grad_norm": 0.6380894184112549, + "learning_rate": 0.0001, + "loss": 1.5302, + "step": 13689 + }, + { + "epoch": 1.5725690655332836, + "grad_norm": 0.6246376633644104, + "learning_rate": 0.0001, + "loss": 1.3728, + "step": 13690 + }, + { + "epoch": 1.5726839354431106, + "grad_norm": 0.6492366790771484, + "learning_rate": 0.0001, + "loss": 1.3662, + "step": 13691 + }, + { + "epoch": 1.5727988053529378, + "grad_norm": 0.6497386693954468, + "learning_rate": 0.0001, + "loss": 1.4163, + "step": 13692 + }, + { + "epoch": 1.572913675262765, + "grad_norm": 0.5843161940574646, + "learning_rate": 0.0001, + "loss": 1.3522, + "step": 13693 + }, + { + "epoch": 1.573028545172592, + "grad_norm": 0.6169954538345337, + "learning_rate": 0.0001, + "loss": 1.4708, + "step": 13694 + }, + { + "epoch": 1.573143415082419, + "grad_norm": 0.5712270140647888, + "learning_rate": 0.0001, + "loss": 1.2669, + "step": 13695 + }, + { + "epoch": 1.5732582849922463, + "grad_norm": 0.6111646294593811, + "learning_rate": 0.0001, + "loss": 1.5445, + "step": 13696 + }, + { + "epoch": 1.5733731549020735, + "grad_norm": 0.5616782307624817, + "learning_rate": 0.0001, + "loss": 1.3518, + "step": 13697 + }, + { + "epoch": 1.5734880248119005, + "grad_norm": 0.6015630960464478, + "learning_rate": 0.0001, + "loss": 1.2921, + "step": 13698 + }, + { + "epoch": 1.5736028947217275, + "grad_norm": 0.5990334749221802, + "learning_rate": 0.0001, + "loss": 1.5865, + "step": 13699 + }, + { + "epoch": 1.5737177646315548, + "grad_norm": 0.5758739709854126, + "learning_rate": 0.0001, + "loss": 1.58, + "step": 13700 + }, + { + "epoch": 1.573832634541382, + "grad_norm": 0.5954878330230713, + "learning_rate": 0.0001, + "loss": 1.3584, + "step": 13701 + }, + { + "epoch": 1.573947504451209, + "grad_norm": 0.6391401290893555, + "learning_rate": 0.0001, + "loss": 1.4215, + "step": 13702 + }, + { + "epoch": 1.574062374361036, + "grad_norm": 0.7178584337234497, + "learning_rate": 0.0001, + "loss": 1.3619, + "step": 13703 + }, + { + "epoch": 1.5741772442708633, + "grad_norm": 0.6165657639503479, + "learning_rate": 0.0001, + "loss": 1.2028, + "step": 13704 + }, + { + "epoch": 1.5742921141806905, + "grad_norm": 0.5853223204612732, + "learning_rate": 0.0001, + "loss": 1.4566, + "step": 13705 + }, + { + "epoch": 1.5744069840905175, + "grad_norm": 0.6249833703041077, + "learning_rate": 0.0001, + "loss": 1.1105, + "step": 13706 + }, + { + "epoch": 1.5745218540003445, + "grad_norm": 0.5916066765785217, + "learning_rate": 0.0001, + "loss": 1.4113, + "step": 13707 + }, + { + "epoch": 1.5746367239101717, + "grad_norm": 0.5843729972839355, + "learning_rate": 0.0001, + "loss": 1.3678, + "step": 13708 + }, + { + "epoch": 1.574751593819999, + "grad_norm": 0.5756968259811401, + "learning_rate": 0.0001, + "loss": 1.3407, + "step": 13709 + }, + { + "epoch": 1.574866463729826, + "grad_norm": 0.6385331749916077, + "learning_rate": 0.0001, + "loss": 1.4298, + "step": 13710 + }, + { + "epoch": 1.574981333639653, + "grad_norm": 0.6959099769592285, + "learning_rate": 0.0001, + "loss": 1.6224, + "step": 13711 + }, + { + "epoch": 1.5750962035494802, + "grad_norm": 0.5969239473342896, + "learning_rate": 0.0001, + "loss": 1.4625, + "step": 13712 + }, + { + "epoch": 1.5752110734593074, + "grad_norm": 0.5907700061798096, + "learning_rate": 0.0001, + "loss": 1.4731, + "step": 13713 + }, + { + "epoch": 1.5753259433691345, + "grad_norm": 0.5716517567634583, + "learning_rate": 0.0001, + "loss": 1.0672, + "step": 13714 + }, + { + "epoch": 1.5754408132789615, + "grad_norm": 0.6416873931884766, + "learning_rate": 0.0001, + "loss": 1.5179, + "step": 13715 + }, + { + "epoch": 1.5755556831887887, + "grad_norm": 0.6345185041427612, + "learning_rate": 0.0001, + "loss": 1.5173, + "step": 13716 + }, + { + "epoch": 1.575670553098616, + "grad_norm": 0.63191157579422, + "learning_rate": 0.0001, + "loss": 1.6803, + "step": 13717 + }, + { + "epoch": 1.575785423008443, + "grad_norm": 0.5994105339050293, + "learning_rate": 0.0001, + "loss": 1.4558, + "step": 13718 + }, + { + "epoch": 1.57590029291827, + "grad_norm": 0.657799243927002, + "learning_rate": 0.0001, + "loss": 1.4011, + "step": 13719 + }, + { + "epoch": 1.5760151628280972, + "grad_norm": 0.6098163723945618, + "learning_rate": 0.0001, + "loss": 1.4295, + "step": 13720 + }, + { + "epoch": 1.5761300327379244, + "grad_norm": 0.5927136540412903, + "learning_rate": 0.0001, + "loss": 1.4307, + "step": 13721 + }, + { + "epoch": 1.5762449026477514, + "grad_norm": 0.5886332392692566, + "learning_rate": 0.0001, + "loss": 1.574, + "step": 13722 + }, + { + "epoch": 1.5763597725575784, + "grad_norm": 0.589408814907074, + "learning_rate": 0.0001, + "loss": 1.3345, + "step": 13723 + }, + { + "epoch": 1.5764746424674057, + "grad_norm": 0.6005495190620422, + "learning_rate": 0.0001, + "loss": 1.5995, + "step": 13724 + }, + { + "epoch": 1.5765895123772329, + "grad_norm": 0.5907677412033081, + "learning_rate": 0.0001, + "loss": 1.591, + "step": 13725 + }, + { + "epoch": 1.57670438228706, + "grad_norm": 0.6294686794281006, + "learning_rate": 0.0001, + "loss": 1.4602, + "step": 13726 + }, + { + "epoch": 1.576819252196887, + "grad_norm": 0.5746269822120667, + "learning_rate": 0.0001, + "loss": 1.4667, + "step": 13727 + }, + { + "epoch": 1.5769341221067141, + "grad_norm": 0.6354400515556335, + "learning_rate": 0.0001, + "loss": 1.3841, + "step": 13728 + }, + { + "epoch": 1.5770489920165414, + "grad_norm": 0.5498471856117249, + "learning_rate": 0.0001, + "loss": 1.4757, + "step": 13729 + }, + { + "epoch": 1.5771638619263684, + "grad_norm": 0.6611149907112122, + "learning_rate": 0.0001, + "loss": 1.4133, + "step": 13730 + }, + { + "epoch": 1.5772787318361954, + "grad_norm": 0.5527372360229492, + "learning_rate": 0.0001, + "loss": 1.4975, + "step": 13731 + }, + { + "epoch": 1.5773936017460226, + "grad_norm": 0.6152689456939697, + "learning_rate": 0.0001, + "loss": 1.5015, + "step": 13732 + }, + { + "epoch": 1.5775084716558498, + "grad_norm": 0.6419612169265747, + "learning_rate": 0.0001, + "loss": 1.6189, + "step": 13733 + }, + { + "epoch": 1.5776233415656769, + "grad_norm": 0.6237484216690063, + "learning_rate": 0.0001, + "loss": 1.3205, + "step": 13734 + }, + { + "epoch": 1.5777382114755039, + "grad_norm": 0.6146785616874695, + "learning_rate": 0.0001, + "loss": 1.3854, + "step": 13735 + }, + { + "epoch": 1.577853081385331, + "grad_norm": 0.5459988117218018, + "learning_rate": 0.0001, + "loss": 1.2364, + "step": 13736 + }, + { + "epoch": 1.5779679512951583, + "grad_norm": 0.5892395377159119, + "learning_rate": 0.0001, + "loss": 1.4168, + "step": 13737 + }, + { + "epoch": 1.5780828212049853, + "grad_norm": 0.6266677975654602, + "learning_rate": 0.0001, + "loss": 1.4853, + "step": 13738 + }, + { + "epoch": 1.5781976911148123, + "grad_norm": 0.6316624283790588, + "learning_rate": 0.0001, + "loss": 1.3561, + "step": 13739 + }, + { + "epoch": 1.5783125610246396, + "grad_norm": 0.5511997938156128, + "learning_rate": 0.0001, + "loss": 1.4032, + "step": 13740 + }, + { + "epoch": 1.5784274309344668, + "grad_norm": 0.6501776576042175, + "learning_rate": 0.0001, + "loss": 1.4935, + "step": 13741 + }, + { + "epoch": 1.5785423008442938, + "grad_norm": 0.6260613203048706, + "learning_rate": 0.0001, + "loss": 1.4332, + "step": 13742 + }, + { + "epoch": 1.5786571707541208, + "grad_norm": 0.6105442047119141, + "learning_rate": 0.0001, + "loss": 1.566, + "step": 13743 + }, + { + "epoch": 1.578772040663948, + "grad_norm": 0.5777299404144287, + "learning_rate": 0.0001, + "loss": 1.4765, + "step": 13744 + }, + { + "epoch": 1.5788869105737753, + "grad_norm": 0.6278029680252075, + "learning_rate": 0.0001, + "loss": 1.4338, + "step": 13745 + }, + { + "epoch": 1.5790017804836023, + "grad_norm": 0.6350043416023254, + "learning_rate": 0.0001, + "loss": 1.2099, + "step": 13746 + }, + { + "epoch": 1.5791166503934293, + "grad_norm": 0.7006800770759583, + "learning_rate": 0.0001, + "loss": 1.6148, + "step": 13747 + }, + { + "epoch": 1.5792315203032565, + "grad_norm": 0.7125732898712158, + "learning_rate": 0.0001, + "loss": 1.5515, + "step": 13748 + }, + { + "epoch": 1.5793463902130838, + "grad_norm": 0.595615565776825, + "learning_rate": 0.0001, + "loss": 1.5118, + "step": 13749 + }, + { + "epoch": 1.5794612601229108, + "grad_norm": 0.5776601433753967, + "learning_rate": 0.0001, + "loss": 1.2368, + "step": 13750 + }, + { + "epoch": 1.5795761300327378, + "grad_norm": 0.6074332594871521, + "learning_rate": 0.0001, + "loss": 1.5576, + "step": 13751 + }, + { + "epoch": 1.579690999942565, + "grad_norm": 0.5884713530540466, + "learning_rate": 0.0001, + "loss": 1.2479, + "step": 13752 + }, + { + "epoch": 1.5798058698523922, + "grad_norm": 0.5512914657592773, + "learning_rate": 0.0001, + "loss": 1.3712, + "step": 13753 + }, + { + "epoch": 1.5799207397622193, + "grad_norm": 0.5844442844390869, + "learning_rate": 0.0001, + "loss": 1.5999, + "step": 13754 + }, + { + "epoch": 1.5800356096720463, + "grad_norm": 0.6031823754310608, + "learning_rate": 0.0001, + "loss": 1.295, + "step": 13755 + }, + { + "epoch": 1.5801504795818735, + "grad_norm": 0.6166372895240784, + "learning_rate": 0.0001, + "loss": 1.3007, + "step": 13756 + }, + { + "epoch": 1.5802653494917007, + "grad_norm": 0.5954715013504028, + "learning_rate": 0.0001, + "loss": 1.4218, + "step": 13757 + }, + { + "epoch": 1.5803802194015277, + "grad_norm": 0.6521285176277161, + "learning_rate": 0.0001, + "loss": 1.4628, + "step": 13758 + }, + { + "epoch": 1.5804950893113547, + "grad_norm": 0.6663768887519836, + "learning_rate": 0.0001, + "loss": 1.6063, + "step": 13759 + }, + { + "epoch": 1.580609959221182, + "grad_norm": 0.577915370464325, + "learning_rate": 0.0001, + "loss": 1.4309, + "step": 13760 + }, + { + "epoch": 1.5807248291310092, + "grad_norm": 0.5663382411003113, + "learning_rate": 0.0001, + "loss": 1.4019, + "step": 13761 + }, + { + "epoch": 1.5808396990408362, + "grad_norm": 0.5602874159812927, + "learning_rate": 0.0001, + "loss": 1.4436, + "step": 13762 + }, + { + "epoch": 1.5809545689506632, + "grad_norm": 0.6626793146133423, + "learning_rate": 0.0001, + "loss": 1.4416, + "step": 13763 + }, + { + "epoch": 1.5810694388604905, + "grad_norm": 0.6153534650802612, + "learning_rate": 0.0001, + "loss": 1.3675, + "step": 13764 + }, + { + "epoch": 1.5811843087703177, + "grad_norm": 0.5600067377090454, + "learning_rate": 0.0001, + "loss": 1.2818, + "step": 13765 + }, + { + "epoch": 1.5812991786801447, + "grad_norm": 0.6040723919868469, + "learning_rate": 0.0001, + "loss": 1.4141, + "step": 13766 + }, + { + "epoch": 1.5814140485899717, + "grad_norm": 0.5831596255302429, + "learning_rate": 0.0001, + "loss": 1.412, + "step": 13767 + }, + { + "epoch": 1.581528918499799, + "grad_norm": 0.5813616514205933, + "learning_rate": 0.0001, + "loss": 1.3028, + "step": 13768 + }, + { + "epoch": 1.5816437884096262, + "grad_norm": 0.5949644446372986, + "learning_rate": 0.0001, + "loss": 1.4923, + "step": 13769 + }, + { + "epoch": 1.5817586583194532, + "grad_norm": 0.6183162927627563, + "learning_rate": 0.0001, + "loss": 1.4916, + "step": 13770 + }, + { + "epoch": 1.5818735282292802, + "grad_norm": 0.5976161360740662, + "learning_rate": 0.0001, + "loss": 1.5491, + "step": 13771 + }, + { + "epoch": 1.5819883981391074, + "grad_norm": 0.6509799957275391, + "learning_rate": 0.0001, + "loss": 1.5773, + "step": 13772 + }, + { + "epoch": 1.5821032680489346, + "grad_norm": 0.6055750250816345, + "learning_rate": 0.0001, + "loss": 1.4911, + "step": 13773 + }, + { + "epoch": 1.5822181379587617, + "grad_norm": 0.6353389024734497, + "learning_rate": 0.0001, + "loss": 1.6027, + "step": 13774 + }, + { + "epoch": 1.5823330078685887, + "grad_norm": 0.645206093788147, + "learning_rate": 0.0001, + "loss": 1.5026, + "step": 13775 + }, + { + "epoch": 1.582447877778416, + "grad_norm": 0.6120302081108093, + "learning_rate": 0.0001, + "loss": 1.5445, + "step": 13776 + }, + { + "epoch": 1.5825627476882431, + "grad_norm": 0.5803287625312805, + "learning_rate": 0.0001, + "loss": 1.2655, + "step": 13777 + }, + { + "epoch": 1.5826776175980704, + "grad_norm": 0.5672709345817566, + "learning_rate": 0.0001, + "loss": 1.5853, + "step": 13778 + }, + { + "epoch": 1.5827924875078974, + "grad_norm": 0.593664288520813, + "learning_rate": 0.0001, + "loss": 1.4554, + "step": 13779 + }, + { + "epoch": 1.5829073574177244, + "grad_norm": 0.6163424849510193, + "learning_rate": 0.0001, + "loss": 1.4778, + "step": 13780 + }, + { + "epoch": 1.5830222273275516, + "grad_norm": 0.6263569593429565, + "learning_rate": 0.0001, + "loss": 1.6402, + "step": 13781 + }, + { + "epoch": 1.5831370972373788, + "grad_norm": 0.6205965876579285, + "learning_rate": 0.0001, + "loss": 1.6084, + "step": 13782 + }, + { + "epoch": 1.5832519671472058, + "grad_norm": 0.6791396141052246, + "learning_rate": 0.0001, + "loss": 1.7392, + "step": 13783 + }, + { + "epoch": 1.5833668370570328, + "grad_norm": 0.6714447736740112, + "learning_rate": 0.0001, + "loss": 1.4619, + "step": 13784 + }, + { + "epoch": 1.58348170696686, + "grad_norm": 0.5691717267036438, + "learning_rate": 0.0001, + "loss": 1.4059, + "step": 13785 + }, + { + "epoch": 1.5835965768766873, + "grad_norm": 0.636568009853363, + "learning_rate": 0.0001, + "loss": 1.4756, + "step": 13786 + }, + { + "epoch": 1.5837114467865143, + "grad_norm": 0.6351962685585022, + "learning_rate": 0.0001, + "loss": 1.4316, + "step": 13787 + }, + { + "epoch": 1.5838263166963413, + "grad_norm": 0.6552459001541138, + "learning_rate": 0.0001, + "loss": 1.7072, + "step": 13788 + }, + { + "epoch": 1.5839411866061686, + "grad_norm": 0.6077340841293335, + "learning_rate": 0.0001, + "loss": 1.585, + "step": 13789 + }, + { + "epoch": 1.5840560565159958, + "grad_norm": 0.5727121829986572, + "learning_rate": 0.0001, + "loss": 1.4895, + "step": 13790 + }, + { + "epoch": 1.5841709264258228, + "grad_norm": 0.6179769039154053, + "learning_rate": 0.0001, + "loss": 1.3921, + "step": 13791 + }, + { + "epoch": 1.5842857963356498, + "grad_norm": 0.5873903036117554, + "learning_rate": 0.0001, + "loss": 1.5292, + "step": 13792 + }, + { + "epoch": 1.584400666245477, + "grad_norm": 0.6171455979347229, + "learning_rate": 0.0001, + "loss": 1.3184, + "step": 13793 + }, + { + "epoch": 1.5845155361553043, + "grad_norm": 0.7028810977935791, + "learning_rate": 0.0001, + "loss": 1.6935, + "step": 13794 + }, + { + "epoch": 1.5846304060651313, + "grad_norm": 0.5513997077941895, + "learning_rate": 0.0001, + "loss": 1.464, + "step": 13795 + }, + { + "epoch": 1.5847452759749583, + "grad_norm": 0.5477231740951538, + "learning_rate": 0.0001, + "loss": 1.3617, + "step": 13796 + }, + { + "epoch": 1.5848601458847855, + "grad_norm": 0.6108036041259766, + "learning_rate": 0.0001, + "loss": 1.3043, + "step": 13797 + }, + { + "epoch": 1.5849750157946128, + "grad_norm": 0.559474527835846, + "learning_rate": 0.0001, + "loss": 1.2964, + "step": 13798 + }, + { + "epoch": 1.5850898857044398, + "grad_norm": 0.6026455760002136, + "learning_rate": 0.0001, + "loss": 1.4424, + "step": 13799 + }, + { + "epoch": 1.5852047556142668, + "grad_norm": 0.7762643098831177, + "learning_rate": 0.0001, + "loss": 1.7879, + "step": 13800 + }, + { + "epoch": 1.585319625524094, + "grad_norm": 0.6184485554695129, + "learning_rate": 0.0001, + "loss": 1.4559, + "step": 13801 + }, + { + "epoch": 1.5854344954339212, + "grad_norm": 0.6571382880210876, + "learning_rate": 0.0001, + "loss": 1.6504, + "step": 13802 + }, + { + "epoch": 1.5855493653437482, + "grad_norm": 0.6091337203979492, + "learning_rate": 0.0001, + "loss": 1.555, + "step": 13803 + }, + { + "epoch": 1.5856642352535752, + "grad_norm": 0.6771681308746338, + "learning_rate": 0.0001, + "loss": 1.3213, + "step": 13804 + }, + { + "epoch": 1.5857791051634025, + "grad_norm": 0.6501324772834778, + "learning_rate": 0.0001, + "loss": 1.5583, + "step": 13805 + }, + { + "epoch": 1.5858939750732297, + "grad_norm": 0.5912746787071228, + "learning_rate": 0.0001, + "loss": 1.3764, + "step": 13806 + }, + { + "epoch": 1.5860088449830567, + "grad_norm": 0.6037330627441406, + "learning_rate": 0.0001, + "loss": 1.4944, + "step": 13807 + }, + { + "epoch": 1.5861237148928837, + "grad_norm": 0.5597751140594482, + "learning_rate": 0.0001, + "loss": 1.3488, + "step": 13808 + }, + { + "epoch": 1.586238584802711, + "grad_norm": 0.5525902509689331, + "learning_rate": 0.0001, + "loss": 1.4154, + "step": 13809 + }, + { + "epoch": 1.5863534547125382, + "grad_norm": 0.6479353308677673, + "learning_rate": 0.0001, + "loss": 1.4451, + "step": 13810 + }, + { + "epoch": 1.5864683246223652, + "grad_norm": 0.6290416121482849, + "learning_rate": 0.0001, + "loss": 1.4498, + "step": 13811 + }, + { + "epoch": 1.5865831945321922, + "grad_norm": 0.6369213461875916, + "learning_rate": 0.0001, + "loss": 1.3789, + "step": 13812 + }, + { + "epoch": 1.5866980644420194, + "grad_norm": 0.6600077152252197, + "learning_rate": 0.0001, + "loss": 1.3561, + "step": 13813 + }, + { + "epoch": 1.5868129343518467, + "grad_norm": 0.6449211239814758, + "learning_rate": 0.0001, + "loss": 1.5132, + "step": 13814 + }, + { + "epoch": 1.5869278042616737, + "grad_norm": 0.596219003200531, + "learning_rate": 0.0001, + "loss": 1.4306, + "step": 13815 + }, + { + "epoch": 1.5870426741715007, + "grad_norm": 0.6226633191108704, + "learning_rate": 0.0001, + "loss": 1.4792, + "step": 13816 + }, + { + "epoch": 1.587157544081328, + "grad_norm": 0.614152193069458, + "learning_rate": 0.0001, + "loss": 1.5682, + "step": 13817 + }, + { + "epoch": 1.5872724139911552, + "grad_norm": 0.6192676424980164, + "learning_rate": 0.0001, + "loss": 1.3306, + "step": 13818 + }, + { + "epoch": 1.5873872839009822, + "grad_norm": 0.6133370995521545, + "learning_rate": 0.0001, + "loss": 1.2503, + "step": 13819 + }, + { + "epoch": 1.5875021538108092, + "grad_norm": 0.6042430996894836, + "learning_rate": 0.0001, + "loss": 1.4175, + "step": 13820 + }, + { + "epoch": 1.5876170237206364, + "grad_norm": 0.7672133445739746, + "learning_rate": 0.0001, + "loss": 1.6331, + "step": 13821 + }, + { + "epoch": 1.5877318936304636, + "grad_norm": 0.662254273891449, + "learning_rate": 0.0001, + "loss": 1.4117, + "step": 13822 + }, + { + "epoch": 1.5878467635402906, + "grad_norm": 0.6693131327629089, + "learning_rate": 0.0001, + "loss": 1.5383, + "step": 13823 + }, + { + "epoch": 1.5879616334501176, + "grad_norm": 0.636415421962738, + "learning_rate": 0.0001, + "loss": 1.5238, + "step": 13824 + }, + { + "epoch": 1.5880765033599449, + "grad_norm": 0.579950213432312, + "learning_rate": 0.0001, + "loss": 1.4417, + "step": 13825 + }, + { + "epoch": 1.588191373269772, + "grad_norm": 0.6041045188903809, + "learning_rate": 0.0001, + "loss": 1.3729, + "step": 13826 + }, + { + "epoch": 1.5883062431795991, + "grad_norm": 0.638687252998352, + "learning_rate": 0.0001, + "loss": 1.4003, + "step": 13827 + }, + { + "epoch": 1.5884211130894261, + "grad_norm": 0.5657978057861328, + "learning_rate": 0.0001, + "loss": 1.3076, + "step": 13828 + }, + { + "epoch": 1.5885359829992534, + "grad_norm": 0.5743250250816345, + "learning_rate": 0.0001, + "loss": 1.5329, + "step": 13829 + }, + { + "epoch": 1.5886508529090806, + "grad_norm": 0.6413049101829529, + "learning_rate": 0.0001, + "loss": 1.5212, + "step": 13830 + }, + { + "epoch": 1.5887657228189076, + "grad_norm": 0.5768105983734131, + "learning_rate": 0.0001, + "loss": 1.4486, + "step": 13831 + }, + { + "epoch": 1.5888805927287346, + "grad_norm": 0.6100025773048401, + "learning_rate": 0.0001, + "loss": 1.5592, + "step": 13832 + }, + { + "epoch": 1.5889954626385618, + "grad_norm": 0.6086409091949463, + "learning_rate": 0.0001, + "loss": 1.3241, + "step": 13833 + }, + { + "epoch": 1.589110332548389, + "grad_norm": 0.6724202632904053, + "learning_rate": 0.0001, + "loss": 1.4075, + "step": 13834 + }, + { + "epoch": 1.589225202458216, + "grad_norm": 0.6264391541481018, + "learning_rate": 0.0001, + "loss": 1.6691, + "step": 13835 + }, + { + "epoch": 1.589340072368043, + "grad_norm": 0.6160328388214111, + "learning_rate": 0.0001, + "loss": 1.4792, + "step": 13836 + }, + { + "epoch": 1.5894549422778703, + "grad_norm": 0.6358749270439148, + "learning_rate": 0.0001, + "loss": 1.465, + "step": 13837 + }, + { + "epoch": 1.5895698121876976, + "grad_norm": 0.6735308766365051, + "learning_rate": 0.0001, + "loss": 1.4201, + "step": 13838 + }, + { + "epoch": 1.5896846820975246, + "grad_norm": 0.6253379583358765, + "learning_rate": 0.0001, + "loss": 1.5515, + "step": 13839 + }, + { + "epoch": 1.5897995520073516, + "grad_norm": 0.6210620999336243, + "learning_rate": 0.0001, + "loss": 1.62, + "step": 13840 + }, + { + "epoch": 1.5899144219171788, + "grad_norm": 0.6258774995803833, + "learning_rate": 0.0001, + "loss": 1.4827, + "step": 13841 + }, + { + "epoch": 1.590029291827006, + "grad_norm": 0.6079967021942139, + "learning_rate": 0.0001, + "loss": 1.4229, + "step": 13842 + }, + { + "epoch": 1.590144161736833, + "grad_norm": 0.5905248522758484, + "learning_rate": 0.0001, + "loss": 1.436, + "step": 13843 + }, + { + "epoch": 1.59025903164666, + "grad_norm": 0.623208224773407, + "learning_rate": 0.0001, + "loss": 1.5429, + "step": 13844 + }, + { + "epoch": 1.5903739015564873, + "grad_norm": 0.6524278521537781, + "learning_rate": 0.0001, + "loss": 1.526, + "step": 13845 + }, + { + "epoch": 1.5904887714663145, + "grad_norm": 0.6335569024085999, + "learning_rate": 0.0001, + "loss": 1.4586, + "step": 13846 + }, + { + "epoch": 1.5906036413761415, + "grad_norm": 0.5839788317680359, + "learning_rate": 0.0001, + "loss": 1.4472, + "step": 13847 + }, + { + "epoch": 1.5907185112859685, + "grad_norm": 0.6016788482666016, + "learning_rate": 0.0001, + "loss": 1.6573, + "step": 13848 + }, + { + "epoch": 1.5908333811957958, + "grad_norm": 0.5814146399497986, + "learning_rate": 0.0001, + "loss": 1.5764, + "step": 13849 + }, + { + "epoch": 1.590948251105623, + "grad_norm": 0.6370062828063965, + "learning_rate": 0.0001, + "loss": 1.6166, + "step": 13850 + }, + { + "epoch": 1.59106312101545, + "grad_norm": 0.5697435736656189, + "learning_rate": 0.0001, + "loss": 1.5031, + "step": 13851 + }, + { + "epoch": 1.591177990925277, + "grad_norm": 0.5778668522834778, + "learning_rate": 0.0001, + "loss": 1.2866, + "step": 13852 + }, + { + "epoch": 1.5912928608351042, + "grad_norm": 0.5944350957870483, + "learning_rate": 0.0001, + "loss": 1.4024, + "step": 13853 + }, + { + "epoch": 1.5914077307449315, + "grad_norm": 0.713706374168396, + "learning_rate": 0.0001, + "loss": 1.4837, + "step": 13854 + }, + { + "epoch": 1.5915226006547585, + "grad_norm": 0.5639752149581909, + "learning_rate": 0.0001, + "loss": 1.4652, + "step": 13855 + }, + { + "epoch": 1.5916374705645855, + "grad_norm": 0.6923316717147827, + "learning_rate": 0.0001, + "loss": 1.529, + "step": 13856 + }, + { + "epoch": 1.5917523404744127, + "grad_norm": 0.5926253795623779, + "learning_rate": 0.0001, + "loss": 1.4854, + "step": 13857 + }, + { + "epoch": 1.59186721038424, + "grad_norm": 0.6101776361465454, + "learning_rate": 0.0001, + "loss": 1.4077, + "step": 13858 + }, + { + "epoch": 1.591982080294067, + "grad_norm": 0.6200652122497559, + "learning_rate": 0.0001, + "loss": 1.5716, + "step": 13859 + }, + { + "epoch": 1.592096950203894, + "grad_norm": 0.7140838503837585, + "learning_rate": 0.0001, + "loss": 1.5328, + "step": 13860 + }, + { + "epoch": 1.5922118201137212, + "grad_norm": 0.6783391237258911, + "learning_rate": 0.0001, + "loss": 1.6397, + "step": 13861 + }, + { + "epoch": 1.5923266900235484, + "grad_norm": 0.6628894805908203, + "learning_rate": 0.0001, + "loss": 1.4749, + "step": 13862 + }, + { + "epoch": 1.5924415599333754, + "grad_norm": 0.59294593334198, + "learning_rate": 0.0001, + "loss": 1.3666, + "step": 13863 + }, + { + "epoch": 1.5925564298432024, + "grad_norm": 0.5863275527954102, + "learning_rate": 0.0001, + "loss": 1.5978, + "step": 13864 + }, + { + "epoch": 1.5926712997530297, + "grad_norm": 0.5980870723724365, + "learning_rate": 0.0001, + "loss": 1.4679, + "step": 13865 + }, + { + "epoch": 1.592786169662857, + "grad_norm": 0.6048779487609863, + "learning_rate": 0.0001, + "loss": 1.4745, + "step": 13866 + }, + { + "epoch": 1.592901039572684, + "grad_norm": 0.5743735432624817, + "learning_rate": 0.0001, + "loss": 1.4831, + "step": 13867 + }, + { + "epoch": 1.593015909482511, + "grad_norm": 0.5809198021888733, + "learning_rate": 0.0001, + "loss": 1.4706, + "step": 13868 + }, + { + "epoch": 1.5931307793923382, + "grad_norm": 0.6342722773551941, + "learning_rate": 0.0001, + "loss": 1.5462, + "step": 13869 + }, + { + "epoch": 1.5932456493021654, + "grad_norm": 0.6230271458625793, + "learning_rate": 0.0001, + "loss": 1.3428, + "step": 13870 + }, + { + "epoch": 1.5933605192119924, + "grad_norm": 0.6740293502807617, + "learning_rate": 0.0001, + "loss": 1.6158, + "step": 13871 + }, + { + "epoch": 1.5934753891218194, + "grad_norm": 0.7214620113372803, + "learning_rate": 0.0001, + "loss": 1.4837, + "step": 13872 + }, + { + "epoch": 1.5935902590316466, + "grad_norm": 0.6256888508796692, + "learning_rate": 0.0001, + "loss": 1.7852, + "step": 13873 + }, + { + "epoch": 1.5937051289414739, + "grad_norm": 0.5903255939483643, + "learning_rate": 0.0001, + "loss": 1.4396, + "step": 13874 + }, + { + "epoch": 1.5938199988513009, + "grad_norm": 0.572929859161377, + "learning_rate": 0.0001, + "loss": 1.4534, + "step": 13875 + }, + { + "epoch": 1.5939348687611279, + "grad_norm": 0.6389855146408081, + "learning_rate": 0.0001, + "loss": 1.4959, + "step": 13876 + }, + { + "epoch": 1.5940497386709551, + "grad_norm": 0.614324688911438, + "learning_rate": 0.0001, + "loss": 1.5737, + "step": 13877 + }, + { + "epoch": 1.5941646085807823, + "grad_norm": 0.6262052655220032, + "learning_rate": 0.0001, + "loss": 1.48, + "step": 13878 + }, + { + "epoch": 1.5942794784906094, + "grad_norm": 0.6147677898406982, + "learning_rate": 0.0001, + "loss": 1.4761, + "step": 13879 + }, + { + "epoch": 1.5943943484004364, + "grad_norm": 0.635215699672699, + "learning_rate": 0.0001, + "loss": 1.498, + "step": 13880 + }, + { + "epoch": 1.5945092183102636, + "grad_norm": 0.5967612266540527, + "learning_rate": 0.0001, + "loss": 1.4983, + "step": 13881 + }, + { + "epoch": 1.5946240882200908, + "grad_norm": 0.5582137703895569, + "learning_rate": 0.0001, + "loss": 1.374, + "step": 13882 + }, + { + "epoch": 1.5947389581299178, + "grad_norm": 0.6301720142364502, + "learning_rate": 0.0001, + "loss": 1.6609, + "step": 13883 + }, + { + "epoch": 1.5948538280397448, + "grad_norm": 0.5917264819145203, + "learning_rate": 0.0001, + "loss": 1.5236, + "step": 13884 + }, + { + "epoch": 1.594968697949572, + "grad_norm": 0.6191073656082153, + "learning_rate": 0.0001, + "loss": 1.4597, + "step": 13885 + }, + { + "epoch": 1.5950835678593993, + "grad_norm": 0.5992122292518616, + "learning_rate": 0.0001, + "loss": 1.3994, + "step": 13886 + }, + { + "epoch": 1.5951984377692263, + "grad_norm": 0.6241499781608582, + "learning_rate": 0.0001, + "loss": 1.5801, + "step": 13887 + }, + { + "epoch": 1.5953133076790533, + "grad_norm": 0.5631673336029053, + "learning_rate": 0.0001, + "loss": 1.4812, + "step": 13888 + }, + { + "epoch": 1.5954281775888806, + "grad_norm": 0.6660466194152832, + "learning_rate": 0.0001, + "loss": 1.7669, + "step": 13889 + }, + { + "epoch": 1.5955430474987078, + "grad_norm": 0.676660418510437, + "learning_rate": 0.0001, + "loss": 1.1202, + "step": 13890 + }, + { + "epoch": 1.5956579174085348, + "grad_norm": 0.6641074419021606, + "learning_rate": 0.0001, + "loss": 1.5507, + "step": 13891 + }, + { + "epoch": 1.5957727873183618, + "grad_norm": 0.6057578325271606, + "learning_rate": 0.0001, + "loss": 1.4497, + "step": 13892 + }, + { + "epoch": 1.595887657228189, + "grad_norm": 0.5668378472328186, + "learning_rate": 0.0001, + "loss": 1.3555, + "step": 13893 + }, + { + "epoch": 1.5960025271380163, + "grad_norm": 0.6328227519989014, + "learning_rate": 0.0001, + "loss": 1.5493, + "step": 13894 + }, + { + "epoch": 1.5961173970478433, + "grad_norm": 0.605475664138794, + "learning_rate": 0.0001, + "loss": 1.4586, + "step": 13895 + }, + { + "epoch": 1.5962322669576703, + "grad_norm": 0.5906625986099243, + "learning_rate": 0.0001, + "loss": 1.4653, + "step": 13896 + }, + { + "epoch": 1.5963471368674975, + "grad_norm": 0.613008439540863, + "learning_rate": 0.0001, + "loss": 1.5165, + "step": 13897 + }, + { + "epoch": 1.5964620067773247, + "grad_norm": 0.6402007937431335, + "learning_rate": 0.0001, + "loss": 1.3678, + "step": 13898 + }, + { + "epoch": 1.5965768766871518, + "grad_norm": 0.6814921498298645, + "learning_rate": 0.0001, + "loss": 1.6187, + "step": 13899 + }, + { + "epoch": 1.5966917465969788, + "grad_norm": 0.5518124103546143, + "learning_rate": 0.0001, + "loss": 1.3886, + "step": 13900 + }, + { + "epoch": 1.596806616506806, + "grad_norm": 0.5962539315223694, + "learning_rate": 0.0001, + "loss": 1.412, + "step": 13901 + }, + { + "epoch": 1.5969214864166332, + "grad_norm": 0.5778492093086243, + "learning_rate": 0.0001, + "loss": 1.3721, + "step": 13902 + }, + { + "epoch": 1.5970363563264602, + "grad_norm": 0.6201198101043701, + "learning_rate": 0.0001, + "loss": 1.3933, + "step": 13903 + }, + { + "epoch": 1.5971512262362872, + "grad_norm": 0.6029123663902283, + "learning_rate": 0.0001, + "loss": 1.3812, + "step": 13904 + }, + { + "epoch": 1.5972660961461145, + "grad_norm": 0.6379988789558411, + "learning_rate": 0.0001, + "loss": 1.2687, + "step": 13905 + }, + { + "epoch": 1.5973809660559417, + "grad_norm": 0.5439411997795105, + "learning_rate": 0.0001, + "loss": 1.3471, + "step": 13906 + }, + { + "epoch": 1.5974958359657687, + "grad_norm": 0.6145894527435303, + "learning_rate": 0.0001, + "loss": 1.3119, + "step": 13907 + }, + { + "epoch": 1.5976107058755957, + "grad_norm": 0.6268411874771118, + "learning_rate": 0.0001, + "loss": 1.4461, + "step": 13908 + }, + { + "epoch": 1.597725575785423, + "grad_norm": 0.5672012567520142, + "learning_rate": 0.0001, + "loss": 1.4152, + "step": 13909 + }, + { + "epoch": 1.5978404456952502, + "grad_norm": 0.5719462633132935, + "learning_rate": 0.0001, + "loss": 1.4458, + "step": 13910 + }, + { + "epoch": 1.5979553156050772, + "grad_norm": 0.564241349697113, + "learning_rate": 0.0001, + "loss": 1.2821, + "step": 13911 + }, + { + "epoch": 1.5980701855149042, + "grad_norm": 0.6079102158546448, + "learning_rate": 0.0001, + "loss": 1.3764, + "step": 13912 + }, + { + "epoch": 1.5981850554247314, + "grad_norm": 0.605634868144989, + "learning_rate": 0.0001, + "loss": 1.4621, + "step": 13913 + }, + { + "epoch": 1.5982999253345587, + "grad_norm": 0.599432110786438, + "learning_rate": 0.0001, + "loss": 1.4588, + "step": 13914 + }, + { + "epoch": 1.5984147952443857, + "grad_norm": 0.6091422438621521, + "learning_rate": 0.0001, + "loss": 1.533, + "step": 13915 + }, + { + "epoch": 1.598529665154213, + "grad_norm": 0.6520305275917053, + "learning_rate": 0.0001, + "loss": 1.6384, + "step": 13916 + }, + { + "epoch": 1.59864453506404, + "grad_norm": 0.6225545406341553, + "learning_rate": 0.0001, + "loss": 1.4667, + "step": 13917 + }, + { + "epoch": 1.5987594049738671, + "grad_norm": 0.5742337107658386, + "learning_rate": 0.0001, + "loss": 1.3543, + "step": 13918 + }, + { + "epoch": 1.5988742748836944, + "grad_norm": 0.6456047892570496, + "learning_rate": 0.0001, + "loss": 1.452, + "step": 13919 + }, + { + "epoch": 1.5989891447935214, + "grad_norm": 0.637758195400238, + "learning_rate": 0.0001, + "loss": 1.4995, + "step": 13920 + }, + { + "epoch": 1.5991040147033484, + "grad_norm": 0.6224943399429321, + "learning_rate": 0.0001, + "loss": 1.2575, + "step": 13921 + }, + { + "epoch": 1.5992188846131756, + "grad_norm": 0.6721301078796387, + "learning_rate": 0.0001, + "loss": 1.5166, + "step": 13922 + }, + { + "epoch": 1.5993337545230029, + "grad_norm": 0.5861270427703857, + "learning_rate": 0.0001, + "loss": 1.2367, + "step": 13923 + }, + { + "epoch": 1.5994486244328299, + "grad_norm": 0.6392655968666077, + "learning_rate": 0.0001, + "loss": 1.4703, + "step": 13924 + }, + { + "epoch": 1.5995634943426569, + "grad_norm": 0.6457921862602234, + "learning_rate": 0.0001, + "loss": 1.5829, + "step": 13925 + }, + { + "epoch": 1.599678364252484, + "grad_norm": 0.6379731297492981, + "learning_rate": 0.0001, + "loss": 1.3977, + "step": 13926 + }, + { + "epoch": 1.5997932341623113, + "grad_norm": 0.5802931785583496, + "learning_rate": 0.0001, + "loss": 1.3204, + "step": 13927 + }, + { + "epoch": 1.5999081040721383, + "grad_norm": 0.6664165258407593, + "learning_rate": 0.0001, + "loss": 1.5346, + "step": 13928 + }, + { + "epoch": 1.6000229739819654, + "grad_norm": 0.6241664290428162, + "learning_rate": 0.0001, + "loss": 1.6584, + "step": 13929 + }, + { + "epoch": 1.6001378438917926, + "grad_norm": 0.6621495485305786, + "learning_rate": 0.0001, + "loss": 1.3809, + "step": 13930 + }, + { + "epoch": 1.6002527138016198, + "grad_norm": 0.6004970073699951, + "learning_rate": 0.0001, + "loss": 1.4092, + "step": 13931 + }, + { + "epoch": 1.6003675837114468, + "grad_norm": 0.6193183064460754, + "learning_rate": 0.0001, + "loss": 1.4335, + "step": 13932 + }, + { + "epoch": 1.6004824536212738, + "grad_norm": 0.6352545619010925, + "learning_rate": 0.0001, + "loss": 1.5641, + "step": 13933 + }, + { + "epoch": 1.600597323531101, + "grad_norm": 0.6299646496772766, + "learning_rate": 0.0001, + "loss": 1.4958, + "step": 13934 + }, + { + "epoch": 1.6007121934409283, + "grad_norm": 0.5891308784484863, + "learning_rate": 0.0001, + "loss": 1.3664, + "step": 13935 + }, + { + "epoch": 1.6008270633507553, + "grad_norm": 0.655714750289917, + "learning_rate": 0.0001, + "loss": 1.3667, + "step": 13936 + }, + { + "epoch": 1.6009419332605823, + "grad_norm": 0.6111890077590942, + "learning_rate": 0.0001, + "loss": 1.4353, + "step": 13937 + }, + { + "epoch": 1.6010568031704095, + "grad_norm": 0.5762326717376709, + "learning_rate": 0.0001, + "loss": 1.5739, + "step": 13938 + }, + { + "epoch": 1.6011716730802368, + "grad_norm": 0.5921722650527954, + "learning_rate": 0.0001, + "loss": 1.5816, + "step": 13939 + }, + { + "epoch": 1.6012865429900638, + "grad_norm": 0.6111255884170532, + "learning_rate": 0.0001, + "loss": 1.722, + "step": 13940 + }, + { + "epoch": 1.6014014128998908, + "grad_norm": 0.5806509256362915, + "learning_rate": 0.0001, + "loss": 1.4408, + "step": 13941 + }, + { + "epoch": 1.601516282809718, + "grad_norm": 0.5703766942024231, + "learning_rate": 0.0001, + "loss": 1.303, + "step": 13942 + }, + { + "epoch": 1.6016311527195453, + "grad_norm": 0.5971461534500122, + "learning_rate": 0.0001, + "loss": 1.541, + "step": 13943 + }, + { + "epoch": 1.6017460226293723, + "grad_norm": 0.6908469796180725, + "learning_rate": 0.0001, + "loss": 1.4302, + "step": 13944 + }, + { + "epoch": 1.6018608925391993, + "grad_norm": 0.5854578614234924, + "learning_rate": 0.0001, + "loss": 1.5302, + "step": 13945 + }, + { + "epoch": 1.6019757624490265, + "grad_norm": 0.6084689497947693, + "learning_rate": 0.0001, + "loss": 1.5165, + "step": 13946 + }, + { + "epoch": 1.6020906323588537, + "grad_norm": 0.6091856956481934, + "learning_rate": 0.0001, + "loss": 1.3299, + "step": 13947 + }, + { + "epoch": 1.6022055022686807, + "grad_norm": 0.6106900572776794, + "learning_rate": 0.0001, + "loss": 1.4301, + "step": 13948 + }, + { + "epoch": 1.6023203721785078, + "grad_norm": 0.6288970708847046, + "learning_rate": 0.0001, + "loss": 1.5532, + "step": 13949 + }, + { + "epoch": 1.602435242088335, + "grad_norm": 0.6874493956565857, + "learning_rate": 0.0001, + "loss": 1.4993, + "step": 13950 + }, + { + "epoch": 1.6025501119981622, + "grad_norm": 0.5642193555831909, + "learning_rate": 0.0001, + "loss": 1.2891, + "step": 13951 + }, + { + "epoch": 1.6026649819079892, + "grad_norm": 0.6149432063102722, + "learning_rate": 0.0001, + "loss": 1.346, + "step": 13952 + }, + { + "epoch": 1.6027798518178162, + "grad_norm": 0.6034269332885742, + "learning_rate": 0.0001, + "loss": 1.5462, + "step": 13953 + }, + { + "epoch": 1.6028947217276435, + "grad_norm": 0.5566948056221008, + "learning_rate": 0.0001, + "loss": 1.3367, + "step": 13954 + }, + { + "epoch": 1.6030095916374707, + "grad_norm": 0.6086257100105286, + "learning_rate": 0.0001, + "loss": 1.5219, + "step": 13955 + }, + { + "epoch": 1.6031244615472977, + "grad_norm": 0.6284582614898682, + "learning_rate": 0.0001, + "loss": 1.4821, + "step": 13956 + }, + { + "epoch": 1.6032393314571247, + "grad_norm": 0.6378557085990906, + "learning_rate": 0.0001, + "loss": 1.4118, + "step": 13957 + }, + { + "epoch": 1.603354201366952, + "grad_norm": 0.6031190156936646, + "learning_rate": 0.0001, + "loss": 1.5337, + "step": 13958 + }, + { + "epoch": 1.6034690712767792, + "grad_norm": 0.6202352046966553, + "learning_rate": 0.0001, + "loss": 1.3672, + "step": 13959 + }, + { + "epoch": 1.6035839411866062, + "grad_norm": 0.6928132772445679, + "learning_rate": 0.0001, + "loss": 1.2899, + "step": 13960 + }, + { + "epoch": 1.6036988110964332, + "grad_norm": 0.6329846978187561, + "learning_rate": 0.0001, + "loss": 1.533, + "step": 13961 + }, + { + "epoch": 1.6038136810062604, + "grad_norm": 0.5797714591026306, + "learning_rate": 0.0001, + "loss": 1.3294, + "step": 13962 + }, + { + "epoch": 1.6039285509160877, + "grad_norm": 0.6104891896247864, + "learning_rate": 0.0001, + "loss": 1.3985, + "step": 13963 + }, + { + "epoch": 1.6040434208259147, + "grad_norm": 0.6021250486373901, + "learning_rate": 0.0001, + "loss": 1.4517, + "step": 13964 + }, + { + "epoch": 1.6041582907357417, + "grad_norm": 0.6344681978225708, + "learning_rate": 0.0001, + "loss": 1.5118, + "step": 13965 + }, + { + "epoch": 1.604273160645569, + "grad_norm": 0.6106261014938354, + "learning_rate": 0.0001, + "loss": 1.4263, + "step": 13966 + }, + { + "epoch": 1.6043880305553961, + "grad_norm": 0.598883330821991, + "learning_rate": 0.0001, + "loss": 1.4839, + "step": 13967 + }, + { + "epoch": 1.6045029004652231, + "grad_norm": 0.6441910862922668, + "learning_rate": 0.0001, + "loss": 1.3875, + "step": 13968 + }, + { + "epoch": 1.6046177703750502, + "grad_norm": 0.6011143326759338, + "learning_rate": 0.0001, + "loss": 1.4894, + "step": 13969 + }, + { + "epoch": 1.6047326402848774, + "grad_norm": 0.6068170070648193, + "learning_rate": 0.0001, + "loss": 1.2947, + "step": 13970 + }, + { + "epoch": 1.6048475101947046, + "grad_norm": 0.6693038940429688, + "learning_rate": 0.0001, + "loss": 1.4515, + "step": 13971 + }, + { + "epoch": 1.6049623801045316, + "grad_norm": 0.6221956610679626, + "learning_rate": 0.0001, + "loss": 1.462, + "step": 13972 + }, + { + "epoch": 1.6050772500143586, + "grad_norm": 0.6345918774604797, + "learning_rate": 0.0001, + "loss": 1.5339, + "step": 13973 + }, + { + "epoch": 1.6051921199241859, + "grad_norm": 0.5779280066490173, + "learning_rate": 0.0001, + "loss": 1.3821, + "step": 13974 + }, + { + "epoch": 1.605306989834013, + "grad_norm": 0.6681035757064819, + "learning_rate": 0.0001, + "loss": 1.5948, + "step": 13975 + }, + { + "epoch": 1.60542185974384, + "grad_norm": 0.6619514226913452, + "learning_rate": 0.0001, + "loss": 1.4559, + "step": 13976 + }, + { + "epoch": 1.6055367296536671, + "grad_norm": 0.5762709379196167, + "learning_rate": 0.0001, + "loss": 1.4743, + "step": 13977 + }, + { + "epoch": 1.6056515995634943, + "grad_norm": 0.597047746181488, + "learning_rate": 0.0001, + "loss": 1.3797, + "step": 13978 + }, + { + "epoch": 1.6057664694733216, + "grad_norm": 0.6428928375244141, + "learning_rate": 0.0001, + "loss": 1.5844, + "step": 13979 + }, + { + "epoch": 1.6058813393831486, + "grad_norm": 0.6371673941612244, + "learning_rate": 0.0001, + "loss": 1.5729, + "step": 13980 + }, + { + "epoch": 1.6059962092929756, + "grad_norm": 0.69063401222229, + "learning_rate": 0.0001, + "loss": 1.3198, + "step": 13981 + }, + { + "epoch": 1.6061110792028028, + "grad_norm": 0.6148319244384766, + "learning_rate": 0.0001, + "loss": 1.514, + "step": 13982 + }, + { + "epoch": 1.60622594911263, + "grad_norm": 0.6522954106330872, + "learning_rate": 0.0001, + "loss": 1.5567, + "step": 13983 + }, + { + "epoch": 1.606340819022457, + "grad_norm": 0.6170140504837036, + "learning_rate": 0.0001, + "loss": 1.561, + "step": 13984 + }, + { + "epoch": 1.606455688932284, + "grad_norm": 0.5978295803070068, + "learning_rate": 0.0001, + "loss": 1.4698, + "step": 13985 + }, + { + "epoch": 1.6065705588421113, + "grad_norm": 0.6230267882347107, + "learning_rate": 0.0001, + "loss": 1.6488, + "step": 13986 + }, + { + "epoch": 1.6066854287519385, + "grad_norm": 0.6035535335540771, + "learning_rate": 0.0001, + "loss": 1.5813, + "step": 13987 + }, + { + "epoch": 1.6068002986617655, + "grad_norm": 0.6185717582702637, + "learning_rate": 0.0001, + "loss": 1.5899, + "step": 13988 + }, + { + "epoch": 1.6069151685715926, + "grad_norm": 0.5879165530204773, + "learning_rate": 0.0001, + "loss": 1.1743, + "step": 13989 + }, + { + "epoch": 1.6070300384814198, + "grad_norm": 0.6925861239433289, + "learning_rate": 0.0001, + "loss": 1.6345, + "step": 13990 + }, + { + "epoch": 1.607144908391247, + "grad_norm": 0.603412926197052, + "learning_rate": 0.0001, + "loss": 1.3555, + "step": 13991 + }, + { + "epoch": 1.607259778301074, + "grad_norm": 0.6258808374404907, + "learning_rate": 0.0001, + "loss": 1.692, + "step": 13992 + }, + { + "epoch": 1.607374648210901, + "grad_norm": 0.6243830919265747, + "learning_rate": 0.0001, + "loss": 1.4886, + "step": 13993 + }, + { + "epoch": 1.6074895181207283, + "grad_norm": 0.5801231861114502, + "learning_rate": 0.0001, + "loss": 1.3985, + "step": 13994 + }, + { + "epoch": 1.6076043880305555, + "grad_norm": 0.5816532373428345, + "learning_rate": 0.0001, + "loss": 1.274, + "step": 13995 + }, + { + "epoch": 1.6077192579403825, + "grad_norm": 0.6143868565559387, + "learning_rate": 0.0001, + "loss": 1.4923, + "step": 13996 + }, + { + "epoch": 1.6078341278502095, + "grad_norm": 0.5322367548942566, + "learning_rate": 0.0001, + "loss": 1.4064, + "step": 13997 + }, + { + "epoch": 1.6079489977600367, + "grad_norm": 0.6119695901870728, + "learning_rate": 0.0001, + "loss": 1.3524, + "step": 13998 + }, + { + "epoch": 1.608063867669864, + "grad_norm": 0.6040320992469788, + "learning_rate": 0.0001, + "loss": 1.5813, + "step": 13999 + }, + { + "epoch": 1.608178737579691, + "grad_norm": 0.5936914682388306, + "learning_rate": 0.0001, + "loss": 1.3736, + "step": 14000 + }, + { + "epoch": 1.608293607489518, + "grad_norm": 0.6493724584579468, + "learning_rate": 0.0001, + "loss": 1.6253, + "step": 14001 + }, + { + "epoch": 1.6084084773993452, + "grad_norm": 0.5766366720199585, + "learning_rate": 0.0001, + "loss": 1.4142, + "step": 14002 + }, + { + "epoch": 1.6085233473091725, + "grad_norm": 0.6150953769683838, + "learning_rate": 0.0001, + "loss": 1.3288, + "step": 14003 + }, + { + "epoch": 1.6086382172189995, + "grad_norm": 0.5571459531784058, + "learning_rate": 0.0001, + "loss": 1.5631, + "step": 14004 + }, + { + "epoch": 1.6087530871288265, + "grad_norm": 0.6243135333061218, + "learning_rate": 0.0001, + "loss": 1.3522, + "step": 14005 + }, + { + "epoch": 1.6088679570386537, + "grad_norm": 0.5796478390693665, + "learning_rate": 0.0001, + "loss": 1.3919, + "step": 14006 + }, + { + "epoch": 1.608982826948481, + "grad_norm": 0.6436089873313904, + "learning_rate": 0.0001, + "loss": 1.4988, + "step": 14007 + }, + { + "epoch": 1.609097696858308, + "grad_norm": 0.6261184811592102, + "learning_rate": 0.0001, + "loss": 1.5976, + "step": 14008 + }, + { + "epoch": 1.609212566768135, + "grad_norm": 0.5776735544204712, + "learning_rate": 0.0001, + "loss": 1.3081, + "step": 14009 + }, + { + "epoch": 1.6093274366779622, + "grad_norm": 0.5873029828071594, + "learning_rate": 0.0001, + "loss": 1.452, + "step": 14010 + }, + { + "epoch": 1.6094423065877894, + "grad_norm": 0.642424464225769, + "learning_rate": 0.0001, + "loss": 1.5855, + "step": 14011 + }, + { + "epoch": 1.6095571764976164, + "grad_norm": 0.6408437490463257, + "learning_rate": 0.0001, + "loss": 1.4495, + "step": 14012 + }, + { + "epoch": 1.6096720464074434, + "grad_norm": 0.6098792552947998, + "learning_rate": 0.0001, + "loss": 1.515, + "step": 14013 + }, + { + "epoch": 1.6097869163172707, + "grad_norm": 0.5650305151939392, + "learning_rate": 0.0001, + "loss": 1.5223, + "step": 14014 + }, + { + "epoch": 1.609901786227098, + "grad_norm": 0.6191934943199158, + "learning_rate": 0.0001, + "loss": 1.3989, + "step": 14015 + }, + { + "epoch": 1.610016656136925, + "grad_norm": 0.5842486619949341, + "learning_rate": 0.0001, + "loss": 1.5828, + "step": 14016 + }, + { + "epoch": 1.610131526046752, + "grad_norm": 0.6919946670532227, + "learning_rate": 0.0001, + "loss": 1.4848, + "step": 14017 + }, + { + "epoch": 1.6102463959565791, + "grad_norm": 0.5900247693061829, + "learning_rate": 0.0001, + "loss": 1.5379, + "step": 14018 + }, + { + "epoch": 1.6103612658664064, + "grad_norm": 0.5633094906806946, + "learning_rate": 0.0001, + "loss": 1.2923, + "step": 14019 + }, + { + "epoch": 1.6104761357762334, + "grad_norm": 0.6042163968086243, + "learning_rate": 0.0001, + "loss": 1.5307, + "step": 14020 + }, + { + "epoch": 1.6105910056860604, + "grad_norm": 0.6642841100692749, + "learning_rate": 0.0001, + "loss": 1.4918, + "step": 14021 + }, + { + "epoch": 1.6107058755958876, + "grad_norm": 0.7833337783813477, + "learning_rate": 0.0001, + "loss": 1.4682, + "step": 14022 + }, + { + "epoch": 1.6108207455057149, + "grad_norm": 0.5767188668251038, + "learning_rate": 0.0001, + "loss": 1.4422, + "step": 14023 + }, + { + "epoch": 1.6109356154155419, + "grad_norm": 0.5754833817481995, + "learning_rate": 0.0001, + "loss": 1.4488, + "step": 14024 + }, + { + "epoch": 1.6110504853253689, + "grad_norm": 0.6136239171028137, + "learning_rate": 0.0001, + "loss": 1.4834, + "step": 14025 + }, + { + "epoch": 1.611165355235196, + "grad_norm": 0.6007016897201538, + "learning_rate": 0.0001, + "loss": 1.5565, + "step": 14026 + }, + { + "epoch": 1.6112802251450233, + "grad_norm": 0.5736799836158752, + "learning_rate": 0.0001, + "loss": 1.4608, + "step": 14027 + }, + { + "epoch": 1.6113950950548503, + "grad_norm": 0.5962418913841248, + "learning_rate": 0.0001, + "loss": 1.4597, + "step": 14028 + }, + { + "epoch": 1.6115099649646774, + "grad_norm": 0.6547172665596008, + "learning_rate": 0.0001, + "loss": 1.4047, + "step": 14029 + }, + { + "epoch": 1.6116248348745046, + "grad_norm": 0.6729933023452759, + "learning_rate": 0.0001, + "loss": 1.57, + "step": 14030 + }, + { + "epoch": 1.6117397047843318, + "grad_norm": 0.5763393640518188, + "learning_rate": 0.0001, + "loss": 1.6013, + "step": 14031 + }, + { + "epoch": 1.6118545746941588, + "grad_norm": 0.6308915019035339, + "learning_rate": 0.0001, + "loss": 1.5667, + "step": 14032 + }, + { + "epoch": 1.6119694446039858, + "grad_norm": 0.5580734610557556, + "learning_rate": 0.0001, + "loss": 1.4442, + "step": 14033 + }, + { + "epoch": 1.612084314513813, + "grad_norm": 0.5644499063491821, + "learning_rate": 0.0001, + "loss": 1.3308, + "step": 14034 + }, + { + "epoch": 1.6121991844236403, + "grad_norm": 0.6455258131027222, + "learning_rate": 0.0001, + "loss": 1.547, + "step": 14035 + }, + { + "epoch": 1.6123140543334673, + "grad_norm": 0.596584677696228, + "learning_rate": 0.0001, + "loss": 1.3524, + "step": 14036 + }, + { + "epoch": 1.6124289242432943, + "grad_norm": 0.7680908441543579, + "learning_rate": 0.0001, + "loss": 1.5657, + "step": 14037 + }, + { + "epoch": 1.6125437941531215, + "grad_norm": 0.661892294883728, + "learning_rate": 0.0001, + "loss": 1.5992, + "step": 14038 + }, + { + "epoch": 1.6126586640629488, + "grad_norm": 0.5909493565559387, + "learning_rate": 0.0001, + "loss": 1.3543, + "step": 14039 + }, + { + "epoch": 1.6127735339727758, + "grad_norm": 0.5718796849250793, + "learning_rate": 0.0001, + "loss": 1.191, + "step": 14040 + }, + { + "epoch": 1.6128884038826028, + "grad_norm": 0.6069730520248413, + "learning_rate": 0.0001, + "loss": 1.5217, + "step": 14041 + }, + { + "epoch": 1.61300327379243, + "grad_norm": 0.6135676503181458, + "learning_rate": 0.0001, + "loss": 1.5736, + "step": 14042 + }, + { + "epoch": 1.6131181437022573, + "grad_norm": 0.5874907970428467, + "learning_rate": 0.0001, + "loss": 1.5292, + "step": 14043 + }, + { + "epoch": 1.6132330136120843, + "grad_norm": 0.6142223477363586, + "learning_rate": 0.0001, + "loss": 1.3995, + "step": 14044 + }, + { + "epoch": 1.6133478835219113, + "grad_norm": 0.6572316884994507, + "learning_rate": 0.0001, + "loss": 1.6148, + "step": 14045 + }, + { + "epoch": 1.6134627534317385, + "grad_norm": 0.6865919828414917, + "learning_rate": 0.0001, + "loss": 1.4198, + "step": 14046 + }, + { + "epoch": 1.6135776233415657, + "grad_norm": 0.57161945104599, + "learning_rate": 0.0001, + "loss": 1.3409, + "step": 14047 + }, + { + "epoch": 1.6136924932513927, + "grad_norm": 0.672649085521698, + "learning_rate": 0.0001, + "loss": 1.6139, + "step": 14048 + }, + { + "epoch": 1.6138073631612198, + "grad_norm": 0.5908873677253723, + "learning_rate": 0.0001, + "loss": 1.3198, + "step": 14049 + }, + { + "epoch": 1.613922233071047, + "grad_norm": 0.7088580131530762, + "learning_rate": 0.0001, + "loss": 1.5666, + "step": 14050 + }, + { + "epoch": 1.6140371029808742, + "grad_norm": 0.6201817393302917, + "learning_rate": 0.0001, + "loss": 1.3589, + "step": 14051 + }, + { + "epoch": 1.6141519728907012, + "grad_norm": 0.584344208240509, + "learning_rate": 0.0001, + "loss": 1.4533, + "step": 14052 + }, + { + "epoch": 1.6142668428005285, + "grad_norm": 0.5732455849647522, + "learning_rate": 0.0001, + "loss": 1.4879, + "step": 14053 + }, + { + "epoch": 1.6143817127103555, + "grad_norm": 0.5996015071868896, + "learning_rate": 0.0001, + "loss": 1.3556, + "step": 14054 + }, + { + "epoch": 1.6144965826201827, + "grad_norm": 0.633338987827301, + "learning_rate": 0.0001, + "loss": 1.3706, + "step": 14055 + }, + { + "epoch": 1.61461145253001, + "grad_norm": 0.6130332350730896, + "learning_rate": 0.0001, + "loss": 1.5321, + "step": 14056 + }, + { + "epoch": 1.614726322439837, + "grad_norm": 0.5730156898498535, + "learning_rate": 0.0001, + "loss": 1.2704, + "step": 14057 + }, + { + "epoch": 1.614841192349664, + "grad_norm": 0.5948575139045715, + "learning_rate": 0.0001, + "loss": 1.5801, + "step": 14058 + }, + { + "epoch": 1.6149560622594912, + "grad_norm": 0.6097572445869446, + "learning_rate": 0.0001, + "loss": 1.3788, + "step": 14059 + }, + { + "epoch": 1.6150709321693184, + "grad_norm": 0.5689347386360168, + "learning_rate": 0.0001, + "loss": 1.3254, + "step": 14060 + }, + { + "epoch": 1.6151858020791454, + "grad_norm": 0.6363088488578796, + "learning_rate": 0.0001, + "loss": 1.5221, + "step": 14061 + }, + { + "epoch": 1.6153006719889724, + "grad_norm": 0.6014930605888367, + "learning_rate": 0.0001, + "loss": 1.5373, + "step": 14062 + }, + { + "epoch": 1.6154155418987997, + "grad_norm": 0.5899112224578857, + "learning_rate": 0.0001, + "loss": 1.4386, + "step": 14063 + }, + { + "epoch": 1.6155304118086269, + "grad_norm": 0.7439025044441223, + "learning_rate": 0.0001, + "loss": 1.7865, + "step": 14064 + }, + { + "epoch": 1.615645281718454, + "grad_norm": 0.6421499252319336, + "learning_rate": 0.0001, + "loss": 1.5911, + "step": 14065 + }, + { + "epoch": 1.615760151628281, + "grad_norm": 0.6733357310295105, + "learning_rate": 0.0001, + "loss": 1.506, + "step": 14066 + }, + { + "epoch": 1.6158750215381081, + "grad_norm": 0.5812866687774658, + "learning_rate": 0.0001, + "loss": 1.3306, + "step": 14067 + }, + { + "epoch": 1.6159898914479354, + "grad_norm": 0.6346286535263062, + "learning_rate": 0.0001, + "loss": 1.5606, + "step": 14068 + }, + { + "epoch": 1.6161047613577624, + "grad_norm": 0.6660323739051819, + "learning_rate": 0.0001, + "loss": 1.5526, + "step": 14069 + }, + { + "epoch": 1.6162196312675894, + "grad_norm": 0.6062818169593811, + "learning_rate": 0.0001, + "loss": 1.4311, + "step": 14070 + }, + { + "epoch": 1.6163345011774166, + "grad_norm": 0.6203289031982422, + "learning_rate": 0.0001, + "loss": 1.5723, + "step": 14071 + }, + { + "epoch": 1.6164493710872438, + "grad_norm": 0.6779844760894775, + "learning_rate": 0.0001, + "loss": 1.3864, + "step": 14072 + }, + { + "epoch": 1.6165642409970709, + "grad_norm": 0.6042460203170776, + "learning_rate": 0.0001, + "loss": 1.5513, + "step": 14073 + }, + { + "epoch": 1.6166791109068979, + "grad_norm": 0.6318661570549011, + "learning_rate": 0.0001, + "loss": 1.3761, + "step": 14074 + }, + { + "epoch": 1.616793980816725, + "grad_norm": 0.579138457775116, + "learning_rate": 0.0001, + "loss": 1.3566, + "step": 14075 + }, + { + "epoch": 1.6169088507265523, + "grad_norm": 0.6678400635719299, + "learning_rate": 0.0001, + "loss": 1.5174, + "step": 14076 + }, + { + "epoch": 1.6170237206363793, + "grad_norm": 0.6432377099990845, + "learning_rate": 0.0001, + "loss": 1.3636, + "step": 14077 + }, + { + "epoch": 1.6171385905462063, + "grad_norm": 0.6431517004966736, + "learning_rate": 0.0001, + "loss": 1.4582, + "step": 14078 + }, + { + "epoch": 1.6172534604560336, + "grad_norm": 0.6440772414207458, + "learning_rate": 0.0001, + "loss": 1.3715, + "step": 14079 + }, + { + "epoch": 1.6173683303658608, + "grad_norm": 0.5892122387886047, + "learning_rate": 0.0001, + "loss": 1.4429, + "step": 14080 + }, + { + "epoch": 1.6174832002756878, + "grad_norm": 0.5709667801856995, + "learning_rate": 0.0001, + "loss": 1.4392, + "step": 14081 + }, + { + "epoch": 1.6175980701855148, + "grad_norm": 0.5939996242523193, + "learning_rate": 0.0001, + "loss": 1.4266, + "step": 14082 + }, + { + "epoch": 1.617712940095342, + "grad_norm": 0.6955810785293579, + "learning_rate": 0.0001, + "loss": 1.453, + "step": 14083 + }, + { + "epoch": 1.6178278100051693, + "grad_norm": 0.6132066249847412, + "learning_rate": 0.0001, + "loss": 1.5033, + "step": 14084 + }, + { + "epoch": 1.6179426799149963, + "grad_norm": 0.5954875946044922, + "learning_rate": 0.0001, + "loss": 1.4841, + "step": 14085 + }, + { + "epoch": 1.6180575498248233, + "grad_norm": 0.6023247838020325, + "learning_rate": 0.0001, + "loss": 1.3852, + "step": 14086 + }, + { + "epoch": 1.6181724197346505, + "grad_norm": 0.7028385400772095, + "learning_rate": 0.0001, + "loss": 1.5218, + "step": 14087 + }, + { + "epoch": 1.6182872896444778, + "grad_norm": 0.6407709121704102, + "learning_rate": 0.0001, + "loss": 1.6185, + "step": 14088 + }, + { + "epoch": 1.6184021595543048, + "grad_norm": 0.5889942049980164, + "learning_rate": 0.0001, + "loss": 1.5281, + "step": 14089 + }, + { + "epoch": 1.6185170294641318, + "grad_norm": 0.6771349906921387, + "learning_rate": 0.0001, + "loss": 1.4593, + "step": 14090 + }, + { + "epoch": 1.618631899373959, + "grad_norm": 0.6546837687492371, + "learning_rate": 0.0001, + "loss": 1.4992, + "step": 14091 + }, + { + "epoch": 1.6187467692837862, + "grad_norm": 0.5943772196769714, + "learning_rate": 0.0001, + "loss": 1.5061, + "step": 14092 + }, + { + "epoch": 1.6188616391936133, + "grad_norm": 0.5858648419380188, + "learning_rate": 0.0001, + "loss": 1.3868, + "step": 14093 + }, + { + "epoch": 1.6189765091034403, + "grad_norm": 0.6995881795883179, + "learning_rate": 0.0001, + "loss": 1.7393, + "step": 14094 + }, + { + "epoch": 1.6190913790132675, + "grad_norm": 0.6692748069763184, + "learning_rate": 0.0001, + "loss": 1.4476, + "step": 14095 + }, + { + "epoch": 1.6192062489230947, + "grad_norm": 0.5856147408485413, + "learning_rate": 0.0001, + "loss": 1.3743, + "step": 14096 + }, + { + "epoch": 1.6193211188329217, + "grad_norm": 0.5996282696723938, + "learning_rate": 0.0001, + "loss": 1.5481, + "step": 14097 + }, + { + "epoch": 1.6194359887427487, + "grad_norm": 0.6970905065536499, + "learning_rate": 0.0001, + "loss": 1.5738, + "step": 14098 + }, + { + "epoch": 1.619550858652576, + "grad_norm": 0.6515586972236633, + "learning_rate": 0.0001, + "loss": 1.4475, + "step": 14099 + }, + { + "epoch": 1.6196657285624032, + "grad_norm": 0.5993524789810181, + "learning_rate": 0.0001, + "loss": 1.4535, + "step": 14100 + }, + { + "epoch": 1.6197805984722302, + "grad_norm": 0.5940983295440674, + "learning_rate": 0.0001, + "loss": 1.4715, + "step": 14101 + }, + { + "epoch": 1.6198954683820572, + "grad_norm": 0.585989236831665, + "learning_rate": 0.0001, + "loss": 1.4361, + "step": 14102 + }, + { + "epoch": 1.6200103382918845, + "grad_norm": 0.5850023031234741, + "learning_rate": 0.0001, + "loss": 1.5792, + "step": 14103 + }, + { + "epoch": 1.6201252082017117, + "grad_norm": 0.5683290362358093, + "learning_rate": 0.0001, + "loss": 1.1528, + "step": 14104 + }, + { + "epoch": 1.6202400781115387, + "grad_norm": 0.5724591016769409, + "learning_rate": 0.0001, + "loss": 1.4089, + "step": 14105 + }, + { + "epoch": 1.6203549480213657, + "grad_norm": 0.7932894825935364, + "learning_rate": 0.0001, + "loss": 1.6699, + "step": 14106 + }, + { + "epoch": 1.620469817931193, + "grad_norm": 0.6048719882965088, + "learning_rate": 0.0001, + "loss": 1.4477, + "step": 14107 + }, + { + "epoch": 1.6205846878410202, + "grad_norm": 0.6853838562965393, + "learning_rate": 0.0001, + "loss": 1.5403, + "step": 14108 + }, + { + "epoch": 1.6206995577508472, + "grad_norm": 0.7041744589805603, + "learning_rate": 0.0001, + "loss": 1.5995, + "step": 14109 + }, + { + "epoch": 1.6208144276606742, + "grad_norm": 0.6290716528892517, + "learning_rate": 0.0001, + "loss": 1.5163, + "step": 14110 + }, + { + "epoch": 1.6209292975705014, + "grad_norm": 0.5670915842056274, + "learning_rate": 0.0001, + "loss": 1.3655, + "step": 14111 + }, + { + "epoch": 1.6210441674803286, + "grad_norm": 0.5751041769981384, + "learning_rate": 0.0001, + "loss": 1.2977, + "step": 14112 + }, + { + "epoch": 1.6211590373901557, + "grad_norm": 0.5855926275253296, + "learning_rate": 0.0001, + "loss": 1.6452, + "step": 14113 + }, + { + "epoch": 1.6212739072999827, + "grad_norm": 0.6567171812057495, + "learning_rate": 0.0001, + "loss": 1.3936, + "step": 14114 + }, + { + "epoch": 1.62138877720981, + "grad_norm": 0.5859187841415405, + "learning_rate": 0.0001, + "loss": 1.448, + "step": 14115 + }, + { + "epoch": 1.6215036471196371, + "grad_norm": 0.5759543180465698, + "learning_rate": 0.0001, + "loss": 1.4566, + "step": 14116 + }, + { + "epoch": 1.6216185170294641, + "grad_norm": 0.5698956847190857, + "learning_rate": 0.0001, + "loss": 1.3733, + "step": 14117 + }, + { + "epoch": 1.6217333869392911, + "grad_norm": 0.6352313160896301, + "learning_rate": 0.0001, + "loss": 1.3525, + "step": 14118 + }, + { + "epoch": 1.6218482568491184, + "grad_norm": 0.6209557056427002, + "learning_rate": 0.0001, + "loss": 1.2902, + "step": 14119 + }, + { + "epoch": 1.6219631267589456, + "grad_norm": 0.6741890907287598, + "learning_rate": 0.0001, + "loss": 1.5208, + "step": 14120 + }, + { + "epoch": 1.6220779966687726, + "grad_norm": 0.5992792248725891, + "learning_rate": 0.0001, + "loss": 1.3194, + "step": 14121 + }, + { + "epoch": 1.6221928665785996, + "grad_norm": 0.6535847187042236, + "learning_rate": 0.0001, + "loss": 1.285, + "step": 14122 + }, + { + "epoch": 1.6223077364884269, + "grad_norm": 0.5931274890899658, + "learning_rate": 0.0001, + "loss": 1.5578, + "step": 14123 + }, + { + "epoch": 1.622422606398254, + "grad_norm": 0.6604529023170471, + "learning_rate": 0.0001, + "loss": 1.4776, + "step": 14124 + }, + { + "epoch": 1.622537476308081, + "grad_norm": 0.6257154941558838, + "learning_rate": 0.0001, + "loss": 1.6617, + "step": 14125 + }, + { + "epoch": 1.622652346217908, + "grad_norm": 0.6580725908279419, + "learning_rate": 0.0001, + "loss": 1.1384, + "step": 14126 + }, + { + "epoch": 1.6227672161277353, + "grad_norm": 0.575198233127594, + "learning_rate": 0.0001, + "loss": 1.3248, + "step": 14127 + }, + { + "epoch": 1.6228820860375626, + "grad_norm": 0.591127872467041, + "learning_rate": 0.0001, + "loss": 1.3635, + "step": 14128 + }, + { + "epoch": 1.6229969559473896, + "grad_norm": 0.6647173166275024, + "learning_rate": 0.0001, + "loss": 1.4805, + "step": 14129 + }, + { + "epoch": 1.6231118258572166, + "grad_norm": 0.5722182393074036, + "learning_rate": 0.0001, + "loss": 1.4466, + "step": 14130 + }, + { + "epoch": 1.6232266957670438, + "grad_norm": 0.5584431886672974, + "learning_rate": 0.0001, + "loss": 1.5135, + "step": 14131 + }, + { + "epoch": 1.623341565676871, + "grad_norm": 0.6139621138572693, + "learning_rate": 0.0001, + "loss": 1.6196, + "step": 14132 + }, + { + "epoch": 1.623456435586698, + "grad_norm": 0.6344627737998962, + "learning_rate": 0.0001, + "loss": 1.6444, + "step": 14133 + }, + { + "epoch": 1.623571305496525, + "grad_norm": 0.6031376123428345, + "learning_rate": 0.0001, + "loss": 1.2777, + "step": 14134 + }, + { + "epoch": 1.6236861754063523, + "grad_norm": 0.6405968070030212, + "learning_rate": 0.0001, + "loss": 1.5615, + "step": 14135 + }, + { + "epoch": 1.6238010453161795, + "grad_norm": 0.7221595644950867, + "learning_rate": 0.0001, + "loss": 1.2492, + "step": 14136 + }, + { + "epoch": 1.6239159152260065, + "grad_norm": 0.5714577436447144, + "learning_rate": 0.0001, + "loss": 1.5219, + "step": 14137 + }, + { + "epoch": 1.6240307851358335, + "grad_norm": 0.607839047908783, + "learning_rate": 0.0001, + "loss": 1.5385, + "step": 14138 + }, + { + "epoch": 1.6241456550456608, + "grad_norm": 0.6695128679275513, + "learning_rate": 0.0001, + "loss": 1.5084, + "step": 14139 + }, + { + "epoch": 1.624260524955488, + "grad_norm": 0.6498796343803406, + "learning_rate": 0.0001, + "loss": 1.5016, + "step": 14140 + }, + { + "epoch": 1.624375394865315, + "grad_norm": 0.582234799861908, + "learning_rate": 0.0001, + "loss": 1.2691, + "step": 14141 + }, + { + "epoch": 1.624490264775142, + "grad_norm": 0.6602511405944824, + "learning_rate": 0.0001, + "loss": 1.445, + "step": 14142 + }, + { + "epoch": 1.6246051346849693, + "grad_norm": 0.6686058640480042, + "learning_rate": 0.0001, + "loss": 1.4448, + "step": 14143 + }, + { + "epoch": 1.6247200045947965, + "grad_norm": 0.65159010887146, + "learning_rate": 0.0001, + "loss": 1.428, + "step": 14144 + }, + { + "epoch": 1.6248348745046235, + "grad_norm": 0.6343734860420227, + "learning_rate": 0.0001, + "loss": 1.565, + "step": 14145 + }, + { + "epoch": 1.6249497444144505, + "grad_norm": 0.5925803184509277, + "learning_rate": 0.0001, + "loss": 1.4588, + "step": 14146 + }, + { + "epoch": 1.6250646143242777, + "grad_norm": 0.593427300453186, + "learning_rate": 0.0001, + "loss": 1.3529, + "step": 14147 + }, + { + "epoch": 1.625179484234105, + "grad_norm": 0.6615251302719116, + "learning_rate": 0.0001, + "loss": 1.466, + "step": 14148 + }, + { + "epoch": 1.625294354143932, + "grad_norm": 0.6435301899909973, + "learning_rate": 0.0001, + "loss": 1.4284, + "step": 14149 + }, + { + "epoch": 1.625409224053759, + "grad_norm": 0.5664091110229492, + "learning_rate": 0.0001, + "loss": 1.3037, + "step": 14150 + }, + { + "epoch": 1.6255240939635862, + "grad_norm": 0.5707288980484009, + "learning_rate": 0.0001, + "loss": 1.4152, + "step": 14151 + }, + { + "epoch": 1.6256389638734134, + "grad_norm": 0.6005744934082031, + "learning_rate": 0.0001, + "loss": 1.4547, + "step": 14152 + }, + { + "epoch": 1.6257538337832405, + "grad_norm": 0.6113733649253845, + "learning_rate": 0.0001, + "loss": 1.4973, + "step": 14153 + }, + { + "epoch": 1.6258687036930675, + "grad_norm": 0.6435218453407288, + "learning_rate": 0.0001, + "loss": 1.448, + "step": 14154 + }, + { + "epoch": 1.6259835736028947, + "grad_norm": 0.6765085458755493, + "learning_rate": 0.0001, + "loss": 1.3975, + "step": 14155 + }, + { + "epoch": 1.626098443512722, + "grad_norm": 0.6617031693458557, + "learning_rate": 0.0001, + "loss": 1.5578, + "step": 14156 + }, + { + "epoch": 1.626213313422549, + "grad_norm": 0.6027497053146362, + "learning_rate": 0.0001, + "loss": 1.3812, + "step": 14157 + }, + { + "epoch": 1.626328183332376, + "grad_norm": 0.6761353611946106, + "learning_rate": 0.0001, + "loss": 1.5668, + "step": 14158 + }, + { + "epoch": 1.6264430532422032, + "grad_norm": 0.6267996430397034, + "learning_rate": 0.0001, + "loss": 1.5065, + "step": 14159 + }, + { + "epoch": 1.6265579231520304, + "grad_norm": 0.6282928586006165, + "learning_rate": 0.0001, + "loss": 1.5412, + "step": 14160 + }, + { + "epoch": 1.6266727930618574, + "grad_norm": 0.5661084651947021, + "learning_rate": 0.0001, + "loss": 1.4419, + "step": 14161 + }, + { + "epoch": 1.6267876629716844, + "grad_norm": 0.6607252955436707, + "learning_rate": 0.0001, + "loss": 1.5683, + "step": 14162 + }, + { + "epoch": 1.6269025328815117, + "grad_norm": 0.6275922060012817, + "learning_rate": 0.0001, + "loss": 1.2246, + "step": 14163 + }, + { + "epoch": 1.6270174027913389, + "grad_norm": 0.5838649272918701, + "learning_rate": 0.0001, + "loss": 1.4886, + "step": 14164 + }, + { + "epoch": 1.627132272701166, + "grad_norm": 0.627335786819458, + "learning_rate": 0.0001, + "loss": 1.3975, + "step": 14165 + }, + { + "epoch": 1.627247142610993, + "grad_norm": 0.6400899291038513, + "learning_rate": 0.0001, + "loss": 1.3117, + "step": 14166 + }, + { + "epoch": 1.6273620125208201, + "grad_norm": 0.6840837597846985, + "learning_rate": 0.0001, + "loss": 1.6182, + "step": 14167 + }, + { + "epoch": 1.6274768824306474, + "grad_norm": 0.6731758713722229, + "learning_rate": 0.0001, + "loss": 1.3957, + "step": 14168 + }, + { + "epoch": 1.6275917523404744, + "grad_norm": 0.6186261177062988, + "learning_rate": 0.0001, + "loss": 1.3614, + "step": 14169 + }, + { + "epoch": 1.6277066222503014, + "grad_norm": 0.6093758344650269, + "learning_rate": 0.0001, + "loss": 1.4298, + "step": 14170 + }, + { + "epoch": 1.6278214921601286, + "grad_norm": 0.8518862128257751, + "learning_rate": 0.0001, + "loss": 1.1226, + "step": 14171 + }, + { + "epoch": 1.6279363620699558, + "grad_norm": 0.6340732574462891, + "learning_rate": 0.0001, + "loss": 1.55, + "step": 14172 + }, + { + "epoch": 1.6280512319797829, + "grad_norm": 0.5993664264678955, + "learning_rate": 0.0001, + "loss": 1.4323, + "step": 14173 + }, + { + "epoch": 1.6281661018896099, + "grad_norm": 0.6589725017547607, + "learning_rate": 0.0001, + "loss": 1.321, + "step": 14174 + }, + { + "epoch": 1.628280971799437, + "grad_norm": 0.668163001537323, + "learning_rate": 0.0001, + "loss": 1.3883, + "step": 14175 + }, + { + "epoch": 1.6283958417092643, + "grad_norm": 0.6377967000007629, + "learning_rate": 0.0001, + "loss": 1.3221, + "step": 14176 + }, + { + "epoch": 1.6285107116190913, + "grad_norm": 0.7198302149772644, + "learning_rate": 0.0001, + "loss": 1.6935, + "step": 14177 + }, + { + "epoch": 1.6286255815289183, + "grad_norm": 0.6226831078529358, + "learning_rate": 0.0001, + "loss": 1.533, + "step": 14178 + }, + { + "epoch": 1.6287404514387456, + "grad_norm": 0.6308751106262207, + "learning_rate": 0.0001, + "loss": 1.3146, + "step": 14179 + }, + { + "epoch": 1.6288553213485728, + "grad_norm": 0.6031596064567566, + "learning_rate": 0.0001, + "loss": 1.4562, + "step": 14180 + }, + { + "epoch": 1.6289701912583998, + "grad_norm": 0.6148450374603271, + "learning_rate": 0.0001, + "loss": 1.5277, + "step": 14181 + }, + { + "epoch": 1.6290850611682268, + "grad_norm": 0.6116688847541809, + "learning_rate": 0.0001, + "loss": 1.5069, + "step": 14182 + }, + { + "epoch": 1.629199931078054, + "grad_norm": 0.594800591468811, + "learning_rate": 0.0001, + "loss": 1.5319, + "step": 14183 + }, + { + "epoch": 1.6293148009878813, + "grad_norm": 0.5846831202507019, + "learning_rate": 0.0001, + "loss": 1.371, + "step": 14184 + }, + { + "epoch": 1.6294296708977083, + "grad_norm": 0.5682091116905212, + "learning_rate": 0.0001, + "loss": 1.2561, + "step": 14185 + }, + { + "epoch": 1.6295445408075353, + "grad_norm": 0.6536598801612854, + "learning_rate": 0.0001, + "loss": 1.5194, + "step": 14186 + }, + { + "epoch": 1.6296594107173625, + "grad_norm": 0.6191934943199158, + "learning_rate": 0.0001, + "loss": 1.5155, + "step": 14187 + }, + { + "epoch": 1.6297742806271898, + "grad_norm": 0.5823689699172974, + "learning_rate": 0.0001, + "loss": 1.4693, + "step": 14188 + }, + { + "epoch": 1.6298891505370168, + "grad_norm": 0.6082186698913574, + "learning_rate": 0.0001, + "loss": 1.4571, + "step": 14189 + }, + { + "epoch": 1.630004020446844, + "grad_norm": 0.6357173919677734, + "learning_rate": 0.0001, + "loss": 1.2842, + "step": 14190 + }, + { + "epoch": 1.630118890356671, + "grad_norm": 0.5339798331260681, + "learning_rate": 0.0001, + "loss": 1.271, + "step": 14191 + }, + { + "epoch": 1.6302337602664982, + "grad_norm": 0.6120511889457703, + "learning_rate": 0.0001, + "loss": 1.3902, + "step": 14192 + }, + { + "epoch": 1.6303486301763255, + "grad_norm": 0.6843846440315247, + "learning_rate": 0.0001, + "loss": 1.4445, + "step": 14193 + }, + { + "epoch": 1.6304635000861525, + "grad_norm": 0.5969381928443909, + "learning_rate": 0.0001, + "loss": 1.4672, + "step": 14194 + }, + { + "epoch": 1.6305783699959795, + "grad_norm": 0.621590793132782, + "learning_rate": 0.0001, + "loss": 1.4124, + "step": 14195 + }, + { + "epoch": 1.6306932399058067, + "grad_norm": 0.6120715737342834, + "learning_rate": 0.0001, + "loss": 1.5246, + "step": 14196 + }, + { + "epoch": 1.630808109815634, + "grad_norm": 0.6380988955497742, + "learning_rate": 0.0001, + "loss": 1.5292, + "step": 14197 + }, + { + "epoch": 1.630922979725461, + "grad_norm": 0.6133922338485718, + "learning_rate": 0.0001, + "loss": 1.4572, + "step": 14198 + }, + { + "epoch": 1.631037849635288, + "grad_norm": 0.6428544521331787, + "learning_rate": 0.0001, + "loss": 1.3983, + "step": 14199 + }, + { + "epoch": 1.6311527195451152, + "grad_norm": 0.5923104286193848, + "learning_rate": 0.0001, + "loss": 1.3451, + "step": 14200 + }, + { + "epoch": 1.6312675894549424, + "grad_norm": 0.6585755348205566, + "learning_rate": 0.0001, + "loss": 1.4829, + "step": 14201 + }, + { + "epoch": 1.6313824593647694, + "grad_norm": 0.5719717144966125, + "learning_rate": 0.0001, + "loss": 1.4134, + "step": 14202 + }, + { + "epoch": 1.6314973292745965, + "grad_norm": 0.6237934231758118, + "learning_rate": 0.0001, + "loss": 1.6209, + "step": 14203 + }, + { + "epoch": 1.6316121991844237, + "grad_norm": 0.6467723250389099, + "learning_rate": 0.0001, + "loss": 1.3684, + "step": 14204 + }, + { + "epoch": 1.631727069094251, + "grad_norm": 0.7436313033103943, + "learning_rate": 0.0001, + "loss": 1.6965, + "step": 14205 + }, + { + "epoch": 1.631841939004078, + "grad_norm": 0.6541860699653625, + "learning_rate": 0.0001, + "loss": 1.5145, + "step": 14206 + }, + { + "epoch": 1.631956808913905, + "grad_norm": 0.6902755498886108, + "learning_rate": 0.0001, + "loss": 1.7396, + "step": 14207 + }, + { + "epoch": 1.6320716788237322, + "grad_norm": 0.5739198923110962, + "learning_rate": 0.0001, + "loss": 1.4084, + "step": 14208 + }, + { + "epoch": 1.6321865487335594, + "grad_norm": 0.6371223330497742, + "learning_rate": 0.0001, + "loss": 1.5507, + "step": 14209 + }, + { + "epoch": 1.6323014186433864, + "grad_norm": 0.5753113031387329, + "learning_rate": 0.0001, + "loss": 1.3629, + "step": 14210 + }, + { + "epoch": 1.6324162885532134, + "grad_norm": 0.5805362462997437, + "learning_rate": 0.0001, + "loss": 1.5456, + "step": 14211 + }, + { + "epoch": 1.6325311584630406, + "grad_norm": 0.5889348983764648, + "learning_rate": 0.0001, + "loss": 1.331, + "step": 14212 + }, + { + "epoch": 1.6326460283728679, + "grad_norm": 0.6165045499801636, + "learning_rate": 0.0001, + "loss": 1.5128, + "step": 14213 + }, + { + "epoch": 1.6327608982826949, + "grad_norm": 0.6740553975105286, + "learning_rate": 0.0001, + "loss": 1.4898, + "step": 14214 + }, + { + "epoch": 1.632875768192522, + "grad_norm": 0.6100121140480042, + "learning_rate": 0.0001, + "loss": 1.3378, + "step": 14215 + }, + { + "epoch": 1.6329906381023491, + "grad_norm": 0.6665295958518982, + "learning_rate": 0.0001, + "loss": 1.5055, + "step": 14216 + }, + { + "epoch": 1.6331055080121764, + "grad_norm": 0.574766218662262, + "learning_rate": 0.0001, + "loss": 1.4792, + "step": 14217 + }, + { + "epoch": 1.6332203779220034, + "grad_norm": 0.6160637140274048, + "learning_rate": 0.0001, + "loss": 1.529, + "step": 14218 + }, + { + "epoch": 1.6333352478318304, + "grad_norm": 0.6245967745780945, + "learning_rate": 0.0001, + "loss": 1.5056, + "step": 14219 + }, + { + "epoch": 1.6334501177416576, + "grad_norm": 0.6003860235214233, + "learning_rate": 0.0001, + "loss": 1.4246, + "step": 14220 + }, + { + "epoch": 1.6335649876514848, + "grad_norm": 0.5847166776657104, + "learning_rate": 0.0001, + "loss": 1.3012, + "step": 14221 + }, + { + "epoch": 1.6336798575613118, + "grad_norm": 0.567745566368103, + "learning_rate": 0.0001, + "loss": 1.3486, + "step": 14222 + }, + { + "epoch": 1.6337947274711389, + "grad_norm": 0.7059317231178284, + "learning_rate": 0.0001, + "loss": 1.4865, + "step": 14223 + }, + { + "epoch": 1.633909597380966, + "grad_norm": 0.6010088920593262, + "learning_rate": 0.0001, + "loss": 1.4275, + "step": 14224 + }, + { + "epoch": 1.6340244672907933, + "grad_norm": 0.7067935466766357, + "learning_rate": 0.0001, + "loss": 1.5533, + "step": 14225 + }, + { + "epoch": 1.6341393372006203, + "grad_norm": 0.6423289775848389, + "learning_rate": 0.0001, + "loss": 1.4735, + "step": 14226 + }, + { + "epoch": 1.6342542071104473, + "grad_norm": 0.6251084804534912, + "learning_rate": 0.0001, + "loss": 1.3356, + "step": 14227 + }, + { + "epoch": 1.6343690770202746, + "grad_norm": 0.6532206535339355, + "learning_rate": 0.0001, + "loss": 1.3189, + "step": 14228 + }, + { + "epoch": 1.6344839469301018, + "grad_norm": 0.5804827213287354, + "learning_rate": 0.0001, + "loss": 1.527, + "step": 14229 + }, + { + "epoch": 1.6345988168399288, + "grad_norm": 0.6210793256759644, + "learning_rate": 0.0001, + "loss": 1.4906, + "step": 14230 + }, + { + "epoch": 1.6347136867497558, + "grad_norm": 0.5873556137084961, + "learning_rate": 0.0001, + "loss": 1.2527, + "step": 14231 + }, + { + "epoch": 1.634828556659583, + "grad_norm": 0.5766886472702026, + "learning_rate": 0.0001, + "loss": 1.4624, + "step": 14232 + }, + { + "epoch": 1.6349434265694103, + "grad_norm": 0.6255853772163391, + "learning_rate": 0.0001, + "loss": 1.6646, + "step": 14233 + }, + { + "epoch": 1.6350582964792373, + "grad_norm": 0.6151155829429626, + "learning_rate": 0.0001, + "loss": 1.4379, + "step": 14234 + }, + { + "epoch": 1.6351731663890643, + "grad_norm": 0.602134644985199, + "learning_rate": 0.0001, + "loss": 1.4381, + "step": 14235 + }, + { + "epoch": 1.6352880362988915, + "grad_norm": 0.5812894105911255, + "learning_rate": 0.0001, + "loss": 1.5664, + "step": 14236 + }, + { + "epoch": 1.6354029062087188, + "grad_norm": 0.5972526669502258, + "learning_rate": 0.0001, + "loss": 1.5256, + "step": 14237 + }, + { + "epoch": 1.6355177761185458, + "grad_norm": 0.6522207260131836, + "learning_rate": 0.0001, + "loss": 1.4439, + "step": 14238 + }, + { + "epoch": 1.6356326460283728, + "grad_norm": 0.5848917961120605, + "learning_rate": 0.0001, + "loss": 1.114, + "step": 14239 + }, + { + "epoch": 1.6357475159382, + "grad_norm": 0.6326489448547363, + "learning_rate": 0.0001, + "loss": 1.3317, + "step": 14240 + }, + { + "epoch": 1.6358623858480272, + "grad_norm": 0.6178004741668701, + "learning_rate": 0.0001, + "loss": 1.3889, + "step": 14241 + }, + { + "epoch": 1.6359772557578542, + "grad_norm": 0.6275560259819031, + "learning_rate": 0.0001, + "loss": 1.3722, + "step": 14242 + }, + { + "epoch": 1.6360921256676813, + "grad_norm": 0.5886978507041931, + "learning_rate": 0.0001, + "loss": 1.505, + "step": 14243 + }, + { + "epoch": 1.6362069955775085, + "grad_norm": 0.6700698137283325, + "learning_rate": 0.0001, + "loss": 1.5026, + "step": 14244 + }, + { + "epoch": 1.6363218654873357, + "grad_norm": 0.6027423143386841, + "learning_rate": 0.0001, + "loss": 1.5032, + "step": 14245 + }, + { + "epoch": 1.6364367353971627, + "grad_norm": 0.6072428226470947, + "learning_rate": 0.0001, + "loss": 1.4246, + "step": 14246 + }, + { + "epoch": 1.6365516053069897, + "grad_norm": 0.6349037289619446, + "learning_rate": 0.0001, + "loss": 1.0018, + "step": 14247 + }, + { + "epoch": 1.636666475216817, + "grad_norm": 0.6661435961723328, + "learning_rate": 0.0001, + "loss": 1.6446, + "step": 14248 + }, + { + "epoch": 1.6367813451266442, + "grad_norm": 0.6801328063011169, + "learning_rate": 0.0001, + "loss": 1.5803, + "step": 14249 + }, + { + "epoch": 1.6368962150364712, + "grad_norm": 0.6124873161315918, + "learning_rate": 0.0001, + "loss": 1.3097, + "step": 14250 + }, + { + "epoch": 1.6370110849462982, + "grad_norm": 0.6170512437820435, + "learning_rate": 0.0001, + "loss": 1.3019, + "step": 14251 + }, + { + "epoch": 1.6371259548561254, + "grad_norm": 0.6388773918151855, + "learning_rate": 0.0001, + "loss": 1.6134, + "step": 14252 + }, + { + "epoch": 1.6372408247659527, + "grad_norm": 0.6021920442581177, + "learning_rate": 0.0001, + "loss": 1.483, + "step": 14253 + }, + { + "epoch": 1.6373556946757797, + "grad_norm": 0.6165171265602112, + "learning_rate": 0.0001, + "loss": 1.5962, + "step": 14254 + }, + { + "epoch": 1.6374705645856067, + "grad_norm": 0.5764616131782532, + "learning_rate": 0.0001, + "loss": 1.2656, + "step": 14255 + }, + { + "epoch": 1.637585434495434, + "grad_norm": 0.6353535652160645, + "learning_rate": 0.0001, + "loss": 1.4185, + "step": 14256 + }, + { + "epoch": 1.6377003044052612, + "grad_norm": 0.5799005627632141, + "learning_rate": 0.0001, + "loss": 1.6052, + "step": 14257 + }, + { + "epoch": 1.6378151743150882, + "grad_norm": 0.6138670444488525, + "learning_rate": 0.0001, + "loss": 1.4453, + "step": 14258 + }, + { + "epoch": 1.6379300442249152, + "grad_norm": 0.5995337963104248, + "learning_rate": 0.0001, + "loss": 1.622, + "step": 14259 + }, + { + "epoch": 1.6380449141347424, + "grad_norm": 0.5708662271499634, + "learning_rate": 0.0001, + "loss": 1.3962, + "step": 14260 + }, + { + "epoch": 1.6381597840445696, + "grad_norm": 0.6403148174285889, + "learning_rate": 0.0001, + "loss": 1.5728, + "step": 14261 + }, + { + "epoch": 1.6382746539543966, + "grad_norm": 0.6167385578155518, + "learning_rate": 0.0001, + "loss": 1.4572, + "step": 14262 + }, + { + "epoch": 1.6383895238642237, + "grad_norm": 0.5929616689682007, + "learning_rate": 0.0001, + "loss": 1.4688, + "step": 14263 + }, + { + "epoch": 1.6385043937740509, + "grad_norm": 0.5963886380195618, + "learning_rate": 0.0001, + "loss": 1.304, + "step": 14264 + }, + { + "epoch": 1.6386192636838781, + "grad_norm": 0.596179723739624, + "learning_rate": 0.0001, + "loss": 1.3922, + "step": 14265 + }, + { + "epoch": 1.6387341335937051, + "grad_norm": 0.6750287413597107, + "learning_rate": 0.0001, + "loss": 1.4107, + "step": 14266 + }, + { + "epoch": 1.6388490035035321, + "grad_norm": 0.6416113376617432, + "learning_rate": 0.0001, + "loss": 1.4031, + "step": 14267 + }, + { + "epoch": 1.6389638734133594, + "grad_norm": 0.547660231590271, + "learning_rate": 0.0001, + "loss": 1.2633, + "step": 14268 + }, + { + "epoch": 1.6390787433231866, + "grad_norm": 0.6302758455276489, + "learning_rate": 0.0001, + "loss": 1.526, + "step": 14269 + }, + { + "epoch": 1.6391936132330136, + "grad_norm": 0.6041958928108215, + "learning_rate": 0.0001, + "loss": 1.5458, + "step": 14270 + }, + { + "epoch": 1.6393084831428406, + "grad_norm": 0.5641599893569946, + "learning_rate": 0.0001, + "loss": 1.2687, + "step": 14271 + }, + { + "epoch": 1.6394233530526678, + "grad_norm": 0.6017992496490479, + "learning_rate": 0.0001, + "loss": 1.4718, + "step": 14272 + }, + { + "epoch": 1.639538222962495, + "grad_norm": 0.5696545243263245, + "learning_rate": 0.0001, + "loss": 1.5084, + "step": 14273 + }, + { + "epoch": 1.639653092872322, + "grad_norm": 0.6044654250144958, + "learning_rate": 0.0001, + "loss": 1.4717, + "step": 14274 + }, + { + "epoch": 1.639767962782149, + "grad_norm": 0.5639116168022156, + "learning_rate": 0.0001, + "loss": 1.4058, + "step": 14275 + }, + { + "epoch": 1.6398828326919763, + "grad_norm": 0.6422322988510132, + "learning_rate": 0.0001, + "loss": 1.5171, + "step": 14276 + }, + { + "epoch": 1.6399977026018036, + "grad_norm": 0.651578426361084, + "learning_rate": 0.0001, + "loss": 1.3275, + "step": 14277 + }, + { + "epoch": 1.6401125725116306, + "grad_norm": 0.6262843012809753, + "learning_rate": 0.0001, + "loss": 1.4298, + "step": 14278 + }, + { + "epoch": 1.6402274424214576, + "grad_norm": 0.6277267336845398, + "learning_rate": 0.0001, + "loss": 1.5122, + "step": 14279 + }, + { + "epoch": 1.6403423123312848, + "grad_norm": 0.6524622440338135, + "learning_rate": 0.0001, + "loss": 1.5141, + "step": 14280 + }, + { + "epoch": 1.640457182241112, + "grad_norm": 0.6294464468955994, + "learning_rate": 0.0001, + "loss": 1.5869, + "step": 14281 + }, + { + "epoch": 1.640572052150939, + "grad_norm": 0.609076201915741, + "learning_rate": 0.0001, + "loss": 1.4457, + "step": 14282 + }, + { + "epoch": 1.640686922060766, + "grad_norm": 0.5797666907310486, + "learning_rate": 0.0001, + "loss": 1.5167, + "step": 14283 + }, + { + "epoch": 1.6408017919705933, + "grad_norm": 0.593920111656189, + "learning_rate": 0.0001, + "loss": 1.3679, + "step": 14284 + }, + { + "epoch": 1.6409166618804205, + "grad_norm": 0.6230970025062561, + "learning_rate": 0.0001, + "loss": 1.4868, + "step": 14285 + }, + { + "epoch": 1.6410315317902475, + "grad_norm": 0.6537438631057739, + "learning_rate": 0.0001, + "loss": 1.5031, + "step": 14286 + }, + { + "epoch": 1.6411464017000745, + "grad_norm": 0.5811769962310791, + "learning_rate": 0.0001, + "loss": 1.3242, + "step": 14287 + }, + { + "epoch": 1.6412612716099018, + "grad_norm": 0.6686524748802185, + "learning_rate": 0.0001, + "loss": 1.6498, + "step": 14288 + }, + { + "epoch": 1.641376141519729, + "grad_norm": 0.6294060349464417, + "learning_rate": 0.0001, + "loss": 1.4349, + "step": 14289 + }, + { + "epoch": 1.641491011429556, + "grad_norm": 0.6076821684837341, + "learning_rate": 0.0001, + "loss": 1.4389, + "step": 14290 + }, + { + "epoch": 1.641605881339383, + "grad_norm": 0.6137123107910156, + "learning_rate": 0.0001, + "loss": 1.4509, + "step": 14291 + }, + { + "epoch": 1.6417207512492102, + "grad_norm": 0.5927547812461853, + "learning_rate": 0.0001, + "loss": 1.3281, + "step": 14292 + }, + { + "epoch": 1.6418356211590375, + "grad_norm": 0.6271234750747681, + "learning_rate": 0.0001, + "loss": 1.4946, + "step": 14293 + }, + { + "epoch": 1.6419504910688645, + "grad_norm": 0.6225645542144775, + "learning_rate": 0.0001, + "loss": 1.6972, + "step": 14294 + }, + { + "epoch": 1.6420653609786915, + "grad_norm": 0.6010716557502747, + "learning_rate": 0.0001, + "loss": 1.4282, + "step": 14295 + }, + { + "epoch": 1.6421802308885187, + "grad_norm": 0.5875716805458069, + "learning_rate": 0.0001, + "loss": 1.4042, + "step": 14296 + }, + { + "epoch": 1.642295100798346, + "grad_norm": 0.5978166460990906, + "learning_rate": 0.0001, + "loss": 1.3037, + "step": 14297 + }, + { + "epoch": 1.642409970708173, + "grad_norm": 0.6238410472869873, + "learning_rate": 0.0001, + "loss": 1.5442, + "step": 14298 + }, + { + "epoch": 1.642524840618, + "grad_norm": 0.6206856966018677, + "learning_rate": 0.0001, + "loss": 1.5749, + "step": 14299 + }, + { + "epoch": 1.6426397105278272, + "grad_norm": 0.6737737655639648, + "learning_rate": 0.0001, + "loss": 1.7241, + "step": 14300 + }, + { + "epoch": 1.6427545804376544, + "grad_norm": 0.6297151446342468, + "learning_rate": 0.0001, + "loss": 1.5897, + "step": 14301 + }, + { + "epoch": 1.6428694503474814, + "grad_norm": 0.6496241092681885, + "learning_rate": 0.0001, + "loss": 1.5686, + "step": 14302 + }, + { + "epoch": 1.6429843202573085, + "grad_norm": 0.6007269620895386, + "learning_rate": 0.0001, + "loss": 1.4899, + "step": 14303 + }, + { + "epoch": 1.6430991901671357, + "grad_norm": 0.7573440670967102, + "learning_rate": 0.0001, + "loss": 1.4469, + "step": 14304 + }, + { + "epoch": 1.643214060076963, + "grad_norm": 0.6531268954277039, + "learning_rate": 0.0001, + "loss": 1.44, + "step": 14305 + }, + { + "epoch": 1.64332892998679, + "grad_norm": 0.6082087159156799, + "learning_rate": 0.0001, + "loss": 1.5434, + "step": 14306 + }, + { + "epoch": 1.643443799896617, + "grad_norm": 0.641038179397583, + "learning_rate": 0.0001, + "loss": 1.4916, + "step": 14307 + }, + { + "epoch": 1.6435586698064442, + "grad_norm": 0.6176828145980835, + "learning_rate": 0.0001, + "loss": 1.3881, + "step": 14308 + }, + { + "epoch": 1.6436735397162714, + "grad_norm": 0.6139614582061768, + "learning_rate": 0.0001, + "loss": 1.4972, + "step": 14309 + }, + { + "epoch": 1.6437884096260984, + "grad_norm": 0.6146692037582397, + "learning_rate": 0.0001, + "loss": 1.3981, + "step": 14310 + }, + { + "epoch": 1.6439032795359254, + "grad_norm": 0.6403799057006836, + "learning_rate": 0.0001, + "loss": 1.5217, + "step": 14311 + }, + { + "epoch": 1.6440181494457526, + "grad_norm": 0.6941229104995728, + "learning_rate": 0.0001, + "loss": 1.3688, + "step": 14312 + }, + { + "epoch": 1.6441330193555799, + "grad_norm": 0.6253212094306946, + "learning_rate": 0.0001, + "loss": 1.4806, + "step": 14313 + }, + { + "epoch": 1.6442478892654069, + "grad_norm": 0.6304717063903809, + "learning_rate": 0.0001, + "loss": 1.3255, + "step": 14314 + }, + { + "epoch": 1.644362759175234, + "grad_norm": 0.6108677387237549, + "learning_rate": 0.0001, + "loss": 1.549, + "step": 14315 + }, + { + "epoch": 1.6444776290850611, + "grad_norm": 0.7766926288604736, + "learning_rate": 0.0001, + "loss": 1.6879, + "step": 14316 + }, + { + "epoch": 1.6445924989948884, + "grad_norm": 0.6032610535621643, + "learning_rate": 0.0001, + "loss": 1.3837, + "step": 14317 + }, + { + "epoch": 1.6447073689047154, + "grad_norm": 0.6643081307411194, + "learning_rate": 0.0001, + "loss": 1.5128, + "step": 14318 + }, + { + "epoch": 1.6448222388145424, + "grad_norm": 0.6397324800491333, + "learning_rate": 0.0001, + "loss": 1.3905, + "step": 14319 + }, + { + "epoch": 1.6449371087243696, + "grad_norm": 0.5864359736442566, + "learning_rate": 0.0001, + "loss": 1.2971, + "step": 14320 + }, + { + "epoch": 1.6450519786341968, + "grad_norm": 0.6553377509117126, + "learning_rate": 0.0001, + "loss": 1.3454, + "step": 14321 + }, + { + "epoch": 1.6451668485440238, + "grad_norm": 0.7063130736351013, + "learning_rate": 0.0001, + "loss": 1.2324, + "step": 14322 + }, + { + "epoch": 1.6452817184538508, + "grad_norm": 0.6475198864936829, + "learning_rate": 0.0001, + "loss": 1.4901, + "step": 14323 + }, + { + "epoch": 1.645396588363678, + "grad_norm": 0.6414164304733276, + "learning_rate": 0.0001, + "loss": 1.4304, + "step": 14324 + }, + { + "epoch": 1.6455114582735053, + "grad_norm": 0.6583581566810608, + "learning_rate": 0.0001, + "loss": 1.6196, + "step": 14325 + }, + { + "epoch": 1.6456263281833323, + "grad_norm": 0.6059905290603638, + "learning_rate": 0.0001, + "loss": 1.5552, + "step": 14326 + }, + { + "epoch": 1.6457411980931596, + "grad_norm": 0.6345920562744141, + "learning_rate": 0.0001, + "loss": 1.639, + "step": 14327 + }, + { + "epoch": 1.6458560680029866, + "grad_norm": 0.620621919631958, + "learning_rate": 0.0001, + "loss": 1.5369, + "step": 14328 + }, + { + "epoch": 1.6459709379128138, + "grad_norm": 0.6474280953407288, + "learning_rate": 0.0001, + "loss": 1.4068, + "step": 14329 + }, + { + "epoch": 1.646085807822641, + "grad_norm": 0.5968835949897766, + "learning_rate": 0.0001, + "loss": 1.4271, + "step": 14330 + }, + { + "epoch": 1.646200677732468, + "grad_norm": 0.6671695113182068, + "learning_rate": 0.0001, + "loss": 1.1918, + "step": 14331 + }, + { + "epoch": 1.646315547642295, + "grad_norm": 0.622138500213623, + "learning_rate": 0.0001, + "loss": 1.2681, + "step": 14332 + }, + { + "epoch": 1.6464304175521223, + "grad_norm": 0.587945818901062, + "learning_rate": 0.0001, + "loss": 1.391, + "step": 14333 + }, + { + "epoch": 1.6465452874619495, + "grad_norm": 0.6441559791564941, + "learning_rate": 0.0001, + "loss": 1.5257, + "step": 14334 + }, + { + "epoch": 1.6466601573717765, + "grad_norm": 0.6618092656135559, + "learning_rate": 0.0001, + "loss": 1.6387, + "step": 14335 + }, + { + "epoch": 1.6467750272816035, + "grad_norm": 0.5723019242286682, + "learning_rate": 0.0001, + "loss": 1.5012, + "step": 14336 + }, + { + "epoch": 1.6468898971914308, + "grad_norm": 0.6029953956604004, + "learning_rate": 0.0001, + "loss": 1.3561, + "step": 14337 + }, + { + "epoch": 1.647004767101258, + "grad_norm": 0.646026074886322, + "learning_rate": 0.0001, + "loss": 1.5387, + "step": 14338 + }, + { + "epoch": 1.647119637011085, + "grad_norm": 0.6190069913864136, + "learning_rate": 0.0001, + "loss": 1.5599, + "step": 14339 + }, + { + "epoch": 1.647234506920912, + "grad_norm": 0.6896335482597351, + "learning_rate": 0.0001, + "loss": 1.6861, + "step": 14340 + }, + { + "epoch": 1.6473493768307392, + "grad_norm": 0.611846387386322, + "learning_rate": 0.0001, + "loss": 1.4243, + "step": 14341 + }, + { + "epoch": 1.6474642467405665, + "grad_norm": 0.5718992948532104, + "learning_rate": 0.0001, + "loss": 1.2889, + "step": 14342 + }, + { + "epoch": 1.6475791166503935, + "grad_norm": 0.5699307918548584, + "learning_rate": 0.0001, + "loss": 1.4659, + "step": 14343 + }, + { + "epoch": 1.6476939865602205, + "grad_norm": 0.5619608163833618, + "learning_rate": 0.0001, + "loss": 1.2261, + "step": 14344 + }, + { + "epoch": 1.6478088564700477, + "grad_norm": 0.6159090399742126, + "learning_rate": 0.0001, + "loss": 1.5151, + "step": 14345 + }, + { + "epoch": 1.647923726379875, + "grad_norm": 0.5962672233581543, + "learning_rate": 0.0001, + "loss": 1.1608, + "step": 14346 + }, + { + "epoch": 1.648038596289702, + "grad_norm": 0.6491231918334961, + "learning_rate": 0.0001, + "loss": 1.4453, + "step": 14347 + }, + { + "epoch": 1.648153466199529, + "grad_norm": 0.6284507513046265, + "learning_rate": 0.0001, + "loss": 1.5089, + "step": 14348 + }, + { + "epoch": 1.6482683361093562, + "grad_norm": 0.5969189405441284, + "learning_rate": 0.0001, + "loss": 1.5054, + "step": 14349 + }, + { + "epoch": 1.6483832060191834, + "grad_norm": 0.5988194346427917, + "learning_rate": 0.0001, + "loss": 1.352, + "step": 14350 + }, + { + "epoch": 1.6484980759290104, + "grad_norm": 0.6822172999382019, + "learning_rate": 0.0001, + "loss": 1.5764, + "step": 14351 + }, + { + "epoch": 1.6486129458388374, + "grad_norm": 0.6363539695739746, + "learning_rate": 0.0001, + "loss": 1.6048, + "step": 14352 + }, + { + "epoch": 1.6487278157486647, + "grad_norm": 0.65143221616745, + "learning_rate": 0.0001, + "loss": 1.5366, + "step": 14353 + }, + { + "epoch": 1.648842685658492, + "grad_norm": 0.6166749000549316, + "learning_rate": 0.0001, + "loss": 1.4744, + "step": 14354 + }, + { + "epoch": 1.648957555568319, + "grad_norm": 0.6183647513389587, + "learning_rate": 0.0001, + "loss": 1.4902, + "step": 14355 + }, + { + "epoch": 1.649072425478146, + "grad_norm": 0.5944738984107971, + "learning_rate": 0.0001, + "loss": 1.4701, + "step": 14356 + }, + { + "epoch": 1.6491872953879732, + "grad_norm": 0.6394641995429993, + "learning_rate": 0.0001, + "loss": 1.3743, + "step": 14357 + }, + { + "epoch": 1.6493021652978004, + "grad_norm": 0.6612902283668518, + "learning_rate": 0.0001, + "loss": 1.5309, + "step": 14358 + }, + { + "epoch": 1.6494170352076274, + "grad_norm": 0.6034130454063416, + "learning_rate": 0.0001, + "loss": 1.564, + "step": 14359 + }, + { + "epoch": 1.6495319051174544, + "grad_norm": 0.5944687128067017, + "learning_rate": 0.0001, + "loss": 1.4008, + "step": 14360 + }, + { + "epoch": 1.6496467750272816, + "grad_norm": 0.6049661040306091, + "learning_rate": 0.0001, + "loss": 1.5983, + "step": 14361 + }, + { + "epoch": 1.6497616449371089, + "grad_norm": 0.5754156112670898, + "learning_rate": 0.0001, + "loss": 1.4454, + "step": 14362 + }, + { + "epoch": 1.6498765148469359, + "grad_norm": 0.6179141998291016, + "learning_rate": 0.0001, + "loss": 1.4861, + "step": 14363 + }, + { + "epoch": 1.6499913847567629, + "grad_norm": 0.5943996906280518, + "learning_rate": 0.0001, + "loss": 1.4576, + "step": 14364 + }, + { + "epoch": 1.65010625466659, + "grad_norm": 0.6303023099899292, + "learning_rate": 0.0001, + "loss": 1.4891, + "step": 14365 + }, + { + "epoch": 1.6502211245764173, + "grad_norm": 0.5632081627845764, + "learning_rate": 0.0001, + "loss": 1.3568, + "step": 14366 + }, + { + "epoch": 1.6503359944862444, + "grad_norm": 0.574306070804596, + "learning_rate": 0.0001, + "loss": 1.2801, + "step": 14367 + }, + { + "epoch": 1.6504508643960714, + "grad_norm": 0.5834609270095825, + "learning_rate": 0.0001, + "loss": 1.3699, + "step": 14368 + }, + { + "epoch": 1.6505657343058986, + "grad_norm": 0.5941557288169861, + "learning_rate": 0.0001, + "loss": 1.398, + "step": 14369 + }, + { + "epoch": 1.6506806042157258, + "grad_norm": 0.6643593907356262, + "learning_rate": 0.0001, + "loss": 1.6187, + "step": 14370 + }, + { + "epoch": 1.6507954741255528, + "grad_norm": 0.5751389861106873, + "learning_rate": 0.0001, + "loss": 1.4632, + "step": 14371 + }, + { + "epoch": 1.6509103440353798, + "grad_norm": 0.6634930968284607, + "learning_rate": 0.0001, + "loss": 1.6016, + "step": 14372 + }, + { + "epoch": 1.651025213945207, + "grad_norm": 0.6502936482429504, + "learning_rate": 0.0001, + "loss": 1.4685, + "step": 14373 + }, + { + "epoch": 1.6511400838550343, + "grad_norm": 0.6059218049049377, + "learning_rate": 0.0001, + "loss": 1.4038, + "step": 14374 + }, + { + "epoch": 1.6512549537648613, + "grad_norm": 0.6395809650421143, + "learning_rate": 0.0001, + "loss": 1.5501, + "step": 14375 + }, + { + "epoch": 1.6513698236746883, + "grad_norm": 0.614824116230011, + "learning_rate": 0.0001, + "loss": 1.3675, + "step": 14376 + }, + { + "epoch": 1.6514846935845156, + "grad_norm": 0.6158643364906311, + "learning_rate": 0.0001, + "loss": 1.6117, + "step": 14377 + }, + { + "epoch": 1.6515995634943428, + "grad_norm": 0.5627233982086182, + "learning_rate": 0.0001, + "loss": 1.4827, + "step": 14378 + }, + { + "epoch": 1.6517144334041698, + "grad_norm": 0.7553406953811646, + "learning_rate": 0.0001, + "loss": 1.4885, + "step": 14379 + }, + { + "epoch": 1.6518293033139968, + "grad_norm": 0.5953923463821411, + "learning_rate": 0.0001, + "loss": 1.5845, + "step": 14380 + }, + { + "epoch": 1.651944173223824, + "grad_norm": 0.6409009695053101, + "learning_rate": 0.0001, + "loss": 1.4745, + "step": 14381 + }, + { + "epoch": 1.6520590431336513, + "grad_norm": 0.5870394110679626, + "learning_rate": 0.0001, + "loss": 1.4941, + "step": 14382 + }, + { + "epoch": 1.6521739130434783, + "grad_norm": 0.6005561947822571, + "learning_rate": 0.0001, + "loss": 1.5428, + "step": 14383 + }, + { + "epoch": 1.6522887829533053, + "grad_norm": 0.651766836643219, + "learning_rate": 0.0001, + "loss": 1.4715, + "step": 14384 + }, + { + "epoch": 1.6524036528631325, + "grad_norm": 0.5785371661186218, + "learning_rate": 0.0001, + "loss": 1.5496, + "step": 14385 + }, + { + "epoch": 1.6525185227729597, + "grad_norm": 0.6617969274520874, + "learning_rate": 0.0001, + "loss": 1.4035, + "step": 14386 + }, + { + "epoch": 1.6526333926827868, + "grad_norm": 0.6037518382072449, + "learning_rate": 0.0001, + "loss": 1.5064, + "step": 14387 + }, + { + "epoch": 1.6527482625926138, + "grad_norm": 0.6930570006370544, + "learning_rate": 0.0001, + "loss": 1.74, + "step": 14388 + }, + { + "epoch": 1.652863132502441, + "grad_norm": 0.6605214476585388, + "learning_rate": 0.0001, + "loss": 1.4112, + "step": 14389 + }, + { + "epoch": 1.6529780024122682, + "grad_norm": 0.6189714074134827, + "learning_rate": 0.0001, + "loss": 1.3354, + "step": 14390 + }, + { + "epoch": 1.6530928723220952, + "grad_norm": 0.6060426235198975, + "learning_rate": 0.0001, + "loss": 1.5953, + "step": 14391 + }, + { + "epoch": 1.6532077422319222, + "grad_norm": 0.5811141729354858, + "learning_rate": 0.0001, + "loss": 1.2768, + "step": 14392 + }, + { + "epoch": 1.6533226121417495, + "grad_norm": 0.608869731426239, + "learning_rate": 0.0001, + "loss": 1.4701, + "step": 14393 + }, + { + "epoch": 1.6534374820515767, + "grad_norm": 0.6065149307250977, + "learning_rate": 0.0001, + "loss": 1.4988, + "step": 14394 + }, + { + "epoch": 1.6535523519614037, + "grad_norm": 0.7323920726776123, + "learning_rate": 0.0001, + "loss": 1.628, + "step": 14395 + }, + { + "epoch": 1.6536672218712307, + "grad_norm": 0.604194700717926, + "learning_rate": 0.0001, + "loss": 1.4543, + "step": 14396 + }, + { + "epoch": 1.653782091781058, + "grad_norm": 0.6321179866790771, + "learning_rate": 0.0001, + "loss": 1.4389, + "step": 14397 + }, + { + "epoch": 1.6538969616908852, + "grad_norm": 0.6846322417259216, + "learning_rate": 0.0001, + "loss": 1.6427, + "step": 14398 + }, + { + "epoch": 1.6540118316007122, + "grad_norm": 0.6531241536140442, + "learning_rate": 0.0001, + "loss": 1.5168, + "step": 14399 + }, + { + "epoch": 1.6541267015105392, + "grad_norm": 0.5736318826675415, + "learning_rate": 0.0001, + "loss": 1.359, + "step": 14400 + }, + { + "epoch": 1.6542415714203664, + "grad_norm": 0.5621281266212463, + "learning_rate": 0.0001, + "loss": 1.2091, + "step": 14401 + }, + { + "epoch": 1.6543564413301937, + "grad_norm": 0.6186012029647827, + "learning_rate": 0.0001, + "loss": 1.3328, + "step": 14402 + }, + { + "epoch": 1.6544713112400207, + "grad_norm": 0.6069703698158264, + "learning_rate": 0.0001, + "loss": 1.4517, + "step": 14403 + }, + { + "epoch": 1.6545861811498477, + "grad_norm": 0.6218054890632629, + "learning_rate": 0.0001, + "loss": 1.5537, + "step": 14404 + }, + { + "epoch": 1.654701051059675, + "grad_norm": 0.6225132346153259, + "learning_rate": 0.0001, + "loss": 1.4158, + "step": 14405 + }, + { + "epoch": 1.6548159209695021, + "grad_norm": 0.6091734170913696, + "learning_rate": 0.0001, + "loss": 1.3794, + "step": 14406 + }, + { + "epoch": 1.6549307908793291, + "grad_norm": 0.569509744644165, + "learning_rate": 0.0001, + "loss": 1.2694, + "step": 14407 + }, + { + "epoch": 1.6550456607891562, + "grad_norm": 0.690952718257904, + "learning_rate": 0.0001, + "loss": 1.5541, + "step": 14408 + }, + { + "epoch": 1.6551605306989834, + "grad_norm": 0.6232033371925354, + "learning_rate": 0.0001, + "loss": 1.3905, + "step": 14409 + }, + { + "epoch": 1.6552754006088106, + "grad_norm": 0.6081985235214233, + "learning_rate": 0.0001, + "loss": 1.2826, + "step": 14410 + }, + { + "epoch": 1.6553902705186376, + "grad_norm": 0.6122736930847168, + "learning_rate": 0.0001, + "loss": 1.5884, + "step": 14411 + }, + { + "epoch": 1.6555051404284646, + "grad_norm": 0.5784592628479004, + "learning_rate": 0.0001, + "loss": 1.6078, + "step": 14412 + }, + { + "epoch": 1.6556200103382919, + "grad_norm": 0.5675357580184937, + "learning_rate": 0.0001, + "loss": 1.4174, + "step": 14413 + }, + { + "epoch": 1.655734880248119, + "grad_norm": 0.5335760116577148, + "learning_rate": 0.0001, + "loss": 1.1964, + "step": 14414 + }, + { + "epoch": 1.655849750157946, + "grad_norm": 0.604422390460968, + "learning_rate": 0.0001, + "loss": 1.4652, + "step": 14415 + }, + { + "epoch": 1.6559646200677731, + "grad_norm": 0.6323688626289368, + "learning_rate": 0.0001, + "loss": 1.4988, + "step": 14416 + }, + { + "epoch": 1.6560794899776003, + "grad_norm": 0.5734161734580994, + "learning_rate": 0.0001, + "loss": 1.3093, + "step": 14417 + }, + { + "epoch": 1.6561943598874276, + "grad_norm": 0.6848605275154114, + "learning_rate": 0.0001, + "loss": 1.546, + "step": 14418 + }, + { + "epoch": 1.6563092297972546, + "grad_norm": 0.5923926830291748, + "learning_rate": 0.0001, + "loss": 1.3412, + "step": 14419 + }, + { + "epoch": 1.6564240997070816, + "grad_norm": 0.7171801328659058, + "learning_rate": 0.0001, + "loss": 1.4567, + "step": 14420 + }, + { + "epoch": 1.6565389696169088, + "grad_norm": 0.6611701846122742, + "learning_rate": 0.0001, + "loss": 1.4262, + "step": 14421 + }, + { + "epoch": 1.656653839526736, + "grad_norm": 0.6520163416862488, + "learning_rate": 0.0001, + "loss": 1.495, + "step": 14422 + }, + { + "epoch": 1.656768709436563, + "grad_norm": 0.6139830350875854, + "learning_rate": 0.0001, + "loss": 1.6852, + "step": 14423 + }, + { + "epoch": 1.65688357934639, + "grad_norm": 0.5797949433326721, + "learning_rate": 0.0001, + "loss": 1.3619, + "step": 14424 + }, + { + "epoch": 1.6569984492562173, + "grad_norm": 0.5873029828071594, + "learning_rate": 0.0001, + "loss": 1.3233, + "step": 14425 + }, + { + "epoch": 1.6571133191660445, + "grad_norm": 0.6242226362228394, + "learning_rate": 0.0001, + "loss": 1.2352, + "step": 14426 + }, + { + "epoch": 1.6572281890758715, + "grad_norm": 0.589346170425415, + "learning_rate": 0.0001, + "loss": 1.327, + "step": 14427 + }, + { + "epoch": 1.6573430589856986, + "grad_norm": 0.6410361528396606, + "learning_rate": 0.0001, + "loss": 1.6419, + "step": 14428 + }, + { + "epoch": 1.6574579288955258, + "grad_norm": 0.5922463536262512, + "learning_rate": 0.0001, + "loss": 1.5046, + "step": 14429 + }, + { + "epoch": 1.657572798805353, + "grad_norm": 0.5775658488273621, + "learning_rate": 0.0001, + "loss": 1.2298, + "step": 14430 + }, + { + "epoch": 1.65768766871518, + "grad_norm": 0.7526030540466309, + "learning_rate": 0.0001, + "loss": 1.5982, + "step": 14431 + }, + { + "epoch": 1.657802538625007, + "grad_norm": 0.5999011993408203, + "learning_rate": 0.0001, + "loss": 1.3619, + "step": 14432 + }, + { + "epoch": 1.6579174085348343, + "grad_norm": 0.6628460884094238, + "learning_rate": 0.0001, + "loss": 1.6252, + "step": 14433 + }, + { + "epoch": 1.6580322784446615, + "grad_norm": 0.6481192708015442, + "learning_rate": 0.0001, + "loss": 1.4441, + "step": 14434 + }, + { + "epoch": 1.6581471483544885, + "grad_norm": 0.6300088167190552, + "learning_rate": 0.0001, + "loss": 1.4529, + "step": 14435 + }, + { + "epoch": 1.6582620182643155, + "grad_norm": 0.5989202260971069, + "learning_rate": 0.0001, + "loss": 1.2056, + "step": 14436 + }, + { + "epoch": 1.6583768881741427, + "grad_norm": 0.7720966339111328, + "learning_rate": 0.0001, + "loss": 1.581, + "step": 14437 + }, + { + "epoch": 1.65849175808397, + "grad_norm": 0.6756711602210999, + "learning_rate": 0.0001, + "loss": 1.4533, + "step": 14438 + }, + { + "epoch": 1.658606627993797, + "grad_norm": 0.6091902852058411, + "learning_rate": 0.0001, + "loss": 1.4482, + "step": 14439 + }, + { + "epoch": 1.658721497903624, + "grad_norm": 0.6059051156044006, + "learning_rate": 0.0001, + "loss": 1.4326, + "step": 14440 + }, + { + "epoch": 1.6588363678134512, + "grad_norm": 0.6013321876525879, + "learning_rate": 0.0001, + "loss": 1.352, + "step": 14441 + }, + { + "epoch": 1.6589512377232785, + "grad_norm": 0.5931786298751831, + "learning_rate": 0.0001, + "loss": 1.4597, + "step": 14442 + }, + { + "epoch": 1.6590661076331055, + "grad_norm": 0.5912080407142639, + "learning_rate": 0.0001, + "loss": 1.4353, + "step": 14443 + }, + { + "epoch": 1.6591809775429325, + "grad_norm": 0.614547073841095, + "learning_rate": 0.0001, + "loss": 1.4778, + "step": 14444 + }, + { + "epoch": 1.6592958474527597, + "grad_norm": 0.6366251707077026, + "learning_rate": 0.0001, + "loss": 1.4466, + "step": 14445 + }, + { + "epoch": 1.659410717362587, + "grad_norm": 0.5798001289367676, + "learning_rate": 0.0001, + "loss": 1.3829, + "step": 14446 + }, + { + "epoch": 1.659525587272414, + "grad_norm": 0.5841000080108643, + "learning_rate": 0.0001, + "loss": 1.5427, + "step": 14447 + }, + { + "epoch": 1.659640457182241, + "grad_norm": 0.6051506400108337, + "learning_rate": 0.0001, + "loss": 1.4419, + "step": 14448 + }, + { + "epoch": 1.6597553270920682, + "grad_norm": 0.5879446864128113, + "learning_rate": 0.0001, + "loss": 1.4215, + "step": 14449 + }, + { + "epoch": 1.6598701970018954, + "grad_norm": 0.661629855632782, + "learning_rate": 0.0001, + "loss": 1.4418, + "step": 14450 + }, + { + "epoch": 1.6599850669117224, + "grad_norm": 0.6741747856140137, + "learning_rate": 0.0001, + "loss": 1.5166, + "step": 14451 + }, + { + "epoch": 1.6600999368215494, + "grad_norm": 0.5925803184509277, + "learning_rate": 0.0001, + "loss": 1.3766, + "step": 14452 + }, + { + "epoch": 1.6602148067313767, + "grad_norm": 0.6182143688201904, + "learning_rate": 0.0001, + "loss": 1.4005, + "step": 14453 + }, + { + "epoch": 1.660329676641204, + "grad_norm": 0.6143870949745178, + "learning_rate": 0.0001, + "loss": 1.4139, + "step": 14454 + }, + { + "epoch": 1.660444546551031, + "grad_norm": 0.6046828031539917, + "learning_rate": 0.0001, + "loss": 1.3055, + "step": 14455 + }, + { + "epoch": 1.660559416460858, + "grad_norm": 0.767876148223877, + "learning_rate": 0.0001, + "loss": 1.5525, + "step": 14456 + }, + { + "epoch": 1.6606742863706851, + "grad_norm": 0.6341820359230042, + "learning_rate": 0.0001, + "loss": 1.6184, + "step": 14457 + }, + { + "epoch": 1.6607891562805124, + "grad_norm": 0.6907703876495361, + "learning_rate": 0.0001, + "loss": 1.3277, + "step": 14458 + }, + { + "epoch": 1.6609040261903394, + "grad_norm": 0.6609397530555725, + "learning_rate": 0.0001, + "loss": 1.6431, + "step": 14459 + }, + { + "epoch": 1.6610188961001664, + "grad_norm": 0.6220353245735168, + "learning_rate": 0.0001, + "loss": 1.4967, + "step": 14460 + }, + { + "epoch": 1.6611337660099936, + "grad_norm": 0.5976139903068542, + "learning_rate": 0.0001, + "loss": 1.4258, + "step": 14461 + }, + { + "epoch": 1.6612486359198209, + "grad_norm": 0.654104471206665, + "learning_rate": 0.0001, + "loss": 1.53, + "step": 14462 + }, + { + "epoch": 1.6613635058296479, + "grad_norm": 0.6497766971588135, + "learning_rate": 0.0001, + "loss": 1.6418, + "step": 14463 + }, + { + "epoch": 1.661478375739475, + "grad_norm": 0.5906299948692322, + "learning_rate": 0.0001, + "loss": 1.5512, + "step": 14464 + }, + { + "epoch": 1.661593245649302, + "grad_norm": 0.6207028031349182, + "learning_rate": 0.0001, + "loss": 1.498, + "step": 14465 + }, + { + "epoch": 1.6617081155591293, + "grad_norm": 0.6227574348449707, + "learning_rate": 0.0001, + "loss": 1.4097, + "step": 14466 + }, + { + "epoch": 1.6618229854689566, + "grad_norm": 0.7923548817634583, + "learning_rate": 0.0001, + "loss": 1.0432, + "step": 14467 + }, + { + "epoch": 1.6619378553787836, + "grad_norm": 0.6193511486053467, + "learning_rate": 0.0001, + "loss": 1.5811, + "step": 14468 + }, + { + "epoch": 1.6620527252886106, + "grad_norm": 0.6481901407241821, + "learning_rate": 0.0001, + "loss": 1.2574, + "step": 14469 + }, + { + "epoch": 1.6621675951984378, + "grad_norm": 0.7560790181159973, + "learning_rate": 0.0001, + "loss": 1.7806, + "step": 14470 + }, + { + "epoch": 1.662282465108265, + "grad_norm": 0.6491854786872864, + "learning_rate": 0.0001, + "loss": 1.6269, + "step": 14471 + }, + { + "epoch": 1.662397335018092, + "grad_norm": 0.6301493048667908, + "learning_rate": 0.0001, + "loss": 1.4367, + "step": 14472 + }, + { + "epoch": 1.662512204927919, + "grad_norm": 0.6142545342445374, + "learning_rate": 0.0001, + "loss": 1.5011, + "step": 14473 + }, + { + "epoch": 1.6626270748377463, + "grad_norm": 0.6091681718826294, + "learning_rate": 0.0001, + "loss": 1.4234, + "step": 14474 + }, + { + "epoch": 1.6627419447475735, + "grad_norm": 0.5943846106529236, + "learning_rate": 0.0001, + "loss": 1.5198, + "step": 14475 + }, + { + "epoch": 1.6628568146574005, + "grad_norm": 0.7121033668518066, + "learning_rate": 0.0001, + "loss": 1.4786, + "step": 14476 + }, + { + "epoch": 1.6629716845672275, + "grad_norm": 0.575641930103302, + "learning_rate": 0.0001, + "loss": 1.283, + "step": 14477 + }, + { + "epoch": 1.6630865544770548, + "grad_norm": 0.62906414270401, + "learning_rate": 0.0001, + "loss": 1.6762, + "step": 14478 + }, + { + "epoch": 1.663201424386882, + "grad_norm": 0.5758494734764099, + "learning_rate": 0.0001, + "loss": 1.3509, + "step": 14479 + }, + { + "epoch": 1.663316294296709, + "grad_norm": 0.581095278263092, + "learning_rate": 0.0001, + "loss": 1.376, + "step": 14480 + }, + { + "epoch": 1.663431164206536, + "grad_norm": 0.6003084182739258, + "learning_rate": 0.0001, + "loss": 1.4864, + "step": 14481 + }, + { + "epoch": 1.6635460341163633, + "grad_norm": 0.6477168202400208, + "learning_rate": 0.0001, + "loss": 1.3539, + "step": 14482 + }, + { + "epoch": 1.6636609040261905, + "grad_norm": 0.6288648843765259, + "learning_rate": 0.0001, + "loss": 1.4663, + "step": 14483 + }, + { + "epoch": 1.6637757739360175, + "grad_norm": 0.6509358882904053, + "learning_rate": 0.0001, + "loss": 1.3245, + "step": 14484 + }, + { + "epoch": 1.6638906438458445, + "grad_norm": 0.6689729690551758, + "learning_rate": 0.0001, + "loss": 1.4742, + "step": 14485 + }, + { + "epoch": 1.6640055137556717, + "grad_norm": 0.6662051677703857, + "learning_rate": 0.0001, + "loss": 1.4835, + "step": 14486 + }, + { + "epoch": 1.664120383665499, + "grad_norm": 0.6444410681724548, + "learning_rate": 0.0001, + "loss": 1.5203, + "step": 14487 + }, + { + "epoch": 1.664235253575326, + "grad_norm": 0.5951128602027893, + "learning_rate": 0.0001, + "loss": 1.4918, + "step": 14488 + }, + { + "epoch": 1.664350123485153, + "grad_norm": 0.6148644089698792, + "learning_rate": 0.0001, + "loss": 1.2399, + "step": 14489 + }, + { + "epoch": 1.6644649933949802, + "grad_norm": 0.6238715052604675, + "learning_rate": 0.0001, + "loss": 1.6374, + "step": 14490 + }, + { + "epoch": 1.6645798633048074, + "grad_norm": 0.6118984818458557, + "learning_rate": 0.0001, + "loss": 1.5081, + "step": 14491 + }, + { + "epoch": 1.6646947332146345, + "grad_norm": 0.6418251991271973, + "learning_rate": 0.0001, + "loss": 1.4167, + "step": 14492 + }, + { + "epoch": 1.6648096031244615, + "grad_norm": 0.6296436786651611, + "learning_rate": 0.0001, + "loss": 1.6315, + "step": 14493 + }, + { + "epoch": 1.6649244730342887, + "grad_norm": 0.6078484654426575, + "learning_rate": 0.0001, + "loss": 1.4571, + "step": 14494 + }, + { + "epoch": 1.665039342944116, + "grad_norm": 0.5940951108932495, + "learning_rate": 0.0001, + "loss": 1.5002, + "step": 14495 + }, + { + "epoch": 1.665154212853943, + "grad_norm": 0.6265494227409363, + "learning_rate": 0.0001, + "loss": 1.4058, + "step": 14496 + }, + { + "epoch": 1.66526908276377, + "grad_norm": 0.551169753074646, + "learning_rate": 0.0001, + "loss": 1.4444, + "step": 14497 + }, + { + "epoch": 1.6653839526735972, + "grad_norm": 0.6252605319023132, + "learning_rate": 0.0001, + "loss": 1.5168, + "step": 14498 + }, + { + "epoch": 1.6654988225834244, + "grad_norm": 0.6075454950332642, + "learning_rate": 0.0001, + "loss": 1.4124, + "step": 14499 + }, + { + "epoch": 1.6656136924932514, + "grad_norm": 0.6732686161994934, + "learning_rate": 0.0001, + "loss": 1.6495, + "step": 14500 + }, + { + "epoch": 1.6657285624030784, + "grad_norm": 0.6578999161720276, + "learning_rate": 0.0001, + "loss": 1.535, + "step": 14501 + }, + { + "epoch": 1.6658434323129057, + "grad_norm": 0.6686480045318604, + "learning_rate": 0.0001, + "loss": 1.5928, + "step": 14502 + }, + { + "epoch": 1.665958302222733, + "grad_norm": 0.5849835276603699, + "learning_rate": 0.0001, + "loss": 1.3953, + "step": 14503 + }, + { + "epoch": 1.66607317213256, + "grad_norm": 0.6391957402229309, + "learning_rate": 0.0001, + "loss": 1.5159, + "step": 14504 + }, + { + "epoch": 1.666188042042387, + "grad_norm": 0.6156760454177856, + "learning_rate": 0.0001, + "loss": 1.2013, + "step": 14505 + }, + { + "epoch": 1.6663029119522141, + "grad_norm": 0.627495527267456, + "learning_rate": 0.0001, + "loss": 1.7254, + "step": 14506 + }, + { + "epoch": 1.6664177818620414, + "grad_norm": 0.6318630576133728, + "learning_rate": 0.0001, + "loss": 1.495, + "step": 14507 + }, + { + "epoch": 1.6665326517718684, + "grad_norm": 0.6174932718276978, + "learning_rate": 0.0001, + "loss": 1.508, + "step": 14508 + }, + { + "epoch": 1.6666475216816954, + "grad_norm": 0.6004678606987, + "learning_rate": 0.0001, + "loss": 1.3112, + "step": 14509 + }, + { + "epoch": 1.6667623915915226, + "grad_norm": 0.5961777567863464, + "learning_rate": 0.0001, + "loss": 1.3316, + "step": 14510 + }, + { + "epoch": 1.6668772615013498, + "grad_norm": 0.6679588556289673, + "learning_rate": 0.0001, + "loss": 1.4856, + "step": 14511 + }, + { + "epoch": 1.6669921314111769, + "grad_norm": 0.6514487862586975, + "learning_rate": 0.0001, + "loss": 1.5705, + "step": 14512 + }, + { + "epoch": 1.6671070013210039, + "grad_norm": 0.6601291298866272, + "learning_rate": 0.0001, + "loss": 1.4138, + "step": 14513 + }, + { + "epoch": 1.667221871230831, + "grad_norm": 0.6563011407852173, + "learning_rate": 0.0001, + "loss": 1.5599, + "step": 14514 + }, + { + "epoch": 1.6673367411406583, + "grad_norm": 0.6405079960823059, + "learning_rate": 0.0001, + "loss": 1.484, + "step": 14515 + }, + { + "epoch": 1.6674516110504853, + "grad_norm": 0.6541486978530884, + "learning_rate": 0.0001, + "loss": 1.5507, + "step": 14516 + }, + { + "epoch": 1.6675664809603123, + "grad_norm": 0.5867092609405518, + "learning_rate": 0.0001, + "loss": 1.4034, + "step": 14517 + }, + { + "epoch": 1.6676813508701396, + "grad_norm": 0.5766440033912659, + "learning_rate": 0.0001, + "loss": 1.4268, + "step": 14518 + }, + { + "epoch": 1.6677962207799668, + "grad_norm": 0.5653627514839172, + "learning_rate": 0.0001, + "loss": 1.3912, + "step": 14519 + }, + { + "epoch": 1.6679110906897938, + "grad_norm": 0.6476209759712219, + "learning_rate": 0.0001, + "loss": 1.5981, + "step": 14520 + }, + { + "epoch": 1.6680259605996208, + "grad_norm": 0.5878192186355591, + "learning_rate": 0.0001, + "loss": 1.3328, + "step": 14521 + }, + { + "epoch": 1.668140830509448, + "grad_norm": 0.5736682415008545, + "learning_rate": 0.0001, + "loss": 1.3202, + "step": 14522 + }, + { + "epoch": 1.6682557004192753, + "grad_norm": 0.6797529458999634, + "learning_rate": 0.0001, + "loss": 1.7396, + "step": 14523 + }, + { + "epoch": 1.6683705703291023, + "grad_norm": 0.6123679876327515, + "learning_rate": 0.0001, + "loss": 1.4517, + "step": 14524 + }, + { + "epoch": 1.6684854402389293, + "grad_norm": 0.567111074924469, + "learning_rate": 0.0001, + "loss": 1.4072, + "step": 14525 + }, + { + "epoch": 1.6686003101487565, + "grad_norm": 0.5874302983283997, + "learning_rate": 0.0001, + "loss": 1.3295, + "step": 14526 + }, + { + "epoch": 1.6687151800585838, + "grad_norm": 0.6208910346031189, + "learning_rate": 0.0001, + "loss": 1.5107, + "step": 14527 + }, + { + "epoch": 1.6688300499684108, + "grad_norm": 0.5815864205360413, + "learning_rate": 0.0001, + "loss": 1.2403, + "step": 14528 + }, + { + "epoch": 1.6689449198782378, + "grad_norm": 0.6493234634399414, + "learning_rate": 0.0001, + "loss": 1.4954, + "step": 14529 + }, + { + "epoch": 1.669059789788065, + "grad_norm": 0.556931734085083, + "learning_rate": 0.0001, + "loss": 1.3694, + "step": 14530 + }, + { + "epoch": 1.6691746596978922, + "grad_norm": 0.6887297034263611, + "learning_rate": 0.0001, + "loss": 1.6801, + "step": 14531 + }, + { + "epoch": 1.6692895296077193, + "grad_norm": 0.688178300857544, + "learning_rate": 0.0001, + "loss": 1.5668, + "step": 14532 + }, + { + "epoch": 1.6694043995175463, + "grad_norm": 0.6757493019104004, + "learning_rate": 0.0001, + "loss": 1.5948, + "step": 14533 + }, + { + "epoch": 1.6695192694273735, + "grad_norm": 0.5804457068443298, + "learning_rate": 0.0001, + "loss": 1.3788, + "step": 14534 + }, + { + "epoch": 1.6696341393372007, + "grad_norm": 0.5876379013061523, + "learning_rate": 0.0001, + "loss": 1.1737, + "step": 14535 + }, + { + "epoch": 1.6697490092470277, + "grad_norm": 0.6323837637901306, + "learning_rate": 0.0001, + "loss": 1.3878, + "step": 14536 + }, + { + "epoch": 1.6698638791568547, + "grad_norm": 0.6448050141334534, + "learning_rate": 0.0001, + "loss": 1.5295, + "step": 14537 + }, + { + "epoch": 1.669978749066682, + "grad_norm": 0.6202848553657532, + "learning_rate": 0.0001, + "loss": 1.5633, + "step": 14538 + }, + { + "epoch": 1.6700936189765092, + "grad_norm": 0.6006600856781006, + "learning_rate": 0.0001, + "loss": 1.5798, + "step": 14539 + }, + { + "epoch": 1.6702084888863362, + "grad_norm": 0.6253841519355774, + "learning_rate": 0.0001, + "loss": 1.361, + "step": 14540 + }, + { + "epoch": 1.6703233587961632, + "grad_norm": 0.6643701791763306, + "learning_rate": 0.0001, + "loss": 1.3992, + "step": 14541 + }, + { + "epoch": 1.6704382287059905, + "grad_norm": 0.6138646602630615, + "learning_rate": 0.0001, + "loss": 1.5829, + "step": 14542 + }, + { + "epoch": 1.6705530986158177, + "grad_norm": 0.614280641078949, + "learning_rate": 0.0001, + "loss": 1.4676, + "step": 14543 + }, + { + "epoch": 1.6706679685256447, + "grad_norm": 0.5600206851959229, + "learning_rate": 0.0001, + "loss": 1.3489, + "step": 14544 + }, + { + "epoch": 1.6707828384354717, + "grad_norm": 0.6660794615745544, + "learning_rate": 0.0001, + "loss": 1.6009, + "step": 14545 + }, + { + "epoch": 1.670897708345299, + "grad_norm": 0.6051227450370789, + "learning_rate": 0.0001, + "loss": 1.4396, + "step": 14546 + }, + { + "epoch": 1.6710125782551262, + "grad_norm": 0.5952056646347046, + "learning_rate": 0.0001, + "loss": 1.4834, + "step": 14547 + }, + { + "epoch": 1.6711274481649532, + "grad_norm": 0.8179489374160767, + "learning_rate": 0.0001, + "loss": 1.7063, + "step": 14548 + }, + { + "epoch": 1.6712423180747802, + "grad_norm": 0.5879932045936584, + "learning_rate": 0.0001, + "loss": 1.2387, + "step": 14549 + }, + { + "epoch": 1.6713571879846074, + "grad_norm": 0.6535231471061707, + "learning_rate": 0.0001, + "loss": 1.6113, + "step": 14550 + }, + { + "epoch": 1.6714720578944346, + "grad_norm": 0.6634051203727722, + "learning_rate": 0.0001, + "loss": 1.5402, + "step": 14551 + }, + { + "epoch": 1.6715869278042617, + "grad_norm": 0.6484226584434509, + "learning_rate": 0.0001, + "loss": 1.4585, + "step": 14552 + }, + { + "epoch": 1.6717017977140887, + "grad_norm": 0.6908429861068726, + "learning_rate": 0.0001, + "loss": 1.4969, + "step": 14553 + }, + { + "epoch": 1.671816667623916, + "grad_norm": 0.6413170695304871, + "learning_rate": 0.0001, + "loss": 1.3641, + "step": 14554 + }, + { + "epoch": 1.6719315375337431, + "grad_norm": 0.6127830147743225, + "learning_rate": 0.0001, + "loss": 1.4752, + "step": 14555 + }, + { + "epoch": 1.6720464074435701, + "grad_norm": 0.6318410038948059, + "learning_rate": 0.0001, + "loss": 1.5818, + "step": 14556 + }, + { + "epoch": 1.6721612773533971, + "grad_norm": 0.6313050389289856, + "learning_rate": 0.0001, + "loss": 1.4046, + "step": 14557 + }, + { + "epoch": 1.6722761472632244, + "grad_norm": 0.6314618587493896, + "learning_rate": 0.0001, + "loss": 1.4672, + "step": 14558 + }, + { + "epoch": 1.6723910171730516, + "grad_norm": 0.6210103034973145, + "learning_rate": 0.0001, + "loss": 1.2459, + "step": 14559 + }, + { + "epoch": 1.6725058870828786, + "grad_norm": 0.6324970126152039, + "learning_rate": 0.0001, + "loss": 1.3796, + "step": 14560 + }, + { + "epoch": 1.6726207569927056, + "grad_norm": 0.679050087928772, + "learning_rate": 0.0001, + "loss": 1.4594, + "step": 14561 + }, + { + "epoch": 1.6727356269025329, + "grad_norm": 0.5676151514053345, + "learning_rate": 0.0001, + "loss": 1.4308, + "step": 14562 + }, + { + "epoch": 1.67285049681236, + "grad_norm": 0.5721042156219482, + "learning_rate": 0.0001, + "loss": 1.3224, + "step": 14563 + }, + { + "epoch": 1.672965366722187, + "grad_norm": 0.5606781840324402, + "learning_rate": 0.0001, + "loss": 1.5974, + "step": 14564 + }, + { + "epoch": 1.673080236632014, + "grad_norm": 0.602632462978363, + "learning_rate": 0.0001, + "loss": 1.316, + "step": 14565 + }, + { + "epoch": 1.6731951065418413, + "grad_norm": 0.6165114641189575, + "learning_rate": 0.0001, + "loss": 1.485, + "step": 14566 + }, + { + "epoch": 1.6733099764516686, + "grad_norm": 0.6127558350563049, + "learning_rate": 0.0001, + "loss": 1.5381, + "step": 14567 + }, + { + "epoch": 1.6734248463614956, + "grad_norm": 0.6075685024261475, + "learning_rate": 0.0001, + "loss": 1.3019, + "step": 14568 + }, + { + "epoch": 1.6735397162713226, + "grad_norm": 0.6863022446632385, + "learning_rate": 0.0001, + "loss": 1.5015, + "step": 14569 + }, + { + "epoch": 1.6736545861811498, + "grad_norm": 0.6382628679275513, + "learning_rate": 0.0001, + "loss": 1.5063, + "step": 14570 + }, + { + "epoch": 1.673769456090977, + "grad_norm": 0.6291472911834717, + "learning_rate": 0.0001, + "loss": 1.5664, + "step": 14571 + }, + { + "epoch": 1.673884326000804, + "grad_norm": 0.6682292222976685, + "learning_rate": 0.0001, + "loss": 1.5275, + "step": 14572 + }, + { + "epoch": 1.673999195910631, + "grad_norm": 0.6253922581672668, + "learning_rate": 0.0001, + "loss": 1.4154, + "step": 14573 + }, + { + "epoch": 1.6741140658204583, + "grad_norm": 0.6436946988105774, + "learning_rate": 0.0001, + "loss": 1.246, + "step": 14574 + }, + { + "epoch": 1.6742289357302855, + "grad_norm": 0.6284177303314209, + "learning_rate": 0.0001, + "loss": 1.5651, + "step": 14575 + }, + { + "epoch": 1.6743438056401125, + "grad_norm": 0.6123202443122864, + "learning_rate": 0.0001, + "loss": 1.4549, + "step": 14576 + }, + { + "epoch": 1.6744586755499395, + "grad_norm": 0.635627269744873, + "learning_rate": 0.0001, + "loss": 1.4781, + "step": 14577 + }, + { + "epoch": 1.6745735454597668, + "grad_norm": 0.637518048286438, + "learning_rate": 0.0001, + "loss": 1.5056, + "step": 14578 + }, + { + "epoch": 1.674688415369594, + "grad_norm": 0.6325652003288269, + "learning_rate": 0.0001, + "loss": 1.3018, + "step": 14579 + }, + { + "epoch": 1.674803285279421, + "grad_norm": 0.6435227394104004, + "learning_rate": 0.0001, + "loss": 1.4763, + "step": 14580 + }, + { + "epoch": 1.674918155189248, + "grad_norm": 0.6192457675933838, + "learning_rate": 0.0001, + "loss": 1.3374, + "step": 14581 + }, + { + "epoch": 1.6750330250990753, + "grad_norm": 0.5760471820831299, + "learning_rate": 0.0001, + "loss": 1.2543, + "step": 14582 + }, + { + "epoch": 1.6751478950089025, + "grad_norm": 0.6090351343154907, + "learning_rate": 0.0001, + "loss": 1.245, + "step": 14583 + }, + { + "epoch": 1.6752627649187295, + "grad_norm": 0.5881224274635315, + "learning_rate": 0.0001, + "loss": 1.4009, + "step": 14584 + }, + { + "epoch": 1.6753776348285565, + "grad_norm": 0.7061769366264343, + "learning_rate": 0.0001, + "loss": 1.6304, + "step": 14585 + }, + { + "epoch": 1.6754925047383837, + "grad_norm": 0.6568925380706787, + "learning_rate": 0.0001, + "loss": 1.4963, + "step": 14586 + }, + { + "epoch": 1.675607374648211, + "grad_norm": 0.6131194233894348, + "learning_rate": 0.0001, + "loss": 1.3247, + "step": 14587 + }, + { + "epoch": 1.675722244558038, + "grad_norm": 0.6096778512001038, + "learning_rate": 0.0001, + "loss": 1.1763, + "step": 14588 + }, + { + "epoch": 1.675837114467865, + "grad_norm": 0.7438702583312988, + "learning_rate": 0.0001, + "loss": 1.4943, + "step": 14589 + }, + { + "epoch": 1.6759519843776922, + "grad_norm": 0.6072378754615784, + "learning_rate": 0.0001, + "loss": 1.4269, + "step": 14590 + }, + { + "epoch": 1.6760668542875194, + "grad_norm": 0.6651720404624939, + "learning_rate": 0.0001, + "loss": 1.3208, + "step": 14591 + }, + { + "epoch": 1.6761817241973465, + "grad_norm": 0.7027106881141663, + "learning_rate": 0.0001, + "loss": 1.5718, + "step": 14592 + }, + { + "epoch": 1.6762965941071735, + "grad_norm": 0.5823193788528442, + "learning_rate": 0.0001, + "loss": 1.5061, + "step": 14593 + }, + { + "epoch": 1.6764114640170007, + "grad_norm": 0.5353499054908752, + "learning_rate": 0.0001, + "loss": 1.0777, + "step": 14594 + }, + { + "epoch": 1.676526333926828, + "grad_norm": 0.7175213098526001, + "learning_rate": 0.0001, + "loss": 1.42, + "step": 14595 + }, + { + "epoch": 1.676641203836655, + "grad_norm": 0.5985221862792969, + "learning_rate": 0.0001, + "loss": 1.4292, + "step": 14596 + }, + { + "epoch": 1.676756073746482, + "grad_norm": 0.6454430818557739, + "learning_rate": 0.0001, + "loss": 1.4714, + "step": 14597 + }, + { + "epoch": 1.6768709436563092, + "grad_norm": 0.6255630850791931, + "learning_rate": 0.0001, + "loss": 1.521, + "step": 14598 + }, + { + "epoch": 1.6769858135661364, + "grad_norm": 0.6272461414337158, + "learning_rate": 0.0001, + "loss": 1.528, + "step": 14599 + }, + { + "epoch": 1.6771006834759634, + "grad_norm": 0.6686280369758606, + "learning_rate": 0.0001, + "loss": 1.711, + "step": 14600 + }, + { + "epoch": 1.6772155533857906, + "grad_norm": 0.5825721025466919, + "learning_rate": 0.0001, + "loss": 1.3257, + "step": 14601 + }, + { + "epoch": 1.6773304232956177, + "grad_norm": 0.6497721672058105, + "learning_rate": 0.0001, + "loss": 1.6273, + "step": 14602 + }, + { + "epoch": 1.6774452932054449, + "grad_norm": 0.6572535634040833, + "learning_rate": 0.0001, + "loss": 1.6098, + "step": 14603 + }, + { + "epoch": 1.6775601631152721, + "grad_norm": 0.590975284576416, + "learning_rate": 0.0001, + "loss": 1.476, + "step": 14604 + }, + { + "epoch": 1.6776750330250991, + "grad_norm": 0.6207257509231567, + "learning_rate": 0.0001, + "loss": 1.6519, + "step": 14605 + }, + { + "epoch": 1.6777899029349261, + "grad_norm": 0.6565500497817993, + "learning_rate": 0.0001, + "loss": 1.6584, + "step": 14606 + }, + { + "epoch": 1.6779047728447534, + "grad_norm": 0.6749123334884644, + "learning_rate": 0.0001, + "loss": 1.6032, + "step": 14607 + }, + { + "epoch": 1.6780196427545806, + "grad_norm": 0.6280671954154968, + "learning_rate": 0.0001, + "loss": 1.5966, + "step": 14608 + }, + { + "epoch": 1.6781345126644076, + "grad_norm": 0.6080400347709656, + "learning_rate": 0.0001, + "loss": 1.4813, + "step": 14609 + }, + { + "epoch": 1.6782493825742346, + "grad_norm": 0.6583701968193054, + "learning_rate": 0.0001, + "loss": 1.5606, + "step": 14610 + }, + { + "epoch": 1.6783642524840618, + "grad_norm": 0.7133932113647461, + "learning_rate": 0.0001, + "loss": 1.4113, + "step": 14611 + }, + { + "epoch": 1.678479122393889, + "grad_norm": 0.647867739200592, + "learning_rate": 0.0001, + "loss": 1.6575, + "step": 14612 + }, + { + "epoch": 1.678593992303716, + "grad_norm": 0.5785742402076721, + "learning_rate": 0.0001, + "loss": 1.2819, + "step": 14613 + }, + { + "epoch": 1.678708862213543, + "grad_norm": 0.6787799596786499, + "learning_rate": 0.0001, + "loss": 1.6367, + "step": 14614 + }, + { + "epoch": 1.6788237321233703, + "grad_norm": 0.581734299659729, + "learning_rate": 0.0001, + "loss": 1.2793, + "step": 14615 + }, + { + "epoch": 1.6789386020331976, + "grad_norm": 0.5504398345947266, + "learning_rate": 0.0001, + "loss": 1.3114, + "step": 14616 + }, + { + "epoch": 1.6790534719430246, + "grad_norm": 0.5877566337585449, + "learning_rate": 0.0001, + "loss": 1.3758, + "step": 14617 + }, + { + "epoch": 1.6791683418528516, + "grad_norm": 0.7053650617599487, + "learning_rate": 0.0001, + "loss": 1.5527, + "step": 14618 + }, + { + "epoch": 1.6792832117626788, + "grad_norm": 0.644305408000946, + "learning_rate": 0.0001, + "loss": 1.233, + "step": 14619 + }, + { + "epoch": 1.679398081672506, + "grad_norm": 0.6391613483428955, + "learning_rate": 0.0001, + "loss": 1.3591, + "step": 14620 + }, + { + "epoch": 1.679512951582333, + "grad_norm": 0.598298192024231, + "learning_rate": 0.0001, + "loss": 1.519, + "step": 14621 + }, + { + "epoch": 1.67962782149216, + "grad_norm": 0.60166996717453, + "learning_rate": 0.0001, + "loss": 1.5308, + "step": 14622 + }, + { + "epoch": 1.6797426914019873, + "grad_norm": 0.6578497290611267, + "learning_rate": 0.0001, + "loss": 1.3284, + "step": 14623 + }, + { + "epoch": 1.6798575613118145, + "grad_norm": 0.6441363096237183, + "learning_rate": 0.0001, + "loss": 1.3718, + "step": 14624 + }, + { + "epoch": 1.6799724312216415, + "grad_norm": 0.6314264535903931, + "learning_rate": 0.0001, + "loss": 1.5161, + "step": 14625 + }, + { + "epoch": 1.6800873011314685, + "grad_norm": 0.6196305155754089, + "learning_rate": 0.0001, + "loss": 1.503, + "step": 14626 + }, + { + "epoch": 1.6802021710412958, + "grad_norm": 0.6449001431465149, + "learning_rate": 0.0001, + "loss": 1.4048, + "step": 14627 + }, + { + "epoch": 1.680317040951123, + "grad_norm": 0.6065531969070435, + "learning_rate": 0.0001, + "loss": 1.396, + "step": 14628 + }, + { + "epoch": 1.68043191086095, + "grad_norm": 0.6479570269584656, + "learning_rate": 0.0001, + "loss": 1.6157, + "step": 14629 + }, + { + "epoch": 1.680546780770777, + "grad_norm": 0.6452540159225464, + "learning_rate": 0.0001, + "loss": 1.4435, + "step": 14630 + }, + { + "epoch": 1.6806616506806042, + "grad_norm": 0.6001893877983093, + "learning_rate": 0.0001, + "loss": 1.3286, + "step": 14631 + }, + { + "epoch": 1.6807765205904315, + "grad_norm": 0.6244105696678162, + "learning_rate": 0.0001, + "loss": 1.6007, + "step": 14632 + }, + { + "epoch": 1.6808913905002585, + "grad_norm": 0.6581660509109497, + "learning_rate": 0.0001, + "loss": 1.4778, + "step": 14633 + }, + { + "epoch": 1.6810062604100855, + "grad_norm": 0.7391045093536377, + "learning_rate": 0.0001, + "loss": 1.5467, + "step": 14634 + }, + { + "epoch": 1.6811211303199127, + "grad_norm": 0.6155508160591125, + "learning_rate": 0.0001, + "loss": 1.5132, + "step": 14635 + }, + { + "epoch": 1.68123600022974, + "grad_norm": 0.673112154006958, + "learning_rate": 0.0001, + "loss": 1.6676, + "step": 14636 + }, + { + "epoch": 1.681350870139567, + "grad_norm": 0.6411565542221069, + "learning_rate": 0.0001, + "loss": 1.4505, + "step": 14637 + }, + { + "epoch": 1.681465740049394, + "grad_norm": 0.7229892015457153, + "learning_rate": 0.0001, + "loss": 1.4545, + "step": 14638 + }, + { + "epoch": 1.6815806099592212, + "grad_norm": 0.5898306369781494, + "learning_rate": 0.0001, + "loss": 1.4702, + "step": 14639 + }, + { + "epoch": 1.6816954798690484, + "grad_norm": 0.6058882474899292, + "learning_rate": 0.0001, + "loss": 1.5154, + "step": 14640 + }, + { + "epoch": 1.6818103497788754, + "grad_norm": 0.7117508053779602, + "learning_rate": 0.0001, + "loss": 1.5957, + "step": 14641 + }, + { + "epoch": 1.6819252196887025, + "grad_norm": 0.7228211164474487, + "learning_rate": 0.0001, + "loss": 1.3576, + "step": 14642 + }, + { + "epoch": 1.6820400895985297, + "grad_norm": 0.5465759634971619, + "learning_rate": 0.0001, + "loss": 1.3254, + "step": 14643 + }, + { + "epoch": 1.682154959508357, + "grad_norm": 0.7400708198547363, + "learning_rate": 0.0001, + "loss": 1.7752, + "step": 14644 + }, + { + "epoch": 1.682269829418184, + "grad_norm": 0.5981911420822144, + "learning_rate": 0.0001, + "loss": 1.4287, + "step": 14645 + }, + { + "epoch": 1.682384699328011, + "grad_norm": 0.6448339223861694, + "learning_rate": 0.0001, + "loss": 1.6, + "step": 14646 + }, + { + "epoch": 1.6824995692378382, + "grad_norm": 0.560468316078186, + "learning_rate": 0.0001, + "loss": 1.453, + "step": 14647 + }, + { + "epoch": 1.6826144391476654, + "grad_norm": 0.6334208250045776, + "learning_rate": 0.0001, + "loss": 1.3525, + "step": 14648 + }, + { + "epoch": 1.6827293090574924, + "grad_norm": 0.6362658739089966, + "learning_rate": 0.0001, + "loss": 1.5981, + "step": 14649 + }, + { + "epoch": 1.6828441789673194, + "grad_norm": 0.644481360912323, + "learning_rate": 0.0001, + "loss": 1.6839, + "step": 14650 + }, + { + "epoch": 1.6829590488771466, + "grad_norm": 0.6140202879905701, + "learning_rate": 0.0001, + "loss": 1.5951, + "step": 14651 + }, + { + "epoch": 1.6830739187869739, + "grad_norm": 0.6383975744247437, + "learning_rate": 0.0001, + "loss": 1.4495, + "step": 14652 + }, + { + "epoch": 1.6831887886968009, + "grad_norm": 0.6611577868461609, + "learning_rate": 0.0001, + "loss": 1.5805, + "step": 14653 + }, + { + "epoch": 1.683303658606628, + "grad_norm": 0.6007152199745178, + "learning_rate": 0.0001, + "loss": 1.3869, + "step": 14654 + }, + { + "epoch": 1.6834185285164551, + "grad_norm": 0.6275546550750732, + "learning_rate": 0.0001, + "loss": 1.368, + "step": 14655 + }, + { + "epoch": 1.6835333984262824, + "grad_norm": 0.649321973323822, + "learning_rate": 0.0001, + "loss": 1.5191, + "step": 14656 + }, + { + "epoch": 1.6836482683361094, + "grad_norm": 0.5704925060272217, + "learning_rate": 0.0001, + "loss": 1.403, + "step": 14657 + }, + { + "epoch": 1.6837631382459364, + "grad_norm": 0.6267727017402649, + "learning_rate": 0.0001, + "loss": 1.4628, + "step": 14658 + }, + { + "epoch": 1.6838780081557636, + "grad_norm": 0.6294718980789185, + "learning_rate": 0.0001, + "loss": 1.3524, + "step": 14659 + }, + { + "epoch": 1.6839928780655908, + "grad_norm": 0.583004891872406, + "learning_rate": 0.0001, + "loss": 1.4805, + "step": 14660 + }, + { + "epoch": 1.6841077479754178, + "grad_norm": 0.616648256778717, + "learning_rate": 0.0001, + "loss": 1.4662, + "step": 14661 + }, + { + "epoch": 1.6842226178852449, + "grad_norm": 0.6571115255355835, + "learning_rate": 0.0001, + "loss": 1.4202, + "step": 14662 + }, + { + "epoch": 1.684337487795072, + "grad_norm": 0.7342918515205383, + "learning_rate": 0.0001, + "loss": 1.6703, + "step": 14663 + }, + { + "epoch": 1.6844523577048993, + "grad_norm": 0.6430667042732239, + "learning_rate": 0.0001, + "loss": 1.6223, + "step": 14664 + }, + { + "epoch": 1.6845672276147263, + "grad_norm": 0.6896945238113403, + "learning_rate": 0.0001, + "loss": 1.6645, + "step": 14665 + }, + { + "epoch": 1.6846820975245533, + "grad_norm": 0.6307993531227112, + "learning_rate": 0.0001, + "loss": 1.2423, + "step": 14666 + }, + { + "epoch": 1.6847969674343806, + "grad_norm": 0.6446820497512817, + "learning_rate": 0.0001, + "loss": 1.259, + "step": 14667 + }, + { + "epoch": 1.6849118373442078, + "grad_norm": 0.6242921948432922, + "learning_rate": 0.0001, + "loss": 1.4609, + "step": 14668 + }, + { + "epoch": 1.6850267072540348, + "grad_norm": 0.6997743248939514, + "learning_rate": 0.0001, + "loss": 1.6098, + "step": 14669 + }, + { + "epoch": 1.6851415771638618, + "grad_norm": 0.6224357485771179, + "learning_rate": 0.0001, + "loss": 1.5104, + "step": 14670 + }, + { + "epoch": 1.685256447073689, + "grad_norm": 0.5937080383300781, + "learning_rate": 0.0001, + "loss": 1.4058, + "step": 14671 + }, + { + "epoch": 1.6853713169835163, + "grad_norm": 0.6098648309707642, + "learning_rate": 0.0001, + "loss": 1.4876, + "step": 14672 + }, + { + "epoch": 1.6854861868933433, + "grad_norm": 0.6156720519065857, + "learning_rate": 0.0001, + "loss": 1.4901, + "step": 14673 + }, + { + "epoch": 1.6856010568031703, + "grad_norm": 0.6202700734138489, + "learning_rate": 0.0001, + "loss": 1.4744, + "step": 14674 + }, + { + "epoch": 1.6857159267129975, + "grad_norm": 0.6744952201843262, + "learning_rate": 0.0001, + "loss": 1.5085, + "step": 14675 + }, + { + "epoch": 1.6858307966228248, + "grad_norm": 0.6229870319366455, + "learning_rate": 0.0001, + "loss": 1.4296, + "step": 14676 + }, + { + "epoch": 1.6859456665326518, + "grad_norm": 0.6743441224098206, + "learning_rate": 0.0001, + "loss": 1.4914, + "step": 14677 + }, + { + "epoch": 1.6860605364424788, + "grad_norm": 0.6453379988670349, + "learning_rate": 0.0001, + "loss": 1.589, + "step": 14678 + }, + { + "epoch": 1.686175406352306, + "grad_norm": 0.6405113935470581, + "learning_rate": 0.0001, + "loss": 1.4714, + "step": 14679 + }, + { + "epoch": 1.6862902762621332, + "grad_norm": 0.610621988773346, + "learning_rate": 0.0001, + "loss": 1.5101, + "step": 14680 + }, + { + "epoch": 1.6864051461719602, + "grad_norm": 0.5977540016174316, + "learning_rate": 0.0001, + "loss": 1.4575, + "step": 14681 + }, + { + "epoch": 1.6865200160817873, + "grad_norm": 0.6138947606086731, + "learning_rate": 0.0001, + "loss": 1.5132, + "step": 14682 + }, + { + "epoch": 1.6866348859916145, + "grad_norm": 0.6411577463150024, + "learning_rate": 0.0001, + "loss": 1.5665, + "step": 14683 + }, + { + "epoch": 1.6867497559014417, + "grad_norm": 0.6003040671348572, + "learning_rate": 0.0001, + "loss": 1.3768, + "step": 14684 + }, + { + "epoch": 1.6868646258112687, + "grad_norm": 0.5844460129737854, + "learning_rate": 0.0001, + "loss": 1.351, + "step": 14685 + }, + { + "epoch": 1.6869794957210957, + "grad_norm": 0.7202532887458801, + "learning_rate": 0.0001, + "loss": 1.6241, + "step": 14686 + }, + { + "epoch": 1.687094365630923, + "grad_norm": 0.6601413488388062, + "learning_rate": 0.0001, + "loss": 1.793, + "step": 14687 + }, + { + "epoch": 1.6872092355407502, + "grad_norm": 0.6533530354499817, + "learning_rate": 0.0001, + "loss": 1.4675, + "step": 14688 + }, + { + "epoch": 1.6873241054505772, + "grad_norm": 0.5973615646362305, + "learning_rate": 0.0001, + "loss": 1.4479, + "step": 14689 + }, + { + "epoch": 1.6874389753604042, + "grad_norm": 0.6223400831222534, + "learning_rate": 0.0001, + "loss": 1.5005, + "step": 14690 + }, + { + "epoch": 1.6875538452702314, + "grad_norm": 0.7212362885475159, + "learning_rate": 0.0001, + "loss": 1.7079, + "step": 14691 + }, + { + "epoch": 1.6876687151800587, + "grad_norm": 0.6673148274421692, + "learning_rate": 0.0001, + "loss": 1.428, + "step": 14692 + }, + { + "epoch": 1.6877835850898857, + "grad_norm": 0.6568313241004944, + "learning_rate": 0.0001, + "loss": 1.3289, + "step": 14693 + }, + { + "epoch": 1.6878984549997127, + "grad_norm": 0.6275414228439331, + "learning_rate": 0.0001, + "loss": 1.4829, + "step": 14694 + }, + { + "epoch": 1.68801332490954, + "grad_norm": 0.6030680537223816, + "learning_rate": 0.0001, + "loss": 1.5531, + "step": 14695 + }, + { + "epoch": 1.6881281948193672, + "grad_norm": 0.6563968658447266, + "learning_rate": 0.0001, + "loss": 1.3393, + "step": 14696 + }, + { + "epoch": 1.6882430647291942, + "grad_norm": 0.683423638343811, + "learning_rate": 0.0001, + "loss": 1.5572, + "step": 14697 + }, + { + "epoch": 1.6883579346390212, + "grad_norm": 0.6408438086509705, + "learning_rate": 0.0001, + "loss": 1.5372, + "step": 14698 + }, + { + "epoch": 1.6884728045488484, + "grad_norm": 0.6201786398887634, + "learning_rate": 0.0001, + "loss": 1.3984, + "step": 14699 + }, + { + "epoch": 1.6885876744586756, + "grad_norm": 0.6087800860404968, + "learning_rate": 0.0001, + "loss": 1.616, + "step": 14700 + }, + { + "epoch": 1.6887025443685026, + "grad_norm": 0.6137244701385498, + "learning_rate": 0.0001, + "loss": 1.3528, + "step": 14701 + }, + { + "epoch": 1.6888174142783297, + "grad_norm": 0.5812850594520569, + "learning_rate": 0.0001, + "loss": 1.3308, + "step": 14702 + }, + { + "epoch": 1.6889322841881569, + "grad_norm": 0.6570180058479309, + "learning_rate": 0.0001, + "loss": 1.5878, + "step": 14703 + }, + { + "epoch": 1.6890471540979841, + "grad_norm": 0.6571214199066162, + "learning_rate": 0.0001, + "loss": 1.3125, + "step": 14704 + }, + { + "epoch": 1.6891620240078111, + "grad_norm": 0.6408611536026001, + "learning_rate": 0.0001, + "loss": 1.3928, + "step": 14705 + }, + { + "epoch": 1.6892768939176381, + "grad_norm": 0.6647818088531494, + "learning_rate": 0.0001, + "loss": 1.4973, + "step": 14706 + }, + { + "epoch": 1.6893917638274654, + "grad_norm": 0.6333121061325073, + "learning_rate": 0.0001, + "loss": 1.4169, + "step": 14707 + }, + { + "epoch": 1.6895066337372926, + "grad_norm": 0.6847052574157715, + "learning_rate": 0.0001, + "loss": 1.383, + "step": 14708 + }, + { + "epoch": 1.6896215036471196, + "grad_norm": 0.6509214639663696, + "learning_rate": 0.0001, + "loss": 1.5616, + "step": 14709 + }, + { + "epoch": 1.6897363735569466, + "grad_norm": 0.6630228757858276, + "learning_rate": 0.0001, + "loss": 1.4381, + "step": 14710 + }, + { + "epoch": 1.6898512434667738, + "grad_norm": 0.5848700404167175, + "learning_rate": 0.0001, + "loss": 1.3055, + "step": 14711 + }, + { + "epoch": 1.689966113376601, + "grad_norm": 0.6231377720832825, + "learning_rate": 0.0001, + "loss": 1.5535, + "step": 14712 + }, + { + "epoch": 1.690080983286428, + "grad_norm": 0.5822433233261108, + "learning_rate": 0.0001, + "loss": 1.5534, + "step": 14713 + }, + { + "epoch": 1.690195853196255, + "grad_norm": 0.5917012095451355, + "learning_rate": 0.0001, + "loss": 1.4819, + "step": 14714 + }, + { + "epoch": 1.6903107231060823, + "grad_norm": 0.5557729601860046, + "learning_rate": 0.0001, + "loss": 1.4386, + "step": 14715 + }, + { + "epoch": 1.6904255930159096, + "grad_norm": 0.6072548627853394, + "learning_rate": 0.0001, + "loss": 1.6101, + "step": 14716 + }, + { + "epoch": 1.6905404629257366, + "grad_norm": 0.6061117053031921, + "learning_rate": 0.0001, + "loss": 1.2462, + "step": 14717 + }, + { + "epoch": 1.6906553328355636, + "grad_norm": 0.6177441477775574, + "learning_rate": 0.0001, + "loss": 1.5584, + "step": 14718 + }, + { + "epoch": 1.6907702027453908, + "grad_norm": 0.6135041117668152, + "learning_rate": 0.0001, + "loss": 1.4055, + "step": 14719 + }, + { + "epoch": 1.690885072655218, + "grad_norm": 0.5727534890174866, + "learning_rate": 0.0001, + "loss": 1.2838, + "step": 14720 + }, + { + "epoch": 1.690999942565045, + "grad_norm": 0.6418645977973938, + "learning_rate": 0.0001, + "loss": 1.3512, + "step": 14721 + }, + { + "epoch": 1.691114812474872, + "grad_norm": 0.6252636909484863, + "learning_rate": 0.0001, + "loss": 1.5915, + "step": 14722 + }, + { + "epoch": 1.6912296823846993, + "grad_norm": 0.6545863151550293, + "learning_rate": 0.0001, + "loss": 1.5227, + "step": 14723 + }, + { + "epoch": 1.6913445522945265, + "grad_norm": 0.6704011559486389, + "learning_rate": 0.0001, + "loss": 1.4523, + "step": 14724 + }, + { + "epoch": 1.6914594222043535, + "grad_norm": 0.6669784188270569, + "learning_rate": 0.0001, + "loss": 1.6168, + "step": 14725 + }, + { + "epoch": 1.6915742921141805, + "grad_norm": 0.5946208238601685, + "learning_rate": 0.0001, + "loss": 1.5255, + "step": 14726 + }, + { + "epoch": 1.6916891620240078, + "grad_norm": 0.6300133466720581, + "learning_rate": 0.0001, + "loss": 1.6328, + "step": 14727 + }, + { + "epoch": 1.691804031933835, + "grad_norm": 0.6053428053855896, + "learning_rate": 0.0001, + "loss": 1.6368, + "step": 14728 + }, + { + "epoch": 1.691918901843662, + "grad_norm": 0.5880074501037598, + "learning_rate": 0.0001, + "loss": 1.3086, + "step": 14729 + }, + { + "epoch": 1.692033771753489, + "grad_norm": 0.6411834955215454, + "learning_rate": 0.0001, + "loss": 1.3448, + "step": 14730 + }, + { + "epoch": 1.6921486416633162, + "grad_norm": 0.6445175409317017, + "learning_rate": 0.0001, + "loss": 1.5241, + "step": 14731 + }, + { + "epoch": 1.6922635115731435, + "grad_norm": 0.5757904052734375, + "learning_rate": 0.0001, + "loss": 1.3863, + "step": 14732 + }, + { + "epoch": 1.6923783814829705, + "grad_norm": 0.6143217086791992, + "learning_rate": 0.0001, + "loss": 1.6193, + "step": 14733 + }, + { + "epoch": 1.6924932513927975, + "grad_norm": 0.6285145282745361, + "learning_rate": 0.0001, + "loss": 1.4525, + "step": 14734 + }, + { + "epoch": 1.6926081213026247, + "grad_norm": 0.6272004246711731, + "learning_rate": 0.0001, + "loss": 1.3839, + "step": 14735 + }, + { + "epoch": 1.692722991212452, + "grad_norm": 0.6545289754867554, + "learning_rate": 0.0001, + "loss": 1.4322, + "step": 14736 + }, + { + "epoch": 1.692837861122279, + "grad_norm": 0.6652606725692749, + "learning_rate": 0.0001, + "loss": 1.6324, + "step": 14737 + }, + { + "epoch": 1.6929527310321062, + "grad_norm": 0.6375353932380676, + "learning_rate": 0.0001, + "loss": 1.4693, + "step": 14738 + }, + { + "epoch": 1.6930676009419332, + "grad_norm": 0.6303648948669434, + "learning_rate": 0.0001, + "loss": 1.4073, + "step": 14739 + }, + { + "epoch": 1.6931824708517604, + "grad_norm": 0.6370104551315308, + "learning_rate": 0.0001, + "loss": 1.2702, + "step": 14740 + }, + { + "epoch": 1.6932973407615877, + "grad_norm": 0.723127007484436, + "learning_rate": 0.0001, + "loss": 1.4016, + "step": 14741 + }, + { + "epoch": 1.6934122106714147, + "grad_norm": 0.628434419631958, + "learning_rate": 0.0001, + "loss": 1.3892, + "step": 14742 + }, + { + "epoch": 1.6935270805812417, + "grad_norm": 0.5962461233139038, + "learning_rate": 0.0001, + "loss": 1.4348, + "step": 14743 + }, + { + "epoch": 1.693641950491069, + "grad_norm": 0.6001591682434082, + "learning_rate": 0.0001, + "loss": 1.4057, + "step": 14744 + }, + { + "epoch": 1.6937568204008961, + "grad_norm": 0.6299112439155579, + "learning_rate": 0.0001, + "loss": 1.5806, + "step": 14745 + }, + { + "epoch": 1.6938716903107232, + "grad_norm": 0.6881070137023926, + "learning_rate": 0.0001, + "loss": 1.666, + "step": 14746 + }, + { + "epoch": 1.6939865602205502, + "grad_norm": 0.5913798809051514, + "learning_rate": 0.0001, + "loss": 1.3889, + "step": 14747 + }, + { + "epoch": 1.6941014301303774, + "grad_norm": 0.627502977848053, + "learning_rate": 0.0001, + "loss": 1.5266, + "step": 14748 + }, + { + "epoch": 1.6942163000402046, + "grad_norm": 0.6564575433731079, + "learning_rate": 0.0001, + "loss": 1.4907, + "step": 14749 + }, + { + "epoch": 1.6943311699500316, + "grad_norm": 0.5897772908210754, + "learning_rate": 0.0001, + "loss": 1.3894, + "step": 14750 + }, + { + "epoch": 1.6944460398598586, + "grad_norm": 0.5913833975791931, + "learning_rate": 0.0001, + "loss": 1.2416, + "step": 14751 + }, + { + "epoch": 1.6945609097696859, + "grad_norm": 0.658932626247406, + "learning_rate": 0.0001, + "loss": 1.4408, + "step": 14752 + }, + { + "epoch": 1.694675779679513, + "grad_norm": 0.6177611351013184, + "learning_rate": 0.0001, + "loss": 1.4164, + "step": 14753 + }, + { + "epoch": 1.6947906495893401, + "grad_norm": 0.6406357884407043, + "learning_rate": 0.0001, + "loss": 1.4451, + "step": 14754 + }, + { + "epoch": 1.6949055194991671, + "grad_norm": 0.6174083948135376, + "learning_rate": 0.0001, + "loss": 1.4722, + "step": 14755 + }, + { + "epoch": 1.6950203894089944, + "grad_norm": 0.653312087059021, + "learning_rate": 0.0001, + "loss": 1.4612, + "step": 14756 + }, + { + "epoch": 1.6951352593188216, + "grad_norm": 0.6030119061470032, + "learning_rate": 0.0001, + "loss": 1.3403, + "step": 14757 + }, + { + "epoch": 1.6952501292286486, + "grad_norm": 0.6403154730796814, + "learning_rate": 0.0001, + "loss": 1.5922, + "step": 14758 + }, + { + "epoch": 1.6953649991384756, + "grad_norm": 0.6424520015716553, + "learning_rate": 0.0001, + "loss": 1.5404, + "step": 14759 + }, + { + "epoch": 1.6954798690483028, + "grad_norm": 0.6321309804916382, + "learning_rate": 0.0001, + "loss": 1.3525, + "step": 14760 + }, + { + "epoch": 1.69559473895813, + "grad_norm": 0.5908375382423401, + "learning_rate": 0.0001, + "loss": 1.4181, + "step": 14761 + }, + { + "epoch": 1.695709608867957, + "grad_norm": 0.6151296496391296, + "learning_rate": 0.0001, + "loss": 1.4344, + "step": 14762 + }, + { + "epoch": 1.695824478777784, + "grad_norm": 0.5596523880958557, + "learning_rate": 0.0001, + "loss": 1.2521, + "step": 14763 + }, + { + "epoch": 1.6959393486876113, + "grad_norm": 0.6613996028900146, + "learning_rate": 0.0001, + "loss": 1.5143, + "step": 14764 + }, + { + "epoch": 1.6960542185974385, + "grad_norm": 0.6622706055641174, + "learning_rate": 0.0001, + "loss": 1.6614, + "step": 14765 + }, + { + "epoch": 1.6961690885072656, + "grad_norm": 0.5737489461898804, + "learning_rate": 0.0001, + "loss": 1.4419, + "step": 14766 + }, + { + "epoch": 1.6962839584170926, + "grad_norm": 0.6686919927597046, + "learning_rate": 0.0001, + "loss": 1.5794, + "step": 14767 + }, + { + "epoch": 1.6963988283269198, + "grad_norm": 0.6299922466278076, + "learning_rate": 0.0001, + "loss": 1.5279, + "step": 14768 + }, + { + "epoch": 1.696513698236747, + "grad_norm": 0.7300532460212708, + "learning_rate": 0.0001, + "loss": 1.6348, + "step": 14769 + }, + { + "epoch": 1.696628568146574, + "grad_norm": 0.5972233414649963, + "learning_rate": 0.0001, + "loss": 1.4763, + "step": 14770 + }, + { + "epoch": 1.696743438056401, + "grad_norm": 0.6257323026657104, + "learning_rate": 0.0001, + "loss": 1.3211, + "step": 14771 + }, + { + "epoch": 1.6968583079662283, + "grad_norm": 0.6275603175163269, + "learning_rate": 0.0001, + "loss": 1.5752, + "step": 14772 + }, + { + "epoch": 1.6969731778760555, + "grad_norm": 0.5872368216514587, + "learning_rate": 0.0001, + "loss": 1.3845, + "step": 14773 + }, + { + "epoch": 1.6970880477858825, + "grad_norm": 0.5436793565750122, + "learning_rate": 0.0001, + "loss": 1.1879, + "step": 14774 + }, + { + "epoch": 1.6972029176957095, + "grad_norm": 0.5925673246383667, + "learning_rate": 0.0001, + "loss": 1.4837, + "step": 14775 + }, + { + "epoch": 1.6973177876055368, + "grad_norm": 0.622356653213501, + "learning_rate": 0.0001, + "loss": 1.2646, + "step": 14776 + }, + { + "epoch": 1.697432657515364, + "grad_norm": 0.644891083240509, + "learning_rate": 0.0001, + "loss": 1.4162, + "step": 14777 + }, + { + "epoch": 1.697547527425191, + "grad_norm": 0.6052226424217224, + "learning_rate": 0.0001, + "loss": 1.3639, + "step": 14778 + }, + { + "epoch": 1.697662397335018, + "grad_norm": 0.6266080737113953, + "learning_rate": 0.0001, + "loss": 1.3751, + "step": 14779 + }, + { + "epoch": 1.6977772672448452, + "grad_norm": 0.6198167204856873, + "learning_rate": 0.0001, + "loss": 1.5003, + "step": 14780 + }, + { + "epoch": 1.6978921371546725, + "grad_norm": 0.6346505284309387, + "learning_rate": 0.0001, + "loss": 1.4329, + "step": 14781 + }, + { + "epoch": 1.6980070070644995, + "grad_norm": 0.6443406939506531, + "learning_rate": 0.0001, + "loss": 1.3899, + "step": 14782 + }, + { + "epoch": 1.6981218769743265, + "grad_norm": 0.6747581958770752, + "learning_rate": 0.0001, + "loss": 1.4359, + "step": 14783 + }, + { + "epoch": 1.6982367468841537, + "grad_norm": 0.6033055186271667, + "learning_rate": 0.0001, + "loss": 1.5487, + "step": 14784 + }, + { + "epoch": 1.698351616793981, + "grad_norm": 0.6114761829376221, + "learning_rate": 0.0001, + "loss": 1.4088, + "step": 14785 + }, + { + "epoch": 1.698466486703808, + "grad_norm": 0.5994529724121094, + "learning_rate": 0.0001, + "loss": 1.4139, + "step": 14786 + }, + { + "epoch": 1.698581356613635, + "grad_norm": 0.6032854318618774, + "learning_rate": 0.0001, + "loss": 1.3684, + "step": 14787 + }, + { + "epoch": 1.6986962265234622, + "grad_norm": 0.579774796962738, + "learning_rate": 0.0001, + "loss": 1.6101, + "step": 14788 + }, + { + "epoch": 1.6988110964332894, + "grad_norm": 0.600327730178833, + "learning_rate": 0.0001, + "loss": 1.3107, + "step": 14789 + }, + { + "epoch": 1.6989259663431164, + "grad_norm": 0.6498262882232666, + "learning_rate": 0.0001, + "loss": 1.5339, + "step": 14790 + }, + { + "epoch": 1.6990408362529434, + "grad_norm": 0.6453105211257935, + "learning_rate": 0.0001, + "loss": 1.534, + "step": 14791 + }, + { + "epoch": 1.6991557061627707, + "grad_norm": 0.6776260137557983, + "learning_rate": 0.0001, + "loss": 1.4309, + "step": 14792 + }, + { + "epoch": 1.699270576072598, + "grad_norm": 0.5969988107681274, + "learning_rate": 0.0001, + "loss": 1.4604, + "step": 14793 + }, + { + "epoch": 1.699385445982425, + "grad_norm": 0.585665762424469, + "learning_rate": 0.0001, + "loss": 1.3594, + "step": 14794 + }, + { + "epoch": 1.699500315892252, + "grad_norm": 0.6432570815086365, + "learning_rate": 0.0001, + "loss": 1.5177, + "step": 14795 + }, + { + "epoch": 1.6996151858020792, + "grad_norm": 0.6451871395111084, + "learning_rate": 0.0001, + "loss": 1.3841, + "step": 14796 + }, + { + "epoch": 1.6997300557119064, + "grad_norm": 0.6202559471130371, + "learning_rate": 0.0001, + "loss": 1.5245, + "step": 14797 + }, + { + "epoch": 1.6998449256217334, + "grad_norm": 0.6412597894668579, + "learning_rate": 0.0001, + "loss": 1.4781, + "step": 14798 + }, + { + "epoch": 1.6999597955315604, + "grad_norm": 0.6816568374633789, + "learning_rate": 0.0001, + "loss": 1.4192, + "step": 14799 + }, + { + "epoch": 1.7000746654413876, + "grad_norm": 0.6473395824432373, + "learning_rate": 0.0001, + "loss": 1.5152, + "step": 14800 + }, + { + "epoch": 1.7001895353512149, + "grad_norm": 0.6455778479576111, + "learning_rate": 0.0001, + "loss": 1.2928, + "step": 14801 + }, + { + "epoch": 1.7003044052610419, + "grad_norm": 0.5928786396980286, + "learning_rate": 0.0001, + "loss": 1.2802, + "step": 14802 + }, + { + "epoch": 1.7004192751708689, + "grad_norm": 0.6279940605163574, + "learning_rate": 0.0001, + "loss": 1.2901, + "step": 14803 + }, + { + "epoch": 1.7005341450806961, + "grad_norm": 0.6435730457305908, + "learning_rate": 0.0001, + "loss": 1.4583, + "step": 14804 + }, + { + "epoch": 1.7006490149905233, + "grad_norm": 0.6767624616622925, + "learning_rate": 0.0001, + "loss": 1.5811, + "step": 14805 + }, + { + "epoch": 1.7007638849003504, + "grad_norm": 0.6514124870300293, + "learning_rate": 0.0001, + "loss": 1.589, + "step": 14806 + }, + { + "epoch": 1.7008787548101774, + "grad_norm": 0.6798242926597595, + "learning_rate": 0.0001, + "loss": 1.5002, + "step": 14807 + }, + { + "epoch": 1.7009936247200046, + "grad_norm": 0.6122627258300781, + "learning_rate": 0.0001, + "loss": 1.6312, + "step": 14808 + }, + { + "epoch": 1.7011084946298318, + "grad_norm": 0.6929191946983337, + "learning_rate": 0.0001, + "loss": 1.4633, + "step": 14809 + }, + { + "epoch": 1.7012233645396588, + "grad_norm": 0.6241300702095032, + "learning_rate": 0.0001, + "loss": 1.6087, + "step": 14810 + }, + { + "epoch": 1.7013382344494858, + "grad_norm": 0.7182061076164246, + "learning_rate": 0.0001, + "loss": 1.5046, + "step": 14811 + }, + { + "epoch": 1.701453104359313, + "grad_norm": 0.6124372482299805, + "learning_rate": 0.0001, + "loss": 1.4548, + "step": 14812 + }, + { + "epoch": 1.7015679742691403, + "grad_norm": 0.766761064529419, + "learning_rate": 0.0001, + "loss": 1.4963, + "step": 14813 + }, + { + "epoch": 1.7016828441789673, + "grad_norm": 0.664625346660614, + "learning_rate": 0.0001, + "loss": 1.2332, + "step": 14814 + }, + { + "epoch": 1.7017977140887943, + "grad_norm": 0.641326367855072, + "learning_rate": 0.0001, + "loss": 1.4563, + "step": 14815 + }, + { + "epoch": 1.7019125839986216, + "grad_norm": 0.6896613240242004, + "learning_rate": 0.0001, + "loss": 1.4501, + "step": 14816 + }, + { + "epoch": 1.7020274539084488, + "grad_norm": 0.6627568602561951, + "learning_rate": 0.0001, + "loss": 1.7495, + "step": 14817 + }, + { + "epoch": 1.7021423238182758, + "grad_norm": 0.6433359384536743, + "learning_rate": 0.0001, + "loss": 1.5985, + "step": 14818 + }, + { + "epoch": 1.7022571937281028, + "grad_norm": 0.5926421880722046, + "learning_rate": 0.0001, + "loss": 1.419, + "step": 14819 + }, + { + "epoch": 1.70237206363793, + "grad_norm": 0.6341785192489624, + "learning_rate": 0.0001, + "loss": 1.3109, + "step": 14820 + }, + { + "epoch": 1.7024869335477573, + "grad_norm": 0.5665348768234253, + "learning_rate": 0.0001, + "loss": 1.2676, + "step": 14821 + }, + { + "epoch": 1.7026018034575843, + "grad_norm": 0.6543471217155457, + "learning_rate": 0.0001, + "loss": 1.6028, + "step": 14822 + }, + { + "epoch": 1.7027166733674113, + "grad_norm": 0.6324287056922913, + "learning_rate": 0.0001, + "loss": 1.5246, + "step": 14823 + }, + { + "epoch": 1.7028315432772385, + "grad_norm": 0.6068698167800903, + "learning_rate": 0.0001, + "loss": 1.3328, + "step": 14824 + }, + { + "epoch": 1.7029464131870657, + "grad_norm": 0.6536720991134644, + "learning_rate": 0.0001, + "loss": 1.422, + "step": 14825 + }, + { + "epoch": 1.7030612830968928, + "grad_norm": 0.681016206741333, + "learning_rate": 0.0001, + "loss": 1.5969, + "step": 14826 + }, + { + "epoch": 1.7031761530067198, + "grad_norm": 0.6520375609397888, + "learning_rate": 0.0001, + "loss": 1.565, + "step": 14827 + }, + { + "epoch": 1.703291022916547, + "grad_norm": 0.6231850385665894, + "learning_rate": 0.0001, + "loss": 1.2909, + "step": 14828 + }, + { + "epoch": 1.7034058928263742, + "grad_norm": 0.6289814114570618, + "learning_rate": 0.0001, + "loss": 1.5762, + "step": 14829 + }, + { + "epoch": 1.7035207627362012, + "grad_norm": 0.687447726726532, + "learning_rate": 0.0001, + "loss": 1.2311, + "step": 14830 + }, + { + "epoch": 1.7036356326460282, + "grad_norm": 0.6084256172180176, + "learning_rate": 0.0001, + "loss": 1.512, + "step": 14831 + }, + { + "epoch": 1.7037505025558555, + "grad_norm": 0.6083263754844666, + "learning_rate": 0.0001, + "loss": 1.4937, + "step": 14832 + }, + { + "epoch": 1.7038653724656827, + "grad_norm": 0.612534761428833, + "learning_rate": 0.0001, + "loss": 1.5857, + "step": 14833 + }, + { + "epoch": 1.7039802423755097, + "grad_norm": 0.6613317728042603, + "learning_rate": 0.0001, + "loss": 1.6517, + "step": 14834 + }, + { + "epoch": 1.7040951122853367, + "grad_norm": 0.6363469958305359, + "learning_rate": 0.0001, + "loss": 1.5, + "step": 14835 + }, + { + "epoch": 1.704209982195164, + "grad_norm": 0.6956435441970825, + "learning_rate": 0.0001, + "loss": 1.6757, + "step": 14836 + }, + { + "epoch": 1.7043248521049912, + "grad_norm": 0.649994969367981, + "learning_rate": 0.0001, + "loss": 1.6309, + "step": 14837 + }, + { + "epoch": 1.7044397220148182, + "grad_norm": 0.6042583584785461, + "learning_rate": 0.0001, + "loss": 1.3096, + "step": 14838 + }, + { + "epoch": 1.7045545919246452, + "grad_norm": 0.6033551692962646, + "learning_rate": 0.0001, + "loss": 1.4756, + "step": 14839 + }, + { + "epoch": 1.7046694618344724, + "grad_norm": 0.6003922820091248, + "learning_rate": 0.0001, + "loss": 1.5267, + "step": 14840 + }, + { + "epoch": 1.7047843317442997, + "grad_norm": 0.6671146154403687, + "learning_rate": 0.0001, + "loss": 1.6061, + "step": 14841 + }, + { + "epoch": 1.7048992016541267, + "grad_norm": 0.5957947373390198, + "learning_rate": 0.0001, + "loss": 1.3582, + "step": 14842 + }, + { + "epoch": 1.7050140715639537, + "grad_norm": 0.6179354786872864, + "learning_rate": 0.0001, + "loss": 1.5215, + "step": 14843 + }, + { + "epoch": 1.705128941473781, + "grad_norm": 0.6118759512901306, + "learning_rate": 0.0001, + "loss": 1.3776, + "step": 14844 + }, + { + "epoch": 1.7052438113836081, + "grad_norm": 0.5971879959106445, + "learning_rate": 0.0001, + "loss": 1.4897, + "step": 14845 + }, + { + "epoch": 1.7053586812934352, + "grad_norm": 0.5953097343444824, + "learning_rate": 0.0001, + "loss": 1.4166, + "step": 14846 + }, + { + "epoch": 1.7054735512032622, + "grad_norm": 0.6157338619232178, + "learning_rate": 0.0001, + "loss": 1.4949, + "step": 14847 + }, + { + "epoch": 1.7055884211130894, + "grad_norm": 0.6230495572090149, + "learning_rate": 0.0001, + "loss": 1.4628, + "step": 14848 + }, + { + "epoch": 1.7057032910229166, + "grad_norm": 0.6791070699691772, + "learning_rate": 0.0001, + "loss": 1.5388, + "step": 14849 + }, + { + "epoch": 1.7058181609327436, + "grad_norm": 0.5627061128616333, + "learning_rate": 0.0001, + "loss": 1.2639, + "step": 14850 + }, + { + "epoch": 1.7059330308425706, + "grad_norm": 0.6061202883720398, + "learning_rate": 0.0001, + "loss": 1.34, + "step": 14851 + }, + { + "epoch": 1.7060479007523979, + "grad_norm": 0.6992841362953186, + "learning_rate": 0.0001, + "loss": 1.6581, + "step": 14852 + }, + { + "epoch": 1.706162770662225, + "grad_norm": 0.6365556120872498, + "learning_rate": 0.0001, + "loss": 1.632, + "step": 14853 + }, + { + "epoch": 1.7062776405720521, + "grad_norm": 0.6153625845909119, + "learning_rate": 0.0001, + "loss": 1.2521, + "step": 14854 + }, + { + "epoch": 1.7063925104818791, + "grad_norm": 0.6573269963264465, + "learning_rate": 0.0001, + "loss": 1.5091, + "step": 14855 + }, + { + "epoch": 1.7065073803917064, + "grad_norm": 0.6061968207359314, + "learning_rate": 0.0001, + "loss": 1.4127, + "step": 14856 + }, + { + "epoch": 1.7066222503015336, + "grad_norm": 0.6206647753715515, + "learning_rate": 0.0001, + "loss": 1.3738, + "step": 14857 + }, + { + "epoch": 1.7067371202113606, + "grad_norm": 0.5990536212921143, + "learning_rate": 0.0001, + "loss": 1.5828, + "step": 14858 + }, + { + "epoch": 1.7068519901211876, + "grad_norm": 0.6412786245346069, + "learning_rate": 0.0001, + "loss": 1.6806, + "step": 14859 + }, + { + "epoch": 1.7069668600310148, + "grad_norm": 0.6077423691749573, + "learning_rate": 0.0001, + "loss": 1.4176, + "step": 14860 + }, + { + "epoch": 1.707081729940842, + "grad_norm": 0.6050223708152771, + "learning_rate": 0.0001, + "loss": 1.3676, + "step": 14861 + }, + { + "epoch": 1.707196599850669, + "grad_norm": 0.6094151735305786, + "learning_rate": 0.0001, + "loss": 1.4076, + "step": 14862 + }, + { + "epoch": 1.707311469760496, + "grad_norm": 0.6265443563461304, + "learning_rate": 0.0001, + "loss": 1.6305, + "step": 14863 + }, + { + "epoch": 1.7074263396703233, + "grad_norm": 0.6976631283760071, + "learning_rate": 0.0001, + "loss": 1.3129, + "step": 14864 + }, + { + "epoch": 1.7075412095801505, + "grad_norm": 0.6556214094161987, + "learning_rate": 0.0001, + "loss": 1.5914, + "step": 14865 + }, + { + "epoch": 1.7076560794899776, + "grad_norm": 0.5818542838096619, + "learning_rate": 0.0001, + "loss": 1.3679, + "step": 14866 + }, + { + "epoch": 1.7077709493998046, + "grad_norm": 0.5856722593307495, + "learning_rate": 0.0001, + "loss": 1.3271, + "step": 14867 + }, + { + "epoch": 1.7078858193096318, + "grad_norm": 0.5733777284622192, + "learning_rate": 0.0001, + "loss": 1.3968, + "step": 14868 + }, + { + "epoch": 1.708000689219459, + "grad_norm": 0.6631439924240112, + "learning_rate": 0.0001, + "loss": 1.5482, + "step": 14869 + }, + { + "epoch": 1.708115559129286, + "grad_norm": 0.6170898079872131, + "learning_rate": 0.0001, + "loss": 1.4636, + "step": 14870 + }, + { + "epoch": 1.708230429039113, + "grad_norm": 0.700430154800415, + "learning_rate": 0.0001, + "loss": 1.4296, + "step": 14871 + }, + { + "epoch": 1.7083452989489403, + "grad_norm": 0.6631823182106018, + "learning_rate": 0.0001, + "loss": 1.4876, + "step": 14872 + }, + { + "epoch": 1.7084601688587675, + "grad_norm": 0.6962304711341858, + "learning_rate": 0.0001, + "loss": 1.4368, + "step": 14873 + }, + { + "epoch": 1.7085750387685945, + "grad_norm": 0.5741162896156311, + "learning_rate": 0.0001, + "loss": 1.3373, + "step": 14874 + }, + { + "epoch": 1.7086899086784217, + "grad_norm": 0.6080014109611511, + "learning_rate": 0.0001, + "loss": 1.3203, + "step": 14875 + }, + { + "epoch": 1.7088047785882488, + "grad_norm": 0.6642729043960571, + "learning_rate": 0.0001, + "loss": 1.5342, + "step": 14876 + }, + { + "epoch": 1.708919648498076, + "grad_norm": 0.659485936164856, + "learning_rate": 0.0001, + "loss": 1.4116, + "step": 14877 + }, + { + "epoch": 1.7090345184079032, + "grad_norm": 0.6787165999412537, + "learning_rate": 0.0001, + "loss": 1.4067, + "step": 14878 + }, + { + "epoch": 1.7091493883177302, + "grad_norm": 0.6123861074447632, + "learning_rate": 0.0001, + "loss": 1.4821, + "step": 14879 + }, + { + "epoch": 1.7092642582275572, + "grad_norm": 0.608695387840271, + "learning_rate": 0.0001, + "loss": 1.4714, + "step": 14880 + }, + { + "epoch": 1.7093791281373845, + "grad_norm": 0.5870606899261475, + "learning_rate": 0.0001, + "loss": 1.4441, + "step": 14881 + }, + { + "epoch": 1.7094939980472117, + "grad_norm": 0.6159781813621521, + "learning_rate": 0.0001, + "loss": 1.2932, + "step": 14882 + }, + { + "epoch": 1.7096088679570387, + "grad_norm": 0.6811216473579407, + "learning_rate": 0.0001, + "loss": 1.4543, + "step": 14883 + }, + { + "epoch": 1.7097237378668657, + "grad_norm": 0.601915180683136, + "learning_rate": 0.0001, + "loss": 1.2939, + "step": 14884 + }, + { + "epoch": 1.709838607776693, + "grad_norm": 0.5753575563430786, + "learning_rate": 0.0001, + "loss": 1.4496, + "step": 14885 + }, + { + "epoch": 1.7099534776865202, + "grad_norm": 0.6104464530944824, + "learning_rate": 0.0001, + "loss": 1.4767, + "step": 14886 + }, + { + "epoch": 1.7100683475963472, + "grad_norm": 0.5839319229125977, + "learning_rate": 0.0001, + "loss": 1.2978, + "step": 14887 + }, + { + "epoch": 1.7101832175061742, + "grad_norm": 0.6496050953865051, + "learning_rate": 0.0001, + "loss": 1.6834, + "step": 14888 + }, + { + "epoch": 1.7102980874160014, + "grad_norm": 0.6293419599533081, + "learning_rate": 0.0001, + "loss": 1.4529, + "step": 14889 + }, + { + "epoch": 1.7104129573258287, + "grad_norm": 0.6812129616737366, + "learning_rate": 0.0001, + "loss": 1.534, + "step": 14890 + }, + { + "epoch": 1.7105278272356557, + "grad_norm": 0.614952802658081, + "learning_rate": 0.0001, + "loss": 1.5508, + "step": 14891 + }, + { + "epoch": 1.7106426971454827, + "grad_norm": 0.6444807648658752, + "learning_rate": 0.0001, + "loss": 1.3534, + "step": 14892 + }, + { + "epoch": 1.71075756705531, + "grad_norm": 0.7069858312606812, + "learning_rate": 0.0001, + "loss": 1.6134, + "step": 14893 + }, + { + "epoch": 1.7108724369651371, + "grad_norm": 0.6311841607093811, + "learning_rate": 0.0001, + "loss": 1.4556, + "step": 14894 + }, + { + "epoch": 1.7109873068749641, + "grad_norm": 0.645004153251648, + "learning_rate": 0.0001, + "loss": 1.4576, + "step": 14895 + }, + { + "epoch": 1.7111021767847912, + "grad_norm": 0.613746166229248, + "learning_rate": 0.0001, + "loss": 1.351, + "step": 14896 + }, + { + "epoch": 1.7112170466946184, + "grad_norm": 0.653880774974823, + "learning_rate": 0.0001, + "loss": 1.4966, + "step": 14897 + }, + { + "epoch": 1.7113319166044456, + "grad_norm": 0.6353497505187988, + "learning_rate": 0.0001, + "loss": 1.5277, + "step": 14898 + }, + { + "epoch": 1.7114467865142726, + "grad_norm": 0.6185076236724854, + "learning_rate": 0.0001, + "loss": 1.3926, + "step": 14899 + }, + { + "epoch": 1.7115616564240996, + "grad_norm": 0.6185590028762817, + "learning_rate": 0.0001, + "loss": 1.4641, + "step": 14900 + }, + { + "epoch": 1.7116765263339269, + "grad_norm": 0.6211035847663879, + "learning_rate": 0.0001, + "loss": 1.5412, + "step": 14901 + }, + { + "epoch": 1.711791396243754, + "grad_norm": 0.5859823226928711, + "learning_rate": 0.0001, + "loss": 1.3688, + "step": 14902 + }, + { + "epoch": 1.711906266153581, + "grad_norm": 0.5817895531654358, + "learning_rate": 0.0001, + "loss": 1.3958, + "step": 14903 + }, + { + "epoch": 1.7120211360634081, + "grad_norm": 0.6351556181907654, + "learning_rate": 0.0001, + "loss": 1.5479, + "step": 14904 + }, + { + "epoch": 1.7121360059732353, + "grad_norm": 0.6283468008041382, + "learning_rate": 0.0001, + "loss": 1.496, + "step": 14905 + }, + { + "epoch": 1.7122508758830626, + "grad_norm": 0.6462883353233337, + "learning_rate": 0.0001, + "loss": 1.5196, + "step": 14906 + }, + { + "epoch": 1.7123657457928896, + "grad_norm": 0.6173494458198547, + "learning_rate": 0.0001, + "loss": 1.4087, + "step": 14907 + }, + { + "epoch": 1.7124806157027166, + "grad_norm": 0.5699242949485779, + "learning_rate": 0.0001, + "loss": 1.5066, + "step": 14908 + }, + { + "epoch": 1.7125954856125438, + "grad_norm": 0.6764625310897827, + "learning_rate": 0.0001, + "loss": 1.5277, + "step": 14909 + }, + { + "epoch": 1.712710355522371, + "grad_norm": 0.6505712866783142, + "learning_rate": 0.0001, + "loss": 1.4663, + "step": 14910 + }, + { + "epoch": 1.712825225432198, + "grad_norm": 0.6406113505363464, + "learning_rate": 0.0001, + "loss": 1.4806, + "step": 14911 + }, + { + "epoch": 1.712940095342025, + "grad_norm": 0.621518075466156, + "learning_rate": 0.0001, + "loss": 1.5465, + "step": 14912 + }, + { + "epoch": 1.7130549652518523, + "grad_norm": 0.6753479838371277, + "learning_rate": 0.0001, + "loss": 1.7465, + "step": 14913 + }, + { + "epoch": 1.7131698351616795, + "grad_norm": 0.7874284982681274, + "learning_rate": 0.0001, + "loss": 1.5676, + "step": 14914 + }, + { + "epoch": 1.7132847050715065, + "grad_norm": 0.6016520857810974, + "learning_rate": 0.0001, + "loss": 1.4469, + "step": 14915 + }, + { + "epoch": 1.7133995749813336, + "grad_norm": 0.6585147976875305, + "learning_rate": 0.0001, + "loss": 1.6068, + "step": 14916 + }, + { + "epoch": 1.7135144448911608, + "grad_norm": 0.5877991318702698, + "learning_rate": 0.0001, + "loss": 1.4722, + "step": 14917 + }, + { + "epoch": 1.713629314800988, + "grad_norm": 0.5988264083862305, + "learning_rate": 0.0001, + "loss": 1.6484, + "step": 14918 + }, + { + "epoch": 1.713744184710815, + "grad_norm": 0.7021918296813965, + "learning_rate": 0.0001, + "loss": 1.5297, + "step": 14919 + }, + { + "epoch": 1.713859054620642, + "grad_norm": 0.6114031076431274, + "learning_rate": 0.0001, + "loss": 1.4106, + "step": 14920 + }, + { + "epoch": 1.7139739245304693, + "grad_norm": 0.646767258644104, + "learning_rate": 0.0001, + "loss": 1.4992, + "step": 14921 + }, + { + "epoch": 1.7140887944402965, + "grad_norm": 0.598789393901825, + "learning_rate": 0.0001, + "loss": 1.3379, + "step": 14922 + }, + { + "epoch": 1.7142036643501235, + "grad_norm": 0.6237581372261047, + "learning_rate": 0.0001, + "loss": 1.4198, + "step": 14923 + }, + { + "epoch": 1.7143185342599505, + "grad_norm": 0.613361120223999, + "learning_rate": 0.0001, + "loss": 1.5141, + "step": 14924 + }, + { + "epoch": 1.7144334041697777, + "grad_norm": 0.654420018196106, + "learning_rate": 0.0001, + "loss": 1.5044, + "step": 14925 + }, + { + "epoch": 1.714548274079605, + "grad_norm": 0.6176064014434814, + "learning_rate": 0.0001, + "loss": 1.555, + "step": 14926 + }, + { + "epoch": 1.714663143989432, + "grad_norm": 0.6290966272354126, + "learning_rate": 0.0001, + "loss": 1.6387, + "step": 14927 + }, + { + "epoch": 1.714778013899259, + "grad_norm": 0.6438091397285461, + "learning_rate": 0.0001, + "loss": 1.3984, + "step": 14928 + }, + { + "epoch": 1.7148928838090862, + "grad_norm": 0.6490173935890198, + "learning_rate": 0.0001, + "loss": 1.5162, + "step": 14929 + }, + { + "epoch": 1.7150077537189135, + "grad_norm": 0.5619537830352783, + "learning_rate": 0.0001, + "loss": 1.4052, + "step": 14930 + }, + { + "epoch": 1.7151226236287405, + "grad_norm": 0.5740304589271545, + "learning_rate": 0.0001, + "loss": 1.3887, + "step": 14931 + }, + { + "epoch": 1.7152374935385675, + "grad_norm": 0.5972532629966736, + "learning_rate": 0.0001, + "loss": 1.488, + "step": 14932 + }, + { + "epoch": 1.7153523634483947, + "grad_norm": 0.6191779375076294, + "learning_rate": 0.0001, + "loss": 1.3594, + "step": 14933 + }, + { + "epoch": 1.715467233358222, + "grad_norm": 0.6333027482032776, + "learning_rate": 0.0001, + "loss": 1.3897, + "step": 14934 + }, + { + "epoch": 1.715582103268049, + "grad_norm": 0.5880884528160095, + "learning_rate": 0.0001, + "loss": 1.243, + "step": 14935 + }, + { + "epoch": 1.715696973177876, + "grad_norm": 0.7410986423492432, + "learning_rate": 0.0001, + "loss": 1.6198, + "step": 14936 + }, + { + "epoch": 1.7158118430877032, + "grad_norm": 0.7331770062446594, + "learning_rate": 0.0001, + "loss": 1.5097, + "step": 14937 + }, + { + "epoch": 1.7159267129975304, + "grad_norm": 0.6188743114471436, + "learning_rate": 0.0001, + "loss": 1.5405, + "step": 14938 + }, + { + "epoch": 1.7160415829073574, + "grad_norm": 0.6449677348136902, + "learning_rate": 0.0001, + "loss": 1.8204, + "step": 14939 + }, + { + "epoch": 1.7161564528171844, + "grad_norm": 0.6289303302764893, + "learning_rate": 0.0001, + "loss": 1.5189, + "step": 14940 + }, + { + "epoch": 1.7162713227270117, + "grad_norm": 0.5749282836914062, + "learning_rate": 0.0001, + "loss": 1.3974, + "step": 14941 + }, + { + "epoch": 1.716386192636839, + "grad_norm": 0.6385175585746765, + "learning_rate": 0.0001, + "loss": 1.4408, + "step": 14942 + }, + { + "epoch": 1.716501062546666, + "grad_norm": 0.6189382672309875, + "learning_rate": 0.0001, + "loss": 1.5049, + "step": 14943 + }, + { + "epoch": 1.716615932456493, + "grad_norm": 0.7417494058609009, + "learning_rate": 0.0001, + "loss": 1.6693, + "step": 14944 + }, + { + "epoch": 1.7167308023663201, + "grad_norm": 0.5887095332145691, + "learning_rate": 0.0001, + "loss": 1.5724, + "step": 14945 + }, + { + "epoch": 1.7168456722761474, + "grad_norm": 0.591689944267273, + "learning_rate": 0.0001, + "loss": 1.3988, + "step": 14946 + }, + { + "epoch": 1.7169605421859744, + "grad_norm": 0.7112393975257874, + "learning_rate": 0.0001, + "loss": 1.6562, + "step": 14947 + }, + { + "epoch": 1.7170754120958014, + "grad_norm": 0.6112138032913208, + "learning_rate": 0.0001, + "loss": 1.3103, + "step": 14948 + }, + { + "epoch": 1.7171902820056286, + "grad_norm": 0.6022821664810181, + "learning_rate": 0.0001, + "loss": 1.2665, + "step": 14949 + }, + { + "epoch": 1.7173051519154559, + "grad_norm": 0.6003796458244324, + "learning_rate": 0.0001, + "loss": 1.3495, + "step": 14950 + }, + { + "epoch": 1.7174200218252829, + "grad_norm": 0.7552711963653564, + "learning_rate": 0.0001, + "loss": 1.4782, + "step": 14951 + }, + { + "epoch": 1.7175348917351099, + "grad_norm": 0.6487937569618225, + "learning_rate": 0.0001, + "loss": 1.4913, + "step": 14952 + }, + { + "epoch": 1.717649761644937, + "grad_norm": 0.5925029516220093, + "learning_rate": 0.0001, + "loss": 1.3924, + "step": 14953 + }, + { + "epoch": 1.7177646315547643, + "grad_norm": 0.6977973580360413, + "learning_rate": 0.0001, + "loss": 1.534, + "step": 14954 + }, + { + "epoch": 1.7178795014645913, + "grad_norm": 0.6343498826026917, + "learning_rate": 0.0001, + "loss": 1.5123, + "step": 14955 + }, + { + "epoch": 1.7179943713744183, + "grad_norm": 0.608635663986206, + "learning_rate": 0.0001, + "loss": 1.3089, + "step": 14956 + }, + { + "epoch": 1.7181092412842456, + "grad_norm": 0.6102705001831055, + "learning_rate": 0.0001, + "loss": 1.4239, + "step": 14957 + }, + { + "epoch": 1.7182241111940728, + "grad_norm": 0.6915038824081421, + "learning_rate": 0.0001, + "loss": 1.5418, + "step": 14958 + }, + { + "epoch": 1.7183389811038998, + "grad_norm": 0.6009780764579773, + "learning_rate": 0.0001, + "loss": 1.5457, + "step": 14959 + }, + { + "epoch": 1.7184538510137268, + "grad_norm": 0.5998890399932861, + "learning_rate": 0.0001, + "loss": 1.4216, + "step": 14960 + }, + { + "epoch": 1.718568720923554, + "grad_norm": 0.6564615964889526, + "learning_rate": 0.0001, + "loss": 1.5023, + "step": 14961 + }, + { + "epoch": 1.7186835908333813, + "grad_norm": 0.5995262861251831, + "learning_rate": 0.0001, + "loss": 1.555, + "step": 14962 + }, + { + "epoch": 1.7187984607432083, + "grad_norm": 0.5520354509353638, + "learning_rate": 0.0001, + "loss": 1.3103, + "step": 14963 + }, + { + "epoch": 1.7189133306530353, + "grad_norm": 0.5722663402557373, + "learning_rate": 0.0001, + "loss": 1.2938, + "step": 14964 + }, + { + "epoch": 1.7190282005628625, + "grad_norm": 0.6511684060096741, + "learning_rate": 0.0001, + "loss": 1.5674, + "step": 14965 + }, + { + "epoch": 1.7191430704726898, + "grad_norm": 0.6239314675331116, + "learning_rate": 0.0001, + "loss": 1.4849, + "step": 14966 + }, + { + "epoch": 1.7192579403825168, + "grad_norm": 0.6765071153640747, + "learning_rate": 0.0001, + "loss": 1.7592, + "step": 14967 + }, + { + "epoch": 1.7193728102923438, + "grad_norm": 0.6002155542373657, + "learning_rate": 0.0001, + "loss": 1.4651, + "step": 14968 + }, + { + "epoch": 1.719487680202171, + "grad_norm": 0.6126995086669922, + "learning_rate": 0.0001, + "loss": 1.3875, + "step": 14969 + }, + { + "epoch": 1.7196025501119983, + "grad_norm": 0.6200347542762756, + "learning_rate": 0.0001, + "loss": 1.4829, + "step": 14970 + }, + { + "epoch": 1.7197174200218253, + "grad_norm": 0.6229084730148315, + "learning_rate": 0.0001, + "loss": 1.5968, + "step": 14971 + }, + { + "epoch": 1.7198322899316523, + "grad_norm": 0.5846944451332092, + "learning_rate": 0.0001, + "loss": 1.2907, + "step": 14972 + }, + { + "epoch": 1.7199471598414795, + "grad_norm": 0.6202731132507324, + "learning_rate": 0.0001, + "loss": 1.4005, + "step": 14973 + }, + { + "epoch": 1.7200620297513067, + "grad_norm": 0.5932221412658691, + "learning_rate": 0.0001, + "loss": 1.3362, + "step": 14974 + }, + { + "epoch": 1.7201768996611337, + "grad_norm": 0.5857172012329102, + "learning_rate": 0.0001, + "loss": 1.3536, + "step": 14975 + }, + { + "epoch": 1.7202917695709607, + "grad_norm": 0.6796727776527405, + "learning_rate": 0.0001, + "loss": 1.5061, + "step": 14976 + }, + { + "epoch": 1.720406639480788, + "grad_norm": 0.6333981156349182, + "learning_rate": 0.0001, + "loss": 1.5462, + "step": 14977 + }, + { + "epoch": 1.7205215093906152, + "grad_norm": 0.6612197160720825, + "learning_rate": 0.0001, + "loss": 1.3956, + "step": 14978 + }, + { + "epoch": 1.7206363793004422, + "grad_norm": 0.6356850862503052, + "learning_rate": 0.0001, + "loss": 1.4098, + "step": 14979 + }, + { + "epoch": 1.7207512492102692, + "grad_norm": 0.6078984141349792, + "learning_rate": 0.0001, + "loss": 1.4597, + "step": 14980 + }, + { + "epoch": 1.7208661191200965, + "grad_norm": 0.6189718842506409, + "learning_rate": 0.0001, + "loss": 1.4904, + "step": 14981 + }, + { + "epoch": 1.7209809890299237, + "grad_norm": 0.5823473334312439, + "learning_rate": 0.0001, + "loss": 1.3974, + "step": 14982 + }, + { + "epoch": 1.7210958589397507, + "grad_norm": 0.6768457889556885, + "learning_rate": 0.0001, + "loss": 1.5662, + "step": 14983 + }, + { + "epoch": 1.7212107288495777, + "grad_norm": 0.6085696220397949, + "learning_rate": 0.0001, + "loss": 1.4566, + "step": 14984 + }, + { + "epoch": 1.721325598759405, + "grad_norm": 0.6197189092636108, + "learning_rate": 0.0001, + "loss": 1.4594, + "step": 14985 + }, + { + "epoch": 1.7214404686692322, + "grad_norm": 0.6606781482696533, + "learning_rate": 0.0001, + "loss": 1.5952, + "step": 14986 + }, + { + "epoch": 1.7215553385790592, + "grad_norm": 0.5829959511756897, + "learning_rate": 0.0001, + "loss": 1.2158, + "step": 14987 + }, + { + "epoch": 1.7216702084888862, + "grad_norm": 0.6431682705879211, + "learning_rate": 0.0001, + "loss": 1.3648, + "step": 14988 + }, + { + "epoch": 1.7217850783987134, + "grad_norm": 0.6825764775276184, + "learning_rate": 0.0001, + "loss": 1.5493, + "step": 14989 + }, + { + "epoch": 1.7218999483085407, + "grad_norm": 0.5983292460441589, + "learning_rate": 0.0001, + "loss": 1.3756, + "step": 14990 + }, + { + "epoch": 1.7220148182183677, + "grad_norm": 0.6294005513191223, + "learning_rate": 0.0001, + "loss": 1.2658, + "step": 14991 + }, + { + "epoch": 1.7221296881281947, + "grad_norm": 0.6360632181167603, + "learning_rate": 0.0001, + "loss": 1.5104, + "step": 14992 + }, + { + "epoch": 1.722244558038022, + "grad_norm": 0.5845391154289246, + "learning_rate": 0.0001, + "loss": 1.2789, + "step": 14993 + }, + { + "epoch": 1.7223594279478491, + "grad_norm": 0.7193369269371033, + "learning_rate": 0.0001, + "loss": 1.5247, + "step": 14994 + }, + { + "epoch": 1.7224742978576761, + "grad_norm": 0.583917498588562, + "learning_rate": 0.0001, + "loss": 1.4828, + "step": 14995 + }, + { + "epoch": 1.7225891677675031, + "grad_norm": 0.6654466986656189, + "learning_rate": 0.0001, + "loss": 1.5591, + "step": 14996 + }, + { + "epoch": 1.7227040376773304, + "grad_norm": 0.5956998467445374, + "learning_rate": 0.0001, + "loss": 1.2864, + "step": 14997 + }, + { + "epoch": 1.7228189075871576, + "grad_norm": 0.6005557179450989, + "learning_rate": 0.0001, + "loss": 1.2126, + "step": 14998 + }, + { + "epoch": 1.7229337774969846, + "grad_norm": 0.6229847073554993, + "learning_rate": 0.0001, + "loss": 1.5815, + "step": 14999 + }, + { + "epoch": 1.7230486474068116, + "grad_norm": 0.5950932502746582, + "learning_rate": 0.0001, + "loss": 1.4798, + "step": 15000 + }, + { + "epoch": 1.7231635173166389, + "grad_norm": 0.627495288848877, + "learning_rate": 0.0001, + "loss": 1.4347, + "step": 15001 + }, + { + "epoch": 1.723278387226466, + "grad_norm": 0.5470762252807617, + "learning_rate": 0.0001, + "loss": 1.1846, + "step": 15002 + }, + { + "epoch": 1.723393257136293, + "grad_norm": 0.5681390166282654, + "learning_rate": 0.0001, + "loss": 1.3358, + "step": 15003 + }, + { + "epoch": 1.72350812704612, + "grad_norm": 0.5831446647644043, + "learning_rate": 0.0001, + "loss": 1.4402, + "step": 15004 + }, + { + "epoch": 1.7236229969559473, + "grad_norm": 0.6146191358566284, + "learning_rate": 0.0001, + "loss": 1.4376, + "step": 15005 + }, + { + "epoch": 1.7237378668657746, + "grad_norm": 0.6659471988677979, + "learning_rate": 0.0001, + "loss": 1.3841, + "step": 15006 + }, + { + "epoch": 1.7238527367756016, + "grad_norm": 0.6301655173301697, + "learning_rate": 0.0001, + "loss": 1.4554, + "step": 15007 + }, + { + "epoch": 1.7239676066854286, + "grad_norm": 0.6276340484619141, + "learning_rate": 0.0001, + "loss": 1.5274, + "step": 15008 + }, + { + "epoch": 1.7240824765952558, + "grad_norm": 0.5837733745574951, + "learning_rate": 0.0001, + "loss": 1.4584, + "step": 15009 + }, + { + "epoch": 1.724197346505083, + "grad_norm": 0.5841119885444641, + "learning_rate": 0.0001, + "loss": 1.5087, + "step": 15010 + }, + { + "epoch": 1.72431221641491, + "grad_norm": 0.6044458746910095, + "learning_rate": 0.0001, + "loss": 1.5023, + "step": 15011 + }, + { + "epoch": 1.7244270863247373, + "grad_norm": 0.6079664826393127, + "learning_rate": 0.0001, + "loss": 1.4079, + "step": 15012 + }, + { + "epoch": 1.7245419562345643, + "grad_norm": 0.8847506642341614, + "learning_rate": 0.0001, + "loss": 1.6107, + "step": 15013 + }, + { + "epoch": 1.7246568261443915, + "grad_norm": 0.6034465432167053, + "learning_rate": 0.0001, + "loss": 1.2711, + "step": 15014 + }, + { + "epoch": 1.7247716960542188, + "grad_norm": 0.8095591068267822, + "learning_rate": 0.0001, + "loss": 1.4583, + "step": 15015 + }, + { + "epoch": 1.7248865659640458, + "grad_norm": 0.5719621777534485, + "learning_rate": 0.0001, + "loss": 1.427, + "step": 15016 + }, + { + "epoch": 1.7250014358738728, + "grad_norm": 0.6097038388252258, + "learning_rate": 0.0001, + "loss": 1.5195, + "step": 15017 + }, + { + "epoch": 1.7251163057837, + "grad_norm": 0.64508455991745, + "learning_rate": 0.0001, + "loss": 1.5549, + "step": 15018 + }, + { + "epoch": 1.7252311756935272, + "grad_norm": 0.5853515863418579, + "learning_rate": 0.0001, + "loss": 1.3652, + "step": 15019 + }, + { + "epoch": 1.7253460456033543, + "grad_norm": 0.6391353011131287, + "learning_rate": 0.0001, + "loss": 1.4132, + "step": 15020 + }, + { + "epoch": 1.7254609155131813, + "grad_norm": 0.6568958163261414, + "learning_rate": 0.0001, + "loss": 1.5182, + "step": 15021 + }, + { + "epoch": 1.7255757854230085, + "grad_norm": 0.6337956190109253, + "learning_rate": 0.0001, + "loss": 1.5081, + "step": 15022 + }, + { + "epoch": 1.7256906553328357, + "grad_norm": 0.682131290435791, + "learning_rate": 0.0001, + "loss": 1.4523, + "step": 15023 + }, + { + "epoch": 1.7258055252426627, + "grad_norm": 0.6506146192550659, + "learning_rate": 0.0001, + "loss": 1.5401, + "step": 15024 + }, + { + "epoch": 1.7259203951524897, + "grad_norm": 0.6392216086387634, + "learning_rate": 0.0001, + "loss": 1.5851, + "step": 15025 + }, + { + "epoch": 1.726035265062317, + "grad_norm": 0.6349377632141113, + "learning_rate": 0.0001, + "loss": 1.3924, + "step": 15026 + }, + { + "epoch": 1.7261501349721442, + "grad_norm": 0.588093638420105, + "learning_rate": 0.0001, + "loss": 1.5297, + "step": 15027 + }, + { + "epoch": 1.7262650048819712, + "grad_norm": 0.6122344732284546, + "learning_rate": 0.0001, + "loss": 1.537, + "step": 15028 + }, + { + "epoch": 1.7263798747917982, + "grad_norm": 0.6537800431251526, + "learning_rate": 0.0001, + "loss": 1.5847, + "step": 15029 + }, + { + "epoch": 1.7264947447016255, + "grad_norm": 0.6109773516654968, + "learning_rate": 0.0001, + "loss": 1.4354, + "step": 15030 + }, + { + "epoch": 1.7266096146114527, + "grad_norm": 0.576797604560852, + "learning_rate": 0.0001, + "loss": 1.3143, + "step": 15031 + }, + { + "epoch": 1.7267244845212797, + "grad_norm": 0.6462949514389038, + "learning_rate": 0.0001, + "loss": 1.4408, + "step": 15032 + }, + { + "epoch": 1.7268393544311067, + "grad_norm": 0.6329190731048584, + "learning_rate": 0.0001, + "loss": 1.4861, + "step": 15033 + }, + { + "epoch": 1.726954224340934, + "grad_norm": 0.5958088040351868, + "learning_rate": 0.0001, + "loss": 1.5292, + "step": 15034 + }, + { + "epoch": 1.7270690942507612, + "grad_norm": 0.630052387714386, + "learning_rate": 0.0001, + "loss": 1.3768, + "step": 15035 + }, + { + "epoch": 1.7271839641605882, + "grad_norm": 0.667178750038147, + "learning_rate": 0.0001, + "loss": 1.3865, + "step": 15036 + }, + { + "epoch": 1.7272988340704152, + "grad_norm": 0.7113081216812134, + "learning_rate": 0.0001, + "loss": 1.7048, + "step": 15037 + }, + { + "epoch": 1.7274137039802424, + "grad_norm": 0.6631393432617188, + "learning_rate": 0.0001, + "loss": 1.4899, + "step": 15038 + }, + { + "epoch": 1.7275285738900696, + "grad_norm": 0.6248582005500793, + "learning_rate": 0.0001, + "loss": 1.5667, + "step": 15039 + }, + { + "epoch": 1.7276434437998966, + "grad_norm": 0.6079177260398865, + "learning_rate": 0.0001, + "loss": 1.3492, + "step": 15040 + }, + { + "epoch": 1.7277583137097237, + "grad_norm": 0.5928921699523926, + "learning_rate": 0.0001, + "loss": 1.4487, + "step": 15041 + }, + { + "epoch": 1.727873183619551, + "grad_norm": 0.6063787341117859, + "learning_rate": 0.0001, + "loss": 1.225, + "step": 15042 + }, + { + "epoch": 1.7279880535293781, + "grad_norm": 0.6443949937820435, + "learning_rate": 0.0001, + "loss": 1.5223, + "step": 15043 + }, + { + "epoch": 1.7281029234392051, + "grad_norm": 0.6002311110496521, + "learning_rate": 0.0001, + "loss": 1.474, + "step": 15044 + }, + { + "epoch": 1.7282177933490321, + "grad_norm": 0.6543359756469727, + "learning_rate": 0.0001, + "loss": 1.4626, + "step": 15045 + }, + { + "epoch": 1.7283326632588594, + "grad_norm": 0.6799443364143372, + "learning_rate": 0.0001, + "loss": 1.5299, + "step": 15046 + }, + { + "epoch": 1.7284475331686866, + "grad_norm": 0.6205649971961975, + "learning_rate": 0.0001, + "loss": 1.2928, + "step": 15047 + }, + { + "epoch": 1.7285624030785136, + "grad_norm": 0.589119553565979, + "learning_rate": 0.0001, + "loss": 1.5165, + "step": 15048 + }, + { + "epoch": 1.7286772729883406, + "grad_norm": 0.6019067168235779, + "learning_rate": 0.0001, + "loss": 1.3567, + "step": 15049 + }, + { + "epoch": 1.7287921428981678, + "grad_norm": 0.5993046760559082, + "learning_rate": 0.0001, + "loss": 1.3804, + "step": 15050 + }, + { + "epoch": 1.728907012807995, + "grad_norm": 0.6053866147994995, + "learning_rate": 0.0001, + "loss": 1.4004, + "step": 15051 + }, + { + "epoch": 1.729021882717822, + "grad_norm": 0.5996253490447998, + "learning_rate": 0.0001, + "loss": 1.1198, + "step": 15052 + }, + { + "epoch": 1.729136752627649, + "grad_norm": 0.6605576872825623, + "learning_rate": 0.0001, + "loss": 1.4068, + "step": 15053 + }, + { + "epoch": 1.7292516225374763, + "grad_norm": 0.5937269926071167, + "learning_rate": 0.0001, + "loss": 1.4028, + "step": 15054 + }, + { + "epoch": 1.7293664924473036, + "grad_norm": 0.6884070038795471, + "learning_rate": 0.0001, + "loss": 1.5806, + "step": 15055 + }, + { + "epoch": 1.7294813623571306, + "grad_norm": 0.6444214582443237, + "learning_rate": 0.0001, + "loss": 1.4598, + "step": 15056 + }, + { + "epoch": 1.7295962322669576, + "grad_norm": 0.6065829992294312, + "learning_rate": 0.0001, + "loss": 1.5141, + "step": 15057 + }, + { + "epoch": 1.7297111021767848, + "grad_norm": 0.6256981492042542, + "learning_rate": 0.0001, + "loss": 1.2783, + "step": 15058 + }, + { + "epoch": 1.729825972086612, + "grad_norm": 0.641852617263794, + "learning_rate": 0.0001, + "loss": 1.6474, + "step": 15059 + }, + { + "epoch": 1.729940841996439, + "grad_norm": 0.5563915967941284, + "learning_rate": 0.0001, + "loss": 1.4621, + "step": 15060 + }, + { + "epoch": 1.730055711906266, + "grad_norm": 0.6508216261863708, + "learning_rate": 0.0001, + "loss": 1.3978, + "step": 15061 + }, + { + "epoch": 1.7301705818160933, + "grad_norm": 0.61888188123703, + "learning_rate": 0.0001, + "loss": 1.6299, + "step": 15062 + }, + { + "epoch": 1.7302854517259205, + "grad_norm": 0.6529124975204468, + "learning_rate": 0.0001, + "loss": 1.3718, + "step": 15063 + }, + { + "epoch": 1.7304003216357475, + "grad_norm": 0.5647428631782532, + "learning_rate": 0.0001, + "loss": 1.4557, + "step": 15064 + }, + { + "epoch": 1.7305151915455745, + "grad_norm": 0.710871696472168, + "learning_rate": 0.0001, + "loss": 1.6701, + "step": 15065 + }, + { + "epoch": 1.7306300614554018, + "grad_norm": 0.6646068692207336, + "learning_rate": 0.0001, + "loss": 1.6186, + "step": 15066 + }, + { + "epoch": 1.730744931365229, + "grad_norm": 0.6500128507614136, + "learning_rate": 0.0001, + "loss": 1.5042, + "step": 15067 + }, + { + "epoch": 1.730859801275056, + "grad_norm": 0.6322208642959595, + "learning_rate": 0.0001, + "loss": 1.4786, + "step": 15068 + }, + { + "epoch": 1.730974671184883, + "grad_norm": 0.5904911756515503, + "learning_rate": 0.0001, + "loss": 1.4433, + "step": 15069 + }, + { + "epoch": 1.7310895410947102, + "grad_norm": 0.639840841293335, + "learning_rate": 0.0001, + "loss": 1.5974, + "step": 15070 + }, + { + "epoch": 1.7312044110045375, + "grad_norm": 0.6180384159088135, + "learning_rate": 0.0001, + "loss": 1.4879, + "step": 15071 + }, + { + "epoch": 1.7313192809143645, + "grad_norm": 0.5702844858169556, + "learning_rate": 0.0001, + "loss": 1.3738, + "step": 15072 + }, + { + "epoch": 1.7314341508241915, + "grad_norm": 0.6036345362663269, + "learning_rate": 0.0001, + "loss": 1.5192, + "step": 15073 + }, + { + "epoch": 1.7315490207340187, + "grad_norm": 0.6159979104995728, + "learning_rate": 0.0001, + "loss": 1.5198, + "step": 15074 + }, + { + "epoch": 1.731663890643846, + "grad_norm": 0.5626546740531921, + "learning_rate": 0.0001, + "loss": 1.422, + "step": 15075 + }, + { + "epoch": 1.731778760553673, + "grad_norm": 0.6199283599853516, + "learning_rate": 0.0001, + "loss": 1.5527, + "step": 15076 + }, + { + "epoch": 1.7318936304635, + "grad_norm": 0.6209295392036438, + "learning_rate": 0.0001, + "loss": 1.5764, + "step": 15077 + }, + { + "epoch": 1.7320085003733272, + "grad_norm": 0.574235200881958, + "learning_rate": 0.0001, + "loss": 1.4249, + "step": 15078 + }, + { + "epoch": 1.7321233702831544, + "grad_norm": 0.6106351017951965, + "learning_rate": 0.0001, + "loss": 1.5343, + "step": 15079 + }, + { + "epoch": 1.7322382401929814, + "grad_norm": 0.6253929138183594, + "learning_rate": 0.0001, + "loss": 1.3034, + "step": 15080 + }, + { + "epoch": 1.7323531101028085, + "grad_norm": 0.6872732639312744, + "learning_rate": 0.0001, + "loss": 1.5674, + "step": 15081 + }, + { + "epoch": 1.7324679800126357, + "grad_norm": 0.6001285314559937, + "learning_rate": 0.0001, + "loss": 1.3798, + "step": 15082 + }, + { + "epoch": 1.732582849922463, + "grad_norm": 0.5744822025299072, + "learning_rate": 0.0001, + "loss": 1.5976, + "step": 15083 + }, + { + "epoch": 1.73269771983229, + "grad_norm": 0.5871395468711853, + "learning_rate": 0.0001, + "loss": 1.4265, + "step": 15084 + }, + { + "epoch": 1.732812589742117, + "grad_norm": 0.586462140083313, + "learning_rate": 0.0001, + "loss": 1.4947, + "step": 15085 + }, + { + "epoch": 1.7329274596519442, + "grad_norm": 0.6341403722763062, + "learning_rate": 0.0001, + "loss": 1.4599, + "step": 15086 + }, + { + "epoch": 1.7330423295617714, + "grad_norm": 0.5986180901527405, + "learning_rate": 0.0001, + "loss": 1.3682, + "step": 15087 + }, + { + "epoch": 1.7331571994715984, + "grad_norm": 0.6208639144897461, + "learning_rate": 0.0001, + "loss": 1.5485, + "step": 15088 + }, + { + "epoch": 1.7332720693814254, + "grad_norm": 0.684834361076355, + "learning_rate": 0.0001, + "loss": 1.5264, + "step": 15089 + }, + { + "epoch": 1.7333869392912526, + "grad_norm": 0.692686915397644, + "learning_rate": 0.0001, + "loss": 1.441, + "step": 15090 + }, + { + "epoch": 1.7335018092010799, + "grad_norm": 0.621494710445404, + "learning_rate": 0.0001, + "loss": 1.5951, + "step": 15091 + }, + { + "epoch": 1.7336166791109069, + "grad_norm": 0.6340914368629456, + "learning_rate": 0.0001, + "loss": 1.5213, + "step": 15092 + }, + { + "epoch": 1.733731549020734, + "grad_norm": 0.6137570142745972, + "learning_rate": 0.0001, + "loss": 1.3492, + "step": 15093 + }, + { + "epoch": 1.7338464189305611, + "grad_norm": 0.6438025832176208, + "learning_rate": 0.0001, + "loss": 1.4728, + "step": 15094 + }, + { + "epoch": 1.7339612888403884, + "grad_norm": 0.5989691615104675, + "learning_rate": 0.0001, + "loss": 1.4918, + "step": 15095 + }, + { + "epoch": 1.7340761587502154, + "grad_norm": 0.6587399840354919, + "learning_rate": 0.0001, + "loss": 1.2788, + "step": 15096 + }, + { + "epoch": 1.7341910286600424, + "grad_norm": 0.6119810342788696, + "learning_rate": 0.0001, + "loss": 1.3324, + "step": 15097 + }, + { + "epoch": 1.7343058985698696, + "grad_norm": 0.6475637555122375, + "learning_rate": 0.0001, + "loss": 1.6146, + "step": 15098 + }, + { + "epoch": 1.7344207684796968, + "grad_norm": 0.5902281403541565, + "learning_rate": 0.0001, + "loss": 1.4495, + "step": 15099 + }, + { + "epoch": 1.7345356383895238, + "grad_norm": 0.5807440876960754, + "learning_rate": 0.0001, + "loss": 1.3451, + "step": 15100 + }, + { + "epoch": 1.7346505082993509, + "grad_norm": 0.6495022773742676, + "learning_rate": 0.0001, + "loss": 1.5186, + "step": 15101 + }, + { + "epoch": 1.734765378209178, + "grad_norm": 0.65216064453125, + "learning_rate": 0.0001, + "loss": 1.4218, + "step": 15102 + }, + { + "epoch": 1.7348802481190053, + "grad_norm": 0.6560716032981873, + "learning_rate": 0.0001, + "loss": 1.4779, + "step": 15103 + }, + { + "epoch": 1.7349951180288323, + "grad_norm": 0.5937643051147461, + "learning_rate": 0.0001, + "loss": 1.3291, + "step": 15104 + }, + { + "epoch": 1.7351099879386593, + "grad_norm": 0.6294172406196594, + "learning_rate": 0.0001, + "loss": 1.2777, + "step": 15105 + }, + { + "epoch": 1.7352248578484866, + "grad_norm": 0.6666632890701294, + "learning_rate": 0.0001, + "loss": 1.3235, + "step": 15106 + }, + { + "epoch": 1.7353397277583138, + "grad_norm": 0.6566066741943359, + "learning_rate": 0.0001, + "loss": 1.3513, + "step": 15107 + }, + { + "epoch": 1.7354545976681408, + "grad_norm": 0.6714562773704529, + "learning_rate": 0.0001, + "loss": 1.6247, + "step": 15108 + }, + { + "epoch": 1.7355694675779678, + "grad_norm": 0.5992361307144165, + "learning_rate": 0.0001, + "loss": 1.3927, + "step": 15109 + }, + { + "epoch": 1.735684337487795, + "grad_norm": 0.5907565951347351, + "learning_rate": 0.0001, + "loss": 1.5398, + "step": 15110 + }, + { + "epoch": 1.7357992073976223, + "grad_norm": 0.6381534934043884, + "learning_rate": 0.0001, + "loss": 1.4723, + "step": 15111 + }, + { + "epoch": 1.7359140773074493, + "grad_norm": 0.5985099077224731, + "learning_rate": 0.0001, + "loss": 1.4673, + "step": 15112 + }, + { + "epoch": 1.7360289472172763, + "grad_norm": 0.566951334476471, + "learning_rate": 0.0001, + "loss": 1.4824, + "step": 15113 + }, + { + "epoch": 1.7361438171271035, + "grad_norm": 0.6164551973342896, + "learning_rate": 0.0001, + "loss": 1.5531, + "step": 15114 + }, + { + "epoch": 1.7362586870369308, + "grad_norm": 0.5941165089607239, + "learning_rate": 0.0001, + "loss": 1.3014, + "step": 15115 + }, + { + "epoch": 1.7363735569467578, + "grad_norm": 0.5940799117088318, + "learning_rate": 0.0001, + "loss": 1.6121, + "step": 15116 + }, + { + "epoch": 1.7364884268565848, + "grad_norm": 0.629859209060669, + "learning_rate": 0.0001, + "loss": 1.6983, + "step": 15117 + }, + { + "epoch": 1.736603296766412, + "grad_norm": 0.6390464305877686, + "learning_rate": 0.0001, + "loss": 1.5392, + "step": 15118 + }, + { + "epoch": 1.7367181666762392, + "grad_norm": 0.6173864006996155, + "learning_rate": 0.0001, + "loss": 1.3426, + "step": 15119 + }, + { + "epoch": 1.7368330365860662, + "grad_norm": 0.5565081238746643, + "learning_rate": 0.0001, + "loss": 1.4207, + "step": 15120 + }, + { + "epoch": 1.7369479064958933, + "grad_norm": 0.6064382791519165, + "learning_rate": 0.0001, + "loss": 1.5435, + "step": 15121 + }, + { + "epoch": 1.7370627764057205, + "grad_norm": 0.6307348608970642, + "learning_rate": 0.0001, + "loss": 1.3269, + "step": 15122 + }, + { + "epoch": 1.7371776463155477, + "grad_norm": 0.63545823097229, + "learning_rate": 0.0001, + "loss": 1.5386, + "step": 15123 + }, + { + "epoch": 1.7372925162253747, + "grad_norm": 0.6427724957466125, + "learning_rate": 0.0001, + "loss": 1.4303, + "step": 15124 + }, + { + "epoch": 1.7374073861352017, + "grad_norm": 0.5886901021003723, + "learning_rate": 0.0001, + "loss": 1.2971, + "step": 15125 + }, + { + "epoch": 1.737522256045029, + "grad_norm": 0.6295827031135559, + "learning_rate": 0.0001, + "loss": 1.2244, + "step": 15126 + }, + { + "epoch": 1.7376371259548562, + "grad_norm": 0.5705087780952454, + "learning_rate": 0.0001, + "loss": 1.2789, + "step": 15127 + }, + { + "epoch": 1.7377519958646832, + "grad_norm": 0.6834404468536377, + "learning_rate": 0.0001, + "loss": 1.3771, + "step": 15128 + }, + { + "epoch": 1.7378668657745102, + "grad_norm": 0.6291454434394836, + "learning_rate": 0.0001, + "loss": 1.3769, + "step": 15129 + }, + { + "epoch": 1.7379817356843374, + "grad_norm": 0.7055854201316833, + "learning_rate": 0.0001, + "loss": 1.609, + "step": 15130 + }, + { + "epoch": 1.7380966055941647, + "grad_norm": 0.6519612669944763, + "learning_rate": 0.0001, + "loss": 1.3599, + "step": 15131 + }, + { + "epoch": 1.7382114755039917, + "grad_norm": 0.7939478754997253, + "learning_rate": 0.0001, + "loss": 1.5643, + "step": 15132 + }, + { + "epoch": 1.7383263454138187, + "grad_norm": 0.5996159315109253, + "learning_rate": 0.0001, + "loss": 1.4736, + "step": 15133 + }, + { + "epoch": 1.738441215323646, + "grad_norm": 0.6544761061668396, + "learning_rate": 0.0001, + "loss": 1.5878, + "step": 15134 + }, + { + "epoch": 1.7385560852334732, + "grad_norm": 0.5604496002197266, + "learning_rate": 0.0001, + "loss": 1.2495, + "step": 15135 + }, + { + "epoch": 1.7386709551433002, + "grad_norm": 0.6021955609321594, + "learning_rate": 0.0001, + "loss": 1.4273, + "step": 15136 + }, + { + "epoch": 1.7387858250531272, + "grad_norm": 0.6894492506980896, + "learning_rate": 0.0001, + "loss": 1.4778, + "step": 15137 + }, + { + "epoch": 1.7389006949629544, + "grad_norm": 0.5934672355651855, + "learning_rate": 0.0001, + "loss": 1.4288, + "step": 15138 + }, + { + "epoch": 1.7390155648727816, + "grad_norm": 0.6965713500976562, + "learning_rate": 0.0001, + "loss": 1.5802, + "step": 15139 + }, + { + "epoch": 1.7391304347826086, + "grad_norm": 0.6328989863395691, + "learning_rate": 0.0001, + "loss": 1.3815, + "step": 15140 + }, + { + "epoch": 1.7392453046924357, + "grad_norm": 0.6331512928009033, + "learning_rate": 0.0001, + "loss": 1.5845, + "step": 15141 + }, + { + "epoch": 1.7393601746022629, + "grad_norm": 0.7313364148139954, + "learning_rate": 0.0001, + "loss": 1.6402, + "step": 15142 + }, + { + "epoch": 1.7394750445120901, + "grad_norm": 0.6274045705795288, + "learning_rate": 0.0001, + "loss": 1.6405, + "step": 15143 + }, + { + "epoch": 1.7395899144219171, + "grad_norm": 0.6259190440177917, + "learning_rate": 0.0001, + "loss": 1.4399, + "step": 15144 + }, + { + "epoch": 1.7397047843317441, + "grad_norm": 0.6094817519187927, + "learning_rate": 0.0001, + "loss": 1.5018, + "step": 15145 + }, + { + "epoch": 1.7398196542415714, + "grad_norm": 0.6577267646789551, + "learning_rate": 0.0001, + "loss": 1.3699, + "step": 15146 + }, + { + "epoch": 1.7399345241513986, + "grad_norm": 0.6378180980682373, + "learning_rate": 0.0001, + "loss": 1.469, + "step": 15147 + }, + { + "epoch": 1.7400493940612256, + "grad_norm": 0.6114896535873413, + "learning_rate": 0.0001, + "loss": 1.6303, + "step": 15148 + }, + { + "epoch": 1.7401642639710528, + "grad_norm": 0.6004807353019714, + "learning_rate": 0.0001, + "loss": 1.4736, + "step": 15149 + }, + { + "epoch": 1.7402791338808798, + "grad_norm": 0.5777592658996582, + "learning_rate": 0.0001, + "loss": 1.463, + "step": 15150 + }, + { + "epoch": 1.740394003790707, + "grad_norm": 0.667898416519165, + "learning_rate": 0.0001, + "loss": 1.4367, + "step": 15151 + }, + { + "epoch": 1.7405088737005343, + "grad_norm": 0.6375964879989624, + "learning_rate": 0.0001, + "loss": 1.4712, + "step": 15152 + }, + { + "epoch": 1.7406237436103613, + "grad_norm": 0.6091217398643494, + "learning_rate": 0.0001, + "loss": 1.4241, + "step": 15153 + }, + { + "epoch": 1.7407386135201883, + "grad_norm": 0.6524693965911865, + "learning_rate": 0.0001, + "loss": 1.3071, + "step": 15154 + }, + { + "epoch": 1.7408534834300156, + "grad_norm": 0.6394875645637512, + "learning_rate": 0.0001, + "loss": 1.5411, + "step": 15155 + }, + { + "epoch": 1.7409683533398428, + "grad_norm": 0.6616272926330566, + "learning_rate": 0.0001, + "loss": 1.4982, + "step": 15156 + }, + { + "epoch": 1.7410832232496698, + "grad_norm": 0.6061643958091736, + "learning_rate": 0.0001, + "loss": 1.4134, + "step": 15157 + }, + { + "epoch": 1.7411980931594968, + "grad_norm": 0.5694348216056824, + "learning_rate": 0.0001, + "loss": 1.44, + "step": 15158 + }, + { + "epoch": 1.741312963069324, + "grad_norm": 0.651995062828064, + "learning_rate": 0.0001, + "loss": 1.5107, + "step": 15159 + }, + { + "epoch": 1.7414278329791513, + "grad_norm": 0.6059070229530334, + "learning_rate": 0.0001, + "loss": 1.548, + "step": 15160 + }, + { + "epoch": 1.7415427028889783, + "grad_norm": 0.58353590965271, + "learning_rate": 0.0001, + "loss": 1.4756, + "step": 15161 + }, + { + "epoch": 1.7416575727988053, + "grad_norm": 0.5728119611740112, + "learning_rate": 0.0001, + "loss": 1.3305, + "step": 15162 + }, + { + "epoch": 1.7417724427086325, + "grad_norm": 0.5594245791435242, + "learning_rate": 0.0001, + "loss": 1.295, + "step": 15163 + }, + { + "epoch": 1.7418873126184597, + "grad_norm": 0.6906068325042725, + "learning_rate": 0.0001, + "loss": 1.5825, + "step": 15164 + }, + { + "epoch": 1.7420021825282868, + "grad_norm": 0.6946594715118408, + "learning_rate": 0.0001, + "loss": 1.5164, + "step": 15165 + }, + { + "epoch": 1.7421170524381138, + "grad_norm": 0.6139326095581055, + "learning_rate": 0.0001, + "loss": 1.519, + "step": 15166 + }, + { + "epoch": 1.742231922347941, + "grad_norm": 0.6352965235710144, + "learning_rate": 0.0001, + "loss": 1.4031, + "step": 15167 + }, + { + "epoch": 1.7423467922577682, + "grad_norm": 0.6800305843353271, + "learning_rate": 0.0001, + "loss": 1.5563, + "step": 15168 + }, + { + "epoch": 1.7424616621675952, + "grad_norm": 0.5998824834823608, + "learning_rate": 0.0001, + "loss": 1.3361, + "step": 15169 + }, + { + "epoch": 1.7425765320774222, + "grad_norm": 0.6264121532440186, + "learning_rate": 0.0001, + "loss": 1.6003, + "step": 15170 + }, + { + "epoch": 1.7426914019872495, + "grad_norm": 0.6085196733474731, + "learning_rate": 0.0001, + "loss": 1.3164, + "step": 15171 + }, + { + "epoch": 1.7428062718970767, + "grad_norm": 0.6375176310539246, + "learning_rate": 0.0001, + "loss": 1.4239, + "step": 15172 + }, + { + "epoch": 1.7429211418069037, + "grad_norm": 0.6847028732299805, + "learning_rate": 0.0001, + "loss": 1.5247, + "step": 15173 + }, + { + "epoch": 1.7430360117167307, + "grad_norm": 0.8053979873657227, + "learning_rate": 0.0001, + "loss": 1.5727, + "step": 15174 + }, + { + "epoch": 1.743150881626558, + "grad_norm": 0.6131575107574463, + "learning_rate": 0.0001, + "loss": 1.3573, + "step": 15175 + }, + { + "epoch": 1.7432657515363852, + "grad_norm": 0.570138692855835, + "learning_rate": 0.0001, + "loss": 1.2953, + "step": 15176 + }, + { + "epoch": 1.7433806214462122, + "grad_norm": 0.6050834059715271, + "learning_rate": 0.0001, + "loss": 1.2771, + "step": 15177 + }, + { + "epoch": 1.7434954913560392, + "grad_norm": 0.6091510653495789, + "learning_rate": 0.0001, + "loss": 1.4857, + "step": 15178 + }, + { + "epoch": 1.7436103612658664, + "grad_norm": 0.7901420593261719, + "learning_rate": 0.0001, + "loss": 1.5952, + "step": 15179 + }, + { + "epoch": 1.7437252311756937, + "grad_norm": 0.675471842288971, + "learning_rate": 0.0001, + "loss": 1.3616, + "step": 15180 + }, + { + "epoch": 1.7438401010855207, + "grad_norm": 0.6008798480033875, + "learning_rate": 0.0001, + "loss": 1.4139, + "step": 15181 + }, + { + "epoch": 1.7439549709953477, + "grad_norm": 0.6464613080024719, + "learning_rate": 0.0001, + "loss": 1.3417, + "step": 15182 + }, + { + "epoch": 1.744069840905175, + "grad_norm": 0.6285551190376282, + "learning_rate": 0.0001, + "loss": 1.5489, + "step": 15183 + }, + { + "epoch": 1.7441847108150021, + "grad_norm": 0.6213048100471497, + "learning_rate": 0.0001, + "loss": 1.3166, + "step": 15184 + }, + { + "epoch": 1.7442995807248292, + "grad_norm": 0.6194262504577637, + "learning_rate": 0.0001, + "loss": 1.5114, + "step": 15185 + }, + { + "epoch": 1.7444144506346562, + "grad_norm": 0.6530826687812805, + "learning_rate": 0.0001, + "loss": 1.563, + "step": 15186 + }, + { + "epoch": 1.7445293205444834, + "grad_norm": 0.6780787110328674, + "learning_rate": 0.0001, + "loss": 1.2973, + "step": 15187 + }, + { + "epoch": 1.7446441904543106, + "grad_norm": 0.7618767023086548, + "learning_rate": 0.0001, + "loss": 1.5906, + "step": 15188 + }, + { + "epoch": 1.7447590603641376, + "grad_norm": 0.6271486282348633, + "learning_rate": 0.0001, + "loss": 1.5238, + "step": 15189 + }, + { + "epoch": 1.7448739302739646, + "grad_norm": 0.5610907077789307, + "learning_rate": 0.0001, + "loss": 1.2431, + "step": 15190 + }, + { + "epoch": 1.7449888001837919, + "grad_norm": 0.6444764733314514, + "learning_rate": 0.0001, + "loss": 1.3007, + "step": 15191 + }, + { + "epoch": 1.745103670093619, + "grad_norm": 0.653785765171051, + "learning_rate": 0.0001, + "loss": 1.7039, + "step": 15192 + }, + { + "epoch": 1.7452185400034461, + "grad_norm": 0.6614614129066467, + "learning_rate": 0.0001, + "loss": 1.5011, + "step": 15193 + }, + { + "epoch": 1.7453334099132731, + "grad_norm": 0.6090264320373535, + "learning_rate": 0.0001, + "loss": 1.4303, + "step": 15194 + }, + { + "epoch": 1.7454482798231004, + "grad_norm": 0.5970935821533203, + "learning_rate": 0.0001, + "loss": 1.4341, + "step": 15195 + }, + { + "epoch": 1.7455631497329276, + "grad_norm": 0.6474558711051941, + "learning_rate": 0.0001, + "loss": 1.5577, + "step": 15196 + }, + { + "epoch": 1.7456780196427546, + "grad_norm": 0.6696446537971497, + "learning_rate": 0.0001, + "loss": 1.3981, + "step": 15197 + }, + { + "epoch": 1.7457928895525816, + "grad_norm": 0.6368044018745422, + "learning_rate": 0.0001, + "loss": 1.431, + "step": 15198 + }, + { + "epoch": 1.7459077594624088, + "grad_norm": 0.5889632105827332, + "learning_rate": 0.0001, + "loss": 1.3663, + "step": 15199 + }, + { + "epoch": 1.746022629372236, + "grad_norm": 0.6324240565299988, + "learning_rate": 0.0001, + "loss": 1.4025, + "step": 15200 + }, + { + "epoch": 1.746137499282063, + "grad_norm": 0.664838969707489, + "learning_rate": 0.0001, + "loss": 1.5643, + "step": 15201 + }, + { + "epoch": 1.74625236919189, + "grad_norm": 0.609592616558075, + "learning_rate": 0.0001, + "loss": 1.4405, + "step": 15202 + }, + { + "epoch": 1.7463672391017173, + "grad_norm": 0.64817214012146, + "learning_rate": 0.0001, + "loss": 1.5301, + "step": 15203 + }, + { + "epoch": 1.7464821090115445, + "grad_norm": 0.5915722846984863, + "learning_rate": 0.0001, + "loss": 1.5131, + "step": 15204 + }, + { + "epoch": 1.7465969789213716, + "grad_norm": 0.6245658993721008, + "learning_rate": 0.0001, + "loss": 1.4266, + "step": 15205 + }, + { + "epoch": 1.7467118488311986, + "grad_norm": 0.5552368760108948, + "learning_rate": 0.0001, + "loss": 1.3063, + "step": 15206 + }, + { + "epoch": 1.7468267187410258, + "grad_norm": 0.6671764254570007, + "learning_rate": 0.0001, + "loss": 1.455, + "step": 15207 + }, + { + "epoch": 1.746941588650853, + "grad_norm": 0.6359453201293945, + "learning_rate": 0.0001, + "loss": 1.5014, + "step": 15208 + }, + { + "epoch": 1.74705645856068, + "grad_norm": 0.6473332643508911, + "learning_rate": 0.0001, + "loss": 1.4903, + "step": 15209 + }, + { + "epoch": 1.747171328470507, + "grad_norm": 0.609463095664978, + "learning_rate": 0.0001, + "loss": 1.4419, + "step": 15210 + }, + { + "epoch": 1.7472861983803343, + "grad_norm": 0.7541261911392212, + "learning_rate": 0.0001, + "loss": 1.4196, + "step": 15211 + }, + { + "epoch": 1.7474010682901615, + "grad_norm": 0.6469170451164246, + "learning_rate": 0.0001, + "loss": 1.4974, + "step": 15212 + }, + { + "epoch": 1.7475159381999885, + "grad_norm": 0.6460253000259399, + "learning_rate": 0.0001, + "loss": 1.4009, + "step": 15213 + }, + { + "epoch": 1.7476308081098155, + "grad_norm": 0.6805127859115601, + "learning_rate": 0.0001, + "loss": 1.4109, + "step": 15214 + }, + { + "epoch": 1.7477456780196428, + "grad_norm": 0.6314157843589783, + "learning_rate": 0.0001, + "loss": 1.3916, + "step": 15215 + }, + { + "epoch": 1.74786054792947, + "grad_norm": 0.5894502997398376, + "learning_rate": 0.0001, + "loss": 1.2586, + "step": 15216 + }, + { + "epoch": 1.747975417839297, + "grad_norm": 0.6559074521064758, + "learning_rate": 0.0001, + "loss": 1.6868, + "step": 15217 + }, + { + "epoch": 1.748090287749124, + "grad_norm": 0.624843955039978, + "learning_rate": 0.0001, + "loss": 1.3526, + "step": 15218 + }, + { + "epoch": 1.7482051576589512, + "grad_norm": 0.6877935528755188, + "learning_rate": 0.0001, + "loss": 1.4833, + "step": 15219 + }, + { + "epoch": 1.7483200275687785, + "grad_norm": 0.6488064527511597, + "learning_rate": 0.0001, + "loss": 1.5458, + "step": 15220 + }, + { + "epoch": 1.7484348974786055, + "grad_norm": 0.7320221066474915, + "learning_rate": 0.0001, + "loss": 1.6404, + "step": 15221 + }, + { + "epoch": 1.7485497673884325, + "grad_norm": 0.6499512195587158, + "learning_rate": 0.0001, + "loss": 1.4216, + "step": 15222 + }, + { + "epoch": 1.7486646372982597, + "grad_norm": 0.6216921806335449, + "learning_rate": 0.0001, + "loss": 1.4012, + "step": 15223 + }, + { + "epoch": 1.748779507208087, + "grad_norm": 0.6483284831047058, + "learning_rate": 0.0001, + "loss": 1.5375, + "step": 15224 + }, + { + "epoch": 1.748894377117914, + "grad_norm": 0.6150051951408386, + "learning_rate": 0.0001, + "loss": 1.5839, + "step": 15225 + }, + { + "epoch": 1.749009247027741, + "grad_norm": 0.5937765836715698, + "learning_rate": 0.0001, + "loss": 1.4394, + "step": 15226 + }, + { + "epoch": 1.7491241169375682, + "grad_norm": 0.5825061202049255, + "learning_rate": 0.0001, + "loss": 1.3277, + "step": 15227 + }, + { + "epoch": 1.7492389868473954, + "grad_norm": 0.7533363103866577, + "learning_rate": 0.0001, + "loss": 1.7251, + "step": 15228 + }, + { + "epoch": 1.7493538567572224, + "grad_norm": 0.6395822167396545, + "learning_rate": 0.0001, + "loss": 1.444, + "step": 15229 + }, + { + "epoch": 1.7494687266670494, + "grad_norm": 0.5838786959648132, + "learning_rate": 0.0001, + "loss": 1.5254, + "step": 15230 + }, + { + "epoch": 1.7495835965768767, + "grad_norm": 0.6399153470993042, + "learning_rate": 0.0001, + "loss": 1.2255, + "step": 15231 + }, + { + "epoch": 1.749698466486704, + "grad_norm": 0.667523205280304, + "learning_rate": 0.0001, + "loss": 1.4569, + "step": 15232 + }, + { + "epoch": 1.749813336396531, + "grad_norm": 0.6431367993354797, + "learning_rate": 0.0001, + "loss": 1.5047, + "step": 15233 + }, + { + "epoch": 1.749928206306358, + "grad_norm": 0.6183831095695496, + "learning_rate": 0.0001, + "loss": 1.3978, + "step": 15234 + }, + { + "epoch": 1.7500430762161852, + "grad_norm": 0.655053436756134, + "learning_rate": 0.0001, + "loss": 1.4848, + "step": 15235 + }, + { + "epoch": 1.7501579461260124, + "grad_norm": 0.7972126603126526, + "learning_rate": 0.0001, + "loss": 1.5805, + "step": 15236 + }, + { + "epoch": 1.7502728160358394, + "grad_norm": 0.6024256944656372, + "learning_rate": 0.0001, + "loss": 1.4008, + "step": 15237 + }, + { + "epoch": 1.7503876859456664, + "grad_norm": 0.5974478125572205, + "learning_rate": 0.0001, + "loss": 1.453, + "step": 15238 + }, + { + "epoch": 1.7505025558554936, + "grad_norm": 0.7210254669189453, + "learning_rate": 0.0001, + "loss": 1.6979, + "step": 15239 + }, + { + "epoch": 1.7506174257653209, + "grad_norm": 0.6723774075508118, + "learning_rate": 0.0001, + "loss": 1.4211, + "step": 15240 + }, + { + "epoch": 1.7507322956751479, + "grad_norm": 0.6472156047821045, + "learning_rate": 0.0001, + "loss": 1.5018, + "step": 15241 + }, + { + "epoch": 1.7508471655849749, + "grad_norm": 0.5895910859107971, + "learning_rate": 0.0001, + "loss": 1.2851, + "step": 15242 + }, + { + "epoch": 1.7509620354948021, + "grad_norm": 0.6794853210449219, + "learning_rate": 0.0001, + "loss": 1.3592, + "step": 15243 + }, + { + "epoch": 1.7510769054046293, + "grad_norm": 0.6932422518730164, + "learning_rate": 0.0001, + "loss": 1.6878, + "step": 15244 + }, + { + "epoch": 1.7511917753144564, + "grad_norm": 0.6949918866157532, + "learning_rate": 0.0001, + "loss": 1.581, + "step": 15245 + }, + { + "epoch": 1.7513066452242834, + "grad_norm": 0.6456189751625061, + "learning_rate": 0.0001, + "loss": 1.4974, + "step": 15246 + }, + { + "epoch": 1.7514215151341106, + "grad_norm": 0.6524791121482849, + "learning_rate": 0.0001, + "loss": 1.5143, + "step": 15247 + }, + { + "epoch": 1.7515363850439378, + "grad_norm": 0.5790715217590332, + "learning_rate": 0.0001, + "loss": 1.5119, + "step": 15248 + }, + { + "epoch": 1.7516512549537648, + "grad_norm": 0.5909721255302429, + "learning_rate": 0.0001, + "loss": 1.3346, + "step": 15249 + }, + { + "epoch": 1.7517661248635918, + "grad_norm": 0.5983031988143921, + "learning_rate": 0.0001, + "loss": 1.3218, + "step": 15250 + }, + { + "epoch": 1.751880994773419, + "grad_norm": 0.5888960361480713, + "learning_rate": 0.0001, + "loss": 1.4113, + "step": 15251 + }, + { + "epoch": 1.7519958646832463, + "grad_norm": 0.6211053729057312, + "learning_rate": 0.0001, + "loss": 1.4408, + "step": 15252 + }, + { + "epoch": 1.7521107345930733, + "grad_norm": 0.6472274661064148, + "learning_rate": 0.0001, + "loss": 1.6397, + "step": 15253 + }, + { + "epoch": 1.7522256045029003, + "grad_norm": 0.5877479314804077, + "learning_rate": 0.0001, + "loss": 1.3772, + "step": 15254 + }, + { + "epoch": 1.7523404744127276, + "grad_norm": 0.6007200479507446, + "learning_rate": 0.0001, + "loss": 1.3258, + "step": 15255 + }, + { + "epoch": 1.7524553443225548, + "grad_norm": 0.6134718656539917, + "learning_rate": 0.0001, + "loss": 1.3475, + "step": 15256 + }, + { + "epoch": 1.7525702142323818, + "grad_norm": 0.6493116021156311, + "learning_rate": 0.0001, + "loss": 1.5139, + "step": 15257 + }, + { + "epoch": 1.7526850841422088, + "grad_norm": 0.6316721439361572, + "learning_rate": 0.0001, + "loss": 1.5339, + "step": 15258 + }, + { + "epoch": 1.752799954052036, + "grad_norm": 0.63750159740448, + "learning_rate": 0.0001, + "loss": 1.4438, + "step": 15259 + }, + { + "epoch": 1.7529148239618633, + "grad_norm": 0.6934375762939453, + "learning_rate": 0.0001, + "loss": 1.5165, + "step": 15260 + }, + { + "epoch": 1.7530296938716903, + "grad_norm": 0.6297123432159424, + "learning_rate": 0.0001, + "loss": 1.4865, + "step": 15261 + }, + { + "epoch": 1.7531445637815173, + "grad_norm": 0.582813560962677, + "learning_rate": 0.0001, + "loss": 1.1494, + "step": 15262 + }, + { + "epoch": 1.7532594336913445, + "grad_norm": 0.6479847431182861, + "learning_rate": 0.0001, + "loss": 1.3812, + "step": 15263 + }, + { + "epoch": 1.7533743036011717, + "grad_norm": 0.6570015549659729, + "learning_rate": 0.0001, + "loss": 1.5119, + "step": 15264 + }, + { + "epoch": 1.7534891735109988, + "grad_norm": 0.6873630285263062, + "learning_rate": 0.0001, + "loss": 1.2798, + "step": 15265 + }, + { + "epoch": 1.7536040434208258, + "grad_norm": 0.6358373165130615, + "learning_rate": 0.0001, + "loss": 1.4256, + "step": 15266 + }, + { + "epoch": 1.753718913330653, + "grad_norm": 0.5748741626739502, + "learning_rate": 0.0001, + "loss": 1.3127, + "step": 15267 + }, + { + "epoch": 1.7538337832404802, + "grad_norm": 0.6858731508255005, + "learning_rate": 0.0001, + "loss": 1.4542, + "step": 15268 + }, + { + "epoch": 1.7539486531503072, + "grad_norm": 0.5939129590988159, + "learning_rate": 0.0001, + "loss": 1.266, + "step": 15269 + }, + { + "epoch": 1.7540635230601342, + "grad_norm": 0.65522301197052, + "learning_rate": 0.0001, + "loss": 1.3433, + "step": 15270 + }, + { + "epoch": 1.7541783929699615, + "grad_norm": 0.6629993915557861, + "learning_rate": 0.0001, + "loss": 1.4215, + "step": 15271 + }, + { + "epoch": 1.7542932628797887, + "grad_norm": 0.6635041236877441, + "learning_rate": 0.0001, + "loss": 1.5512, + "step": 15272 + }, + { + "epoch": 1.7544081327896157, + "grad_norm": 0.6313446164131165, + "learning_rate": 0.0001, + "loss": 1.6941, + "step": 15273 + }, + { + "epoch": 1.7545230026994427, + "grad_norm": 0.6612875461578369, + "learning_rate": 0.0001, + "loss": 1.3453, + "step": 15274 + }, + { + "epoch": 1.75463787260927, + "grad_norm": 0.6233786940574646, + "learning_rate": 0.0001, + "loss": 1.4155, + "step": 15275 + }, + { + "epoch": 1.7547527425190972, + "grad_norm": 0.688138484954834, + "learning_rate": 0.0001, + "loss": 1.4463, + "step": 15276 + }, + { + "epoch": 1.7548676124289242, + "grad_norm": 0.5722640156745911, + "learning_rate": 0.0001, + "loss": 1.173, + "step": 15277 + }, + { + "epoch": 1.7549824823387512, + "grad_norm": 0.6150445342063904, + "learning_rate": 0.0001, + "loss": 1.4503, + "step": 15278 + }, + { + "epoch": 1.7550973522485784, + "grad_norm": 0.6374548673629761, + "learning_rate": 0.0001, + "loss": 1.4994, + "step": 15279 + }, + { + "epoch": 1.7552122221584057, + "grad_norm": 0.5925512313842773, + "learning_rate": 0.0001, + "loss": 1.3682, + "step": 15280 + }, + { + "epoch": 1.7553270920682327, + "grad_norm": 0.6822425723075867, + "learning_rate": 0.0001, + "loss": 1.5626, + "step": 15281 + }, + { + "epoch": 1.7554419619780597, + "grad_norm": 0.6651631593704224, + "learning_rate": 0.0001, + "loss": 1.7616, + "step": 15282 + }, + { + "epoch": 1.755556831887887, + "grad_norm": 0.6492800712585449, + "learning_rate": 0.0001, + "loss": 1.4807, + "step": 15283 + }, + { + "epoch": 1.7556717017977141, + "grad_norm": 0.5847377777099609, + "learning_rate": 0.0001, + "loss": 1.4534, + "step": 15284 + }, + { + "epoch": 1.7557865717075412, + "grad_norm": 0.6187418699264526, + "learning_rate": 0.0001, + "loss": 1.3685, + "step": 15285 + }, + { + "epoch": 1.7559014416173682, + "grad_norm": 0.5958352088928223, + "learning_rate": 0.0001, + "loss": 1.3842, + "step": 15286 + }, + { + "epoch": 1.7560163115271954, + "grad_norm": 0.6693896055221558, + "learning_rate": 0.0001, + "loss": 1.4296, + "step": 15287 + }, + { + "epoch": 1.7561311814370226, + "grad_norm": 0.6221320033073425, + "learning_rate": 0.0001, + "loss": 1.3786, + "step": 15288 + }, + { + "epoch": 1.7562460513468499, + "grad_norm": 0.6341426968574524, + "learning_rate": 0.0001, + "loss": 1.5418, + "step": 15289 + }, + { + "epoch": 1.7563609212566769, + "grad_norm": 0.5881104469299316, + "learning_rate": 0.0001, + "loss": 1.4077, + "step": 15290 + }, + { + "epoch": 1.7564757911665039, + "grad_norm": 0.6449071764945984, + "learning_rate": 0.0001, + "loss": 1.5997, + "step": 15291 + }, + { + "epoch": 1.756590661076331, + "grad_norm": 0.6435044407844543, + "learning_rate": 0.0001, + "loss": 1.486, + "step": 15292 + }, + { + "epoch": 1.7567055309861583, + "grad_norm": 0.5955443978309631, + "learning_rate": 0.0001, + "loss": 1.4916, + "step": 15293 + }, + { + "epoch": 1.7568204008959853, + "grad_norm": 0.6793741583824158, + "learning_rate": 0.0001, + "loss": 1.5893, + "step": 15294 + }, + { + "epoch": 1.7569352708058124, + "grad_norm": 0.6085398197174072, + "learning_rate": 0.0001, + "loss": 1.4329, + "step": 15295 + }, + { + "epoch": 1.7570501407156396, + "grad_norm": 0.6038925051689148, + "learning_rate": 0.0001, + "loss": 1.3851, + "step": 15296 + }, + { + "epoch": 1.7571650106254668, + "grad_norm": 0.6389901041984558, + "learning_rate": 0.0001, + "loss": 1.4429, + "step": 15297 + }, + { + "epoch": 1.7572798805352938, + "grad_norm": 0.5799313187599182, + "learning_rate": 0.0001, + "loss": 1.5151, + "step": 15298 + }, + { + "epoch": 1.7573947504451208, + "grad_norm": 0.6270938515663147, + "learning_rate": 0.0001, + "loss": 1.4155, + "step": 15299 + }, + { + "epoch": 1.757509620354948, + "grad_norm": 0.5713139176368713, + "learning_rate": 0.0001, + "loss": 1.4485, + "step": 15300 + }, + { + "epoch": 1.7576244902647753, + "grad_norm": 0.5795127153396606, + "learning_rate": 0.0001, + "loss": 1.2822, + "step": 15301 + }, + { + "epoch": 1.7577393601746023, + "grad_norm": 0.5726675391197205, + "learning_rate": 0.0001, + "loss": 1.0326, + "step": 15302 + }, + { + "epoch": 1.7578542300844293, + "grad_norm": 0.6082470417022705, + "learning_rate": 0.0001, + "loss": 1.4728, + "step": 15303 + }, + { + "epoch": 1.7579690999942565, + "grad_norm": 0.6625812649726868, + "learning_rate": 0.0001, + "loss": 1.4903, + "step": 15304 + }, + { + "epoch": 1.7580839699040838, + "grad_norm": 0.6669724583625793, + "learning_rate": 0.0001, + "loss": 1.5908, + "step": 15305 + }, + { + "epoch": 1.7581988398139108, + "grad_norm": 0.6124973893165588, + "learning_rate": 0.0001, + "loss": 1.5021, + "step": 15306 + }, + { + "epoch": 1.7583137097237378, + "grad_norm": 0.6731737852096558, + "learning_rate": 0.0001, + "loss": 1.5494, + "step": 15307 + }, + { + "epoch": 1.758428579633565, + "grad_norm": 0.597639799118042, + "learning_rate": 0.0001, + "loss": 1.2609, + "step": 15308 + }, + { + "epoch": 1.7585434495433923, + "grad_norm": 0.6299818158149719, + "learning_rate": 0.0001, + "loss": 1.7241, + "step": 15309 + }, + { + "epoch": 1.7586583194532193, + "grad_norm": 0.578934371471405, + "learning_rate": 0.0001, + "loss": 1.5805, + "step": 15310 + }, + { + "epoch": 1.7587731893630463, + "grad_norm": 0.6151703596115112, + "learning_rate": 0.0001, + "loss": 1.3717, + "step": 15311 + }, + { + "epoch": 1.7588880592728735, + "grad_norm": 0.6442793011665344, + "learning_rate": 0.0001, + "loss": 1.4778, + "step": 15312 + }, + { + "epoch": 1.7590029291827007, + "grad_norm": 0.6389909982681274, + "learning_rate": 0.0001, + "loss": 1.4665, + "step": 15313 + }, + { + "epoch": 1.7591177990925277, + "grad_norm": 0.5759516954421997, + "learning_rate": 0.0001, + "loss": 1.354, + "step": 15314 + }, + { + "epoch": 1.7592326690023548, + "grad_norm": 0.6577784419059753, + "learning_rate": 0.0001, + "loss": 1.5927, + "step": 15315 + }, + { + "epoch": 1.759347538912182, + "grad_norm": 0.6161843538284302, + "learning_rate": 0.0001, + "loss": 1.3574, + "step": 15316 + }, + { + "epoch": 1.7594624088220092, + "grad_norm": 0.6002083420753479, + "learning_rate": 0.0001, + "loss": 1.51, + "step": 15317 + }, + { + "epoch": 1.7595772787318362, + "grad_norm": 0.6231681704521179, + "learning_rate": 0.0001, + "loss": 1.2959, + "step": 15318 + }, + { + "epoch": 1.7596921486416632, + "grad_norm": 0.6722519397735596, + "learning_rate": 0.0001, + "loss": 1.3182, + "step": 15319 + }, + { + "epoch": 1.7598070185514905, + "grad_norm": 0.6488606929779053, + "learning_rate": 0.0001, + "loss": 1.249, + "step": 15320 + }, + { + "epoch": 1.7599218884613177, + "grad_norm": 0.6526792049407959, + "learning_rate": 0.0001, + "loss": 1.389, + "step": 15321 + }, + { + "epoch": 1.7600367583711447, + "grad_norm": 0.8293418884277344, + "learning_rate": 0.0001, + "loss": 1.5196, + "step": 15322 + }, + { + "epoch": 1.7601516282809717, + "grad_norm": 0.6284173130989075, + "learning_rate": 0.0001, + "loss": 1.4126, + "step": 15323 + }, + { + "epoch": 1.760266498190799, + "grad_norm": 0.6085017919540405, + "learning_rate": 0.0001, + "loss": 1.5337, + "step": 15324 + }, + { + "epoch": 1.7603813681006262, + "grad_norm": 0.6490882039070129, + "learning_rate": 0.0001, + "loss": 1.5043, + "step": 15325 + }, + { + "epoch": 1.7604962380104532, + "grad_norm": 0.6944969296455383, + "learning_rate": 0.0001, + "loss": 1.4436, + "step": 15326 + }, + { + "epoch": 1.7606111079202802, + "grad_norm": 0.5754250288009644, + "learning_rate": 0.0001, + "loss": 1.4301, + "step": 15327 + }, + { + "epoch": 1.7607259778301074, + "grad_norm": 0.5691632628440857, + "learning_rate": 0.0001, + "loss": 1.2594, + "step": 15328 + }, + { + "epoch": 1.7608408477399347, + "grad_norm": 0.6530894041061401, + "learning_rate": 0.0001, + "loss": 1.7345, + "step": 15329 + }, + { + "epoch": 1.7609557176497617, + "grad_norm": 0.5945919156074524, + "learning_rate": 0.0001, + "loss": 1.5005, + "step": 15330 + }, + { + "epoch": 1.7610705875595887, + "grad_norm": 0.6224496364593506, + "learning_rate": 0.0001, + "loss": 1.371, + "step": 15331 + }, + { + "epoch": 1.761185457469416, + "grad_norm": 0.5837387442588806, + "learning_rate": 0.0001, + "loss": 1.2299, + "step": 15332 + }, + { + "epoch": 1.7613003273792431, + "grad_norm": 0.6480647325515747, + "learning_rate": 0.0001, + "loss": 1.4484, + "step": 15333 + }, + { + "epoch": 1.7614151972890701, + "grad_norm": 0.655102014541626, + "learning_rate": 0.0001, + "loss": 1.6464, + "step": 15334 + }, + { + "epoch": 1.7615300671988972, + "grad_norm": 0.624885618686676, + "learning_rate": 0.0001, + "loss": 1.504, + "step": 15335 + }, + { + "epoch": 1.7616449371087244, + "grad_norm": 0.7076594233512878, + "learning_rate": 0.0001, + "loss": 1.2598, + "step": 15336 + }, + { + "epoch": 1.7617598070185516, + "grad_norm": 0.6876631379127502, + "learning_rate": 0.0001, + "loss": 1.6424, + "step": 15337 + }, + { + "epoch": 1.7618746769283786, + "grad_norm": 0.642560601234436, + "learning_rate": 0.0001, + "loss": 1.474, + "step": 15338 + }, + { + "epoch": 1.7619895468382056, + "grad_norm": 0.6945380568504333, + "learning_rate": 0.0001, + "loss": 1.4465, + "step": 15339 + }, + { + "epoch": 1.7621044167480329, + "grad_norm": 0.6342966556549072, + "learning_rate": 0.0001, + "loss": 1.5273, + "step": 15340 + }, + { + "epoch": 1.76221928665786, + "grad_norm": 0.6334778666496277, + "learning_rate": 0.0001, + "loss": 1.4535, + "step": 15341 + }, + { + "epoch": 1.762334156567687, + "grad_norm": 0.5924774408340454, + "learning_rate": 0.0001, + "loss": 1.4886, + "step": 15342 + }, + { + "epoch": 1.7624490264775141, + "grad_norm": 0.6480969190597534, + "learning_rate": 0.0001, + "loss": 1.3745, + "step": 15343 + }, + { + "epoch": 1.7625638963873413, + "grad_norm": 0.635136067867279, + "learning_rate": 0.0001, + "loss": 1.4212, + "step": 15344 + }, + { + "epoch": 1.7626787662971686, + "grad_norm": 0.6227051615715027, + "learning_rate": 0.0001, + "loss": 1.6379, + "step": 15345 + }, + { + "epoch": 1.7627936362069956, + "grad_norm": 0.5813191533088684, + "learning_rate": 0.0001, + "loss": 1.514, + "step": 15346 + }, + { + "epoch": 1.7629085061168226, + "grad_norm": 0.7114011645317078, + "learning_rate": 0.0001, + "loss": 1.4692, + "step": 15347 + }, + { + "epoch": 1.7630233760266498, + "grad_norm": 0.6176375150680542, + "learning_rate": 0.0001, + "loss": 1.4317, + "step": 15348 + }, + { + "epoch": 1.763138245936477, + "grad_norm": 0.7010447382926941, + "learning_rate": 0.0001, + "loss": 1.6766, + "step": 15349 + }, + { + "epoch": 1.763253115846304, + "grad_norm": 0.62742680311203, + "learning_rate": 0.0001, + "loss": 1.3897, + "step": 15350 + }, + { + "epoch": 1.763367985756131, + "grad_norm": 0.6509758830070496, + "learning_rate": 0.0001, + "loss": 1.5885, + "step": 15351 + }, + { + "epoch": 1.7634828556659583, + "grad_norm": 0.6291651725769043, + "learning_rate": 0.0001, + "loss": 1.1953, + "step": 15352 + }, + { + "epoch": 1.7635977255757855, + "grad_norm": 0.6242035031318665, + "learning_rate": 0.0001, + "loss": 1.4124, + "step": 15353 + }, + { + "epoch": 1.7637125954856125, + "grad_norm": 0.5866750478744507, + "learning_rate": 0.0001, + "loss": 1.5409, + "step": 15354 + }, + { + "epoch": 1.7638274653954396, + "grad_norm": 0.6242676973342896, + "learning_rate": 0.0001, + "loss": 1.4799, + "step": 15355 + }, + { + "epoch": 1.7639423353052668, + "grad_norm": 0.6475937366485596, + "learning_rate": 0.0001, + "loss": 1.3933, + "step": 15356 + }, + { + "epoch": 1.764057205215094, + "grad_norm": 0.6821995377540588, + "learning_rate": 0.0001, + "loss": 1.6972, + "step": 15357 + }, + { + "epoch": 1.764172075124921, + "grad_norm": 0.6403782963752747, + "learning_rate": 0.0001, + "loss": 1.4328, + "step": 15358 + }, + { + "epoch": 1.764286945034748, + "grad_norm": 0.5568740963935852, + "learning_rate": 0.0001, + "loss": 1.3501, + "step": 15359 + }, + { + "epoch": 1.7644018149445753, + "grad_norm": 0.7482900619506836, + "learning_rate": 0.0001, + "loss": 1.7237, + "step": 15360 + }, + { + "epoch": 1.7645166848544025, + "grad_norm": 0.7028294801712036, + "learning_rate": 0.0001, + "loss": 1.4546, + "step": 15361 + }, + { + "epoch": 1.7646315547642295, + "grad_norm": 0.6432153582572937, + "learning_rate": 0.0001, + "loss": 1.4991, + "step": 15362 + }, + { + "epoch": 1.7647464246740565, + "grad_norm": 0.5955196619033813, + "learning_rate": 0.0001, + "loss": 1.4428, + "step": 15363 + }, + { + "epoch": 1.7648612945838837, + "grad_norm": 0.6378939747810364, + "learning_rate": 0.0001, + "loss": 1.5622, + "step": 15364 + }, + { + "epoch": 1.764976164493711, + "grad_norm": 0.6353983283042908, + "learning_rate": 0.0001, + "loss": 1.4364, + "step": 15365 + }, + { + "epoch": 1.765091034403538, + "grad_norm": 0.6289456486701965, + "learning_rate": 0.0001, + "loss": 1.5238, + "step": 15366 + }, + { + "epoch": 1.765205904313365, + "grad_norm": 0.6229878664016724, + "learning_rate": 0.0001, + "loss": 1.2323, + "step": 15367 + }, + { + "epoch": 1.7653207742231922, + "grad_norm": 0.6140499114990234, + "learning_rate": 0.0001, + "loss": 1.593, + "step": 15368 + }, + { + "epoch": 1.7654356441330195, + "grad_norm": 0.6184989213943481, + "learning_rate": 0.0001, + "loss": 1.3525, + "step": 15369 + }, + { + "epoch": 1.7655505140428465, + "grad_norm": 0.6055330634117126, + "learning_rate": 0.0001, + "loss": 1.2379, + "step": 15370 + }, + { + "epoch": 1.7656653839526735, + "grad_norm": 0.6226028203964233, + "learning_rate": 0.0001, + "loss": 1.511, + "step": 15371 + }, + { + "epoch": 1.7657802538625007, + "grad_norm": 0.6356282830238342, + "learning_rate": 0.0001, + "loss": 1.281, + "step": 15372 + }, + { + "epoch": 1.765895123772328, + "grad_norm": 0.5963003635406494, + "learning_rate": 0.0001, + "loss": 1.3596, + "step": 15373 + }, + { + "epoch": 1.766009993682155, + "grad_norm": 0.598003625869751, + "learning_rate": 0.0001, + "loss": 1.4494, + "step": 15374 + }, + { + "epoch": 1.766124863591982, + "grad_norm": 0.6972695589065552, + "learning_rate": 0.0001, + "loss": 1.2956, + "step": 15375 + }, + { + "epoch": 1.7662397335018092, + "grad_norm": 0.6252386569976807, + "learning_rate": 0.0001, + "loss": 1.4696, + "step": 15376 + }, + { + "epoch": 1.7663546034116364, + "grad_norm": 0.7368474006652832, + "learning_rate": 0.0001, + "loss": 1.6705, + "step": 15377 + }, + { + "epoch": 1.7664694733214634, + "grad_norm": 0.7342138886451721, + "learning_rate": 0.0001, + "loss": 1.4238, + "step": 15378 + }, + { + "epoch": 1.7665843432312904, + "grad_norm": 0.5836483836174011, + "learning_rate": 0.0001, + "loss": 1.3567, + "step": 15379 + }, + { + "epoch": 1.7666992131411177, + "grad_norm": 0.6157330274581909, + "learning_rate": 0.0001, + "loss": 1.4845, + "step": 15380 + }, + { + "epoch": 1.766814083050945, + "grad_norm": 0.630933940410614, + "learning_rate": 0.0001, + "loss": 1.5091, + "step": 15381 + }, + { + "epoch": 1.766928952960772, + "grad_norm": 0.6310692429542542, + "learning_rate": 0.0001, + "loss": 1.4669, + "step": 15382 + }, + { + "epoch": 1.767043822870599, + "grad_norm": 0.6385482549667358, + "learning_rate": 0.0001, + "loss": 1.2872, + "step": 15383 + }, + { + "epoch": 1.7671586927804261, + "grad_norm": 0.6223406195640564, + "learning_rate": 0.0001, + "loss": 1.2195, + "step": 15384 + }, + { + "epoch": 1.7672735626902534, + "grad_norm": 0.5795210003852844, + "learning_rate": 0.0001, + "loss": 1.3071, + "step": 15385 + }, + { + "epoch": 1.7673884326000804, + "grad_norm": 0.6244056224822998, + "learning_rate": 0.0001, + "loss": 1.3275, + "step": 15386 + }, + { + "epoch": 1.7675033025099074, + "grad_norm": 0.7372899055480957, + "learning_rate": 0.0001, + "loss": 1.6583, + "step": 15387 + }, + { + "epoch": 1.7676181724197346, + "grad_norm": 0.6304470300674438, + "learning_rate": 0.0001, + "loss": 1.3243, + "step": 15388 + }, + { + "epoch": 1.7677330423295619, + "grad_norm": 0.6503053307533264, + "learning_rate": 0.0001, + "loss": 1.3891, + "step": 15389 + }, + { + "epoch": 1.7678479122393889, + "grad_norm": 0.6643422245979309, + "learning_rate": 0.0001, + "loss": 1.5425, + "step": 15390 + }, + { + "epoch": 1.7679627821492159, + "grad_norm": 0.6259305477142334, + "learning_rate": 0.0001, + "loss": 1.3976, + "step": 15391 + }, + { + "epoch": 1.768077652059043, + "grad_norm": 0.6734678745269775, + "learning_rate": 0.0001, + "loss": 1.4114, + "step": 15392 + }, + { + "epoch": 1.7681925219688703, + "grad_norm": 0.6242077946662903, + "learning_rate": 0.0001, + "loss": 1.4955, + "step": 15393 + }, + { + "epoch": 1.7683073918786973, + "grad_norm": 0.6714116930961609, + "learning_rate": 0.0001, + "loss": 1.3507, + "step": 15394 + }, + { + "epoch": 1.7684222617885244, + "grad_norm": 0.5949355363845825, + "learning_rate": 0.0001, + "loss": 1.4515, + "step": 15395 + }, + { + "epoch": 1.7685371316983516, + "grad_norm": 0.5709308981895447, + "learning_rate": 0.0001, + "loss": 1.3621, + "step": 15396 + }, + { + "epoch": 1.7686520016081788, + "grad_norm": 0.6260430812835693, + "learning_rate": 0.0001, + "loss": 1.4699, + "step": 15397 + }, + { + "epoch": 1.7687668715180058, + "grad_norm": 0.6341519355773926, + "learning_rate": 0.0001, + "loss": 1.6458, + "step": 15398 + }, + { + "epoch": 1.7688817414278328, + "grad_norm": 0.6372659802436829, + "learning_rate": 0.0001, + "loss": 1.4443, + "step": 15399 + }, + { + "epoch": 1.76899661133766, + "grad_norm": 0.5935454964637756, + "learning_rate": 0.0001, + "loss": 1.2865, + "step": 15400 + }, + { + "epoch": 1.7691114812474873, + "grad_norm": 0.6392391324043274, + "learning_rate": 0.0001, + "loss": 1.5012, + "step": 15401 + }, + { + "epoch": 1.7692263511573143, + "grad_norm": 0.6767097115516663, + "learning_rate": 0.0001, + "loss": 1.5747, + "step": 15402 + }, + { + "epoch": 1.7693412210671413, + "grad_norm": 0.575113832950592, + "learning_rate": 0.0001, + "loss": 1.4338, + "step": 15403 + }, + { + "epoch": 1.7694560909769685, + "grad_norm": 0.6148155927658081, + "learning_rate": 0.0001, + "loss": 1.4446, + "step": 15404 + }, + { + "epoch": 1.7695709608867958, + "grad_norm": 0.6530346274375916, + "learning_rate": 0.0001, + "loss": 1.5008, + "step": 15405 + }, + { + "epoch": 1.7696858307966228, + "grad_norm": 0.624439001083374, + "learning_rate": 0.0001, + "loss": 1.4565, + "step": 15406 + }, + { + "epoch": 1.7698007007064498, + "grad_norm": 0.633822500705719, + "learning_rate": 0.0001, + "loss": 1.5872, + "step": 15407 + }, + { + "epoch": 1.769915570616277, + "grad_norm": 0.6398473381996155, + "learning_rate": 0.0001, + "loss": 1.3496, + "step": 15408 + }, + { + "epoch": 1.7700304405261043, + "grad_norm": 0.6082612872123718, + "learning_rate": 0.0001, + "loss": 1.2266, + "step": 15409 + }, + { + "epoch": 1.7701453104359313, + "grad_norm": 0.6723073124885559, + "learning_rate": 0.0001, + "loss": 1.3755, + "step": 15410 + }, + { + "epoch": 1.7702601803457583, + "grad_norm": 0.6552845239639282, + "learning_rate": 0.0001, + "loss": 1.5456, + "step": 15411 + }, + { + "epoch": 1.7703750502555855, + "grad_norm": 0.682786226272583, + "learning_rate": 0.0001, + "loss": 1.4209, + "step": 15412 + }, + { + "epoch": 1.7704899201654127, + "grad_norm": 0.6715489625930786, + "learning_rate": 0.0001, + "loss": 1.5819, + "step": 15413 + }, + { + "epoch": 1.7706047900752397, + "grad_norm": 0.6266937851905823, + "learning_rate": 0.0001, + "loss": 1.4389, + "step": 15414 + }, + { + "epoch": 1.7707196599850668, + "grad_norm": 0.6482228636741638, + "learning_rate": 0.0001, + "loss": 1.4746, + "step": 15415 + }, + { + "epoch": 1.770834529894894, + "grad_norm": 0.6691463589668274, + "learning_rate": 0.0001, + "loss": 1.6347, + "step": 15416 + }, + { + "epoch": 1.7709493998047212, + "grad_norm": 0.6793537735939026, + "learning_rate": 0.0001, + "loss": 1.6443, + "step": 15417 + }, + { + "epoch": 1.7710642697145482, + "grad_norm": 0.604053258895874, + "learning_rate": 0.0001, + "loss": 1.4415, + "step": 15418 + }, + { + "epoch": 1.7711791396243752, + "grad_norm": 0.7533746361732483, + "learning_rate": 0.0001, + "loss": 1.586, + "step": 15419 + }, + { + "epoch": 1.7712940095342025, + "grad_norm": 0.6014366745948792, + "learning_rate": 0.0001, + "loss": 1.37, + "step": 15420 + }, + { + "epoch": 1.7714088794440297, + "grad_norm": 0.6073466539382935, + "learning_rate": 0.0001, + "loss": 1.4946, + "step": 15421 + }, + { + "epoch": 1.7715237493538567, + "grad_norm": 0.6218917965888977, + "learning_rate": 0.0001, + "loss": 1.5332, + "step": 15422 + }, + { + "epoch": 1.7716386192636837, + "grad_norm": 0.6328446865081787, + "learning_rate": 0.0001, + "loss": 1.5439, + "step": 15423 + }, + { + "epoch": 1.771753489173511, + "grad_norm": 0.5783063173294067, + "learning_rate": 0.0001, + "loss": 1.2782, + "step": 15424 + }, + { + "epoch": 1.7718683590833382, + "grad_norm": 0.5905358791351318, + "learning_rate": 0.0001, + "loss": 1.528, + "step": 15425 + }, + { + "epoch": 1.7719832289931654, + "grad_norm": 0.6171038150787354, + "learning_rate": 0.0001, + "loss": 1.4173, + "step": 15426 + }, + { + "epoch": 1.7720980989029924, + "grad_norm": 0.5971978902816772, + "learning_rate": 0.0001, + "loss": 1.401, + "step": 15427 + }, + { + "epoch": 1.7722129688128194, + "grad_norm": 0.6217857003211975, + "learning_rate": 0.0001, + "loss": 1.4225, + "step": 15428 + }, + { + "epoch": 1.7723278387226467, + "grad_norm": 0.6341108679771423, + "learning_rate": 0.0001, + "loss": 1.3223, + "step": 15429 + }, + { + "epoch": 1.7724427086324739, + "grad_norm": 0.5544351935386658, + "learning_rate": 0.0001, + "loss": 1.4596, + "step": 15430 + }, + { + "epoch": 1.772557578542301, + "grad_norm": 0.6070437431335449, + "learning_rate": 0.0001, + "loss": 1.4921, + "step": 15431 + }, + { + "epoch": 1.772672448452128, + "grad_norm": 0.679899275302887, + "learning_rate": 0.0001, + "loss": 1.5184, + "step": 15432 + }, + { + "epoch": 1.7727873183619551, + "grad_norm": 0.6571292877197266, + "learning_rate": 0.0001, + "loss": 1.3402, + "step": 15433 + }, + { + "epoch": 1.7729021882717824, + "grad_norm": 0.6752861738204956, + "learning_rate": 0.0001, + "loss": 1.4057, + "step": 15434 + }, + { + "epoch": 1.7730170581816094, + "grad_norm": 0.6188561916351318, + "learning_rate": 0.0001, + "loss": 1.4021, + "step": 15435 + }, + { + "epoch": 1.7731319280914364, + "grad_norm": 0.645605206489563, + "learning_rate": 0.0001, + "loss": 1.6572, + "step": 15436 + }, + { + "epoch": 1.7732467980012636, + "grad_norm": 0.6331257820129395, + "learning_rate": 0.0001, + "loss": 1.4723, + "step": 15437 + }, + { + "epoch": 1.7733616679110908, + "grad_norm": 0.6232604384422302, + "learning_rate": 0.0001, + "loss": 1.6332, + "step": 15438 + }, + { + "epoch": 1.7734765378209179, + "grad_norm": 0.6299999952316284, + "learning_rate": 0.0001, + "loss": 1.4565, + "step": 15439 + }, + { + "epoch": 1.7735914077307449, + "grad_norm": 0.6201632618904114, + "learning_rate": 0.0001, + "loss": 1.4274, + "step": 15440 + }, + { + "epoch": 1.773706277640572, + "grad_norm": 0.6291703581809998, + "learning_rate": 0.0001, + "loss": 1.5339, + "step": 15441 + }, + { + "epoch": 1.7738211475503993, + "grad_norm": 0.6148052215576172, + "learning_rate": 0.0001, + "loss": 1.4622, + "step": 15442 + }, + { + "epoch": 1.7739360174602263, + "grad_norm": 0.6915506720542908, + "learning_rate": 0.0001, + "loss": 1.4758, + "step": 15443 + }, + { + "epoch": 1.7740508873700533, + "grad_norm": 0.6690899133682251, + "learning_rate": 0.0001, + "loss": 1.3532, + "step": 15444 + }, + { + "epoch": 1.7741657572798806, + "grad_norm": 0.6253287196159363, + "learning_rate": 0.0001, + "loss": 1.5029, + "step": 15445 + }, + { + "epoch": 1.7742806271897078, + "grad_norm": 0.66769939661026, + "learning_rate": 0.0001, + "loss": 1.5278, + "step": 15446 + }, + { + "epoch": 1.7743954970995348, + "grad_norm": 0.6366974115371704, + "learning_rate": 0.0001, + "loss": 1.4671, + "step": 15447 + }, + { + "epoch": 1.7745103670093618, + "grad_norm": 0.6763840913772583, + "learning_rate": 0.0001, + "loss": 1.575, + "step": 15448 + }, + { + "epoch": 1.774625236919189, + "grad_norm": 0.6336223483085632, + "learning_rate": 0.0001, + "loss": 1.5122, + "step": 15449 + }, + { + "epoch": 1.7747401068290163, + "grad_norm": 0.6298586130142212, + "learning_rate": 0.0001, + "loss": 1.5169, + "step": 15450 + }, + { + "epoch": 1.7748549767388433, + "grad_norm": 0.6808599829673767, + "learning_rate": 0.0001, + "loss": 1.4164, + "step": 15451 + }, + { + "epoch": 1.7749698466486703, + "grad_norm": 0.638803243637085, + "learning_rate": 0.0001, + "loss": 1.5666, + "step": 15452 + }, + { + "epoch": 1.7750847165584975, + "grad_norm": 0.6190632581710815, + "learning_rate": 0.0001, + "loss": 1.3554, + "step": 15453 + }, + { + "epoch": 1.7751995864683248, + "grad_norm": 0.6603758335113525, + "learning_rate": 0.0001, + "loss": 1.5177, + "step": 15454 + }, + { + "epoch": 1.7753144563781518, + "grad_norm": 0.6279389262199402, + "learning_rate": 0.0001, + "loss": 1.5894, + "step": 15455 + }, + { + "epoch": 1.7754293262879788, + "grad_norm": 0.6289408206939697, + "learning_rate": 0.0001, + "loss": 1.5424, + "step": 15456 + }, + { + "epoch": 1.775544196197806, + "grad_norm": 0.5870251059532166, + "learning_rate": 0.0001, + "loss": 1.3102, + "step": 15457 + }, + { + "epoch": 1.7756590661076332, + "grad_norm": 0.6192272305488586, + "learning_rate": 0.0001, + "loss": 1.3527, + "step": 15458 + }, + { + "epoch": 1.7757739360174603, + "grad_norm": 0.626162588596344, + "learning_rate": 0.0001, + "loss": 1.4285, + "step": 15459 + }, + { + "epoch": 1.7758888059272873, + "grad_norm": 0.6371445059776306, + "learning_rate": 0.0001, + "loss": 1.4435, + "step": 15460 + }, + { + "epoch": 1.7760036758371145, + "grad_norm": 0.7196854948997498, + "learning_rate": 0.0001, + "loss": 1.5672, + "step": 15461 + }, + { + "epoch": 1.7761185457469417, + "grad_norm": 0.6295327544212341, + "learning_rate": 0.0001, + "loss": 1.4168, + "step": 15462 + }, + { + "epoch": 1.7762334156567687, + "grad_norm": 0.6012369394302368, + "learning_rate": 0.0001, + "loss": 1.1423, + "step": 15463 + }, + { + "epoch": 1.7763482855665957, + "grad_norm": 0.6109943985939026, + "learning_rate": 0.0001, + "loss": 1.7749, + "step": 15464 + }, + { + "epoch": 1.776463155476423, + "grad_norm": 0.6386968493461609, + "learning_rate": 0.0001, + "loss": 1.4812, + "step": 15465 + }, + { + "epoch": 1.7765780253862502, + "grad_norm": 0.6312034130096436, + "learning_rate": 0.0001, + "loss": 1.3813, + "step": 15466 + }, + { + "epoch": 1.7766928952960772, + "grad_norm": 0.6818085312843323, + "learning_rate": 0.0001, + "loss": 1.5537, + "step": 15467 + }, + { + "epoch": 1.7768077652059042, + "grad_norm": 0.616159975528717, + "learning_rate": 0.0001, + "loss": 1.3302, + "step": 15468 + }, + { + "epoch": 1.7769226351157315, + "grad_norm": 0.6063972115516663, + "learning_rate": 0.0001, + "loss": 1.6571, + "step": 15469 + }, + { + "epoch": 1.7770375050255587, + "grad_norm": 0.6804771423339844, + "learning_rate": 0.0001, + "loss": 1.6351, + "step": 15470 + }, + { + "epoch": 1.7771523749353857, + "grad_norm": 0.6419492959976196, + "learning_rate": 0.0001, + "loss": 1.6362, + "step": 15471 + }, + { + "epoch": 1.7772672448452127, + "grad_norm": 0.6010777354240417, + "learning_rate": 0.0001, + "loss": 1.419, + "step": 15472 + }, + { + "epoch": 1.77738211475504, + "grad_norm": 0.6770652532577515, + "learning_rate": 0.0001, + "loss": 1.523, + "step": 15473 + }, + { + "epoch": 1.7774969846648672, + "grad_norm": 0.7224645614624023, + "learning_rate": 0.0001, + "loss": 1.4757, + "step": 15474 + }, + { + "epoch": 1.7776118545746942, + "grad_norm": 0.6119104623794556, + "learning_rate": 0.0001, + "loss": 1.3286, + "step": 15475 + }, + { + "epoch": 1.7777267244845212, + "grad_norm": 0.6447218060493469, + "learning_rate": 0.0001, + "loss": 1.5151, + "step": 15476 + }, + { + "epoch": 1.7778415943943484, + "grad_norm": 0.6411008238792419, + "learning_rate": 0.0001, + "loss": 1.5644, + "step": 15477 + }, + { + "epoch": 1.7779564643041756, + "grad_norm": 0.6330268383026123, + "learning_rate": 0.0001, + "loss": 1.5924, + "step": 15478 + }, + { + "epoch": 1.7780713342140027, + "grad_norm": 0.5914243459701538, + "learning_rate": 0.0001, + "loss": 1.5313, + "step": 15479 + }, + { + "epoch": 1.7781862041238297, + "grad_norm": 0.6504284739494324, + "learning_rate": 0.0001, + "loss": 1.662, + "step": 15480 + }, + { + "epoch": 1.778301074033657, + "grad_norm": 0.6057482361793518, + "learning_rate": 0.0001, + "loss": 1.3601, + "step": 15481 + }, + { + "epoch": 1.7784159439434841, + "grad_norm": 0.6217951774597168, + "learning_rate": 0.0001, + "loss": 1.5181, + "step": 15482 + }, + { + "epoch": 1.7785308138533111, + "grad_norm": 0.6592935919761658, + "learning_rate": 0.0001, + "loss": 1.5075, + "step": 15483 + }, + { + "epoch": 1.7786456837631381, + "grad_norm": 0.6222397089004517, + "learning_rate": 0.0001, + "loss": 1.5401, + "step": 15484 + }, + { + "epoch": 1.7787605536729654, + "grad_norm": 0.6175488233566284, + "learning_rate": 0.0001, + "loss": 1.4577, + "step": 15485 + }, + { + "epoch": 1.7788754235827926, + "grad_norm": 0.6524683833122253, + "learning_rate": 0.0001, + "loss": 1.45, + "step": 15486 + }, + { + "epoch": 1.7789902934926196, + "grad_norm": 0.6026191711425781, + "learning_rate": 0.0001, + "loss": 1.3723, + "step": 15487 + }, + { + "epoch": 1.7791051634024466, + "grad_norm": 0.6724210977554321, + "learning_rate": 0.0001, + "loss": 1.3315, + "step": 15488 + }, + { + "epoch": 1.7792200333122739, + "grad_norm": 0.738828718662262, + "learning_rate": 0.0001, + "loss": 1.6497, + "step": 15489 + }, + { + "epoch": 1.779334903222101, + "grad_norm": 0.7495474219322205, + "learning_rate": 0.0001, + "loss": 1.5393, + "step": 15490 + }, + { + "epoch": 1.779449773131928, + "grad_norm": 0.5927098989486694, + "learning_rate": 0.0001, + "loss": 1.4034, + "step": 15491 + }, + { + "epoch": 1.779564643041755, + "grad_norm": 0.6002185940742493, + "learning_rate": 0.0001, + "loss": 1.4098, + "step": 15492 + }, + { + "epoch": 1.7796795129515823, + "grad_norm": 0.6237373948097229, + "learning_rate": 0.0001, + "loss": 1.5183, + "step": 15493 + }, + { + "epoch": 1.7797943828614096, + "grad_norm": 0.606011152267456, + "learning_rate": 0.0001, + "loss": 1.4736, + "step": 15494 + }, + { + "epoch": 1.7799092527712366, + "grad_norm": 0.6501420736312866, + "learning_rate": 0.0001, + "loss": 1.4973, + "step": 15495 + }, + { + "epoch": 1.7800241226810636, + "grad_norm": 0.6354816555976868, + "learning_rate": 0.0001, + "loss": 1.4597, + "step": 15496 + }, + { + "epoch": 1.7801389925908908, + "grad_norm": 0.5931168794631958, + "learning_rate": 0.0001, + "loss": 1.4077, + "step": 15497 + }, + { + "epoch": 1.780253862500718, + "grad_norm": 0.626844048500061, + "learning_rate": 0.0001, + "loss": 1.5079, + "step": 15498 + }, + { + "epoch": 1.780368732410545, + "grad_norm": 0.6105011105537415, + "learning_rate": 0.0001, + "loss": 1.2362, + "step": 15499 + }, + { + "epoch": 1.780483602320372, + "grad_norm": 0.6420881152153015, + "learning_rate": 0.0001, + "loss": 1.5375, + "step": 15500 + }, + { + "epoch": 1.7805984722301993, + "grad_norm": 0.6629682779312134, + "learning_rate": 0.0001, + "loss": 1.6415, + "step": 15501 + }, + { + "epoch": 1.7807133421400265, + "grad_norm": 0.5871655941009521, + "learning_rate": 0.0001, + "loss": 1.3112, + "step": 15502 + }, + { + "epoch": 1.7808282120498535, + "grad_norm": 0.6966809034347534, + "learning_rate": 0.0001, + "loss": 1.5694, + "step": 15503 + }, + { + "epoch": 1.7809430819596805, + "grad_norm": 0.6312387585639954, + "learning_rate": 0.0001, + "loss": 1.5059, + "step": 15504 + }, + { + "epoch": 1.7810579518695078, + "grad_norm": 0.6314889192581177, + "learning_rate": 0.0001, + "loss": 1.452, + "step": 15505 + }, + { + "epoch": 1.781172821779335, + "grad_norm": 0.657773494720459, + "learning_rate": 0.0001, + "loss": 1.2146, + "step": 15506 + }, + { + "epoch": 1.781287691689162, + "grad_norm": 0.6220430731773376, + "learning_rate": 0.0001, + "loss": 1.3241, + "step": 15507 + }, + { + "epoch": 1.781402561598989, + "grad_norm": 0.7031171321868896, + "learning_rate": 0.0001, + "loss": 1.4987, + "step": 15508 + }, + { + "epoch": 1.7815174315088163, + "grad_norm": 0.6239733099937439, + "learning_rate": 0.0001, + "loss": 1.481, + "step": 15509 + }, + { + "epoch": 1.7816323014186435, + "grad_norm": 0.5512365698814392, + "learning_rate": 0.0001, + "loss": 1.1257, + "step": 15510 + }, + { + "epoch": 1.7817471713284705, + "grad_norm": 0.6748344302177429, + "learning_rate": 0.0001, + "loss": 1.5039, + "step": 15511 + }, + { + "epoch": 1.7818620412382975, + "grad_norm": 0.6460611820220947, + "learning_rate": 0.0001, + "loss": 1.3985, + "step": 15512 + }, + { + "epoch": 1.7819769111481247, + "grad_norm": 0.6497007608413696, + "learning_rate": 0.0001, + "loss": 1.5167, + "step": 15513 + }, + { + "epoch": 1.782091781057952, + "grad_norm": 0.6263741850852966, + "learning_rate": 0.0001, + "loss": 1.4769, + "step": 15514 + }, + { + "epoch": 1.782206650967779, + "grad_norm": 0.6193414330482483, + "learning_rate": 0.0001, + "loss": 1.4357, + "step": 15515 + }, + { + "epoch": 1.782321520877606, + "grad_norm": 0.6849316358566284, + "learning_rate": 0.0001, + "loss": 1.3156, + "step": 15516 + }, + { + "epoch": 1.7824363907874332, + "grad_norm": 0.5991883873939514, + "learning_rate": 0.0001, + "loss": 1.5014, + "step": 15517 + }, + { + "epoch": 1.7825512606972604, + "grad_norm": 0.608816921710968, + "learning_rate": 0.0001, + "loss": 1.4711, + "step": 15518 + }, + { + "epoch": 1.7826661306070875, + "grad_norm": 0.6820626258850098, + "learning_rate": 0.0001, + "loss": 1.4933, + "step": 15519 + }, + { + "epoch": 1.7827810005169145, + "grad_norm": 0.6117185354232788, + "learning_rate": 0.0001, + "loss": 1.4344, + "step": 15520 + }, + { + "epoch": 1.7828958704267417, + "grad_norm": 0.5957820415496826, + "learning_rate": 0.0001, + "loss": 1.5231, + "step": 15521 + }, + { + "epoch": 1.783010740336569, + "grad_norm": 0.6155253648757935, + "learning_rate": 0.0001, + "loss": 1.5587, + "step": 15522 + }, + { + "epoch": 1.783125610246396, + "grad_norm": 0.5900723934173584, + "learning_rate": 0.0001, + "loss": 1.4767, + "step": 15523 + }, + { + "epoch": 1.783240480156223, + "grad_norm": 0.6127877831459045, + "learning_rate": 0.0001, + "loss": 1.485, + "step": 15524 + }, + { + "epoch": 1.7833553500660502, + "grad_norm": 0.6130275130271912, + "learning_rate": 0.0001, + "loss": 1.2872, + "step": 15525 + }, + { + "epoch": 1.7834702199758774, + "grad_norm": 0.6365898251533508, + "learning_rate": 0.0001, + "loss": 1.5839, + "step": 15526 + }, + { + "epoch": 1.7835850898857044, + "grad_norm": 0.6927840113639832, + "learning_rate": 0.0001, + "loss": 1.4844, + "step": 15527 + }, + { + "epoch": 1.7836999597955314, + "grad_norm": 0.6556539535522461, + "learning_rate": 0.0001, + "loss": 1.489, + "step": 15528 + }, + { + "epoch": 1.7838148297053587, + "grad_norm": 0.6017742156982422, + "learning_rate": 0.0001, + "loss": 1.3716, + "step": 15529 + }, + { + "epoch": 1.7839296996151859, + "grad_norm": 0.6629507541656494, + "learning_rate": 0.0001, + "loss": 1.5471, + "step": 15530 + }, + { + "epoch": 1.784044569525013, + "grad_norm": 0.6213143467903137, + "learning_rate": 0.0001, + "loss": 1.6124, + "step": 15531 + }, + { + "epoch": 1.78415943943484, + "grad_norm": 0.6131148338317871, + "learning_rate": 0.0001, + "loss": 1.5151, + "step": 15532 + }, + { + "epoch": 1.7842743093446671, + "grad_norm": 0.6498699188232422, + "learning_rate": 0.0001, + "loss": 1.5769, + "step": 15533 + }, + { + "epoch": 1.7843891792544944, + "grad_norm": 0.6236017346382141, + "learning_rate": 0.0001, + "loss": 1.5537, + "step": 15534 + }, + { + "epoch": 1.7845040491643214, + "grad_norm": 0.6245088577270508, + "learning_rate": 0.0001, + "loss": 1.4866, + "step": 15535 + }, + { + "epoch": 1.7846189190741484, + "grad_norm": 0.6300262212753296, + "learning_rate": 0.0001, + "loss": 1.5246, + "step": 15536 + }, + { + "epoch": 1.7847337889839756, + "grad_norm": 0.6133743524551392, + "learning_rate": 0.0001, + "loss": 1.339, + "step": 15537 + }, + { + "epoch": 1.7848486588938028, + "grad_norm": 0.5978740453720093, + "learning_rate": 0.0001, + "loss": 1.38, + "step": 15538 + }, + { + "epoch": 1.7849635288036299, + "grad_norm": 0.583224892616272, + "learning_rate": 0.0001, + "loss": 1.4004, + "step": 15539 + }, + { + "epoch": 1.7850783987134569, + "grad_norm": 0.6395849585533142, + "learning_rate": 0.0001, + "loss": 1.2943, + "step": 15540 + }, + { + "epoch": 1.785193268623284, + "grad_norm": 0.5944806933403015, + "learning_rate": 0.0001, + "loss": 1.5083, + "step": 15541 + }, + { + "epoch": 1.7853081385331113, + "grad_norm": 0.5802217125892639, + "learning_rate": 0.0001, + "loss": 1.5533, + "step": 15542 + }, + { + "epoch": 1.7854230084429383, + "grad_norm": 0.6466507911682129, + "learning_rate": 0.0001, + "loss": 1.5448, + "step": 15543 + }, + { + "epoch": 1.7855378783527653, + "grad_norm": 0.6032712459564209, + "learning_rate": 0.0001, + "loss": 1.3153, + "step": 15544 + }, + { + "epoch": 1.7856527482625926, + "grad_norm": 0.6779415011405945, + "learning_rate": 0.0001, + "loss": 1.3325, + "step": 15545 + }, + { + "epoch": 1.7857676181724198, + "grad_norm": 0.6788262128829956, + "learning_rate": 0.0001, + "loss": 1.4666, + "step": 15546 + }, + { + "epoch": 1.7858824880822468, + "grad_norm": 0.6456349492073059, + "learning_rate": 0.0001, + "loss": 1.5217, + "step": 15547 + }, + { + "epoch": 1.7859973579920738, + "grad_norm": 0.6493527889251709, + "learning_rate": 0.0001, + "loss": 1.5493, + "step": 15548 + }, + { + "epoch": 1.786112227901901, + "grad_norm": 0.5926679968833923, + "learning_rate": 0.0001, + "loss": 1.2383, + "step": 15549 + }, + { + "epoch": 1.7862270978117283, + "grad_norm": 0.6278696656227112, + "learning_rate": 0.0001, + "loss": 1.4126, + "step": 15550 + }, + { + "epoch": 1.7863419677215553, + "grad_norm": 0.6778453588485718, + "learning_rate": 0.0001, + "loss": 1.4835, + "step": 15551 + }, + { + "epoch": 1.7864568376313823, + "grad_norm": 0.6517847180366516, + "learning_rate": 0.0001, + "loss": 1.576, + "step": 15552 + }, + { + "epoch": 1.7865717075412095, + "grad_norm": 0.6085006594657898, + "learning_rate": 0.0001, + "loss": 1.3612, + "step": 15553 + }, + { + "epoch": 1.7866865774510368, + "grad_norm": 0.6282052397727966, + "learning_rate": 0.0001, + "loss": 1.5781, + "step": 15554 + }, + { + "epoch": 1.7868014473608638, + "grad_norm": 0.6484823226928711, + "learning_rate": 0.0001, + "loss": 1.6014, + "step": 15555 + }, + { + "epoch": 1.7869163172706908, + "grad_norm": 0.6144720315933228, + "learning_rate": 0.0001, + "loss": 1.4621, + "step": 15556 + }, + { + "epoch": 1.787031187180518, + "grad_norm": 0.6251539587974548, + "learning_rate": 0.0001, + "loss": 1.5167, + "step": 15557 + }, + { + "epoch": 1.7871460570903452, + "grad_norm": 0.7098304033279419, + "learning_rate": 0.0001, + "loss": 1.6219, + "step": 15558 + }, + { + "epoch": 1.7872609270001723, + "grad_norm": 0.6878634095191956, + "learning_rate": 0.0001, + "loss": 1.5715, + "step": 15559 + }, + { + "epoch": 1.7873757969099993, + "grad_norm": 0.6232366561889648, + "learning_rate": 0.0001, + "loss": 1.4746, + "step": 15560 + }, + { + "epoch": 1.7874906668198265, + "grad_norm": 0.6680793166160583, + "learning_rate": 0.0001, + "loss": 1.4533, + "step": 15561 + }, + { + "epoch": 1.7876055367296537, + "grad_norm": 0.5885732173919678, + "learning_rate": 0.0001, + "loss": 1.3305, + "step": 15562 + }, + { + "epoch": 1.787720406639481, + "grad_norm": 0.7211229801177979, + "learning_rate": 0.0001, + "loss": 1.6991, + "step": 15563 + }, + { + "epoch": 1.787835276549308, + "grad_norm": 0.6472933888435364, + "learning_rate": 0.0001, + "loss": 1.5143, + "step": 15564 + }, + { + "epoch": 1.787950146459135, + "grad_norm": 0.6679675579071045, + "learning_rate": 0.0001, + "loss": 1.6347, + "step": 15565 + }, + { + "epoch": 1.7880650163689622, + "grad_norm": 0.6244038939476013, + "learning_rate": 0.0001, + "loss": 1.4162, + "step": 15566 + }, + { + "epoch": 1.7881798862787894, + "grad_norm": 0.6286173462867737, + "learning_rate": 0.0001, + "loss": 1.2787, + "step": 15567 + }, + { + "epoch": 1.7882947561886164, + "grad_norm": 0.6652150750160217, + "learning_rate": 0.0001, + "loss": 1.3333, + "step": 15568 + }, + { + "epoch": 1.7884096260984435, + "grad_norm": 0.5869207382202148, + "learning_rate": 0.0001, + "loss": 1.3693, + "step": 15569 + }, + { + "epoch": 1.7885244960082707, + "grad_norm": 0.6084496974945068, + "learning_rate": 0.0001, + "loss": 1.419, + "step": 15570 + }, + { + "epoch": 1.788639365918098, + "grad_norm": 0.6759893894195557, + "learning_rate": 0.0001, + "loss": 1.2883, + "step": 15571 + }, + { + "epoch": 1.788754235827925, + "grad_norm": 0.5952495336532593, + "learning_rate": 0.0001, + "loss": 1.4366, + "step": 15572 + }, + { + "epoch": 1.788869105737752, + "grad_norm": 0.5997838377952576, + "learning_rate": 0.0001, + "loss": 1.4331, + "step": 15573 + }, + { + "epoch": 1.7889839756475792, + "grad_norm": 0.6798581480979919, + "learning_rate": 0.0001, + "loss": 1.6547, + "step": 15574 + }, + { + "epoch": 1.7890988455574064, + "grad_norm": 0.6956671476364136, + "learning_rate": 0.0001, + "loss": 1.7073, + "step": 15575 + }, + { + "epoch": 1.7892137154672334, + "grad_norm": 0.642201840877533, + "learning_rate": 0.0001, + "loss": 1.3273, + "step": 15576 + }, + { + "epoch": 1.7893285853770604, + "grad_norm": 0.6423424482345581, + "learning_rate": 0.0001, + "loss": 1.4838, + "step": 15577 + }, + { + "epoch": 1.7894434552868876, + "grad_norm": 0.6451569199562073, + "learning_rate": 0.0001, + "loss": 1.4567, + "step": 15578 + }, + { + "epoch": 1.7895583251967149, + "grad_norm": 0.623618483543396, + "learning_rate": 0.0001, + "loss": 1.3361, + "step": 15579 + }, + { + "epoch": 1.7896731951065419, + "grad_norm": 0.6370881795883179, + "learning_rate": 0.0001, + "loss": 1.4506, + "step": 15580 + }, + { + "epoch": 1.789788065016369, + "grad_norm": 0.6714301705360413, + "learning_rate": 0.0001, + "loss": 1.48, + "step": 15581 + }, + { + "epoch": 1.7899029349261961, + "grad_norm": 0.7274426817893982, + "learning_rate": 0.0001, + "loss": 1.5294, + "step": 15582 + }, + { + "epoch": 1.7900178048360234, + "grad_norm": 0.7094860672950745, + "learning_rate": 0.0001, + "loss": 1.5049, + "step": 15583 + }, + { + "epoch": 1.7901326747458504, + "grad_norm": 0.6510686278343201, + "learning_rate": 0.0001, + "loss": 1.1147, + "step": 15584 + }, + { + "epoch": 1.7902475446556774, + "grad_norm": 0.6724952459335327, + "learning_rate": 0.0001, + "loss": 1.6518, + "step": 15585 + }, + { + "epoch": 1.7903624145655046, + "grad_norm": 0.6730976700782776, + "learning_rate": 0.0001, + "loss": 1.6428, + "step": 15586 + }, + { + "epoch": 1.7904772844753318, + "grad_norm": 0.5927944779396057, + "learning_rate": 0.0001, + "loss": 1.269, + "step": 15587 + }, + { + "epoch": 1.7905921543851588, + "grad_norm": 0.6580035090446472, + "learning_rate": 0.0001, + "loss": 1.4699, + "step": 15588 + }, + { + "epoch": 1.7907070242949858, + "grad_norm": 0.5773756504058838, + "learning_rate": 0.0001, + "loss": 1.4074, + "step": 15589 + }, + { + "epoch": 1.790821894204813, + "grad_norm": 0.660759449005127, + "learning_rate": 0.0001, + "loss": 1.6364, + "step": 15590 + }, + { + "epoch": 1.7909367641146403, + "grad_norm": 0.7201364040374756, + "learning_rate": 0.0001, + "loss": 1.7218, + "step": 15591 + }, + { + "epoch": 1.7910516340244673, + "grad_norm": 0.6264855861663818, + "learning_rate": 0.0001, + "loss": 1.5182, + "step": 15592 + }, + { + "epoch": 1.7911665039342943, + "grad_norm": 0.6170802712440491, + "learning_rate": 0.0001, + "loss": 1.3783, + "step": 15593 + }, + { + "epoch": 1.7912813738441216, + "grad_norm": 0.6500568389892578, + "learning_rate": 0.0001, + "loss": 1.449, + "step": 15594 + }, + { + "epoch": 1.7913962437539488, + "grad_norm": 0.6140516996383667, + "learning_rate": 0.0001, + "loss": 1.4694, + "step": 15595 + }, + { + "epoch": 1.7915111136637758, + "grad_norm": 0.5915245413780212, + "learning_rate": 0.0001, + "loss": 1.3078, + "step": 15596 + }, + { + "epoch": 1.7916259835736028, + "grad_norm": 0.5879004001617432, + "learning_rate": 0.0001, + "loss": 1.3216, + "step": 15597 + }, + { + "epoch": 1.79174085348343, + "grad_norm": 0.6277405619621277, + "learning_rate": 0.0001, + "loss": 1.4248, + "step": 15598 + }, + { + "epoch": 1.7918557233932573, + "grad_norm": 0.5921669006347656, + "learning_rate": 0.0001, + "loss": 1.3723, + "step": 15599 + }, + { + "epoch": 1.7919705933030843, + "grad_norm": 0.5975223183631897, + "learning_rate": 0.0001, + "loss": 1.4798, + "step": 15600 + }, + { + "epoch": 1.7920854632129113, + "grad_norm": 0.6387137770652771, + "learning_rate": 0.0001, + "loss": 1.5026, + "step": 15601 + }, + { + "epoch": 1.7922003331227385, + "grad_norm": 0.5730849504470825, + "learning_rate": 0.0001, + "loss": 1.3881, + "step": 15602 + }, + { + "epoch": 1.7923152030325658, + "grad_norm": 0.5885117650032043, + "learning_rate": 0.0001, + "loss": 1.5356, + "step": 15603 + }, + { + "epoch": 1.7924300729423928, + "grad_norm": 0.6096714735031128, + "learning_rate": 0.0001, + "loss": 1.4063, + "step": 15604 + }, + { + "epoch": 1.7925449428522198, + "grad_norm": 0.6324706077575684, + "learning_rate": 0.0001, + "loss": 1.5439, + "step": 15605 + }, + { + "epoch": 1.792659812762047, + "grad_norm": 0.6507951617240906, + "learning_rate": 0.0001, + "loss": 1.4797, + "step": 15606 + }, + { + "epoch": 1.7927746826718742, + "grad_norm": 0.6201066970825195, + "learning_rate": 0.0001, + "loss": 1.5646, + "step": 15607 + }, + { + "epoch": 1.7928895525817012, + "grad_norm": 0.6061670780181885, + "learning_rate": 0.0001, + "loss": 1.455, + "step": 15608 + }, + { + "epoch": 1.7930044224915282, + "grad_norm": 0.6483253240585327, + "learning_rate": 0.0001, + "loss": 1.3321, + "step": 15609 + }, + { + "epoch": 1.7931192924013555, + "grad_norm": 0.608324408531189, + "learning_rate": 0.0001, + "loss": 1.5719, + "step": 15610 + }, + { + "epoch": 1.7932341623111827, + "grad_norm": 0.5885353684425354, + "learning_rate": 0.0001, + "loss": 1.4886, + "step": 15611 + }, + { + "epoch": 1.7933490322210097, + "grad_norm": 0.612770676612854, + "learning_rate": 0.0001, + "loss": 1.4332, + "step": 15612 + }, + { + "epoch": 1.7934639021308367, + "grad_norm": 0.656052827835083, + "learning_rate": 0.0001, + "loss": 1.3348, + "step": 15613 + }, + { + "epoch": 1.793578772040664, + "grad_norm": 0.6733161211013794, + "learning_rate": 0.0001, + "loss": 1.5747, + "step": 15614 + }, + { + "epoch": 1.7936936419504912, + "grad_norm": 0.6594151258468628, + "learning_rate": 0.0001, + "loss": 1.5171, + "step": 15615 + }, + { + "epoch": 1.7938085118603182, + "grad_norm": 0.6126337051391602, + "learning_rate": 0.0001, + "loss": 1.3746, + "step": 15616 + }, + { + "epoch": 1.7939233817701452, + "grad_norm": 0.613390326499939, + "learning_rate": 0.0001, + "loss": 1.4174, + "step": 15617 + }, + { + "epoch": 1.7940382516799724, + "grad_norm": 0.6970869898796082, + "learning_rate": 0.0001, + "loss": 1.4908, + "step": 15618 + }, + { + "epoch": 1.7941531215897997, + "grad_norm": 0.5497614145278931, + "learning_rate": 0.0001, + "loss": 1.3542, + "step": 15619 + }, + { + "epoch": 1.7942679914996267, + "grad_norm": 0.5766857862472534, + "learning_rate": 0.0001, + "loss": 1.5123, + "step": 15620 + }, + { + "epoch": 1.7943828614094537, + "grad_norm": 0.6197019219398499, + "learning_rate": 0.0001, + "loss": 1.2267, + "step": 15621 + }, + { + "epoch": 1.794497731319281, + "grad_norm": 0.6978227496147156, + "learning_rate": 0.0001, + "loss": 1.3925, + "step": 15622 + }, + { + "epoch": 1.7946126012291082, + "grad_norm": 0.6291922330856323, + "learning_rate": 0.0001, + "loss": 1.3681, + "step": 15623 + }, + { + "epoch": 1.7947274711389352, + "grad_norm": 0.6435182690620422, + "learning_rate": 0.0001, + "loss": 1.5943, + "step": 15624 + }, + { + "epoch": 1.7948423410487622, + "grad_norm": 0.6218845248222351, + "learning_rate": 0.0001, + "loss": 1.4538, + "step": 15625 + }, + { + "epoch": 1.7949572109585894, + "grad_norm": 0.5924429297447205, + "learning_rate": 0.0001, + "loss": 1.4747, + "step": 15626 + }, + { + "epoch": 1.7950720808684166, + "grad_norm": 0.632346510887146, + "learning_rate": 0.0001, + "loss": 1.5101, + "step": 15627 + }, + { + "epoch": 1.7951869507782436, + "grad_norm": 0.5771130323410034, + "learning_rate": 0.0001, + "loss": 1.4203, + "step": 15628 + }, + { + "epoch": 1.7953018206880706, + "grad_norm": 0.5830177068710327, + "learning_rate": 0.0001, + "loss": 1.5285, + "step": 15629 + }, + { + "epoch": 1.7954166905978979, + "grad_norm": 0.685839831829071, + "learning_rate": 0.0001, + "loss": 1.3029, + "step": 15630 + }, + { + "epoch": 1.795531560507725, + "grad_norm": 0.5754895806312561, + "learning_rate": 0.0001, + "loss": 1.4049, + "step": 15631 + }, + { + "epoch": 1.7956464304175521, + "grad_norm": 0.6010079979896545, + "learning_rate": 0.0001, + "loss": 1.3411, + "step": 15632 + }, + { + "epoch": 1.7957613003273791, + "grad_norm": 0.6301531791687012, + "learning_rate": 0.0001, + "loss": 1.3216, + "step": 15633 + }, + { + "epoch": 1.7958761702372064, + "grad_norm": 0.5766407251358032, + "learning_rate": 0.0001, + "loss": 1.3345, + "step": 15634 + }, + { + "epoch": 1.7959910401470336, + "grad_norm": 0.6998764872550964, + "learning_rate": 0.0001, + "loss": 1.7205, + "step": 15635 + }, + { + "epoch": 1.7961059100568606, + "grad_norm": 0.6410335898399353, + "learning_rate": 0.0001, + "loss": 1.6245, + "step": 15636 + }, + { + "epoch": 1.7962207799666876, + "grad_norm": 0.5890425443649292, + "learning_rate": 0.0001, + "loss": 1.4822, + "step": 15637 + }, + { + "epoch": 1.7963356498765148, + "grad_norm": 0.717514157295227, + "learning_rate": 0.0001, + "loss": 1.4209, + "step": 15638 + }, + { + "epoch": 1.796450519786342, + "grad_norm": 0.6638051271438599, + "learning_rate": 0.0001, + "loss": 1.6087, + "step": 15639 + }, + { + "epoch": 1.796565389696169, + "grad_norm": 0.6315420269966125, + "learning_rate": 0.0001, + "loss": 1.3122, + "step": 15640 + }, + { + "epoch": 1.796680259605996, + "grad_norm": 0.6300930380821228, + "learning_rate": 0.0001, + "loss": 1.4313, + "step": 15641 + }, + { + "epoch": 1.7967951295158233, + "grad_norm": 0.6064184904098511, + "learning_rate": 0.0001, + "loss": 1.4815, + "step": 15642 + }, + { + "epoch": 1.7969099994256506, + "grad_norm": 0.5937897562980652, + "learning_rate": 0.0001, + "loss": 1.44, + "step": 15643 + }, + { + "epoch": 1.7970248693354776, + "grad_norm": 0.5919926166534424, + "learning_rate": 0.0001, + "loss": 1.4487, + "step": 15644 + }, + { + "epoch": 1.7971397392453046, + "grad_norm": 0.6598802208900452, + "learning_rate": 0.0001, + "loss": 1.3712, + "step": 15645 + }, + { + "epoch": 1.7972546091551318, + "grad_norm": 0.6512289047241211, + "learning_rate": 0.0001, + "loss": 1.2151, + "step": 15646 + }, + { + "epoch": 1.797369479064959, + "grad_norm": 0.6600479483604431, + "learning_rate": 0.0001, + "loss": 1.5193, + "step": 15647 + }, + { + "epoch": 1.797484348974786, + "grad_norm": 0.6048285365104675, + "learning_rate": 0.0001, + "loss": 1.4262, + "step": 15648 + }, + { + "epoch": 1.797599218884613, + "grad_norm": 0.6369686126708984, + "learning_rate": 0.0001, + "loss": 1.5183, + "step": 15649 + }, + { + "epoch": 1.7977140887944403, + "grad_norm": 0.6395129561424255, + "learning_rate": 0.0001, + "loss": 1.4658, + "step": 15650 + }, + { + "epoch": 1.7978289587042675, + "grad_norm": 0.6294239163398743, + "learning_rate": 0.0001, + "loss": 1.3939, + "step": 15651 + }, + { + "epoch": 1.7979438286140945, + "grad_norm": 0.6349182724952698, + "learning_rate": 0.0001, + "loss": 1.4647, + "step": 15652 + }, + { + "epoch": 1.7980586985239215, + "grad_norm": 0.7371272444725037, + "learning_rate": 0.0001, + "loss": 1.5782, + "step": 15653 + }, + { + "epoch": 1.7981735684337488, + "grad_norm": 0.724577784538269, + "learning_rate": 0.0001, + "loss": 1.3706, + "step": 15654 + }, + { + "epoch": 1.798288438343576, + "grad_norm": 0.6599369645118713, + "learning_rate": 0.0001, + "loss": 1.4384, + "step": 15655 + }, + { + "epoch": 1.798403308253403, + "grad_norm": 0.6410521864891052, + "learning_rate": 0.0001, + "loss": 1.4447, + "step": 15656 + }, + { + "epoch": 1.79851817816323, + "grad_norm": 0.6453815698623657, + "learning_rate": 0.0001, + "loss": 1.2947, + "step": 15657 + }, + { + "epoch": 1.7986330480730572, + "grad_norm": 0.5987499356269836, + "learning_rate": 0.0001, + "loss": 1.5017, + "step": 15658 + }, + { + "epoch": 1.7987479179828845, + "grad_norm": 0.6878007054328918, + "learning_rate": 0.0001, + "loss": 1.616, + "step": 15659 + }, + { + "epoch": 1.7988627878927115, + "grad_norm": 0.6186679005622864, + "learning_rate": 0.0001, + "loss": 1.4337, + "step": 15660 + }, + { + "epoch": 1.7989776578025385, + "grad_norm": 0.6373911499977112, + "learning_rate": 0.0001, + "loss": 1.3869, + "step": 15661 + }, + { + "epoch": 1.7990925277123657, + "grad_norm": 0.626069962978363, + "learning_rate": 0.0001, + "loss": 1.4759, + "step": 15662 + }, + { + "epoch": 1.799207397622193, + "grad_norm": 0.6917441487312317, + "learning_rate": 0.0001, + "loss": 1.6685, + "step": 15663 + }, + { + "epoch": 1.79932226753202, + "grad_norm": 0.6298771500587463, + "learning_rate": 0.0001, + "loss": 1.4287, + "step": 15664 + }, + { + "epoch": 1.799437137441847, + "grad_norm": 0.6612205505371094, + "learning_rate": 0.0001, + "loss": 1.4862, + "step": 15665 + }, + { + "epoch": 1.7995520073516742, + "grad_norm": 0.6308835744857788, + "learning_rate": 0.0001, + "loss": 1.3969, + "step": 15666 + }, + { + "epoch": 1.7996668772615014, + "grad_norm": 0.6707907319068909, + "learning_rate": 0.0001, + "loss": 1.5247, + "step": 15667 + }, + { + "epoch": 1.7997817471713284, + "grad_norm": 0.5699948072433472, + "learning_rate": 0.0001, + "loss": 1.2569, + "step": 15668 + }, + { + "epoch": 1.7998966170811554, + "grad_norm": 0.6121656894683838, + "learning_rate": 0.0001, + "loss": 1.5105, + "step": 15669 + }, + { + "epoch": 1.8000114869909827, + "grad_norm": 0.640649676322937, + "learning_rate": 0.0001, + "loss": 1.63, + "step": 15670 + }, + { + "epoch": 1.80012635690081, + "grad_norm": 0.595537543296814, + "learning_rate": 0.0001, + "loss": 1.5195, + "step": 15671 + }, + { + "epoch": 1.800241226810637, + "grad_norm": 0.6392516493797302, + "learning_rate": 0.0001, + "loss": 1.5501, + "step": 15672 + }, + { + "epoch": 1.800356096720464, + "grad_norm": 0.6374161243438721, + "learning_rate": 0.0001, + "loss": 1.3964, + "step": 15673 + }, + { + "epoch": 1.8004709666302912, + "grad_norm": 0.6486839652061462, + "learning_rate": 0.0001, + "loss": 1.6107, + "step": 15674 + }, + { + "epoch": 1.8005858365401184, + "grad_norm": 0.7054831981658936, + "learning_rate": 0.0001, + "loss": 1.5579, + "step": 15675 + }, + { + "epoch": 1.8007007064499454, + "grad_norm": 0.6817604899406433, + "learning_rate": 0.0001, + "loss": 1.5474, + "step": 15676 + }, + { + "epoch": 1.8008155763597724, + "grad_norm": 0.6403077840805054, + "learning_rate": 0.0001, + "loss": 1.4614, + "step": 15677 + }, + { + "epoch": 1.8009304462695996, + "grad_norm": 0.6834919452667236, + "learning_rate": 0.0001, + "loss": 1.3062, + "step": 15678 + }, + { + "epoch": 1.8010453161794269, + "grad_norm": 0.6138955354690552, + "learning_rate": 0.0001, + "loss": 1.235, + "step": 15679 + }, + { + "epoch": 1.8011601860892539, + "grad_norm": 0.6135299205780029, + "learning_rate": 0.0001, + "loss": 1.3112, + "step": 15680 + }, + { + "epoch": 1.8012750559990809, + "grad_norm": 0.6524873971939087, + "learning_rate": 0.0001, + "loss": 1.3524, + "step": 15681 + }, + { + "epoch": 1.8013899259089081, + "grad_norm": 0.635270357131958, + "learning_rate": 0.0001, + "loss": 1.1356, + "step": 15682 + }, + { + "epoch": 1.8015047958187353, + "grad_norm": 0.6525683403015137, + "learning_rate": 0.0001, + "loss": 1.3779, + "step": 15683 + }, + { + "epoch": 1.8016196657285624, + "grad_norm": 0.6439868807792664, + "learning_rate": 0.0001, + "loss": 1.4131, + "step": 15684 + }, + { + "epoch": 1.8017345356383894, + "grad_norm": 0.6394991278648376, + "learning_rate": 0.0001, + "loss": 1.5015, + "step": 15685 + }, + { + "epoch": 1.8018494055482166, + "grad_norm": 0.6692410707473755, + "learning_rate": 0.0001, + "loss": 1.477, + "step": 15686 + }, + { + "epoch": 1.8019642754580438, + "grad_norm": 0.5906667709350586, + "learning_rate": 0.0001, + "loss": 1.2574, + "step": 15687 + }, + { + "epoch": 1.8020791453678708, + "grad_norm": 0.6098186373710632, + "learning_rate": 0.0001, + "loss": 1.3739, + "step": 15688 + }, + { + "epoch": 1.8021940152776978, + "grad_norm": 0.636909544467926, + "learning_rate": 0.0001, + "loss": 1.4122, + "step": 15689 + }, + { + "epoch": 1.802308885187525, + "grad_norm": 0.606322169303894, + "learning_rate": 0.0001, + "loss": 1.4376, + "step": 15690 + }, + { + "epoch": 1.8024237550973523, + "grad_norm": 0.6577963829040527, + "learning_rate": 0.0001, + "loss": 1.3516, + "step": 15691 + }, + { + "epoch": 1.8025386250071793, + "grad_norm": 0.6309029459953308, + "learning_rate": 0.0001, + "loss": 1.516, + "step": 15692 + }, + { + "epoch": 1.8026534949170063, + "grad_norm": 0.585844099521637, + "learning_rate": 0.0001, + "loss": 1.4676, + "step": 15693 + }, + { + "epoch": 1.8027683648268336, + "grad_norm": 0.6345848441123962, + "learning_rate": 0.0001, + "loss": 1.3576, + "step": 15694 + }, + { + "epoch": 1.8028832347366608, + "grad_norm": 0.6085816621780396, + "learning_rate": 0.0001, + "loss": 1.5278, + "step": 15695 + }, + { + "epoch": 1.8029981046464878, + "grad_norm": 0.5922706127166748, + "learning_rate": 0.0001, + "loss": 1.4294, + "step": 15696 + }, + { + "epoch": 1.8031129745563148, + "grad_norm": 0.7549859285354614, + "learning_rate": 0.0001, + "loss": 1.6641, + "step": 15697 + }, + { + "epoch": 1.803227844466142, + "grad_norm": 0.5932414531707764, + "learning_rate": 0.0001, + "loss": 1.3871, + "step": 15698 + }, + { + "epoch": 1.8033427143759693, + "grad_norm": 0.6508196592330933, + "learning_rate": 0.0001, + "loss": 1.4249, + "step": 15699 + }, + { + "epoch": 1.8034575842857965, + "grad_norm": 0.6208270788192749, + "learning_rate": 0.0001, + "loss": 1.5313, + "step": 15700 + }, + { + "epoch": 1.8035724541956235, + "grad_norm": 0.5751149654388428, + "learning_rate": 0.0001, + "loss": 1.333, + "step": 15701 + }, + { + "epoch": 1.8036873241054505, + "grad_norm": 0.6106820702552795, + "learning_rate": 0.0001, + "loss": 1.2523, + "step": 15702 + }, + { + "epoch": 1.8038021940152777, + "grad_norm": 0.6373929977416992, + "learning_rate": 0.0001, + "loss": 1.4284, + "step": 15703 + }, + { + "epoch": 1.803917063925105, + "grad_norm": 0.6269072890281677, + "learning_rate": 0.0001, + "loss": 1.4818, + "step": 15704 + }, + { + "epoch": 1.804031933834932, + "grad_norm": 0.6745073199272156, + "learning_rate": 0.0001, + "loss": 1.3644, + "step": 15705 + }, + { + "epoch": 1.804146803744759, + "grad_norm": 0.7023970484733582, + "learning_rate": 0.0001, + "loss": 1.5443, + "step": 15706 + }, + { + "epoch": 1.8042616736545862, + "grad_norm": 0.6291683912277222, + "learning_rate": 0.0001, + "loss": 1.3183, + "step": 15707 + }, + { + "epoch": 1.8043765435644135, + "grad_norm": 0.6405826807022095, + "learning_rate": 0.0001, + "loss": 1.5059, + "step": 15708 + }, + { + "epoch": 1.8044914134742405, + "grad_norm": 0.6145171523094177, + "learning_rate": 0.0001, + "loss": 1.4438, + "step": 15709 + }, + { + "epoch": 1.8046062833840675, + "grad_norm": 0.6371951103210449, + "learning_rate": 0.0001, + "loss": 1.562, + "step": 15710 + }, + { + "epoch": 1.8047211532938947, + "grad_norm": 0.6146504878997803, + "learning_rate": 0.0001, + "loss": 1.4305, + "step": 15711 + }, + { + "epoch": 1.804836023203722, + "grad_norm": 0.6177845597267151, + "learning_rate": 0.0001, + "loss": 1.2665, + "step": 15712 + }, + { + "epoch": 1.804950893113549, + "grad_norm": 0.6438102722167969, + "learning_rate": 0.0001, + "loss": 1.5661, + "step": 15713 + }, + { + "epoch": 1.805065763023376, + "grad_norm": 0.6426578164100647, + "learning_rate": 0.0001, + "loss": 1.4181, + "step": 15714 + }, + { + "epoch": 1.8051806329332032, + "grad_norm": 0.7150061130523682, + "learning_rate": 0.0001, + "loss": 1.4365, + "step": 15715 + }, + { + "epoch": 1.8052955028430304, + "grad_norm": 0.6319580674171448, + "learning_rate": 0.0001, + "loss": 1.4184, + "step": 15716 + }, + { + "epoch": 1.8054103727528574, + "grad_norm": 0.6289169788360596, + "learning_rate": 0.0001, + "loss": 1.4949, + "step": 15717 + }, + { + "epoch": 1.8055252426626844, + "grad_norm": 0.6072092056274414, + "learning_rate": 0.0001, + "loss": 1.5016, + "step": 15718 + }, + { + "epoch": 1.8056401125725117, + "grad_norm": 0.71548992395401, + "learning_rate": 0.0001, + "loss": 1.7456, + "step": 15719 + }, + { + "epoch": 1.805754982482339, + "grad_norm": 0.622413694858551, + "learning_rate": 0.0001, + "loss": 1.4269, + "step": 15720 + }, + { + "epoch": 1.805869852392166, + "grad_norm": 0.6449538469314575, + "learning_rate": 0.0001, + "loss": 1.2798, + "step": 15721 + }, + { + "epoch": 1.805984722301993, + "grad_norm": 0.6331883668899536, + "learning_rate": 0.0001, + "loss": 1.4222, + "step": 15722 + }, + { + "epoch": 1.8060995922118201, + "grad_norm": 0.57768315076828, + "learning_rate": 0.0001, + "loss": 1.2396, + "step": 15723 + }, + { + "epoch": 1.8062144621216474, + "grad_norm": 0.6328497529029846, + "learning_rate": 0.0001, + "loss": 1.3806, + "step": 15724 + }, + { + "epoch": 1.8063293320314744, + "grad_norm": 0.69939124584198, + "learning_rate": 0.0001, + "loss": 1.3731, + "step": 15725 + }, + { + "epoch": 1.8064442019413014, + "grad_norm": 0.6895657181739807, + "learning_rate": 0.0001, + "loss": 1.507, + "step": 15726 + }, + { + "epoch": 1.8065590718511286, + "grad_norm": 0.6120685935020447, + "learning_rate": 0.0001, + "loss": 1.1978, + "step": 15727 + }, + { + "epoch": 1.8066739417609559, + "grad_norm": 0.6782362461090088, + "learning_rate": 0.0001, + "loss": 1.2976, + "step": 15728 + }, + { + "epoch": 1.8067888116707829, + "grad_norm": 0.6087343096733093, + "learning_rate": 0.0001, + "loss": 1.3276, + "step": 15729 + }, + { + "epoch": 1.8069036815806099, + "grad_norm": 0.6072776913642883, + "learning_rate": 0.0001, + "loss": 1.4559, + "step": 15730 + }, + { + "epoch": 1.807018551490437, + "grad_norm": 0.6058279275894165, + "learning_rate": 0.0001, + "loss": 1.3737, + "step": 15731 + }, + { + "epoch": 1.8071334214002643, + "grad_norm": 0.63099604845047, + "learning_rate": 0.0001, + "loss": 1.413, + "step": 15732 + }, + { + "epoch": 1.8072482913100913, + "grad_norm": 0.6059746146202087, + "learning_rate": 0.0001, + "loss": 1.4017, + "step": 15733 + }, + { + "epoch": 1.8073631612199184, + "grad_norm": 0.6642100214958191, + "learning_rate": 0.0001, + "loss": 1.5973, + "step": 15734 + }, + { + "epoch": 1.8074780311297456, + "grad_norm": 0.6303525567054749, + "learning_rate": 0.0001, + "loss": 1.5459, + "step": 15735 + }, + { + "epoch": 1.8075929010395728, + "grad_norm": 0.605383038520813, + "learning_rate": 0.0001, + "loss": 1.3552, + "step": 15736 + }, + { + "epoch": 1.8077077709493998, + "grad_norm": 0.6636791229248047, + "learning_rate": 0.0001, + "loss": 1.3234, + "step": 15737 + }, + { + "epoch": 1.8078226408592268, + "grad_norm": 0.6717275381088257, + "learning_rate": 0.0001, + "loss": 1.5471, + "step": 15738 + }, + { + "epoch": 1.807937510769054, + "grad_norm": 0.6487286686897278, + "learning_rate": 0.0001, + "loss": 1.5063, + "step": 15739 + }, + { + "epoch": 1.8080523806788813, + "grad_norm": 0.6421818137168884, + "learning_rate": 0.0001, + "loss": 1.4977, + "step": 15740 + }, + { + "epoch": 1.8081672505887083, + "grad_norm": 0.6361395716667175, + "learning_rate": 0.0001, + "loss": 1.416, + "step": 15741 + }, + { + "epoch": 1.8082821204985353, + "grad_norm": 0.6091173887252808, + "learning_rate": 0.0001, + "loss": 1.5454, + "step": 15742 + }, + { + "epoch": 1.8083969904083625, + "grad_norm": 0.6464824080467224, + "learning_rate": 0.0001, + "loss": 1.4512, + "step": 15743 + }, + { + "epoch": 1.8085118603181898, + "grad_norm": 0.6446046829223633, + "learning_rate": 0.0001, + "loss": 1.451, + "step": 15744 + }, + { + "epoch": 1.8086267302280168, + "grad_norm": 0.7475030422210693, + "learning_rate": 0.0001, + "loss": 1.7174, + "step": 15745 + }, + { + "epoch": 1.8087416001378438, + "grad_norm": 0.6352036595344543, + "learning_rate": 0.0001, + "loss": 1.3802, + "step": 15746 + }, + { + "epoch": 1.808856470047671, + "grad_norm": 0.65043044090271, + "learning_rate": 0.0001, + "loss": 1.4297, + "step": 15747 + }, + { + "epoch": 1.8089713399574983, + "grad_norm": 0.6558632254600525, + "learning_rate": 0.0001, + "loss": 1.4044, + "step": 15748 + }, + { + "epoch": 1.8090862098673253, + "grad_norm": 0.6217056512832642, + "learning_rate": 0.0001, + "loss": 1.2797, + "step": 15749 + }, + { + "epoch": 1.8092010797771523, + "grad_norm": 0.6086947917938232, + "learning_rate": 0.0001, + "loss": 1.3942, + "step": 15750 + }, + { + "epoch": 1.8093159496869795, + "grad_norm": 0.5924387574195862, + "learning_rate": 0.0001, + "loss": 1.5433, + "step": 15751 + }, + { + "epoch": 1.8094308195968067, + "grad_norm": 0.6517341732978821, + "learning_rate": 0.0001, + "loss": 1.4954, + "step": 15752 + }, + { + "epoch": 1.8095456895066337, + "grad_norm": 0.6108118891716003, + "learning_rate": 0.0001, + "loss": 1.3201, + "step": 15753 + }, + { + "epoch": 1.8096605594164608, + "grad_norm": 0.6316987872123718, + "learning_rate": 0.0001, + "loss": 1.5334, + "step": 15754 + }, + { + "epoch": 1.809775429326288, + "grad_norm": 0.6387416124343872, + "learning_rate": 0.0001, + "loss": 1.5644, + "step": 15755 + }, + { + "epoch": 1.8098902992361152, + "grad_norm": 0.604150116443634, + "learning_rate": 0.0001, + "loss": 1.185, + "step": 15756 + }, + { + "epoch": 1.8100051691459422, + "grad_norm": 0.6756969690322876, + "learning_rate": 0.0001, + "loss": 1.5415, + "step": 15757 + }, + { + "epoch": 1.8101200390557692, + "grad_norm": 0.6240161061286926, + "learning_rate": 0.0001, + "loss": 1.3972, + "step": 15758 + }, + { + "epoch": 1.8102349089655965, + "grad_norm": 0.6883102655410767, + "learning_rate": 0.0001, + "loss": 1.5476, + "step": 15759 + }, + { + "epoch": 1.8103497788754237, + "grad_norm": 0.5605351328849792, + "learning_rate": 0.0001, + "loss": 1.3037, + "step": 15760 + }, + { + "epoch": 1.8104646487852507, + "grad_norm": 0.6002980470657349, + "learning_rate": 0.0001, + "loss": 1.2442, + "step": 15761 + }, + { + "epoch": 1.8105795186950777, + "grad_norm": 0.6384274959564209, + "learning_rate": 0.0001, + "loss": 1.4306, + "step": 15762 + }, + { + "epoch": 1.810694388604905, + "grad_norm": 0.6188052296638489, + "learning_rate": 0.0001, + "loss": 1.4172, + "step": 15763 + }, + { + "epoch": 1.8108092585147322, + "grad_norm": 0.6061704158782959, + "learning_rate": 0.0001, + "loss": 1.2811, + "step": 15764 + }, + { + "epoch": 1.8109241284245592, + "grad_norm": 0.6506741642951965, + "learning_rate": 0.0001, + "loss": 1.4402, + "step": 15765 + }, + { + "epoch": 1.8110389983343862, + "grad_norm": 0.6865687966346741, + "learning_rate": 0.0001, + "loss": 1.4626, + "step": 15766 + }, + { + "epoch": 1.8111538682442134, + "grad_norm": 0.6647955179214478, + "learning_rate": 0.0001, + "loss": 1.3912, + "step": 15767 + }, + { + "epoch": 1.8112687381540407, + "grad_norm": 0.6481913924217224, + "learning_rate": 0.0001, + "loss": 1.4325, + "step": 15768 + }, + { + "epoch": 1.8113836080638677, + "grad_norm": 0.7807742953300476, + "learning_rate": 0.0001, + "loss": 1.6238, + "step": 15769 + }, + { + "epoch": 1.8114984779736947, + "grad_norm": 0.663368284702301, + "learning_rate": 0.0001, + "loss": 1.2071, + "step": 15770 + }, + { + "epoch": 1.811613347883522, + "grad_norm": 0.6617883443832397, + "learning_rate": 0.0001, + "loss": 1.462, + "step": 15771 + }, + { + "epoch": 1.8117282177933491, + "grad_norm": 0.6506766676902771, + "learning_rate": 0.0001, + "loss": 1.4079, + "step": 15772 + }, + { + "epoch": 1.8118430877031761, + "grad_norm": 0.655314564704895, + "learning_rate": 0.0001, + "loss": 1.4805, + "step": 15773 + }, + { + "epoch": 1.8119579576130032, + "grad_norm": 0.5995591878890991, + "learning_rate": 0.0001, + "loss": 1.4349, + "step": 15774 + }, + { + "epoch": 1.8120728275228304, + "grad_norm": 0.7130681276321411, + "learning_rate": 0.0001, + "loss": 1.28, + "step": 15775 + }, + { + "epoch": 1.8121876974326576, + "grad_norm": 0.6497383713722229, + "learning_rate": 0.0001, + "loss": 1.3618, + "step": 15776 + }, + { + "epoch": 1.8123025673424846, + "grad_norm": 0.6399600505828857, + "learning_rate": 0.0001, + "loss": 1.5071, + "step": 15777 + }, + { + "epoch": 1.8124174372523116, + "grad_norm": 0.6517605185508728, + "learning_rate": 0.0001, + "loss": 1.6146, + "step": 15778 + }, + { + "epoch": 1.8125323071621389, + "grad_norm": 0.7210836410522461, + "learning_rate": 0.0001, + "loss": 1.3193, + "step": 15779 + }, + { + "epoch": 1.812647177071966, + "grad_norm": 0.6529654264450073, + "learning_rate": 0.0001, + "loss": 1.3761, + "step": 15780 + }, + { + "epoch": 1.812762046981793, + "grad_norm": 0.5776856541633606, + "learning_rate": 0.0001, + "loss": 1.2645, + "step": 15781 + }, + { + "epoch": 1.8128769168916201, + "grad_norm": 0.6165531277656555, + "learning_rate": 0.0001, + "loss": 1.5794, + "step": 15782 + }, + { + "epoch": 1.8129917868014473, + "grad_norm": 0.6355653405189514, + "learning_rate": 0.0001, + "loss": 1.4083, + "step": 15783 + }, + { + "epoch": 1.8131066567112746, + "grad_norm": 0.6272133588790894, + "learning_rate": 0.0001, + "loss": 1.5475, + "step": 15784 + }, + { + "epoch": 1.8132215266211016, + "grad_norm": 0.6231955885887146, + "learning_rate": 0.0001, + "loss": 1.1145, + "step": 15785 + }, + { + "epoch": 1.8133363965309286, + "grad_norm": 0.5937713980674744, + "learning_rate": 0.0001, + "loss": 1.3303, + "step": 15786 + }, + { + "epoch": 1.8134512664407558, + "grad_norm": 0.6835618615150452, + "learning_rate": 0.0001, + "loss": 1.5671, + "step": 15787 + }, + { + "epoch": 1.813566136350583, + "grad_norm": 0.700341522693634, + "learning_rate": 0.0001, + "loss": 1.6887, + "step": 15788 + }, + { + "epoch": 1.81368100626041, + "grad_norm": 0.6277741193771362, + "learning_rate": 0.0001, + "loss": 1.5733, + "step": 15789 + }, + { + "epoch": 1.813795876170237, + "grad_norm": 0.6309356689453125, + "learning_rate": 0.0001, + "loss": 1.5235, + "step": 15790 + }, + { + "epoch": 1.8139107460800643, + "grad_norm": 0.5971214175224304, + "learning_rate": 0.0001, + "loss": 1.3632, + "step": 15791 + }, + { + "epoch": 1.8140256159898915, + "grad_norm": 0.7022644281387329, + "learning_rate": 0.0001, + "loss": 1.6092, + "step": 15792 + }, + { + "epoch": 1.8141404858997185, + "grad_norm": 0.5725907683372498, + "learning_rate": 0.0001, + "loss": 1.4579, + "step": 15793 + }, + { + "epoch": 1.8142553558095456, + "grad_norm": 0.6333192586898804, + "learning_rate": 0.0001, + "loss": 1.3691, + "step": 15794 + }, + { + "epoch": 1.8143702257193728, + "grad_norm": 0.6314082145690918, + "learning_rate": 0.0001, + "loss": 1.5763, + "step": 15795 + }, + { + "epoch": 1.8144850956292, + "grad_norm": 0.6020960211753845, + "learning_rate": 0.0001, + "loss": 1.3315, + "step": 15796 + }, + { + "epoch": 1.814599965539027, + "grad_norm": 0.636551022529602, + "learning_rate": 0.0001, + "loss": 1.2842, + "step": 15797 + }, + { + "epoch": 1.814714835448854, + "grad_norm": 0.6058814525604248, + "learning_rate": 0.0001, + "loss": 1.4964, + "step": 15798 + }, + { + "epoch": 1.8148297053586813, + "grad_norm": 0.581272304058075, + "learning_rate": 0.0001, + "loss": 1.4039, + "step": 15799 + }, + { + "epoch": 1.8149445752685085, + "grad_norm": 0.6171589493751526, + "learning_rate": 0.0001, + "loss": 1.4822, + "step": 15800 + }, + { + "epoch": 1.8150594451783355, + "grad_norm": 0.6367219090461731, + "learning_rate": 0.0001, + "loss": 1.3735, + "step": 15801 + }, + { + "epoch": 1.8151743150881625, + "grad_norm": 0.6684643030166626, + "learning_rate": 0.0001, + "loss": 1.7442, + "step": 15802 + }, + { + "epoch": 1.8152891849979897, + "grad_norm": 0.5805231332778931, + "learning_rate": 0.0001, + "loss": 1.4201, + "step": 15803 + }, + { + "epoch": 1.815404054907817, + "grad_norm": 0.6159259676933289, + "learning_rate": 0.0001, + "loss": 1.2066, + "step": 15804 + }, + { + "epoch": 1.815518924817644, + "grad_norm": 0.5856978297233582, + "learning_rate": 0.0001, + "loss": 1.3786, + "step": 15805 + }, + { + "epoch": 1.815633794727471, + "grad_norm": 0.6367132663726807, + "learning_rate": 0.0001, + "loss": 1.3513, + "step": 15806 + }, + { + "epoch": 1.8157486646372982, + "grad_norm": 0.6610946655273438, + "learning_rate": 0.0001, + "loss": 1.4352, + "step": 15807 + }, + { + "epoch": 1.8158635345471255, + "grad_norm": 0.6742769479751587, + "learning_rate": 0.0001, + "loss": 1.4922, + "step": 15808 + }, + { + "epoch": 1.8159784044569525, + "grad_norm": 0.6435980200767517, + "learning_rate": 0.0001, + "loss": 1.5316, + "step": 15809 + }, + { + "epoch": 1.8160932743667795, + "grad_norm": 0.6399965882301331, + "learning_rate": 0.0001, + "loss": 1.4247, + "step": 15810 + }, + { + "epoch": 1.8162081442766067, + "grad_norm": 0.6221297979354858, + "learning_rate": 0.0001, + "loss": 1.4666, + "step": 15811 + }, + { + "epoch": 1.816323014186434, + "grad_norm": 0.6029366254806519, + "learning_rate": 0.0001, + "loss": 1.448, + "step": 15812 + }, + { + "epoch": 1.816437884096261, + "grad_norm": 0.628516435623169, + "learning_rate": 0.0001, + "loss": 1.1754, + "step": 15813 + }, + { + "epoch": 1.816552754006088, + "grad_norm": 0.7524257898330688, + "learning_rate": 0.0001, + "loss": 1.6957, + "step": 15814 + }, + { + "epoch": 1.8166676239159152, + "grad_norm": 0.6547483205795288, + "learning_rate": 0.0001, + "loss": 1.3974, + "step": 15815 + }, + { + "epoch": 1.8167824938257424, + "grad_norm": 0.7047604918479919, + "learning_rate": 0.0001, + "loss": 1.3708, + "step": 15816 + }, + { + "epoch": 1.8168973637355694, + "grad_norm": 0.6523260474205017, + "learning_rate": 0.0001, + "loss": 1.4342, + "step": 15817 + }, + { + "epoch": 1.8170122336453964, + "grad_norm": 0.6819579005241394, + "learning_rate": 0.0001, + "loss": 1.4529, + "step": 15818 + }, + { + "epoch": 1.8171271035552237, + "grad_norm": 0.6569975018501282, + "learning_rate": 0.0001, + "loss": 1.3422, + "step": 15819 + }, + { + "epoch": 1.817241973465051, + "grad_norm": 0.6896679401397705, + "learning_rate": 0.0001, + "loss": 1.4424, + "step": 15820 + }, + { + "epoch": 1.817356843374878, + "grad_norm": 0.6512749195098877, + "learning_rate": 0.0001, + "loss": 1.3904, + "step": 15821 + }, + { + "epoch": 1.817471713284705, + "grad_norm": 0.6123222708702087, + "learning_rate": 0.0001, + "loss": 1.3856, + "step": 15822 + }, + { + "epoch": 1.8175865831945321, + "grad_norm": 0.5971189141273499, + "learning_rate": 0.0001, + "loss": 1.4344, + "step": 15823 + }, + { + "epoch": 1.8177014531043594, + "grad_norm": 0.5992557406425476, + "learning_rate": 0.0001, + "loss": 1.4017, + "step": 15824 + }, + { + "epoch": 1.8178163230141864, + "grad_norm": 0.6497430205345154, + "learning_rate": 0.0001, + "loss": 1.4175, + "step": 15825 + }, + { + "epoch": 1.8179311929240134, + "grad_norm": 0.6796419620513916, + "learning_rate": 0.0001, + "loss": 1.4605, + "step": 15826 + }, + { + "epoch": 1.8180460628338406, + "grad_norm": 0.6003844738006592, + "learning_rate": 0.0001, + "loss": 1.3818, + "step": 15827 + }, + { + "epoch": 1.8181609327436679, + "grad_norm": 0.7161847949028015, + "learning_rate": 0.0001, + "loss": 1.4173, + "step": 15828 + }, + { + "epoch": 1.8182758026534949, + "grad_norm": 0.6299294829368591, + "learning_rate": 0.0001, + "loss": 1.4459, + "step": 15829 + }, + { + "epoch": 1.8183906725633219, + "grad_norm": 0.7790576815605164, + "learning_rate": 0.0001, + "loss": 1.7595, + "step": 15830 + }, + { + "epoch": 1.818505542473149, + "grad_norm": 0.5734109282493591, + "learning_rate": 0.0001, + "loss": 1.3666, + "step": 15831 + }, + { + "epoch": 1.8186204123829763, + "grad_norm": 0.6383972764015198, + "learning_rate": 0.0001, + "loss": 1.6129, + "step": 15832 + }, + { + "epoch": 1.8187352822928033, + "grad_norm": 0.6514372825622559, + "learning_rate": 0.0001, + "loss": 1.4179, + "step": 15833 + }, + { + "epoch": 1.8188501522026304, + "grad_norm": 0.6539475321769714, + "learning_rate": 0.0001, + "loss": 1.3834, + "step": 15834 + }, + { + "epoch": 1.8189650221124576, + "grad_norm": 0.6508055925369263, + "learning_rate": 0.0001, + "loss": 1.1476, + "step": 15835 + }, + { + "epoch": 1.8190798920222848, + "grad_norm": 0.6467891335487366, + "learning_rate": 0.0001, + "loss": 1.4836, + "step": 15836 + }, + { + "epoch": 1.819194761932112, + "grad_norm": 0.6416442394256592, + "learning_rate": 0.0001, + "loss": 1.4443, + "step": 15837 + }, + { + "epoch": 1.819309631841939, + "grad_norm": 0.6353035569190979, + "learning_rate": 0.0001, + "loss": 1.3113, + "step": 15838 + }, + { + "epoch": 1.819424501751766, + "grad_norm": 0.6308345198631287, + "learning_rate": 0.0001, + "loss": 1.5127, + "step": 15839 + }, + { + "epoch": 1.8195393716615933, + "grad_norm": 0.66878741979599, + "learning_rate": 0.0001, + "loss": 1.5101, + "step": 15840 + }, + { + "epoch": 1.8196542415714205, + "grad_norm": 0.6357544660568237, + "learning_rate": 0.0001, + "loss": 1.4058, + "step": 15841 + }, + { + "epoch": 1.8197691114812475, + "grad_norm": 0.6938048601150513, + "learning_rate": 0.0001, + "loss": 1.3966, + "step": 15842 + }, + { + "epoch": 1.8198839813910745, + "grad_norm": 0.6165516376495361, + "learning_rate": 0.0001, + "loss": 1.392, + "step": 15843 + }, + { + "epoch": 1.8199988513009018, + "grad_norm": 0.6373364329338074, + "learning_rate": 0.0001, + "loss": 1.2863, + "step": 15844 + }, + { + "epoch": 1.820113721210729, + "grad_norm": 0.6185095310211182, + "learning_rate": 0.0001, + "loss": 1.3833, + "step": 15845 + }, + { + "epoch": 1.820228591120556, + "grad_norm": 0.6635664105415344, + "learning_rate": 0.0001, + "loss": 1.3862, + "step": 15846 + }, + { + "epoch": 1.820343461030383, + "grad_norm": 0.6329711675643921, + "learning_rate": 0.0001, + "loss": 1.3956, + "step": 15847 + }, + { + "epoch": 1.8204583309402103, + "grad_norm": 0.6014331579208374, + "learning_rate": 0.0001, + "loss": 1.41, + "step": 15848 + }, + { + "epoch": 1.8205732008500375, + "grad_norm": 0.6105222702026367, + "learning_rate": 0.0001, + "loss": 1.234, + "step": 15849 + }, + { + "epoch": 1.8206880707598645, + "grad_norm": 0.648215651512146, + "learning_rate": 0.0001, + "loss": 1.5546, + "step": 15850 + }, + { + "epoch": 1.8208029406696915, + "grad_norm": 0.5845022201538086, + "learning_rate": 0.0001, + "loss": 1.141, + "step": 15851 + }, + { + "epoch": 1.8209178105795187, + "grad_norm": 0.6215102076530457, + "learning_rate": 0.0001, + "loss": 1.472, + "step": 15852 + }, + { + "epoch": 1.821032680489346, + "grad_norm": 0.6202508807182312, + "learning_rate": 0.0001, + "loss": 1.3187, + "step": 15853 + }, + { + "epoch": 1.821147550399173, + "grad_norm": 0.638676106929779, + "learning_rate": 0.0001, + "loss": 1.3897, + "step": 15854 + }, + { + "epoch": 1.821262420309, + "grad_norm": 0.7090446352958679, + "learning_rate": 0.0001, + "loss": 1.4151, + "step": 15855 + }, + { + "epoch": 1.8213772902188272, + "grad_norm": 0.650833249092102, + "learning_rate": 0.0001, + "loss": 1.4007, + "step": 15856 + }, + { + "epoch": 1.8214921601286544, + "grad_norm": 0.7622561454772949, + "learning_rate": 0.0001, + "loss": 1.5458, + "step": 15857 + }, + { + "epoch": 1.8216070300384815, + "grad_norm": 0.6946950554847717, + "learning_rate": 0.0001, + "loss": 1.416, + "step": 15858 + }, + { + "epoch": 1.8217218999483085, + "grad_norm": 0.6929587721824646, + "learning_rate": 0.0001, + "loss": 1.3965, + "step": 15859 + }, + { + "epoch": 1.8218367698581357, + "grad_norm": 0.6364197731018066, + "learning_rate": 0.0001, + "loss": 1.4878, + "step": 15860 + }, + { + "epoch": 1.821951639767963, + "grad_norm": 0.7382113933563232, + "learning_rate": 0.0001, + "loss": 1.5489, + "step": 15861 + }, + { + "epoch": 1.82206650967779, + "grad_norm": 0.6655716896057129, + "learning_rate": 0.0001, + "loss": 1.4976, + "step": 15862 + }, + { + "epoch": 1.822181379587617, + "grad_norm": 0.597642719745636, + "learning_rate": 0.0001, + "loss": 1.3425, + "step": 15863 + }, + { + "epoch": 1.8222962494974442, + "grad_norm": 0.6217376589775085, + "learning_rate": 0.0001, + "loss": 1.5044, + "step": 15864 + }, + { + "epoch": 1.8224111194072714, + "grad_norm": 0.6502073407173157, + "learning_rate": 0.0001, + "loss": 1.5697, + "step": 15865 + }, + { + "epoch": 1.8225259893170984, + "grad_norm": 0.6081299185752869, + "learning_rate": 0.0001, + "loss": 1.3242, + "step": 15866 + }, + { + "epoch": 1.8226408592269254, + "grad_norm": 0.6899523735046387, + "learning_rate": 0.0001, + "loss": 1.093, + "step": 15867 + }, + { + "epoch": 1.8227557291367527, + "grad_norm": 0.6164363622665405, + "learning_rate": 0.0001, + "loss": 1.5016, + "step": 15868 + }, + { + "epoch": 1.8228705990465799, + "grad_norm": 0.6042718887329102, + "learning_rate": 0.0001, + "loss": 1.4358, + "step": 15869 + }, + { + "epoch": 1.822985468956407, + "grad_norm": 0.6475842595100403, + "learning_rate": 0.0001, + "loss": 1.595, + "step": 15870 + }, + { + "epoch": 1.823100338866234, + "grad_norm": 0.6268951892852783, + "learning_rate": 0.0001, + "loss": 1.4257, + "step": 15871 + }, + { + "epoch": 1.8232152087760611, + "grad_norm": 0.6121448874473572, + "learning_rate": 0.0001, + "loss": 1.4889, + "step": 15872 + }, + { + "epoch": 1.8233300786858884, + "grad_norm": 0.6150333881378174, + "learning_rate": 0.0001, + "loss": 1.3803, + "step": 15873 + }, + { + "epoch": 1.8234449485957154, + "grad_norm": 0.6406527757644653, + "learning_rate": 0.0001, + "loss": 1.3256, + "step": 15874 + }, + { + "epoch": 1.8235598185055424, + "grad_norm": 0.6126410961151123, + "learning_rate": 0.0001, + "loss": 1.3716, + "step": 15875 + }, + { + "epoch": 1.8236746884153696, + "grad_norm": 0.6979601383209229, + "learning_rate": 0.0001, + "loss": 1.5394, + "step": 15876 + }, + { + "epoch": 1.8237895583251968, + "grad_norm": 0.6526153087615967, + "learning_rate": 0.0001, + "loss": 1.6131, + "step": 15877 + }, + { + "epoch": 1.8239044282350239, + "grad_norm": 0.6208426356315613, + "learning_rate": 0.0001, + "loss": 1.3515, + "step": 15878 + }, + { + "epoch": 1.8240192981448509, + "grad_norm": 0.5797458291053772, + "learning_rate": 0.0001, + "loss": 1.393, + "step": 15879 + }, + { + "epoch": 1.824134168054678, + "grad_norm": 0.5991585850715637, + "learning_rate": 0.0001, + "loss": 1.4375, + "step": 15880 + }, + { + "epoch": 1.8242490379645053, + "grad_norm": 0.6601519584655762, + "learning_rate": 0.0001, + "loss": 1.5897, + "step": 15881 + }, + { + "epoch": 1.8243639078743323, + "grad_norm": 0.6615204811096191, + "learning_rate": 0.0001, + "loss": 1.5966, + "step": 15882 + }, + { + "epoch": 1.8244787777841593, + "grad_norm": 0.6218351125717163, + "learning_rate": 0.0001, + "loss": 1.478, + "step": 15883 + }, + { + "epoch": 1.8245936476939866, + "grad_norm": 0.6256325244903564, + "learning_rate": 0.0001, + "loss": 1.6471, + "step": 15884 + }, + { + "epoch": 1.8247085176038138, + "grad_norm": 0.6107257008552551, + "learning_rate": 0.0001, + "loss": 1.4236, + "step": 15885 + }, + { + "epoch": 1.8248233875136408, + "grad_norm": 0.7533160448074341, + "learning_rate": 0.0001, + "loss": 1.425, + "step": 15886 + }, + { + "epoch": 1.8249382574234678, + "grad_norm": 0.6379408836364746, + "learning_rate": 0.0001, + "loss": 1.363, + "step": 15887 + }, + { + "epoch": 1.825053127333295, + "grad_norm": 0.598540723323822, + "learning_rate": 0.0001, + "loss": 1.4424, + "step": 15888 + }, + { + "epoch": 1.8251679972431223, + "grad_norm": 0.6070877313613892, + "learning_rate": 0.0001, + "loss": 1.3913, + "step": 15889 + }, + { + "epoch": 1.8252828671529493, + "grad_norm": 0.6327671408653259, + "learning_rate": 0.0001, + "loss": 1.4302, + "step": 15890 + }, + { + "epoch": 1.8253977370627763, + "grad_norm": 0.6539307236671448, + "learning_rate": 0.0001, + "loss": 1.4466, + "step": 15891 + }, + { + "epoch": 1.8255126069726035, + "grad_norm": 0.6575409173965454, + "learning_rate": 0.0001, + "loss": 1.2643, + "step": 15892 + }, + { + "epoch": 1.8256274768824308, + "grad_norm": 0.6441643834114075, + "learning_rate": 0.0001, + "loss": 1.4879, + "step": 15893 + }, + { + "epoch": 1.8257423467922578, + "grad_norm": 0.6457522511482239, + "learning_rate": 0.0001, + "loss": 1.3737, + "step": 15894 + }, + { + "epoch": 1.8258572167020848, + "grad_norm": 0.6049162149429321, + "learning_rate": 0.0001, + "loss": 1.4517, + "step": 15895 + }, + { + "epoch": 1.825972086611912, + "grad_norm": 0.6564713716506958, + "learning_rate": 0.0001, + "loss": 1.3928, + "step": 15896 + }, + { + "epoch": 1.8260869565217392, + "grad_norm": 0.6829112768173218, + "learning_rate": 0.0001, + "loss": 1.5156, + "step": 15897 + }, + { + "epoch": 1.8262018264315663, + "grad_norm": 0.6410605311393738, + "learning_rate": 0.0001, + "loss": 1.3843, + "step": 15898 + }, + { + "epoch": 1.8263166963413933, + "grad_norm": 0.6501928567886353, + "learning_rate": 0.0001, + "loss": 1.4678, + "step": 15899 + }, + { + "epoch": 1.8264315662512205, + "grad_norm": 0.6255066990852356, + "learning_rate": 0.0001, + "loss": 1.4146, + "step": 15900 + }, + { + "epoch": 1.8265464361610477, + "grad_norm": 0.6358596086502075, + "learning_rate": 0.0001, + "loss": 1.462, + "step": 15901 + }, + { + "epoch": 1.8266613060708747, + "grad_norm": 0.6268904805183411, + "learning_rate": 0.0001, + "loss": 1.4233, + "step": 15902 + }, + { + "epoch": 1.8267761759807017, + "grad_norm": 0.5953584909439087, + "learning_rate": 0.0001, + "loss": 1.5563, + "step": 15903 + }, + { + "epoch": 1.826891045890529, + "grad_norm": 0.5962085127830505, + "learning_rate": 0.0001, + "loss": 1.5107, + "step": 15904 + }, + { + "epoch": 1.8270059158003562, + "grad_norm": 0.6590402722358704, + "learning_rate": 0.0001, + "loss": 1.4404, + "step": 15905 + }, + { + "epoch": 1.8271207857101832, + "grad_norm": 0.5863921642303467, + "learning_rate": 0.0001, + "loss": 1.4666, + "step": 15906 + }, + { + "epoch": 1.8272356556200102, + "grad_norm": 0.5984483361244202, + "learning_rate": 0.0001, + "loss": 1.4999, + "step": 15907 + }, + { + "epoch": 1.8273505255298375, + "grad_norm": 0.6065948009490967, + "learning_rate": 0.0001, + "loss": 1.2615, + "step": 15908 + }, + { + "epoch": 1.8274653954396647, + "grad_norm": 0.5679669976234436, + "learning_rate": 0.0001, + "loss": 1.4875, + "step": 15909 + }, + { + "epoch": 1.8275802653494917, + "grad_norm": 0.5973514318466187, + "learning_rate": 0.0001, + "loss": 1.3, + "step": 15910 + }, + { + "epoch": 1.8276951352593187, + "grad_norm": 0.644055962562561, + "learning_rate": 0.0001, + "loss": 1.4505, + "step": 15911 + }, + { + "epoch": 1.827810005169146, + "grad_norm": 0.6656803488731384, + "learning_rate": 0.0001, + "loss": 1.5097, + "step": 15912 + }, + { + "epoch": 1.8279248750789732, + "grad_norm": 0.6298542022705078, + "learning_rate": 0.0001, + "loss": 1.561, + "step": 15913 + }, + { + "epoch": 1.8280397449888002, + "grad_norm": 0.649831235408783, + "learning_rate": 0.0001, + "loss": 1.4071, + "step": 15914 + }, + { + "epoch": 1.8281546148986272, + "grad_norm": 0.675029456615448, + "learning_rate": 0.0001, + "loss": 1.5452, + "step": 15915 + }, + { + "epoch": 1.8282694848084544, + "grad_norm": 0.5778508186340332, + "learning_rate": 0.0001, + "loss": 1.365, + "step": 15916 + }, + { + "epoch": 1.8283843547182816, + "grad_norm": 0.6754083633422852, + "learning_rate": 0.0001, + "loss": 1.464, + "step": 15917 + }, + { + "epoch": 1.8284992246281087, + "grad_norm": 0.6041424870491028, + "learning_rate": 0.0001, + "loss": 1.4145, + "step": 15918 + }, + { + "epoch": 1.8286140945379357, + "grad_norm": 0.6053421497344971, + "learning_rate": 0.0001, + "loss": 1.3666, + "step": 15919 + }, + { + "epoch": 1.828728964447763, + "grad_norm": 0.5872570276260376, + "learning_rate": 0.0001, + "loss": 1.4642, + "step": 15920 + }, + { + "epoch": 1.8288438343575901, + "grad_norm": 0.663306474685669, + "learning_rate": 0.0001, + "loss": 1.5113, + "step": 15921 + }, + { + "epoch": 1.8289587042674171, + "grad_norm": 0.6472460627555847, + "learning_rate": 0.0001, + "loss": 1.4313, + "step": 15922 + }, + { + "epoch": 1.8290735741772441, + "grad_norm": 0.6650756001472473, + "learning_rate": 0.0001, + "loss": 1.4514, + "step": 15923 + }, + { + "epoch": 1.8291884440870714, + "grad_norm": 0.5926016569137573, + "learning_rate": 0.0001, + "loss": 1.4217, + "step": 15924 + }, + { + "epoch": 1.8293033139968986, + "grad_norm": 0.7057727575302124, + "learning_rate": 0.0001, + "loss": 1.4764, + "step": 15925 + }, + { + "epoch": 1.8294181839067256, + "grad_norm": 0.6436575055122375, + "learning_rate": 0.0001, + "loss": 1.4334, + "step": 15926 + }, + { + "epoch": 1.8295330538165526, + "grad_norm": 0.6020593643188477, + "learning_rate": 0.0001, + "loss": 1.4711, + "step": 15927 + }, + { + "epoch": 1.8296479237263799, + "grad_norm": 0.5867114663124084, + "learning_rate": 0.0001, + "loss": 1.3176, + "step": 15928 + }, + { + "epoch": 1.829762793636207, + "grad_norm": 0.6493946313858032, + "learning_rate": 0.0001, + "loss": 1.5546, + "step": 15929 + }, + { + "epoch": 1.829877663546034, + "grad_norm": 0.6781548261642456, + "learning_rate": 0.0001, + "loss": 1.4961, + "step": 15930 + }, + { + "epoch": 1.829992533455861, + "grad_norm": 0.5895503759384155, + "learning_rate": 0.0001, + "loss": 1.2183, + "step": 15931 + }, + { + "epoch": 1.8301074033656883, + "grad_norm": 0.7099341750144958, + "learning_rate": 0.0001, + "loss": 1.6231, + "step": 15932 + }, + { + "epoch": 1.8302222732755156, + "grad_norm": 0.7078137397766113, + "learning_rate": 0.0001, + "loss": 1.5794, + "step": 15933 + }, + { + "epoch": 1.8303371431853426, + "grad_norm": 0.6459853649139404, + "learning_rate": 0.0001, + "loss": 1.5702, + "step": 15934 + }, + { + "epoch": 1.8304520130951696, + "grad_norm": 0.6130335330963135, + "learning_rate": 0.0001, + "loss": 1.3144, + "step": 15935 + }, + { + "epoch": 1.8305668830049968, + "grad_norm": 0.6197957396507263, + "learning_rate": 0.0001, + "loss": 1.4387, + "step": 15936 + }, + { + "epoch": 1.830681752914824, + "grad_norm": 0.667946457862854, + "learning_rate": 0.0001, + "loss": 1.4988, + "step": 15937 + }, + { + "epoch": 1.830796622824651, + "grad_norm": 0.5950310230255127, + "learning_rate": 0.0001, + "loss": 1.5616, + "step": 15938 + }, + { + "epoch": 1.830911492734478, + "grad_norm": 0.632938802242279, + "learning_rate": 0.0001, + "loss": 1.4152, + "step": 15939 + }, + { + "epoch": 1.8310263626443053, + "grad_norm": 0.5838363766670227, + "learning_rate": 0.0001, + "loss": 1.4048, + "step": 15940 + }, + { + "epoch": 1.8311412325541325, + "grad_norm": 0.7285007834434509, + "learning_rate": 0.0001, + "loss": 1.5162, + "step": 15941 + }, + { + "epoch": 1.8312561024639595, + "grad_norm": 0.6085467338562012, + "learning_rate": 0.0001, + "loss": 1.5558, + "step": 15942 + }, + { + "epoch": 1.8313709723737865, + "grad_norm": 0.6019995808601379, + "learning_rate": 0.0001, + "loss": 1.438, + "step": 15943 + }, + { + "epoch": 1.8314858422836138, + "grad_norm": 0.6792658567428589, + "learning_rate": 0.0001, + "loss": 1.6684, + "step": 15944 + }, + { + "epoch": 1.831600712193441, + "grad_norm": 0.602642834186554, + "learning_rate": 0.0001, + "loss": 1.4328, + "step": 15945 + }, + { + "epoch": 1.831715582103268, + "grad_norm": 0.6296089887619019, + "learning_rate": 0.0001, + "loss": 1.4179, + "step": 15946 + }, + { + "epoch": 1.831830452013095, + "grad_norm": 0.5773423314094543, + "learning_rate": 0.0001, + "loss": 1.5549, + "step": 15947 + }, + { + "epoch": 1.8319453219229223, + "grad_norm": 0.6150087714195251, + "learning_rate": 0.0001, + "loss": 1.4553, + "step": 15948 + }, + { + "epoch": 1.8320601918327495, + "grad_norm": 0.6208438277244568, + "learning_rate": 0.0001, + "loss": 1.4832, + "step": 15949 + }, + { + "epoch": 1.8321750617425765, + "grad_norm": 0.6598103046417236, + "learning_rate": 0.0001, + "loss": 1.3849, + "step": 15950 + }, + { + "epoch": 1.8322899316524035, + "grad_norm": 0.5817321538925171, + "learning_rate": 0.0001, + "loss": 1.2799, + "step": 15951 + }, + { + "epoch": 1.8324048015622307, + "grad_norm": 0.6446582078933716, + "learning_rate": 0.0001, + "loss": 1.4087, + "step": 15952 + }, + { + "epoch": 1.832519671472058, + "grad_norm": 0.5805476903915405, + "learning_rate": 0.0001, + "loss": 1.2953, + "step": 15953 + }, + { + "epoch": 1.832634541381885, + "grad_norm": 0.6229280233383179, + "learning_rate": 0.0001, + "loss": 1.4529, + "step": 15954 + }, + { + "epoch": 1.832749411291712, + "grad_norm": 0.6367110013961792, + "learning_rate": 0.0001, + "loss": 1.4145, + "step": 15955 + }, + { + "epoch": 1.8328642812015392, + "grad_norm": 0.6071845889091492, + "learning_rate": 0.0001, + "loss": 1.2801, + "step": 15956 + }, + { + "epoch": 1.8329791511113664, + "grad_norm": 0.6400860548019409, + "learning_rate": 0.0001, + "loss": 1.4126, + "step": 15957 + }, + { + "epoch": 1.8330940210211935, + "grad_norm": 0.6502026915550232, + "learning_rate": 0.0001, + "loss": 1.6317, + "step": 15958 + }, + { + "epoch": 1.8332088909310205, + "grad_norm": 0.5755997896194458, + "learning_rate": 0.0001, + "loss": 1.3486, + "step": 15959 + }, + { + "epoch": 1.8333237608408477, + "grad_norm": 0.6484699845314026, + "learning_rate": 0.0001, + "loss": 1.5004, + "step": 15960 + }, + { + "epoch": 1.833438630750675, + "grad_norm": 0.6342812180519104, + "learning_rate": 0.0001, + "loss": 1.3965, + "step": 15961 + }, + { + "epoch": 1.833553500660502, + "grad_norm": 0.676463782787323, + "learning_rate": 0.0001, + "loss": 1.5334, + "step": 15962 + }, + { + "epoch": 1.833668370570329, + "grad_norm": 0.6405801773071289, + "learning_rate": 0.0001, + "loss": 1.5181, + "step": 15963 + }, + { + "epoch": 1.8337832404801562, + "grad_norm": 0.614596426486969, + "learning_rate": 0.0001, + "loss": 1.3663, + "step": 15964 + }, + { + "epoch": 1.8338981103899834, + "grad_norm": 0.6203700304031372, + "learning_rate": 0.0001, + "loss": 1.4567, + "step": 15965 + }, + { + "epoch": 1.8340129802998104, + "grad_norm": 0.6081563234329224, + "learning_rate": 0.0001, + "loss": 1.5254, + "step": 15966 + }, + { + "epoch": 1.8341278502096374, + "grad_norm": 0.6504876017570496, + "learning_rate": 0.0001, + "loss": 1.4266, + "step": 15967 + }, + { + "epoch": 1.8342427201194647, + "grad_norm": 0.6089107990264893, + "learning_rate": 0.0001, + "loss": 1.4368, + "step": 15968 + }, + { + "epoch": 1.8343575900292919, + "grad_norm": 0.696449875831604, + "learning_rate": 0.0001, + "loss": 1.6924, + "step": 15969 + }, + { + "epoch": 1.834472459939119, + "grad_norm": 0.6738762259483337, + "learning_rate": 0.0001, + "loss": 1.58, + "step": 15970 + }, + { + "epoch": 1.834587329848946, + "grad_norm": 0.6419105529785156, + "learning_rate": 0.0001, + "loss": 1.465, + "step": 15971 + }, + { + "epoch": 1.8347021997587731, + "grad_norm": 0.6349779963493347, + "learning_rate": 0.0001, + "loss": 1.5807, + "step": 15972 + }, + { + "epoch": 1.8348170696686004, + "grad_norm": 0.6129407286643982, + "learning_rate": 0.0001, + "loss": 1.3928, + "step": 15973 + }, + { + "epoch": 1.8349319395784276, + "grad_norm": 0.6540585160255432, + "learning_rate": 0.0001, + "loss": 1.701, + "step": 15974 + }, + { + "epoch": 1.8350468094882546, + "grad_norm": 0.6090661287307739, + "learning_rate": 0.0001, + "loss": 1.4129, + "step": 15975 + }, + { + "epoch": 1.8351616793980816, + "grad_norm": 0.6051062345504761, + "learning_rate": 0.0001, + "loss": 1.4449, + "step": 15976 + }, + { + "epoch": 1.8352765493079088, + "grad_norm": 0.6473617553710938, + "learning_rate": 0.0001, + "loss": 1.5488, + "step": 15977 + }, + { + "epoch": 1.835391419217736, + "grad_norm": 0.6278075575828552, + "learning_rate": 0.0001, + "loss": 1.3918, + "step": 15978 + }, + { + "epoch": 1.835506289127563, + "grad_norm": 0.6662875413894653, + "learning_rate": 0.0001, + "loss": 1.5309, + "step": 15979 + }, + { + "epoch": 1.83562115903739, + "grad_norm": 0.6201448440551758, + "learning_rate": 0.0001, + "loss": 1.4804, + "step": 15980 + }, + { + "epoch": 1.8357360289472173, + "grad_norm": 0.6352674961090088, + "learning_rate": 0.0001, + "loss": 1.4066, + "step": 15981 + }, + { + "epoch": 1.8358508988570446, + "grad_norm": 0.5949738621711731, + "learning_rate": 0.0001, + "loss": 1.5198, + "step": 15982 + }, + { + "epoch": 1.8359657687668716, + "grad_norm": 0.6130930781364441, + "learning_rate": 0.0001, + "loss": 1.4186, + "step": 15983 + }, + { + "epoch": 1.8360806386766986, + "grad_norm": 0.5820370316505432, + "learning_rate": 0.0001, + "loss": 1.223, + "step": 15984 + }, + { + "epoch": 1.8361955085865258, + "grad_norm": 0.6435645222663879, + "learning_rate": 0.0001, + "loss": 1.4747, + "step": 15985 + }, + { + "epoch": 1.836310378496353, + "grad_norm": 0.5986513495445251, + "learning_rate": 0.0001, + "loss": 1.4789, + "step": 15986 + }, + { + "epoch": 1.83642524840618, + "grad_norm": 0.6947880983352661, + "learning_rate": 0.0001, + "loss": 1.5822, + "step": 15987 + }, + { + "epoch": 1.836540118316007, + "grad_norm": 0.6280114054679871, + "learning_rate": 0.0001, + "loss": 1.3834, + "step": 15988 + }, + { + "epoch": 1.8366549882258343, + "grad_norm": 0.669037401676178, + "learning_rate": 0.0001, + "loss": 1.5151, + "step": 15989 + }, + { + "epoch": 1.8367698581356615, + "grad_norm": 0.7081412076950073, + "learning_rate": 0.0001, + "loss": 1.4418, + "step": 15990 + }, + { + "epoch": 1.8368847280454885, + "grad_norm": 0.6335996985435486, + "learning_rate": 0.0001, + "loss": 1.4933, + "step": 15991 + }, + { + "epoch": 1.8369995979553155, + "grad_norm": 0.68028724193573, + "learning_rate": 0.0001, + "loss": 1.4157, + "step": 15992 + }, + { + "epoch": 1.8371144678651428, + "grad_norm": 0.6293099522590637, + "learning_rate": 0.0001, + "loss": 1.5406, + "step": 15993 + }, + { + "epoch": 1.83722933777497, + "grad_norm": 0.6450070738792419, + "learning_rate": 0.0001, + "loss": 1.3165, + "step": 15994 + }, + { + "epoch": 1.837344207684797, + "grad_norm": 0.6043501496315002, + "learning_rate": 0.0001, + "loss": 1.3544, + "step": 15995 + }, + { + "epoch": 1.837459077594624, + "grad_norm": 0.5740723013877869, + "learning_rate": 0.0001, + "loss": 1.3516, + "step": 15996 + }, + { + "epoch": 1.8375739475044512, + "grad_norm": 0.6358054876327515, + "learning_rate": 0.0001, + "loss": 1.3855, + "step": 15997 + }, + { + "epoch": 1.8376888174142785, + "grad_norm": 0.5969364047050476, + "learning_rate": 0.0001, + "loss": 1.3998, + "step": 15998 + }, + { + "epoch": 1.8378036873241055, + "grad_norm": 0.6494911909103394, + "learning_rate": 0.0001, + "loss": 1.4437, + "step": 15999 + }, + { + "epoch": 1.8379185572339325, + "grad_norm": 0.6509250402450562, + "learning_rate": 0.0001, + "loss": 1.6279, + "step": 16000 + } + ], + "logging_steps": 1.0, + "max_steps": 17410, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.4136217259933696e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}