{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.987443401951174, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001493721700975587, "grad_norm": 2.397458076477051, "learning_rate": 4.9975e-05, "loss": 5.0666, "step": 1 }, { "epoch": 0.002987443401951174, "grad_norm": 2.328670024871826, "learning_rate": 4.995e-05, "loss": 4.9326, "step": 2 }, { "epoch": 0.004481165102926761, "grad_norm": 2.459200859069824, "learning_rate": 4.992500000000001e-05, "loss": 4.8339, "step": 3 }, { "epoch": 0.005974886803902348, "grad_norm": 2.4593193531036377, "learning_rate": 4.99e-05, "loss": 4.4933, "step": 4 }, { "epoch": 0.007468608504877935, "grad_norm": 2.6492974758148193, "learning_rate": 4.9875000000000006e-05, "loss": 4.1761, "step": 5 }, { "epoch": 0.008962330205853523, "grad_norm": 3.116227865219116, "learning_rate": 4.9850000000000006e-05, "loss": 4.3921, "step": 6 }, { "epoch": 0.01045605190682911, "grad_norm": 3.4623594284057617, "learning_rate": 4.9825000000000005e-05, "loss": 4.3244, "step": 7 }, { "epoch": 0.011949773607804696, "grad_norm": 3.3238677978515625, "learning_rate": 4.9800000000000004e-05, "loss": 3.8163, "step": 8 }, { "epoch": 0.013443495308780282, "grad_norm": 3.4421119689941406, "learning_rate": 4.9775000000000004e-05, "loss": 3.887, "step": 9 }, { "epoch": 0.01493721700975587, "grad_norm": 3.9978528022766113, "learning_rate": 4.975e-05, "loss": 3.7532, "step": 10 }, { "epoch": 0.016430938710731455, "grad_norm": 4.331995010375977, "learning_rate": 4.9725e-05, "loss": 3.6696, "step": 11 }, { "epoch": 0.017924660411707045, "grad_norm": 4.154501438140869, "learning_rate": 4.97e-05, "loss": 3.3019, "step": 12 }, { "epoch": 0.019418382112682632, "grad_norm": 4.620331764221191, "learning_rate": 4.967500000000001e-05, "loss": 3.2128, "step": 13 }, { "epoch": 0.02091210381365822, "grad_norm": 5.804087162017822, "learning_rate": 4.965e-05, "loss": 3.0165, "step": 14 }, { "epoch": 0.022405825514633805, "grad_norm": 5.3895416259765625, "learning_rate": 4.962500000000001e-05, "loss": 2.7503, "step": 15 }, { "epoch": 0.02389954721560939, "grad_norm": 5.3698225021362305, "learning_rate": 4.96e-05, "loss": 2.4331, "step": 16 }, { "epoch": 0.025393268916584978, "grad_norm": 6.4892401695251465, "learning_rate": 4.9575000000000006e-05, "loss": 2.1955, "step": 17 }, { "epoch": 0.026886990617560565, "grad_norm": 5.283592700958252, "learning_rate": 4.9550000000000005e-05, "loss": 1.8662, "step": 18 }, { "epoch": 0.02838071231853615, "grad_norm": 5.430540561676025, "learning_rate": 4.9525000000000004e-05, "loss": 1.7324, "step": 19 }, { "epoch": 0.02987443401951174, "grad_norm": 3.8396499156951904, "learning_rate": 4.9500000000000004e-05, "loss": 1.5458, "step": 20 }, { "epoch": 0.031368155720487324, "grad_norm": 4.121278762817383, "learning_rate": 4.9475e-05, "loss": 1.2528, "step": 21 }, { "epoch": 0.03286187742146291, "grad_norm": 2.8719050884246826, "learning_rate": 4.945e-05, "loss": 1.1511, "step": 22 }, { "epoch": 0.0343555991224385, "grad_norm": 1.7733224630355835, "learning_rate": 4.9425e-05, "loss": 1.0996, "step": 23 }, { "epoch": 0.03584932082341409, "grad_norm": 1.1547821760177612, "learning_rate": 4.94e-05, "loss": 1.0233, "step": 24 }, { "epoch": 0.03734304252438968, "grad_norm": 0.5447912812232971, "learning_rate": 4.937500000000001e-05, "loss": 1.0291, "step": 25 }, { "epoch": 0.038836764225365264, "grad_norm": 0.38917800784111023, "learning_rate": 4.935e-05, "loss": 1.0335, "step": 26 }, { "epoch": 0.04033048592634085, "grad_norm": 0.33916524052619934, "learning_rate": 4.9325000000000006e-05, "loss": 0.9689, "step": 27 }, { "epoch": 0.04182420762731644, "grad_norm": 0.3054722249507904, "learning_rate": 4.93e-05, "loss": 0.9479, "step": 28 }, { "epoch": 0.043317929328292024, "grad_norm": 0.3788245618343353, "learning_rate": 4.9275000000000005e-05, "loss": 0.992, "step": 29 }, { "epoch": 0.04481165102926761, "grad_norm": 0.31767305731773376, "learning_rate": 4.9250000000000004e-05, "loss": 0.8584, "step": 30 }, { "epoch": 0.0463053727302432, "grad_norm": 0.3323226869106293, "learning_rate": 4.9225000000000004e-05, "loss": 0.9977, "step": 31 }, { "epoch": 0.04779909443121878, "grad_norm": 0.3760989308357239, "learning_rate": 4.92e-05, "loss": 1.0603, "step": 32 }, { "epoch": 0.04929281613219437, "grad_norm": 0.35704895853996277, "learning_rate": 4.9175e-05, "loss": 1.0202, "step": 33 }, { "epoch": 0.050786537833169956, "grad_norm": 0.2768382728099823, "learning_rate": 4.915e-05, "loss": 0.9831, "step": 34 }, { "epoch": 0.05228025953414554, "grad_norm": 0.2571275234222412, "learning_rate": 4.9125e-05, "loss": 0.8108, "step": 35 }, { "epoch": 0.05377398123512113, "grad_norm": 0.2469836175441742, "learning_rate": 4.91e-05, "loss": 0.9268, "step": 36 }, { "epoch": 0.055267702936096716, "grad_norm": 0.22755832970142365, "learning_rate": 4.907500000000001e-05, "loss": 0.9889, "step": 37 }, { "epoch": 0.0567614246370723, "grad_norm": 0.21316394209861755, "learning_rate": 4.905e-05, "loss": 0.8872, "step": 38 }, { "epoch": 0.05825514633804789, "grad_norm": 0.19494163990020752, "learning_rate": 4.9025000000000006e-05, "loss": 0.897, "step": 39 }, { "epoch": 0.05974886803902348, "grad_norm": 0.21571218967437744, "learning_rate": 4.9e-05, "loss": 1.0476, "step": 40 }, { "epoch": 0.06124258973999907, "grad_norm": 0.17231720685958862, "learning_rate": 4.8975000000000005e-05, "loss": 0.9006, "step": 41 }, { "epoch": 0.06273631144097465, "grad_norm": 0.18778854608535767, "learning_rate": 4.8950000000000004e-05, "loss": 1.0526, "step": 42 }, { "epoch": 0.06423003314195024, "grad_norm": 0.15987561643123627, "learning_rate": 4.8925e-05, "loss": 0.951, "step": 43 }, { "epoch": 0.06572375484292582, "grad_norm": 0.15474233031272888, "learning_rate": 4.89e-05, "loss": 1.0503, "step": 44 }, { "epoch": 0.06721747654390141, "grad_norm": 0.1613498330116272, "learning_rate": 4.8875e-05, "loss": 0.9509, "step": 45 }, { "epoch": 0.068711198244877, "grad_norm": 0.15148626267910004, "learning_rate": 4.885e-05, "loss": 0.9517, "step": 46 }, { "epoch": 0.0702049199458526, "grad_norm": 0.130628764629364, "learning_rate": 4.8825e-05, "loss": 0.8603, "step": 47 }, { "epoch": 0.07169864164682818, "grad_norm": 0.13919195532798767, "learning_rate": 4.88e-05, "loss": 0.811, "step": 48 }, { "epoch": 0.07319236334780377, "grad_norm": 0.14678949117660522, "learning_rate": 4.8775000000000007e-05, "loss": 0.9475, "step": 49 }, { "epoch": 0.07468608504877935, "grad_norm": 0.14091861248016357, "learning_rate": 4.875e-05, "loss": 0.9563, "step": 50 }, { "epoch": 0.07617980674975494, "grad_norm": 0.13221223652362823, "learning_rate": 4.8725000000000005e-05, "loss": 0.9796, "step": 51 }, { "epoch": 0.07767352845073053, "grad_norm": 0.14871284365653992, "learning_rate": 4.87e-05, "loss": 0.9789, "step": 52 }, { "epoch": 0.07916725015170611, "grad_norm": 0.1283031851053238, "learning_rate": 4.8675000000000004e-05, "loss": 0.9167, "step": 53 }, { "epoch": 0.0806609718526817, "grad_norm": 0.1367301195859909, "learning_rate": 4.8650000000000003e-05, "loss": 0.994, "step": 54 }, { "epoch": 0.08215469355365729, "grad_norm": 0.1318984478712082, "learning_rate": 4.8625e-05, "loss": 0.993, "step": 55 }, { "epoch": 0.08364841525463287, "grad_norm": 0.13208089768886566, "learning_rate": 4.86e-05, "loss": 0.9828, "step": 56 }, { "epoch": 0.08514213695560846, "grad_norm": 0.11922671645879745, "learning_rate": 4.8575e-05, "loss": 0.8764, "step": 57 }, { "epoch": 0.08663585865658405, "grad_norm": 0.12606212496757507, "learning_rate": 4.855e-05, "loss": 1.0353, "step": 58 }, { "epoch": 0.08812958035755963, "grad_norm": 0.1159941777586937, "learning_rate": 4.8525e-05, "loss": 0.9036, "step": 59 }, { "epoch": 0.08962330205853522, "grad_norm": 0.1234147921204567, "learning_rate": 4.85e-05, "loss": 0.9378, "step": 60 }, { "epoch": 0.0911170237595108, "grad_norm": 0.11809991300106049, "learning_rate": 4.8475000000000006e-05, "loss": 0.9775, "step": 61 }, { "epoch": 0.0926107454604864, "grad_norm": 0.12164122611284256, "learning_rate": 4.845e-05, "loss": 1.1027, "step": 62 }, { "epoch": 0.09410446716146198, "grad_norm": 0.10909315198659897, "learning_rate": 4.8425000000000005e-05, "loss": 0.9166, "step": 63 }, { "epoch": 0.09559818886243757, "grad_norm": 0.10812599211931229, "learning_rate": 4.8400000000000004e-05, "loss": 0.8632, "step": 64 }, { "epoch": 0.09709191056341315, "grad_norm": 0.12244202196598053, "learning_rate": 4.8375000000000004e-05, "loss": 1.0082, "step": 65 }, { "epoch": 0.09858563226438874, "grad_norm": 0.10574106127023697, "learning_rate": 4.835e-05, "loss": 0.7978, "step": 66 }, { "epoch": 0.10007935396536433, "grad_norm": 0.10976710170507431, "learning_rate": 4.8325e-05, "loss": 0.8964, "step": 67 }, { "epoch": 0.10157307566633991, "grad_norm": 0.10399210453033447, "learning_rate": 4.83e-05, "loss": 0.8365, "step": 68 }, { "epoch": 0.1030667973673155, "grad_norm": 0.11445824056863785, "learning_rate": 4.8275e-05, "loss": 0.9975, "step": 69 }, { "epoch": 0.10456051906829109, "grad_norm": 0.11264859884977341, "learning_rate": 4.825e-05, "loss": 0.9251, "step": 70 }, { "epoch": 0.10605424076926667, "grad_norm": 0.10865999013185501, "learning_rate": 4.822500000000001e-05, "loss": 0.9228, "step": 71 }, { "epoch": 0.10754796247024226, "grad_norm": 0.10771652311086655, "learning_rate": 4.82e-05, "loss": 1.0065, "step": 72 }, { "epoch": 0.10904168417121785, "grad_norm": 0.11236055195331573, "learning_rate": 4.8175000000000005e-05, "loss": 0.9589, "step": 73 }, { "epoch": 0.11053540587219343, "grad_norm": 0.10882117599248886, "learning_rate": 4.815e-05, "loss": 0.9339, "step": 74 }, { "epoch": 0.11202912757316902, "grad_norm": 0.10776744037866592, "learning_rate": 4.8125000000000004e-05, "loss": 0.8473, "step": 75 }, { "epoch": 0.1135228492741446, "grad_norm": 0.1170664131641388, "learning_rate": 4.8100000000000004e-05, "loss": 0.8097, "step": 76 }, { "epoch": 0.11501657097512019, "grad_norm": 0.11874318867921829, "learning_rate": 4.8075e-05, "loss": 0.9786, "step": 77 }, { "epoch": 0.11651029267609578, "grad_norm": 0.10675764083862305, "learning_rate": 4.805e-05, "loss": 0.917, "step": 78 }, { "epoch": 0.11800401437707138, "grad_norm": 0.10391847789287567, "learning_rate": 4.8025e-05, "loss": 0.924, "step": 79 }, { "epoch": 0.11949773607804696, "grad_norm": 0.10289454460144043, "learning_rate": 4.8e-05, "loss": 0.8643, "step": 80 }, { "epoch": 0.12099145777902255, "grad_norm": 0.1011955738067627, "learning_rate": 4.7975e-05, "loss": 0.8585, "step": 81 }, { "epoch": 0.12248517947999814, "grad_norm": 0.11620628088712692, "learning_rate": 4.795e-05, "loss": 1.0519, "step": 82 }, { "epoch": 0.12397890118097372, "grad_norm": 0.09800004214048386, "learning_rate": 4.7925000000000006e-05, "loss": 0.9244, "step": 83 }, { "epoch": 0.1254726228819493, "grad_norm": 0.10274948179721832, "learning_rate": 4.79e-05, "loss": 0.957, "step": 84 }, { "epoch": 0.12696634458292488, "grad_norm": 0.10691899061203003, "learning_rate": 4.7875000000000005e-05, "loss": 0.9163, "step": 85 }, { "epoch": 0.12846006628390047, "grad_norm": 0.10854547470808029, "learning_rate": 4.785e-05, "loss": 1.0005, "step": 86 }, { "epoch": 0.12995378798487606, "grad_norm": 0.10459049791097641, "learning_rate": 4.7825000000000004e-05, "loss": 0.9621, "step": 87 }, { "epoch": 0.13144750968585164, "grad_norm": 0.1125258207321167, "learning_rate": 4.78e-05, "loss": 0.9976, "step": 88 }, { "epoch": 0.13294123138682723, "grad_norm": 0.09712523967027664, "learning_rate": 4.7775e-05, "loss": 0.7269, "step": 89 }, { "epoch": 0.13443495308780282, "grad_norm": 0.10515626519918442, "learning_rate": 4.775e-05, "loss": 0.904, "step": 90 }, { "epoch": 0.1359286747887784, "grad_norm": 0.1067182794213295, "learning_rate": 4.7725e-05, "loss": 0.9089, "step": 91 }, { "epoch": 0.137422396489754, "grad_norm": 0.1133221834897995, "learning_rate": 4.77e-05, "loss": 0.9656, "step": 92 }, { "epoch": 0.13891611819072958, "grad_norm": 0.09998854994773865, "learning_rate": 4.7675e-05, "loss": 0.8708, "step": 93 }, { "epoch": 0.1404098398917052, "grad_norm": 0.09762108325958252, "learning_rate": 4.765e-05, "loss": 0.8857, "step": 94 }, { "epoch": 0.14190356159268078, "grad_norm": 0.11542832851409912, "learning_rate": 4.7625000000000006e-05, "loss": 0.8996, "step": 95 }, { "epoch": 0.14339728329365636, "grad_norm": 0.10830055177211761, "learning_rate": 4.76e-05, "loss": 0.8938, "step": 96 }, { "epoch": 0.14489100499463195, "grad_norm": 0.11351759731769562, "learning_rate": 4.7575000000000004e-05, "loss": 0.9222, "step": 97 }, { "epoch": 0.14638472669560754, "grad_norm": 0.1044660434126854, "learning_rate": 4.755e-05, "loss": 0.8635, "step": 98 }, { "epoch": 0.14787844839658312, "grad_norm": 0.10970848053693771, "learning_rate": 4.7525e-05, "loss": 0.9714, "step": 99 }, { "epoch": 0.1493721700975587, "grad_norm": 0.10212516784667969, "learning_rate": 4.75e-05, "loss": 0.8692, "step": 100 }, { "epoch": 0.1508658917985343, "grad_norm": 0.10050351172685623, "learning_rate": 4.7475e-05, "loss": 0.8647, "step": 101 }, { "epoch": 0.15235961349950988, "grad_norm": 0.1008468046784401, "learning_rate": 4.745e-05, "loss": 0.9392, "step": 102 }, { "epoch": 0.15385333520048547, "grad_norm": 0.10778848826885223, "learning_rate": 4.7425e-05, "loss": 0.9258, "step": 103 }, { "epoch": 0.15534705690146106, "grad_norm": 0.09798528254032135, "learning_rate": 4.74e-05, "loss": 0.7945, "step": 104 }, { "epoch": 0.15684077860243664, "grad_norm": 0.11388903111219406, "learning_rate": 4.7375e-05, "loss": 0.9239, "step": 105 }, { "epoch": 0.15833450030341223, "grad_norm": 0.1399800032377243, "learning_rate": 4.735e-05, "loss": 0.8475, "step": 106 }, { "epoch": 0.15982822200438782, "grad_norm": 0.0931653156876564, "learning_rate": 4.7325000000000005e-05, "loss": 0.6971, "step": 107 }, { "epoch": 0.1613219437053634, "grad_norm": 0.10741148889064789, "learning_rate": 4.73e-05, "loss": 1.0422, "step": 108 }, { "epoch": 0.162815665406339, "grad_norm": 0.11818183958530426, "learning_rate": 4.7275000000000004e-05, "loss": 1.0061, "step": 109 }, { "epoch": 0.16430938710731458, "grad_norm": 0.11258810013532639, "learning_rate": 4.7249999999999997e-05, "loss": 0.9357, "step": 110 }, { "epoch": 0.16580310880829016, "grad_norm": 0.10160631686449051, "learning_rate": 4.7225e-05, "loss": 0.907, "step": 111 }, { "epoch": 0.16729683050926575, "grad_norm": 0.12003883719444275, "learning_rate": 4.72e-05, "loss": 0.9181, "step": 112 }, { "epoch": 0.16879055221024133, "grad_norm": 0.10592487454414368, "learning_rate": 4.7175e-05, "loss": 0.9789, "step": 113 }, { "epoch": 0.17028427391121692, "grad_norm": 0.11402179300785065, "learning_rate": 4.715e-05, "loss": 1.0105, "step": 114 }, { "epoch": 0.1717779956121925, "grad_norm": 0.10185225307941437, "learning_rate": 4.7125e-05, "loss": 0.8031, "step": 115 }, { "epoch": 0.1732717173131681, "grad_norm": 0.1073046550154686, "learning_rate": 4.71e-05, "loss": 0.9133, "step": 116 }, { "epoch": 0.17476543901414368, "grad_norm": 0.1191958636045456, "learning_rate": 4.7075e-05, "loss": 0.9415, "step": 117 }, { "epoch": 0.17625916071511927, "grad_norm": 0.10835791379213333, "learning_rate": 4.705e-05, "loss": 0.9283, "step": 118 }, { "epoch": 0.17775288241609485, "grad_norm": 0.10264922678470612, "learning_rate": 4.7025000000000005e-05, "loss": 0.8221, "step": 119 }, { "epoch": 0.17924660411707044, "grad_norm": 0.10879279673099518, "learning_rate": 4.7e-05, "loss": 0.8532, "step": 120 }, { "epoch": 0.18074032581804603, "grad_norm": 0.11400317400693893, "learning_rate": 4.6975000000000003e-05, "loss": 0.9331, "step": 121 }, { "epoch": 0.1822340475190216, "grad_norm": 0.10696861147880554, "learning_rate": 4.695e-05, "loss": 0.8527, "step": 122 }, { "epoch": 0.1837277692199972, "grad_norm": 0.11114850640296936, "learning_rate": 4.6925e-05, "loss": 0.954, "step": 123 }, { "epoch": 0.1852214909209728, "grad_norm": 0.11028968542814255, "learning_rate": 4.69e-05, "loss": 0.9625, "step": 124 }, { "epoch": 0.18671521262194837, "grad_norm": 0.1157396137714386, "learning_rate": 4.6875e-05, "loss": 0.9468, "step": 125 }, { "epoch": 0.18820893432292396, "grad_norm": 0.10750457644462585, "learning_rate": 4.685000000000001e-05, "loss": 0.8925, "step": 126 }, { "epoch": 0.18970265602389955, "grad_norm": 0.10025347024202347, "learning_rate": 4.6825e-05, "loss": 0.786, "step": 127 }, { "epoch": 0.19119637772487513, "grad_norm": 0.104389488697052, "learning_rate": 4.6800000000000006e-05, "loss": 0.8929, "step": 128 }, { "epoch": 0.19269009942585072, "grad_norm": 0.11046409606933594, "learning_rate": 4.6775000000000005e-05, "loss": 0.9153, "step": 129 }, { "epoch": 0.1941838211268263, "grad_norm": 0.10921323299407959, "learning_rate": 4.6750000000000005e-05, "loss": 0.9171, "step": 130 }, { "epoch": 0.1956775428278019, "grad_norm": 0.11169510334730148, "learning_rate": 4.6725000000000004e-05, "loss": 1.0669, "step": 131 }, { "epoch": 0.19717126452877748, "grad_norm": 0.11414733529090881, "learning_rate": 4.6700000000000003e-05, "loss": 0.886, "step": 132 }, { "epoch": 0.19866498622975307, "grad_norm": 0.10438405722379684, "learning_rate": 4.6675e-05, "loss": 0.8437, "step": 133 }, { "epoch": 0.20015870793072865, "grad_norm": 0.12781815230846405, "learning_rate": 4.665e-05, "loss": 0.9431, "step": 134 }, { "epoch": 0.20165242963170424, "grad_norm": 0.11023996770381927, "learning_rate": 4.6625e-05, "loss": 0.9443, "step": 135 }, { "epoch": 0.20314615133267983, "grad_norm": 0.10717877000570297, "learning_rate": 4.660000000000001e-05, "loss": 0.9123, "step": 136 }, { "epoch": 0.2046398730336554, "grad_norm": 0.11436127871274948, "learning_rate": 4.6575e-05, "loss": 0.934, "step": 137 }, { "epoch": 0.206133594734631, "grad_norm": 0.10982613265514374, "learning_rate": 4.655000000000001e-05, "loss": 0.8414, "step": 138 }, { "epoch": 0.20762731643560658, "grad_norm": 0.10516718029975891, "learning_rate": 4.6525e-05, "loss": 0.8775, "step": 139 }, { "epoch": 0.20912103813658217, "grad_norm": 0.1115696057677269, "learning_rate": 4.6500000000000005e-05, "loss": 0.9033, "step": 140 }, { "epoch": 0.21061475983755776, "grad_norm": 0.10917678475379944, "learning_rate": 4.6475000000000005e-05, "loss": 1.0296, "step": 141 }, { "epoch": 0.21210848153853334, "grad_norm": 0.10398805886507034, "learning_rate": 4.6450000000000004e-05, "loss": 0.8614, "step": 142 }, { "epoch": 0.21360220323950893, "grad_norm": 0.1079210638999939, "learning_rate": 4.6425000000000004e-05, "loss": 0.9044, "step": 143 }, { "epoch": 0.21509592494048452, "grad_norm": 0.11536914855241776, "learning_rate": 4.64e-05, "loss": 1.0376, "step": 144 }, { "epoch": 0.2165896466414601, "grad_norm": 0.11732814460992813, "learning_rate": 4.6375e-05, "loss": 0.9523, "step": 145 }, { "epoch": 0.2180833683424357, "grad_norm": 0.10777144134044647, "learning_rate": 4.635e-05, "loss": 0.941, "step": 146 }, { "epoch": 0.21957709004341128, "grad_norm": 0.10128272324800491, "learning_rate": 4.6325e-05, "loss": 0.8771, "step": 147 }, { "epoch": 0.22107081174438686, "grad_norm": 0.1069737896323204, "learning_rate": 4.630000000000001e-05, "loss": 0.9069, "step": 148 }, { "epoch": 0.22256453344536245, "grad_norm": 0.1078038215637207, "learning_rate": 4.6275e-05, "loss": 0.886, "step": 149 }, { "epoch": 0.22405825514633804, "grad_norm": 0.11785303056240082, "learning_rate": 4.6250000000000006e-05, "loss": 0.8347, "step": 150 }, { "epoch": 0.22555197684731362, "grad_norm": 0.10890822112560272, "learning_rate": 4.6225e-05, "loss": 0.8305, "step": 151 }, { "epoch": 0.2270456985482892, "grad_norm": 0.1118755117058754, "learning_rate": 4.6200000000000005e-05, "loss": 0.8914, "step": 152 }, { "epoch": 0.2285394202492648, "grad_norm": 0.11616045236587524, "learning_rate": 4.6175000000000004e-05, "loss": 0.9325, "step": 153 }, { "epoch": 0.23003314195024038, "grad_norm": 0.1134185716509819, "learning_rate": 4.6150000000000004e-05, "loss": 0.8821, "step": 154 }, { "epoch": 0.23152686365121597, "grad_norm": 0.09868773072957993, "learning_rate": 4.6125e-05, "loss": 0.7917, "step": 155 }, { "epoch": 0.23302058535219156, "grad_norm": 0.10579439252614975, "learning_rate": 4.61e-05, "loss": 0.902, "step": 156 }, { "epoch": 0.23451430705316717, "grad_norm": 0.11106543987989426, "learning_rate": 4.6075e-05, "loss": 0.9253, "step": 157 }, { "epoch": 0.23600802875414276, "grad_norm": 0.11669851839542389, "learning_rate": 4.605e-05, "loss": 0.975, "step": 158 }, { "epoch": 0.23750175045511834, "grad_norm": 0.11690078675746918, "learning_rate": 4.6025e-05, "loss": 0.9083, "step": 159 }, { "epoch": 0.23899547215609393, "grad_norm": 0.10988520830869675, "learning_rate": 4.600000000000001e-05, "loss": 0.8583, "step": 160 }, { "epoch": 0.24048919385706952, "grad_norm": 0.10101921111345291, "learning_rate": 4.5975e-05, "loss": 0.8172, "step": 161 }, { "epoch": 0.2419829155580451, "grad_norm": 0.09915332496166229, "learning_rate": 4.5950000000000006e-05, "loss": 0.7665, "step": 162 }, { "epoch": 0.2434766372590207, "grad_norm": 0.10048062354326248, "learning_rate": 4.5925e-05, "loss": 0.8362, "step": 163 }, { "epoch": 0.24497035895999628, "grad_norm": 0.10701285302639008, "learning_rate": 4.5900000000000004e-05, "loss": 0.7849, "step": 164 }, { "epoch": 0.24646408066097186, "grad_norm": 0.10674900561571121, "learning_rate": 4.5875000000000004e-05, "loss": 0.8692, "step": 165 }, { "epoch": 0.24795780236194745, "grad_norm": 0.12537144124507904, "learning_rate": 4.585e-05, "loss": 1.0081, "step": 166 }, { "epoch": 0.24945152406292304, "grad_norm": 0.10654605180025101, "learning_rate": 4.5825e-05, "loss": 0.9109, "step": 167 }, { "epoch": 0.2509452457638986, "grad_norm": 0.09973759949207306, "learning_rate": 4.58e-05, "loss": 0.7605, "step": 168 }, { "epoch": 0.2524389674648742, "grad_norm": 0.11967126280069351, "learning_rate": 4.5775e-05, "loss": 0.9229, "step": 169 }, { "epoch": 0.25393268916584977, "grad_norm": 0.12726350128650665, "learning_rate": 4.575e-05, "loss": 0.9705, "step": 170 }, { "epoch": 0.2554264108668254, "grad_norm": 0.11374322324991226, "learning_rate": 4.5725e-05, "loss": 0.8227, "step": 171 }, { "epoch": 0.25692013256780094, "grad_norm": 0.1116652712225914, "learning_rate": 4.5700000000000006e-05, "loss": 0.9811, "step": 172 }, { "epoch": 0.25841385426877655, "grad_norm": 0.11246800422668457, "learning_rate": 4.5675e-05, "loss": 0.9001, "step": 173 }, { "epoch": 0.2599075759697521, "grad_norm": 0.10981420427560806, "learning_rate": 4.5650000000000005e-05, "loss": 0.8803, "step": 174 }, { "epoch": 0.26140129767072773, "grad_norm": 0.11196296662092209, "learning_rate": 4.5625e-05, "loss": 0.8303, "step": 175 }, { "epoch": 0.2628950193717033, "grad_norm": 0.10695374011993408, "learning_rate": 4.5600000000000004e-05, "loss": 0.8036, "step": 176 }, { "epoch": 0.2643887410726789, "grad_norm": 0.10652667284011841, "learning_rate": 4.5575e-05, "loss": 0.84, "step": 177 }, { "epoch": 0.26588246277365446, "grad_norm": 0.12381096929311752, "learning_rate": 4.555e-05, "loss": 0.8633, "step": 178 }, { "epoch": 0.2673761844746301, "grad_norm": 0.12000639736652374, "learning_rate": 4.5525e-05, "loss": 0.8905, "step": 179 }, { "epoch": 0.26886990617560563, "grad_norm": 0.10701263695955276, "learning_rate": 4.55e-05, "loss": 0.8506, "step": 180 }, { "epoch": 0.27036362787658125, "grad_norm": 0.11427432298660278, "learning_rate": 4.5475e-05, "loss": 0.8793, "step": 181 }, { "epoch": 0.2718573495775568, "grad_norm": 0.1173078790307045, "learning_rate": 4.545000000000001e-05, "loss": 0.7852, "step": 182 }, { "epoch": 0.2733510712785324, "grad_norm": 0.10567096620798111, "learning_rate": 4.5425e-05, "loss": 0.7985, "step": 183 }, { "epoch": 0.274844792979508, "grad_norm": 0.11402223259210587, "learning_rate": 4.5400000000000006e-05, "loss": 0.8679, "step": 184 }, { "epoch": 0.2763385146804836, "grad_norm": 0.11106009781360626, "learning_rate": 4.5375e-05, "loss": 0.77, "step": 185 }, { "epoch": 0.27783223638145915, "grad_norm": 0.10463316738605499, "learning_rate": 4.5350000000000005e-05, "loss": 0.7956, "step": 186 }, { "epoch": 0.27932595808243477, "grad_norm": 0.11327042430639267, "learning_rate": 4.5325000000000004e-05, "loss": 0.9362, "step": 187 }, { "epoch": 0.2808196797834104, "grad_norm": 0.1250493824481964, "learning_rate": 4.53e-05, "loss": 0.9698, "step": 188 }, { "epoch": 0.28231340148438594, "grad_norm": 0.11732323467731476, "learning_rate": 4.5275e-05, "loss": 0.8804, "step": 189 }, { "epoch": 0.28380712318536155, "grad_norm": 0.11572263389825821, "learning_rate": 4.525e-05, "loss": 0.9142, "step": 190 }, { "epoch": 0.2853008448863371, "grad_norm": 0.13714352250099182, "learning_rate": 4.5225e-05, "loss": 0.8059, "step": 191 }, { "epoch": 0.2867945665873127, "grad_norm": 0.10406294465065002, "learning_rate": 4.52e-05, "loss": 0.8634, "step": 192 }, { "epoch": 0.2882882882882883, "grad_norm": 0.10896674543619156, "learning_rate": 4.5175e-05, "loss": 0.8315, "step": 193 }, { "epoch": 0.2897820099892639, "grad_norm": 0.10760944336652756, "learning_rate": 4.5150000000000006e-05, "loss": 0.7441, "step": 194 }, { "epoch": 0.29127573169023946, "grad_norm": 0.11878028512001038, "learning_rate": 4.5125e-05, "loss": 0.9309, "step": 195 }, { "epoch": 0.2927694533912151, "grad_norm": 0.111594058573246, "learning_rate": 4.5100000000000005e-05, "loss": 0.8531, "step": 196 }, { "epoch": 0.29426317509219063, "grad_norm": 0.11237120628356934, "learning_rate": 4.5075e-05, "loss": 0.9093, "step": 197 }, { "epoch": 0.29575689679316625, "grad_norm": 0.11011101305484772, "learning_rate": 4.5050000000000004e-05, "loss": 0.8081, "step": 198 }, { "epoch": 0.2972506184941418, "grad_norm": 0.11488550156354904, "learning_rate": 4.5025000000000003e-05, "loss": 0.88, "step": 199 }, { "epoch": 0.2987443401951174, "grad_norm": 0.10571623593568802, "learning_rate": 4.5e-05, "loss": 0.8372, "step": 200 }, { "epoch": 0.300238061896093, "grad_norm": 0.11315234750509262, "learning_rate": 4.4975e-05, "loss": 0.9472, "step": 201 }, { "epoch": 0.3017317835970686, "grad_norm": 0.11714547127485275, "learning_rate": 4.495e-05, "loss": 0.8431, "step": 202 }, { "epoch": 0.30322550529804415, "grad_norm": 0.1091802567243576, "learning_rate": 4.4925e-05, "loss": 0.8523, "step": 203 }, { "epoch": 0.30471922699901977, "grad_norm": 0.10574609786272049, "learning_rate": 4.49e-05, "loss": 0.8482, "step": 204 }, { "epoch": 0.3062129486999953, "grad_norm": 0.12275499105453491, "learning_rate": 4.4875e-05, "loss": 0.8098, "step": 205 }, { "epoch": 0.30770667040097094, "grad_norm": 0.10528004169464111, "learning_rate": 4.4850000000000006e-05, "loss": 0.8165, "step": 206 }, { "epoch": 0.3092003921019465, "grad_norm": 0.10836642980575562, "learning_rate": 4.4825e-05, "loss": 0.8495, "step": 207 }, { "epoch": 0.3106941138029221, "grad_norm": 0.12764981389045715, "learning_rate": 4.4800000000000005e-05, "loss": 0.8733, "step": 208 }, { "epoch": 0.31218783550389767, "grad_norm": 0.11036060750484467, "learning_rate": 4.4775e-05, "loss": 0.7968, "step": 209 }, { "epoch": 0.3136815572048733, "grad_norm": 0.11040372401475906, "learning_rate": 4.4750000000000004e-05, "loss": 0.8437, "step": 210 }, { "epoch": 0.31517527890584884, "grad_norm": 0.10680075734853745, "learning_rate": 4.4725e-05, "loss": 0.7951, "step": 211 }, { "epoch": 0.31666900060682446, "grad_norm": 0.12390894442796707, "learning_rate": 4.47e-05, "loss": 0.8538, "step": 212 }, { "epoch": 0.3181627223078, "grad_norm": 0.10876529663801193, "learning_rate": 4.4675e-05, "loss": 0.8034, "step": 213 }, { "epoch": 0.31965644400877563, "grad_norm": 0.11314105242490768, "learning_rate": 4.465e-05, "loss": 0.8257, "step": 214 }, { "epoch": 0.3211501657097512, "grad_norm": 0.10273638367652893, "learning_rate": 4.4625e-05, "loss": 0.8153, "step": 215 }, { "epoch": 0.3226438874107268, "grad_norm": 0.11245308071374893, "learning_rate": 4.46e-05, "loss": 0.9359, "step": 216 }, { "epoch": 0.32413760911170236, "grad_norm": 0.10547469556331635, "learning_rate": 4.4575e-05, "loss": 0.7864, "step": 217 }, { "epoch": 0.325631330812678, "grad_norm": 0.11543171852827072, "learning_rate": 4.4550000000000005e-05, "loss": 0.9156, "step": 218 }, { "epoch": 0.32712505251365354, "grad_norm": 0.11760671436786652, "learning_rate": 4.4525e-05, "loss": 0.8921, "step": 219 }, { "epoch": 0.32861877421462915, "grad_norm": 0.11226698011159897, "learning_rate": 4.4500000000000004e-05, "loss": 0.8803, "step": 220 }, { "epoch": 0.3301124959156047, "grad_norm": 0.09755794703960419, "learning_rate": 4.4475e-05, "loss": 0.7245, "step": 221 }, { "epoch": 0.3316062176165803, "grad_norm": 0.11775672435760498, "learning_rate": 4.445e-05, "loss": 0.8504, "step": 222 }, { "epoch": 0.3330999393175559, "grad_norm": 0.1138814389705658, "learning_rate": 4.4425e-05, "loss": 0.8781, "step": 223 }, { "epoch": 0.3345936610185315, "grad_norm": 0.12460143864154816, "learning_rate": 4.44e-05, "loss": 0.9356, "step": 224 }, { "epoch": 0.33608738271950706, "grad_norm": 0.1040855422616005, "learning_rate": 4.4375e-05, "loss": 0.7766, "step": 225 }, { "epoch": 0.33758110442048267, "grad_norm": 0.11422121524810791, "learning_rate": 4.435e-05, "loss": 0.9442, "step": 226 }, { "epoch": 0.33907482612145823, "grad_norm": 0.11283203214406967, "learning_rate": 4.4325e-05, "loss": 0.9004, "step": 227 }, { "epoch": 0.34056854782243384, "grad_norm": 0.13057462871074677, "learning_rate": 4.43e-05, "loss": 0.7747, "step": 228 }, { "epoch": 0.3420622695234094, "grad_norm": 0.11758129298686981, "learning_rate": 4.4275e-05, "loss": 0.8414, "step": 229 }, { "epoch": 0.343555991224385, "grad_norm": 0.10636349022388458, "learning_rate": 4.4250000000000005e-05, "loss": 0.7771, "step": 230 }, { "epoch": 0.3450497129253606, "grad_norm": 0.11190015077590942, "learning_rate": 4.4225e-05, "loss": 0.8398, "step": 231 }, { "epoch": 0.3465434346263362, "grad_norm": 0.13087163865566254, "learning_rate": 4.4200000000000004e-05, "loss": 0.8869, "step": 232 }, { "epoch": 0.34803715632731175, "grad_norm": 0.12220033258199692, "learning_rate": 4.4174999999999996e-05, "loss": 0.9523, "step": 233 }, { "epoch": 0.34953087802828736, "grad_norm": 0.12126144766807556, "learning_rate": 4.415e-05, "loss": 0.8274, "step": 234 }, { "epoch": 0.3510245997292629, "grad_norm": 0.1194160133600235, "learning_rate": 4.4125e-05, "loss": 0.8134, "step": 235 }, { "epoch": 0.35251832143023853, "grad_norm": 0.12468134611845016, "learning_rate": 4.41e-05, "loss": 0.917, "step": 236 }, { "epoch": 0.3540120431312141, "grad_norm": 0.11102744191884995, "learning_rate": 4.4075e-05, "loss": 0.728, "step": 237 }, { "epoch": 0.3555057648321897, "grad_norm": 0.10800369083881378, "learning_rate": 4.405e-05, "loss": 0.8309, "step": 238 }, { "epoch": 0.35699948653316527, "grad_norm": 0.10854049026966095, "learning_rate": 4.4025e-05, "loss": 0.7817, "step": 239 }, { "epoch": 0.3584932082341409, "grad_norm": 0.12646059691905975, "learning_rate": 4.4000000000000006e-05, "loss": 0.8945, "step": 240 }, { "epoch": 0.35998692993511644, "grad_norm": 0.11164715141057968, "learning_rate": 4.3975e-05, "loss": 0.9002, "step": 241 }, { "epoch": 0.36148065163609205, "grad_norm": 0.11623437702655792, "learning_rate": 4.3950000000000004e-05, "loss": 0.9211, "step": 242 }, { "epoch": 0.3629743733370676, "grad_norm": 0.10776624083518982, "learning_rate": 4.3925e-05, "loss": 0.7586, "step": 243 }, { "epoch": 0.3644680950380432, "grad_norm": 0.11604306101799011, "learning_rate": 4.39e-05, "loss": 0.9307, "step": 244 }, { "epoch": 0.3659618167390188, "grad_norm": 0.12260506302118301, "learning_rate": 4.3875e-05, "loss": 0.9839, "step": 245 }, { "epoch": 0.3674555384399944, "grad_norm": 0.10707111656665802, "learning_rate": 4.385e-05, "loss": 0.8564, "step": 246 }, { "epoch": 0.36894926014096996, "grad_norm": 0.1081305518746376, "learning_rate": 4.3825e-05, "loss": 0.7938, "step": 247 }, { "epoch": 0.3704429818419456, "grad_norm": 0.1222182884812355, "learning_rate": 4.38e-05, "loss": 0.8779, "step": 248 }, { "epoch": 0.37193670354292113, "grad_norm": 0.11179394274950027, "learning_rate": 4.3775e-05, "loss": 0.9, "step": 249 }, { "epoch": 0.37343042524389675, "grad_norm": 0.11558545380830765, "learning_rate": 4.375e-05, "loss": 0.9335, "step": 250 }, { "epoch": 0.37492414694487236, "grad_norm": 0.12621845304965973, "learning_rate": 4.3725000000000006e-05, "loss": 0.856, "step": 251 }, { "epoch": 0.3764178686458479, "grad_norm": 0.10921687632799149, "learning_rate": 4.3700000000000005e-05, "loss": 0.8086, "step": 252 }, { "epoch": 0.37791159034682353, "grad_norm": 0.11954590678215027, "learning_rate": 4.3675000000000005e-05, "loss": 0.8394, "step": 253 }, { "epoch": 0.3794053120477991, "grad_norm": 0.1265055388212204, "learning_rate": 4.3650000000000004e-05, "loss": 0.9417, "step": 254 }, { "epoch": 0.3808990337487747, "grad_norm": 0.11341366916894913, "learning_rate": 4.3625e-05, "loss": 0.8648, "step": 255 }, { "epoch": 0.38239275544975027, "grad_norm": 0.12063136696815491, "learning_rate": 4.36e-05, "loss": 0.8097, "step": 256 }, { "epoch": 0.3838864771507259, "grad_norm": 0.1287822425365448, "learning_rate": 4.3575e-05, "loss": 0.9054, "step": 257 }, { "epoch": 0.38538019885170144, "grad_norm": 0.1067572608590126, "learning_rate": 4.355e-05, "loss": 0.8304, "step": 258 }, { "epoch": 0.38687392055267705, "grad_norm": 0.11348453909158707, "learning_rate": 4.352500000000001e-05, "loss": 0.8382, "step": 259 }, { "epoch": 0.3883676422536526, "grad_norm": 0.11681089550256729, "learning_rate": 4.35e-05, "loss": 0.8529, "step": 260 }, { "epoch": 0.3898613639546282, "grad_norm": 0.12092391401529312, "learning_rate": 4.3475000000000006e-05, "loss": 0.9417, "step": 261 }, { "epoch": 0.3913550856556038, "grad_norm": 0.11473096162080765, "learning_rate": 4.345e-05, "loss": 0.9006, "step": 262 }, { "epoch": 0.3928488073565794, "grad_norm": 0.10663125663995743, "learning_rate": 4.3425000000000005e-05, "loss": 0.777, "step": 263 }, { "epoch": 0.39434252905755496, "grad_norm": 0.11494694650173187, "learning_rate": 4.3400000000000005e-05, "loss": 0.8492, "step": 264 }, { "epoch": 0.3958362507585306, "grad_norm": 0.12147645652294159, "learning_rate": 4.3375000000000004e-05, "loss": 0.971, "step": 265 }, { "epoch": 0.39732997245950613, "grad_norm": 0.12208539247512817, "learning_rate": 4.335e-05, "loss": 0.8908, "step": 266 }, { "epoch": 0.39882369416048175, "grad_norm": 0.12308327853679657, "learning_rate": 4.3325e-05, "loss": 0.9551, "step": 267 }, { "epoch": 0.4003174158614573, "grad_norm": 0.12025439739227295, "learning_rate": 4.33e-05, "loss": 0.8434, "step": 268 }, { "epoch": 0.4018111375624329, "grad_norm": 0.11372730135917664, "learning_rate": 4.3275e-05, "loss": 0.887, "step": 269 }, { "epoch": 0.4033048592634085, "grad_norm": 0.11306744068861008, "learning_rate": 4.325e-05, "loss": 0.8162, "step": 270 }, { "epoch": 0.4047985809643841, "grad_norm": 0.12204046547412872, "learning_rate": 4.322500000000001e-05, "loss": 0.8983, "step": 271 }, { "epoch": 0.40629230266535965, "grad_norm": 0.11165154725313187, "learning_rate": 4.32e-05, "loss": 0.8439, "step": 272 }, { "epoch": 0.40778602436633526, "grad_norm": 0.13003522157669067, "learning_rate": 4.3175000000000006e-05, "loss": 0.9496, "step": 273 }, { "epoch": 0.4092797460673108, "grad_norm": 0.12857121229171753, "learning_rate": 4.315e-05, "loss": 1.0183, "step": 274 }, { "epoch": 0.41077346776828644, "grad_norm": 0.11140503734350204, "learning_rate": 4.3125000000000005e-05, "loss": 0.7793, "step": 275 }, { "epoch": 0.412267189469262, "grad_norm": 0.11036542057991028, "learning_rate": 4.3100000000000004e-05, "loss": 0.8609, "step": 276 }, { "epoch": 0.4137609111702376, "grad_norm": 0.11418310552835464, "learning_rate": 4.3075000000000003e-05, "loss": 0.8999, "step": 277 }, { "epoch": 0.41525463287121317, "grad_norm": 0.11190186440944672, "learning_rate": 4.305e-05, "loss": 0.8269, "step": 278 }, { "epoch": 0.4167483545721888, "grad_norm": 0.11850057542324066, "learning_rate": 4.3025e-05, "loss": 0.9137, "step": 279 }, { "epoch": 0.41824207627316434, "grad_norm": 0.12087133526802063, "learning_rate": 4.3e-05, "loss": 0.9278, "step": 280 }, { "epoch": 0.41973579797413996, "grad_norm": 0.11531470715999603, "learning_rate": 4.2975e-05, "loss": 0.927, "step": 281 }, { "epoch": 0.4212295196751155, "grad_norm": 0.1118212342262268, "learning_rate": 4.295e-05, "loss": 0.8573, "step": 282 }, { "epoch": 0.42272324137609113, "grad_norm": 0.1070399060845375, "learning_rate": 4.2925000000000007e-05, "loss": 0.8476, "step": 283 }, { "epoch": 0.4242169630770667, "grad_norm": 0.11764728277921677, "learning_rate": 4.29e-05, "loss": 0.8476, "step": 284 }, { "epoch": 0.4257106847780423, "grad_norm": 0.1171267032623291, "learning_rate": 4.2875000000000005e-05, "loss": 0.9089, "step": 285 }, { "epoch": 0.42720440647901786, "grad_norm": 0.11109969764947891, "learning_rate": 4.285e-05, "loss": 0.8645, "step": 286 }, { "epoch": 0.4286981281799935, "grad_norm": 0.11542622745037079, "learning_rate": 4.2825000000000004e-05, "loss": 0.8355, "step": 287 }, { "epoch": 0.43019184988096903, "grad_norm": 0.11716468632221222, "learning_rate": 4.2800000000000004e-05, "loss": 0.7861, "step": 288 }, { "epoch": 0.43168557158194465, "grad_norm": 0.10457771271467209, "learning_rate": 4.2775e-05, "loss": 0.7788, "step": 289 }, { "epoch": 0.4331792932829202, "grad_norm": 0.13224861025810242, "learning_rate": 4.275e-05, "loss": 1.0643, "step": 290 }, { "epoch": 0.4346730149838958, "grad_norm": 0.11003108322620392, "learning_rate": 4.2725e-05, "loss": 0.7568, "step": 291 }, { "epoch": 0.4361667366848714, "grad_norm": 0.12000851333141327, "learning_rate": 4.27e-05, "loss": 0.9287, "step": 292 }, { "epoch": 0.437660458385847, "grad_norm": 0.1216108500957489, "learning_rate": 4.2675e-05, "loss": 0.9249, "step": 293 }, { "epoch": 0.43915418008682255, "grad_norm": 0.1211746335029602, "learning_rate": 4.265e-05, "loss": 0.7919, "step": 294 }, { "epoch": 0.44064790178779817, "grad_norm": 0.12024874240159988, "learning_rate": 4.2625000000000006e-05, "loss": 0.8795, "step": 295 }, { "epoch": 0.4421416234887737, "grad_norm": 0.1250150203704834, "learning_rate": 4.26e-05, "loss": 0.8786, "step": 296 }, { "epoch": 0.44363534518974934, "grad_norm": 0.1234525665640831, "learning_rate": 4.2575000000000005e-05, "loss": 0.8517, "step": 297 }, { "epoch": 0.4451290668907249, "grad_norm": 0.11786121129989624, "learning_rate": 4.2550000000000004e-05, "loss": 0.8676, "step": 298 }, { "epoch": 0.4466227885917005, "grad_norm": 0.1235240027308464, "learning_rate": 4.2525000000000004e-05, "loss": 0.8803, "step": 299 }, { "epoch": 0.4481165102926761, "grad_norm": 0.10511527955532074, "learning_rate": 4.25e-05, "loss": 0.7864, "step": 300 }, { "epoch": 0.4496102319936517, "grad_norm": 0.12800796329975128, "learning_rate": 4.2475e-05, "loss": 0.8883, "step": 301 }, { "epoch": 0.45110395369462725, "grad_norm": 0.10764234513044357, "learning_rate": 4.245e-05, "loss": 0.8636, "step": 302 }, { "epoch": 0.45259767539560286, "grad_norm": 0.11941605061292648, "learning_rate": 4.2425e-05, "loss": 0.8351, "step": 303 }, { "epoch": 0.4540913970965784, "grad_norm": 0.1105790063738823, "learning_rate": 4.24e-05, "loss": 0.9018, "step": 304 }, { "epoch": 0.45558511879755403, "grad_norm": 0.10682832449674606, "learning_rate": 4.237500000000001e-05, "loss": 0.797, "step": 305 }, { "epoch": 0.4570788404985296, "grad_norm": 0.11704114824533463, "learning_rate": 4.235e-05, "loss": 0.7985, "step": 306 }, { "epoch": 0.4585725621995052, "grad_norm": 0.10500568896532059, "learning_rate": 4.2325000000000006e-05, "loss": 0.7665, "step": 307 }, { "epoch": 0.46006628390048077, "grad_norm": 0.11335790902376175, "learning_rate": 4.23e-05, "loss": 0.94, "step": 308 }, { "epoch": 0.4615600056014564, "grad_norm": 0.11582304537296295, "learning_rate": 4.2275000000000004e-05, "loss": 0.9077, "step": 309 }, { "epoch": 0.46305372730243194, "grad_norm": 0.10909160226583481, "learning_rate": 4.2250000000000004e-05, "loss": 0.8209, "step": 310 }, { "epoch": 0.46454744900340755, "grad_norm": 0.11509093642234802, "learning_rate": 4.2225e-05, "loss": 0.7911, "step": 311 }, { "epoch": 0.4660411707043831, "grad_norm": 0.12319629639387131, "learning_rate": 4.22e-05, "loss": 0.8353, "step": 312 }, { "epoch": 0.4675348924053587, "grad_norm": 0.11463336646556854, "learning_rate": 4.2175e-05, "loss": 0.7597, "step": 313 }, { "epoch": 0.46902861410633434, "grad_norm": 0.12351766973733902, "learning_rate": 4.215e-05, "loss": 0.828, "step": 314 }, { "epoch": 0.4705223358073099, "grad_norm": 0.14272412657737732, "learning_rate": 4.2125e-05, "loss": 0.9552, "step": 315 }, { "epoch": 0.4720160575082855, "grad_norm": 0.1274389773607254, "learning_rate": 4.21e-05, "loss": 0.9272, "step": 316 }, { "epoch": 0.4735097792092611, "grad_norm": 0.11254029721021652, "learning_rate": 4.2075000000000006e-05, "loss": 0.7865, "step": 317 }, { "epoch": 0.4750035009102367, "grad_norm": 0.10742338001728058, "learning_rate": 4.205e-05, "loss": 0.8052, "step": 318 }, { "epoch": 0.47649722261121225, "grad_norm": 0.12696552276611328, "learning_rate": 4.2025000000000005e-05, "loss": 0.8486, "step": 319 }, { "epoch": 0.47799094431218786, "grad_norm": 0.12039877474308014, "learning_rate": 4.2e-05, "loss": 0.8238, "step": 320 }, { "epoch": 0.4794846660131634, "grad_norm": 0.1166485920548439, "learning_rate": 4.1975000000000004e-05, "loss": 0.824, "step": 321 }, { "epoch": 0.48097838771413903, "grad_norm": 0.11079639941453934, "learning_rate": 4.195e-05, "loss": 0.7797, "step": 322 }, { "epoch": 0.4824721094151146, "grad_norm": 0.11130478233098984, "learning_rate": 4.1925e-05, "loss": 0.8346, "step": 323 }, { "epoch": 0.4839658311160902, "grad_norm": 0.12550784647464752, "learning_rate": 4.19e-05, "loss": 0.8799, "step": 324 }, { "epoch": 0.48545955281706576, "grad_norm": 0.1134837418794632, "learning_rate": 4.1875e-05, "loss": 0.8237, "step": 325 }, { "epoch": 0.4869532745180414, "grad_norm": 0.16124998033046722, "learning_rate": 4.185e-05, "loss": 0.8629, "step": 326 }, { "epoch": 0.48844699621901694, "grad_norm": 0.10293842852115631, "learning_rate": 4.1825e-05, "loss": 0.7071, "step": 327 }, { "epoch": 0.48994071791999255, "grad_norm": 0.11426858603954315, "learning_rate": 4.18e-05, "loss": 0.8125, "step": 328 }, { "epoch": 0.4914344396209681, "grad_norm": 0.12414971739053726, "learning_rate": 4.1775000000000006e-05, "loss": 0.8976, "step": 329 }, { "epoch": 0.4929281613219437, "grad_norm": 0.12173853069543839, "learning_rate": 4.175e-05, "loss": 0.8994, "step": 330 }, { "epoch": 0.4944218830229193, "grad_norm": 0.12688115239143372, "learning_rate": 4.1725000000000005e-05, "loss": 0.782, "step": 331 }, { "epoch": 0.4959156047238949, "grad_norm": 0.11825509369373322, "learning_rate": 4.17e-05, "loss": 0.846, "step": 332 }, { "epoch": 0.49740932642487046, "grad_norm": 0.1330428421497345, "learning_rate": 4.1675e-05, "loss": 1.0506, "step": 333 }, { "epoch": 0.49890304812584607, "grad_norm": 0.11930572241544724, "learning_rate": 4.165e-05, "loss": 0.8988, "step": 334 }, { "epoch": 0.5003967698268217, "grad_norm": 0.1190861314535141, "learning_rate": 4.1625e-05, "loss": 0.896, "step": 335 }, { "epoch": 0.5018904915277972, "grad_norm": 0.12587320804595947, "learning_rate": 4.16e-05, "loss": 0.75, "step": 336 }, { "epoch": 0.5033842132287728, "grad_norm": 0.10933604836463928, "learning_rate": 4.1575e-05, "loss": 0.7152, "step": 337 }, { "epoch": 0.5048779349297484, "grad_norm": 0.11405114829540253, "learning_rate": 4.155e-05, "loss": 0.8104, "step": 338 }, { "epoch": 0.506371656630724, "grad_norm": 0.12306026369333267, "learning_rate": 4.1525e-05, "loss": 0.8156, "step": 339 }, { "epoch": 0.5078653783316995, "grad_norm": 0.11889629811048508, "learning_rate": 4.15e-05, "loss": 0.8642, "step": 340 }, { "epoch": 0.5093591000326751, "grad_norm": 0.1187397912144661, "learning_rate": 4.1475000000000005e-05, "loss": 0.82, "step": 341 }, { "epoch": 0.5108528217336508, "grad_norm": 0.10686516016721725, "learning_rate": 4.145e-05, "loss": 0.7757, "step": 342 }, { "epoch": 0.5123465434346264, "grad_norm": 0.11999212950468063, "learning_rate": 4.1425000000000004e-05, "loss": 0.8424, "step": 343 }, { "epoch": 0.5138402651356019, "grad_norm": 0.11294475942850113, "learning_rate": 4.14e-05, "loss": 0.8206, "step": 344 }, { "epoch": 0.5153339868365775, "grad_norm": 0.11675215512514114, "learning_rate": 4.1375e-05, "loss": 0.8371, "step": 345 }, { "epoch": 0.5168277085375531, "grad_norm": 0.1168551966547966, "learning_rate": 4.135e-05, "loss": 0.7877, "step": 346 }, { "epoch": 0.5183214302385287, "grad_norm": 0.1252433806657791, "learning_rate": 4.1325e-05, "loss": 0.9285, "step": 347 }, { "epoch": 0.5198151519395042, "grad_norm": 0.11625576764345169, "learning_rate": 4.13e-05, "loss": 0.8144, "step": 348 }, { "epoch": 0.5213088736404798, "grad_norm": 0.1211349293589592, "learning_rate": 4.1275e-05, "loss": 0.9195, "step": 349 }, { "epoch": 0.5228025953414555, "grad_norm": 0.10717800259590149, "learning_rate": 4.125e-05, "loss": 0.8825, "step": 350 }, { "epoch": 0.5242963170424311, "grad_norm": 0.1134132668375969, "learning_rate": 4.1225e-05, "loss": 0.8071, "step": 351 }, { "epoch": 0.5257900387434066, "grad_norm": 0.11666160076856613, "learning_rate": 4.12e-05, "loss": 0.8803, "step": 352 }, { "epoch": 0.5272837604443822, "grad_norm": 0.11602052301168442, "learning_rate": 4.1175000000000005e-05, "loss": 0.8139, "step": 353 }, { "epoch": 0.5287774821453578, "grad_norm": 0.1190742626786232, "learning_rate": 4.115e-05, "loss": 0.8627, "step": 354 }, { "epoch": 0.5302712038463334, "grad_norm": 0.11163876205682755, "learning_rate": 4.1125000000000004e-05, "loss": 0.8413, "step": 355 }, { "epoch": 0.5317649255473089, "grad_norm": 0.11343089491128922, "learning_rate": 4.11e-05, "loss": 0.8723, "step": 356 }, { "epoch": 0.5332586472482845, "grad_norm": 0.12053605169057846, "learning_rate": 4.1075e-05, "loss": 0.8757, "step": 357 }, { "epoch": 0.5347523689492601, "grad_norm": 0.11563614010810852, "learning_rate": 4.105e-05, "loss": 0.8476, "step": 358 }, { "epoch": 0.5362460906502358, "grad_norm": 0.11387767642736435, "learning_rate": 4.1025e-05, "loss": 0.8334, "step": 359 }, { "epoch": 0.5377398123512113, "grad_norm": 0.11910226196050644, "learning_rate": 4.1e-05, "loss": 0.9067, "step": 360 }, { "epoch": 0.5392335340521869, "grad_norm": 0.11455752700567245, "learning_rate": 4.0975e-05, "loss": 0.8926, "step": 361 }, { "epoch": 0.5407272557531625, "grad_norm": 0.10180629789829254, "learning_rate": 4.095e-05, "loss": 0.6831, "step": 362 }, { "epoch": 0.5422209774541381, "grad_norm": 0.11310421675443649, "learning_rate": 4.0925000000000005e-05, "loss": 0.8902, "step": 363 }, { "epoch": 0.5437146991551136, "grad_norm": 0.13089844584465027, "learning_rate": 4.09e-05, "loss": 0.8915, "step": 364 }, { "epoch": 0.5452084208560892, "grad_norm": 0.11993226408958435, "learning_rate": 4.0875000000000004e-05, "loss": 0.9405, "step": 365 }, { "epoch": 0.5467021425570648, "grad_norm": 0.11476049572229385, "learning_rate": 4.085e-05, "loss": 0.9095, "step": 366 }, { "epoch": 0.5481958642580405, "grad_norm": 0.11394521594047546, "learning_rate": 4.0825e-05, "loss": 0.859, "step": 367 }, { "epoch": 0.549689585959016, "grad_norm": 0.11385123431682587, "learning_rate": 4.08e-05, "loss": 0.7903, "step": 368 }, { "epoch": 0.5511833076599916, "grad_norm": 0.12233684957027435, "learning_rate": 4.0775e-05, "loss": 0.9519, "step": 369 }, { "epoch": 0.5526770293609672, "grad_norm": 0.11814061552286148, "learning_rate": 4.075e-05, "loss": 0.8981, "step": 370 }, { "epoch": 0.5541707510619428, "grad_norm": 0.11834059655666351, "learning_rate": 4.0725e-05, "loss": 0.778, "step": 371 }, { "epoch": 0.5556644727629183, "grad_norm": 0.11565592885017395, "learning_rate": 4.07e-05, "loss": 0.9093, "step": 372 }, { "epoch": 0.5571581944638939, "grad_norm": 0.11408364027738571, "learning_rate": 4.0675e-05, "loss": 0.8727, "step": 373 }, { "epoch": 0.5586519161648695, "grad_norm": 0.11535526812076569, "learning_rate": 4.065e-05, "loss": 0.8431, "step": 374 }, { "epoch": 0.5601456378658451, "grad_norm": 0.1137579008936882, "learning_rate": 4.0625000000000005e-05, "loss": 0.8523, "step": 375 }, { "epoch": 0.5616393595668208, "grad_norm": 0.11451197415590286, "learning_rate": 4.0600000000000004e-05, "loss": 0.8557, "step": 376 }, { "epoch": 0.5631330812677963, "grad_norm": 0.11988858878612518, "learning_rate": 4.0575000000000004e-05, "loss": 0.9092, "step": 377 }, { "epoch": 0.5646268029687719, "grad_norm": 0.11066543310880661, "learning_rate": 4.055e-05, "loss": 0.787, "step": 378 }, { "epoch": 0.5661205246697475, "grad_norm": 0.12285677343606949, "learning_rate": 4.0525e-05, "loss": 0.9088, "step": 379 }, { "epoch": 0.5676142463707231, "grad_norm": 0.12474570423364639, "learning_rate": 4.05e-05, "loss": 0.9328, "step": 380 }, { "epoch": 0.5691079680716986, "grad_norm": 0.12218405306339264, "learning_rate": 4.0475e-05, "loss": 0.8662, "step": 381 }, { "epoch": 0.5706016897726742, "grad_norm": 0.12658347189426422, "learning_rate": 4.045000000000001e-05, "loss": 0.822, "step": 382 }, { "epoch": 0.5720954114736498, "grad_norm": 0.14566221833229065, "learning_rate": 4.0425e-05, "loss": 0.9784, "step": 383 }, { "epoch": 0.5735891331746255, "grad_norm": 0.11796669661998749, "learning_rate": 4.0400000000000006e-05, "loss": 0.9466, "step": 384 }, { "epoch": 0.575082854875601, "grad_norm": 0.11644032597541809, "learning_rate": 4.0375e-05, "loss": 0.7447, "step": 385 }, { "epoch": 0.5765765765765766, "grad_norm": 0.12011049687862396, "learning_rate": 4.0350000000000005e-05, "loss": 0.9198, "step": 386 }, { "epoch": 0.5780702982775522, "grad_norm": 0.11838594824075699, "learning_rate": 4.0325000000000004e-05, "loss": 0.8999, "step": 387 }, { "epoch": 0.5795640199785278, "grad_norm": 0.11792827397584915, "learning_rate": 4.0300000000000004e-05, "loss": 0.9151, "step": 388 }, { "epoch": 0.5810577416795033, "grad_norm": 0.11458005011081696, "learning_rate": 4.0275e-05, "loss": 0.8013, "step": 389 }, { "epoch": 0.5825514633804789, "grad_norm": 0.12026382982730865, "learning_rate": 4.025e-05, "loss": 0.8639, "step": 390 }, { "epoch": 0.5840451850814545, "grad_norm": 0.11109666526317596, "learning_rate": 4.0225e-05, "loss": 0.7676, "step": 391 }, { "epoch": 0.5855389067824301, "grad_norm": 0.13359348475933075, "learning_rate": 4.02e-05, "loss": 0.9278, "step": 392 }, { "epoch": 0.5870326284834056, "grad_norm": 0.11813811212778091, "learning_rate": 4.0175e-05, "loss": 0.8806, "step": 393 }, { "epoch": 0.5885263501843813, "grad_norm": 0.1196754202246666, "learning_rate": 4.015000000000001e-05, "loss": 0.9506, "step": 394 }, { "epoch": 0.5900200718853569, "grad_norm": 0.11270725727081299, "learning_rate": 4.0125e-05, "loss": 0.7666, "step": 395 }, { "epoch": 0.5915137935863325, "grad_norm": 0.12819893658161163, "learning_rate": 4.0100000000000006e-05, "loss": 0.9169, "step": 396 }, { "epoch": 0.593007515287308, "grad_norm": 0.1208658218383789, "learning_rate": 4.0075e-05, "loss": 0.8227, "step": 397 }, { "epoch": 0.5945012369882836, "grad_norm": 0.11468334496021271, "learning_rate": 4.0050000000000004e-05, "loss": 0.8576, "step": 398 }, { "epoch": 0.5959949586892592, "grad_norm": 0.1133088767528534, "learning_rate": 4.0025000000000004e-05, "loss": 0.8458, "step": 399 }, { "epoch": 0.5974886803902348, "grad_norm": 0.12465198338031769, "learning_rate": 4e-05, "loss": 1.0, "step": 400 }, { "epoch": 0.5989824020912103, "grad_norm": 0.11511408537626266, "learning_rate": 3.9975e-05, "loss": 0.8169, "step": 401 }, { "epoch": 0.600476123792186, "grad_norm": 0.12595906853675842, "learning_rate": 3.995e-05, "loss": 0.807, "step": 402 }, { "epoch": 0.6019698454931616, "grad_norm": 0.11657946556806564, "learning_rate": 3.9925e-05, "loss": 0.8446, "step": 403 }, { "epoch": 0.6034635671941372, "grad_norm": 0.1156841441988945, "learning_rate": 3.99e-05, "loss": 0.8418, "step": 404 }, { "epoch": 0.6049572888951127, "grad_norm": 0.1356658935546875, "learning_rate": 3.9875e-05, "loss": 0.9672, "step": 405 }, { "epoch": 0.6064510105960883, "grad_norm": 0.11688660830259323, "learning_rate": 3.9850000000000006e-05, "loss": 0.8833, "step": 406 }, { "epoch": 0.6079447322970639, "grad_norm": 0.11496143043041229, "learning_rate": 3.9825e-05, "loss": 0.7495, "step": 407 }, { "epoch": 0.6094384539980395, "grad_norm": 0.11638246476650238, "learning_rate": 3.9800000000000005e-05, "loss": 0.8645, "step": 408 }, { "epoch": 0.610932175699015, "grad_norm": 0.12243250757455826, "learning_rate": 3.9775e-05, "loss": 0.9789, "step": 409 }, { "epoch": 0.6124258973999906, "grad_norm": 0.12460099905729294, "learning_rate": 3.9750000000000004e-05, "loss": 0.983, "step": 410 }, { "epoch": 0.6139196191009663, "grad_norm": 0.11668864637613297, "learning_rate": 3.9725e-05, "loss": 0.9561, "step": 411 }, { "epoch": 0.6154133408019419, "grad_norm": 0.12145492434501648, "learning_rate": 3.97e-05, "loss": 0.7781, "step": 412 }, { "epoch": 0.6169070625029174, "grad_norm": 0.12386345863342285, "learning_rate": 3.9675e-05, "loss": 0.8093, "step": 413 }, { "epoch": 0.618400784203893, "grad_norm": 0.11989139020442963, "learning_rate": 3.965e-05, "loss": 0.8715, "step": 414 }, { "epoch": 0.6198945059048686, "grad_norm": 0.10321169346570969, "learning_rate": 3.9625e-05, "loss": 0.7498, "step": 415 }, { "epoch": 0.6213882276058442, "grad_norm": 0.11457707732915878, "learning_rate": 3.960000000000001e-05, "loss": 0.7743, "step": 416 }, { "epoch": 0.6228819493068197, "grad_norm": 0.13037341833114624, "learning_rate": 3.9575e-05, "loss": 0.8378, "step": 417 }, { "epoch": 0.6243756710077953, "grad_norm": 0.1215771809220314, "learning_rate": 3.9550000000000006e-05, "loss": 0.8679, "step": 418 }, { "epoch": 0.625869392708771, "grad_norm": 0.10914719104766846, "learning_rate": 3.9525e-05, "loss": 0.7888, "step": 419 }, { "epoch": 0.6273631144097466, "grad_norm": 0.11485666781663895, "learning_rate": 3.9500000000000005e-05, "loss": 0.9687, "step": 420 }, { "epoch": 0.6288568361107221, "grad_norm": 0.11686632037162781, "learning_rate": 3.9475000000000004e-05, "loss": 0.8355, "step": 421 }, { "epoch": 0.6303505578116977, "grad_norm": 0.1543799787759781, "learning_rate": 3.9450000000000003e-05, "loss": 0.8957, "step": 422 }, { "epoch": 0.6318442795126733, "grad_norm": 0.6196660399436951, "learning_rate": 3.9425e-05, "loss": 0.9791, "step": 423 }, { "epoch": 0.6333380012136489, "grad_norm": 0.11365729570388794, "learning_rate": 3.94e-05, "loss": 0.7078, "step": 424 }, { "epoch": 0.6348317229146244, "grad_norm": 0.1275215744972229, "learning_rate": 3.9375e-05, "loss": 0.9238, "step": 425 }, { "epoch": 0.6363254446156, "grad_norm": 0.1132238358259201, "learning_rate": 3.935e-05, "loss": 0.8223, "step": 426 }, { "epoch": 0.6378191663165756, "grad_norm": 0.11624520272016525, "learning_rate": 3.9325e-05, "loss": 0.8275, "step": 427 }, { "epoch": 0.6393128880175513, "grad_norm": 0.1252676099538803, "learning_rate": 3.9300000000000007e-05, "loss": 0.8921, "step": 428 }, { "epoch": 0.6408066097185268, "grad_norm": 0.13340823352336884, "learning_rate": 3.9275e-05, "loss": 0.9065, "step": 429 }, { "epoch": 0.6423003314195024, "grad_norm": 0.10909580439329147, "learning_rate": 3.9250000000000005e-05, "loss": 0.8379, "step": 430 }, { "epoch": 0.643794053120478, "grad_norm": 0.12243309617042542, "learning_rate": 3.9225e-05, "loss": 0.9401, "step": 431 }, { "epoch": 0.6452877748214536, "grad_norm": 0.11474932730197906, "learning_rate": 3.9200000000000004e-05, "loss": 0.8308, "step": 432 }, { "epoch": 0.6467814965224291, "grad_norm": 0.11664999276399612, "learning_rate": 3.9175000000000004e-05, "loss": 0.8755, "step": 433 }, { "epoch": 0.6482752182234047, "grad_norm": 0.117766372859478, "learning_rate": 3.915e-05, "loss": 0.8346, "step": 434 }, { "epoch": 0.6497689399243803, "grad_norm": 0.12336030602455139, "learning_rate": 3.9125e-05, "loss": 0.9515, "step": 435 }, { "epoch": 0.651262661625356, "grad_norm": 0.10965977609157562, "learning_rate": 3.91e-05, "loss": 0.8288, "step": 436 }, { "epoch": 0.6527563833263315, "grad_norm": 0.12656274437904358, "learning_rate": 3.9075e-05, "loss": 0.79, "step": 437 }, { "epoch": 0.6542501050273071, "grad_norm": 0.11523326486349106, "learning_rate": 3.905e-05, "loss": 0.8078, "step": 438 }, { "epoch": 0.6557438267282827, "grad_norm": 0.10973573476076126, "learning_rate": 3.9025e-05, "loss": 0.7972, "step": 439 }, { "epoch": 0.6572375484292583, "grad_norm": 0.11799229681491852, "learning_rate": 3.9000000000000006e-05, "loss": 0.858, "step": 440 }, { "epoch": 0.6587312701302339, "grad_norm": 0.11250176280736923, "learning_rate": 3.8975e-05, "loss": 0.8699, "step": 441 }, { "epoch": 0.6602249918312094, "grad_norm": 0.10535162687301636, "learning_rate": 3.8950000000000005e-05, "loss": 0.7591, "step": 442 }, { "epoch": 0.661718713532185, "grad_norm": 0.11573795229196548, "learning_rate": 3.8925e-05, "loss": 0.8387, "step": 443 }, { "epoch": 0.6632124352331606, "grad_norm": 0.11178013682365417, "learning_rate": 3.8900000000000004e-05, "loss": 0.8117, "step": 444 }, { "epoch": 0.6647061569341363, "grad_norm": 0.1056046113371849, "learning_rate": 3.8875e-05, "loss": 0.7877, "step": 445 }, { "epoch": 0.6661998786351118, "grad_norm": 0.1198507621884346, "learning_rate": 3.885e-05, "loss": 0.8305, "step": 446 }, { "epoch": 0.6676936003360874, "grad_norm": 0.12153521925210953, "learning_rate": 3.8825e-05, "loss": 0.9062, "step": 447 }, { "epoch": 0.669187322037063, "grad_norm": 0.11427931487560272, "learning_rate": 3.88e-05, "loss": 0.7546, "step": 448 }, { "epoch": 0.6706810437380386, "grad_norm": 0.13701508939266205, "learning_rate": 3.8775e-05, "loss": 0.7877, "step": 449 }, { "epoch": 0.6721747654390141, "grad_norm": 0.10716225206851959, "learning_rate": 3.875e-05, "loss": 0.7557, "step": 450 }, { "epoch": 0.6736684871399897, "grad_norm": 0.11461592465639114, "learning_rate": 3.8725e-05, "loss": 0.8498, "step": 451 }, { "epoch": 0.6751622088409653, "grad_norm": 0.1283111423254013, "learning_rate": 3.8700000000000006e-05, "loss": 1.0314, "step": 452 }, { "epoch": 0.676655930541941, "grad_norm": 0.12076213955879211, "learning_rate": 3.8675e-05, "loss": 0.8668, "step": 453 }, { "epoch": 0.6781496522429165, "grad_norm": 0.1114586666226387, "learning_rate": 3.8650000000000004e-05, "loss": 0.7883, "step": 454 }, { "epoch": 0.6796433739438921, "grad_norm": 0.11499448120594025, "learning_rate": 3.8625e-05, "loss": 0.8661, "step": 455 }, { "epoch": 0.6811370956448677, "grad_norm": 0.12389688193798065, "learning_rate": 3.86e-05, "loss": 0.8676, "step": 456 }, { "epoch": 0.6826308173458433, "grad_norm": 0.1087414100766182, "learning_rate": 3.8575e-05, "loss": 0.8046, "step": 457 }, { "epoch": 0.6841245390468188, "grad_norm": 0.11512427031993866, "learning_rate": 3.855e-05, "loss": 0.895, "step": 458 }, { "epoch": 0.6856182607477944, "grad_norm": 0.1182154193520546, "learning_rate": 3.8525e-05, "loss": 0.8813, "step": 459 }, { "epoch": 0.68711198244877, "grad_norm": 0.1151411384344101, "learning_rate": 3.85e-05, "loss": 0.9107, "step": 460 }, { "epoch": 0.6886057041497456, "grad_norm": 0.11687251925468445, "learning_rate": 3.8475e-05, "loss": 0.929, "step": 461 }, { "epoch": 0.6900994258507211, "grad_norm": 0.11749271303415298, "learning_rate": 3.845e-05, "loss": 0.8725, "step": 462 }, { "epoch": 0.6915931475516968, "grad_norm": 0.11867329478263855, "learning_rate": 3.8425e-05, "loss": 0.8705, "step": 463 }, { "epoch": 0.6930868692526724, "grad_norm": 0.12454242259263992, "learning_rate": 3.8400000000000005e-05, "loss": 0.8834, "step": 464 }, { "epoch": 0.694580590953648, "grad_norm": 0.1037922352552414, "learning_rate": 3.8375e-05, "loss": 0.7575, "step": 465 }, { "epoch": 0.6960743126546235, "grad_norm": 0.10541266947984695, "learning_rate": 3.8350000000000004e-05, "loss": 0.7176, "step": 466 }, { "epoch": 0.6975680343555991, "grad_norm": 0.1267130821943283, "learning_rate": 3.8324999999999996e-05, "loss": 0.9245, "step": 467 }, { "epoch": 0.6990617560565747, "grad_norm": 0.11470147967338562, "learning_rate": 3.83e-05, "loss": 0.8349, "step": 468 }, { "epoch": 0.7005554777575503, "grad_norm": 0.1261100023984909, "learning_rate": 3.8275e-05, "loss": 0.8784, "step": 469 }, { "epoch": 0.7020491994585258, "grad_norm": 0.12683509290218353, "learning_rate": 3.825e-05, "loss": 0.9547, "step": 470 }, { "epoch": 0.7035429211595015, "grad_norm": 0.11357103288173676, "learning_rate": 3.8225e-05, "loss": 0.8779, "step": 471 }, { "epoch": 0.7050366428604771, "grad_norm": 0.11892116069793701, "learning_rate": 3.82e-05, "loss": 0.8899, "step": 472 }, { "epoch": 0.7065303645614527, "grad_norm": 0.11261481791734695, "learning_rate": 3.8175e-05, "loss": 0.7613, "step": 473 }, { "epoch": 0.7080240862624282, "grad_norm": 0.11560933291912079, "learning_rate": 3.8150000000000006e-05, "loss": 0.8729, "step": 474 }, { "epoch": 0.7095178079634038, "grad_norm": 0.12955224514007568, "learning_rate": 3.8125e-05, "loss": 1.0153, "step": 475 }, { "epoch": 0.7110115296643794, "grad_norm": 0.11023367941379547, "learning_rate": 3.8100000000000005e-05, "loss": 0.8312, "step": 476 }, { "epoch": 0.712505251365355, "grad_norm": 0.11561868339776993, "learning_rate": 3.8075e-05, "loss": 0.8186, "step": 477 }, { "epoch": 0.7139989730663305, "grad_norm": 0.13367782533168793, "learning_rate": 3.805e-05, "loss": 0.9757, "step": 478 }, { "epoch": 0.7154926947673061, "grad_norm": 0.11734066158533096, "learning_rate": 3.8025e-05, "loss": 0.8952, "step": 479 }, { "epoch": 0.7169864164682818, "grad_norm": 0.11982716619968414, "learning_rate": 3.8e-05, "loss": 0.8943, "step": 480 }, { "epoch": 0.7184801381692574, "grad_norm": 0.12294933944940567, "learning_rate": 3.7975e-05, "loss": 0.9242, "step": 481 }, { "epoch": 0.7199738598702329, "grad_norm": 0.1289445161819458, "learning_rate": 3.795e-05, "loss": 0.9635, "step": 482 }, { "epoch": 0.7214675815712085, "grad_norm": 0.11124879866838455, "learning_rate": 3.7925e-05, "loss": 0.7845, "step": 483 }, { "epoch": 0.7229613032721841, "grad_norm": 0.11613549292087555, "learning_rate": 3.79e-05, "loss": 0.8241, "step": 484 }, { "epoch": 0.7244550249731597, "grad_norm": 0.1195501908659935, "learning_rate": 3.7875e-05, "loss": 0.8908, "step": 485 }, { "epoch": 0.7259487466741352, "grad_norm": 0.11978666484355927, "learning_rate": 3.7850000000000005e-05, "loss": 0.8888, "step": 486 }, { "epoch": 0.7274424683751108, "grad_norm": 0.12248118221759796, "learning_rate": 3.7825e-05, "loss": 0.7841, "step": 487 }, { "epoch": 0.7289361900760865, "grad_norm": 0.12103201448917389, "learning_rate": 3.7800000000000004e-05, "loss": 0.7902, "step": 488 }, { "epoch": 0.7304299117770621, "grad_norm": 0.11651565134525299, "learning_rate": 3.7775e-05, "loss": 0.7761, "step": 489 }, { "epoch": 0.7319236334780376, "grad_norm": 0.1345234215259552, "learning_rate": 3.775e-05, "loss": 0.9341, "step": 490 }, { "epoch": 0.7334173551790132, "grad_norm": 0.11269031465053558, "learning_rate": 3.7725e-05, "loss": 0.8183, "step": 491 }, { "epoch": 0.7349110768799888, "grad_norm": 0.12275443971157074, "learning_rate": 3.77e-05, "loss": 0.9798, "step": 492 }, { "epoch": 0.7364047985809644, "grad_norm": 0.12342868000268936, "learning_rate": 3.7675e-05, "loss": 0.7782, "step": 493 }, { "epoch": 0.7378985202819399, "grad_norm": 0.1261453479528427, "learning_rate": 3.765e-05, "loss": 0.8695, "step": 494 }, { "epoch": 0.7393922419829155, "grad_norm": 0.12151317298412323, "learning_rate": 3.7625e-05, "loss": 0.8409, "step": 495 }, { "epoch": 0.7408859636838911, "grad_norm": 0.11639870703220367, "learning_rate": 3.76e-05, "loss": 0.9152, "step": 496 }, { "epoch": 0.7423796853848668, "grad_norm": 0.12525513768196106, "learning_rate": 3.7575e-05, "loss": 0.8905, "step": 497 }, { "epoch": 0.7438734070858423, "grad_norm": 0.11412794888019562, "learning_rate": 3.7550000000000005e-05, "loss": 0.8063, "step": 498 }, { "epoch": 0.7453671287868179, "grad_norm": 0.1221984401345253, "learning_rate": 3.7525e-05, "loss": 0.8971, "step": 499 }, { "epoch": 0.7468608504877935, "grad_norm": 0.12317992001771927, "learning_rate": 3.7500000000000003e-05, "loss": 0.8958, "step": 500 }, { "epoch": 0.7483545721887691, "grad_norm": 0.11764492839574814, "learning_rate": 3.7475e-05, "loss": 0.9008, "step": 501 }, { "epoch": 0.7498482938897447, "grad_norm": 0.1282505989074707, "learning_rate": 3.745e-05, "loss": 0.974, "step": 502 }, { "epoch": 0.7513420155907202, "grad_norm": 0.10983335971832275, "learning_rate": 3.7425e-05, "loss": 0.7541, "step": 503 }, { "epoch": 0.7528357372916958, "grad_norm": 0.11976707726716995, "learning_rate": 3.74e-05, "loss": 0.8805, "step": 504 }, { "epoch": 0.7543294589926715, "grad_norm": 0.10717479884624481, "learning_rate": 3.737500000000001e-05, "loss": 0.6829, "step": 505 }, { "epoch": 0.7558231806936471, "grad_norm": 0.11701679974794388, "learning_rate": 3.735e-05, "loss": 0.9184, "step": 506 }, { "epoch": 0.7573169023946226, "grad_norm": 0.12355269491672516, "learning_rate": 3.7325000000000006e-05, "loss": 0.9209, "step": 507 }, { "epoch": 0.7588106240955982, "grad_norm": 0.10710404813289642, "learning_rate": 3.73e-05, "loss": 0.7395, "step": 508 }, { "epoch": 0.7603043457965738, "grad_norm": 0.11838013678789139, "learning_rate": 3.7275000000000005e-05, "loss": 0.8773, "step": 509 }, { "epoch": 0.7617980674975494, "grad_norm": 0.10798650234937668, "learning_rate": 3.7250000000000004e-05, "loss": 0.6963, "step": 510 }, { "epoch": 0.7632917891985249, "grad_norm": 0.12685714662075043, "learning_rate": 3.7225000000000004e-05, "loss": 0.8306, "step": 511 }, { "epoch": 0.7647855108995005, "grad_norm": 0.11536894738674164, "learning_rate": 3.72e-05, "loss": 0.7918, "step": 512 }, { "epoch": 0.7662792326004761, "grad_norm": 0.12276559323072433, "learning_rate": 3.7175e-05, "loss": 0.8764, "step": 513 }, { "epoch": 0.7677729543014518, "grad_norm": 0.11633279919624329, "learning_rate": 3.715e-05, "loss": 0.8683, "step": 514 }, { "epoch": 0.7692666760024273, "grad_norm": 0.1106327474117279, "learning_rate": 3.7125e-05, "loss": 0.8597, "step": 515 }, { "epoch": 0.7707603977034029, "grad_norm": 0.1287788301706314, "learning_rate": 3.71e-05, "loss": 0.8783, "step": 516 }, { "epoch": 0.7722541194043785, "grad_norm": 0.11713037639856339, "learning_rate": 3.707500000000001e-05, "loss": 0.8059, "step": 517 }, { "epoch": 0.7737478411053541, "grad_norm": 0.11426515877246857, "learning_rate": 3.705e-05, "loss": 0.7684, "step": 518 }, { "epoch": 0.7752415628063296, "grad_norm": 0.12040442228317261, "learning_rate": 3.7025000000000005e-05, "loss": 0.8971, "step": 519 }, { "epoch": 0.7767352845073052, "grad_norm": 0.11353293061256409, "learning_rate": 3.7e-05, "loss": 0.8397, "step": 520 }, { "epoch": 0.7782290062082808, "grad_norm": 0.11656889319419861, "learning_rate": 3.6975000000000004e-05, "loss": 0.9349, "step": 521 }, { "epoch": 0.7797227279092565, "grad_norm": 0.12571312487125397, "learning_rate": 3.6950000000000004e-05, "loss": 0.845, "step": 522 }, { "epoch": 0.781216449610232, "grad_norm": 0.12603648006916046, "learning_rate": 3.6925e-05, "loss": 0.8908, "step": 523 }, { "epoch": 0.7827101713112076, "grad_norm": 0.12142994999885559, "learning_rate": 3.69e-05, "loss": 0.9071, "step": 524 }, { "epoch": 0.7842038930121832, "grad_norm": 0.1182866320014, "learning_rate": 3.6875e-05, "loss": 0.761, "step": 525 }, { "epoch": 0.7856976147131588, "grad_norm": 0.12061905860900879, "learning_rate": 3.685e-05, "loss": 0.85, "step": 526 }, { "epoch": 0.7871913364141343, "grad_norm": 0.11107086390256882, "learning_rate": 3.6825e-05, "loss": 0.8021, "step": 527 }, { "epoch": 0.7886850581151099, "grad_norm": 0.11587297171354294, "learning_rate": 3.68e-05, "loss": 0.8789, "step": 528 }, { "epoch": 0.7901787798160855, "grad_norm": 0.12360134720802307, "learning_rate": 3.6775000000000006e-05, "loss": 0.8226, "step": 529 }, { "epoch": 0.7916725015170611, "grad_norm": 0.11649927496910095, "learning_rate": 3.675e-05, "loss": 0.8282, "step": 530 }, { "epoch": 0.7931662232180366, "grad_norm": 0.11669060587882996, "learning_rate": 3.6725000000000005e-05, "loss": 0.8378, "step": 531 }, { "epoch": 0.7946599449190123, "grad_norm": 0.11731521040201187, "learning_rate": 3.6700000000000004e-05, "loss": 0.7924, "step": 532 }, { "epoch": 0.7961536666199879, "grad_norm": 0.12249942868947983, "learning_rate": 3.6675000000000004e-05, "loss": 0.876, "step": 533 }, { "epoch": 0.7976473883209635, "grad_norm": 0.11808062344789505, "learning_rate": 3.665e-05, "loss": 0.825, "step": 534 }, { "epoch": 0.799141110021939, "grad_norm": 0.11000221967697144, "learning_rate": 3.6625e-05, "loss": 0.8145, "step": 535 }, { "epoch": 0.8006348317229146, "grad_norm": 0.12330187857151031, "learning_rate": 3.66e-05, "loss": 0.8602, "step": 536 }, { "epoch": 0.8021285534238902, "grad_norm": 0.12026950716972351, "learning_rate": 3.6575e-05, "loss": 0.8213, "step": 537 }, { "epoch": 0.8036222751248658, "grad_norm": 0.13032597303390503, "learning_rate": 3.655e-05, "loss": 0.936, "step": 538 }, { "epoch": 0.8051159968258413, "grad_norm": 0.1326933354139328, "learning_rate": 3.652500000000001e-05, "loss": 0.9442, "step": 539 }, { "epoch": 0.806609718526817, "grad_norm": 0.11534080654382706, "learning_rate": 3.65e-05, "loss": 0.8085, "step": 540 }, { "epoch": 0.8081034402277926, "grad_norm": 0.11813143640756607, "learning_rate": 3.6475000000000006e-05, "loss": 0.8806, "step": 541 }, { "epoch": 0.8095971619287682, "grad_norm": 0.11518147587776184, "learning_rate": 3.645e-05, "loss": 0.8068, "step": 542 }, { "epoch": 0.8110908836297437, "grad_norm": 0.12047188729047775, "learning_rate": 3.6425000000000004e-05, "loss": 1.0127, "step": 543 }, { "epoch": 0.8125846053307193, "grad_norm": 0.1105237528681755, "learning_rate": 3.6400000000000004e-05, "loss": 0.7684, "step": 544 }, { "epoch": 0.8140783270316949, "grad_norm": 0.12371355295181274, "learning_rate": 3.6375e-05, "loss": 0.9584, "step": 545 }, { "epoch": 0.8155720487326705, "grad_norm": 0.11969756335020065, "learning_rate": 3.635e-05, "loss": 0.9173, "step": 546 }, { "epoch": 0.817065770433646, "grad_norm": 0.12263534218072891, "learning_rate": 3.6325e-05, "loss": 0.8786, "step": 547 }, { "epoch": 0.8185594921346216, "grad_norm": 0.10284276306629181, "learning_rate": 3.63e-05, "loss": 0.7357, "step": 548 }, { "epoch": 0.8200532138355973, "grad_norm": 0.11864078044891357, "learning_rate": 3.6275e-05, "loss": 0.9063, "step": 549 }, { "epoch": 0.8215469355365729, "grad_norm": 0.12443932145833969, "learning_rate": 3.625e-05, "loss": 0.9466, "step": 550 }, { "epoch": 0.8230406572375484, "grad_norm": 0.11863391846418381, "learning_rate": 3.6225000000000006e-05, "loss": 0.825, "step": 551 }, { "epoch": 0.824534378938524, "grad_norm": 0.11169655621051788, "learning_rate": 3.62e-05, "loss": 0.7377, "step": 552 }, { "epoch": 0.8260281006394996, "grad_norm": 0.13002797961235046, "learning_rate": 3.6175000000000005e-05, "loss": 0.7923, "step": 553 }, { "epoch": 0.8275218223404752, "grad_norm": 0.13428978621959686, "learning_rate": 3.615e-05, "loss": 0.896, "step": 554 }, { "epoch": 0.8290155440414507, "grad_norm": 0.12754176557064056, "learning_rate": 3.6125000000000004e-05, "loss": 0.9498, "step": 555 }, { "epoch": 0.8305092657424263, "grad_norm": 0.11907877027988434, "learning_rate": 3.61e-05, "loss": 0.8611, "step": 556 }, { "epoch": 0.832002987443402, "grad_norm": 0.12236854434013367, "learning_rate": 3.6075e-05, "loss": 0.9561, "step": 557 }, { "epoch": 0.8334967091443776, "grad_norm": 0.12349528819322586, "learning_rate": 3.605e-05, "loss": 0.8795, "step": 558 }, { "epoch": 0.8349904308453531, "grad_norm": 0.11345000565052032, "learning_rate": 3.6025e-05, "loss": 0.8006, "step": 559 }, { "epoch": 0.8364841525463287, "grad_norm": 0.1231880635023117, "learning_rate": 3.6e-05, "loss": 0.8751, "step": 560 }, { "epoch": 0.8379778742473043, "grad_norm": 0.11477159708738327, "learning_rate": 3.5975e-05, "loss": 0.7424, "step": 561 }, { "epoch": 0.8394715959482799, "grad_norm": 0.13144049048423767, "learning_rate": 3.595e-05, "loss": 0.9217, "step": 562 }, { "epoch": 0.8409653176492555, "grad_norm": 0.12108039110898972, "learning_rate": 3.5925000000000006e-05, "loss": 0.8198, "step": 563 }, { "epoch": 0.842459039350231, "grad_norm": 0.1132141500711441, "learning_rate": 3.59e-05, "loss": 0.8852, "step": 564 }, { "epoch": 0.8439527610512066, "grad_norm": 0.12968137860298157, "learning_rate": 3.5875000000000005e-05, "loss": 0.7796, "step": 565 }, { "epoch": 0.8454464827521823, "grad_norm": 0.11468145996332169, "learning_rate": 3.585e-05, "loss": 0.7932, "step": 566 }, { "epoch": 0.8469402044531579, "grad_norm": 0.1138664111495018, "learning_rate": 3.5825000000000003e-05, "loss": 0.8335, "step": 567 }, { "epoch": 0.8484339261541334, "grad_norm": 0.12407484650611877, "learning_rate": 3.58e-05, "loss": 0.8805, "step": 568 }, { "epoch": 0.849927647855109, "grad_norm": 0.11880036443471909, "learning_rate": 3.5775e-05, "loss": 0.8864, "step": 569 }, { "epoch": 0.8514213695560846, "grad_norm": 0.11420074105262756, "learning_rate": 3.575e-05, "loss": 0.8585, "step": 570 }, { "epoch": 0.8529150912570602, "grad_norm": 0.12508556246757507, "learning_rate": 3.5725e-05, "loss": 0.8332, "step": 571 }, { "epoch": 0.8544088129580357, "grad_norm": 0.12482929974794388, "learning_rate": 3.57e-05, "loss": 0.9317, "step": 572 }, { "epoch": 0.8559025346590113, "grad_norm": 0.10810864716768265, "learning_rate": 3.5675e-05, "loss": 0.7932, "step": 573 }, { "epoch": 0.857396256359987, "grad_norm": 0.10674960911273956, "learning_rate": 3.565e-05, "loss": 0.7211, "step": 574 }, { "epoch": 0.8588899780609626, "grad_norm": 0.11384154111146927, "learning_rate": 3.5625000000000005e-05, "loss": 0.769, "step": 575 }, { "epoch": 0.8603836997619381, "grad_norm": 0.11769191175699234, "learning_rate": 3.56e-05, "loss": 0.8262, "step": 576 }, { "epoch": 0.8618774214629137, "grad_norm": 0.11831361055374146, "learning_rate": 3.5575000000000004e-05, "loss": 0.8132, "step": 577 }, { "epoch": 0.8633711431638893, "grad_norm": 0.15097826719284058, "learning_rate": 3.555e-05, "loss": 0.9268, "step": 578 }, { "epoch": 0.8648648648648649, "grad_norm": 0.1274557113647461, "learning_rate": 3.5525e-05, "loss": 0.8879, "step": 579 }, { "epoch": 0.8663585865658404, "grad_norm": 0.11343612521886826, "learning_rate": 3.55e-05, "loss": 0.8362, "step": 580 }, { "epoch": 0.867852308266816, "grad_norm": 0.1187431812286377, "learning_rate": 3.5475e-05, "loss": 0.8398, "step": 581 }, { "epoch": 0.8693460299677916, "grad_norm": 0.10682875663042068, "learning_rate": 3.545e-05, "loss": 0.7654, "step": 582 }, { "epoch": 0.8708397516687673, "grad_norm": 0.12039945274591446, "learning_rate": 3.5425e-05, "loss": 0.724, "step": 583 }, { "epoch": 0.8723334733697428, "grad_norm": 0.12316028028726578, "learning_rate": 3.54e-05, "loss": 0.8519, "step": 584 }, { "epoch": 0.8738271950707184, "grad_norm": 0.11559374630451202, "learning_rate": 3.5375e-05, "loss": 0.8396, "step": 585 }, { "epoch": 0.875320916771694, "grad_norm": 0.11352609097957611, "learning_rate": 3.535e-05, "loss": 0.8643, "step": 586 }, { "epoch": 0.8768146384726696, "grad_norm": 0.11973422765731812, "learning_rate": 3.5325000000000005e-05, "loss": 0.896, "step": 587 }, { "epoch": 0.8783083601736451, "grad_norm": 0.12587222456932068, "learning_rate": 3.53e-05, "loss": 0.9564, "step": 588 }, { "epoch": 0.8798020818746207, "grad_norm": 0.11478114128112793, "learning_rate": 3.5275000000000004e-05, "loss": 0.8551, "step": 589 }, { "epoch": 0.8812958035755963, "grad_norm": 0.12392482161521912, "learning_rate": 3.525e-05, "loss": 0.8744, "step": 590 }, { "epoch": 0.882789525276572, "grad_norm": 0.11020272970199585, "learning_rate": 3.5225e-05, "loss": 0.6782, "step": 591 }, { "epoch": 0.8842832469775475, "grad_norm": 0.11438829451799393, "learning_rate": 3.52e-05, "loss": 0.9272, "step": 592 }, { "epoch": 0.8857769686785231, "grad_norm": 0.12205393612384796, "learning_rate": 3.5175e-05, "loss": 0.8617, "step": 593 }, { "epoch": 0.8872706903794987, "grad_norm": 0.12319011986255646, "learning_rate": 3.515e-05, "loss": 0.835, "step": 594 }, { "epoch": 0.8887644120804743, "grad_norm": 0.11685352027416229, "learning_rate": 3.5125e-05, "loss": 0.8677, "step": 595 }, { "epoch": 0.8902581337814498, "grad_norm": 0.12064828723669052, "learning_rate": 3.51e-05, "loss": 0.8558, "step": 596 }, { "epoch": 0.8917518554824254, "grad_norm": 0.11435768008232117, "learning_rate": 3.5075000000000006e-05, "loss": 0.8139, "step": 597 }, { "epoch": 0.893245577183401, "grad_norm": 0.11450018733739853, "learning_rate": 3.505e-05, "loss": 0.7819, "step": 598 }, { "epoch": 0.8947392988843766, "grad_norm": 0.116747185587883, "learning_rate": 3.5025000000000004e-05, "loss": 0.8457, "step": 599 }, { "epoch": 0.8962330205853521, "grad_norm": 0.11780279874801636, "learning_rate": 3.5e-05, "loss": 0.8887, "step": 600 }, { "epoch": 0.8977267422863278, "grad_norm": 0.11975818127393723, "learning_rate": 3.4975e-05, "loss": 0.7357, "step": 601 }, { "epoch": 0.8992204639873034, "grad_norm": 0.11866550147533417, "learning_rate": 3.495e-05, "loss": 0.8184, "step": 602 }, { "epoch": 0.900714185688279, "grad_norm": 0.12063805758953094, "learning_rate": 3.4925e-05, "loss": 0.8461, "step": 603 }, { "epoch": 0.9022079073892545, "grad_norm": 0.1259380280971527, "learning_rate": 3.49e-05, "loss": 0.9423, "step": 604 }, { "epoch": 0.9037016290902301, "grad_norm": 0.11953550577163696, "learning_rate": 3.4875e-05, "loss": 0.799, "step": 605 }, { "epoch": 0.9051953507912057, "grad_norm": 0.12620405852794647, "learning_rate": 3.485e-05, "loss": 0.744, "step": 606 }, { "epoch": 0.9066890724921813, "grad_norm": 0.12804405391216278, "learning_rate": 3.4825e-05, "loss": 0.8804, "step": 607 }, { "epoch": 0.9081827941931568, "grad_norm": 0.12298737466335297, "learning_rate": 3.48e-05, "loss": 0.9239, "step": 608 }, { "epoch": 0.9096765158941325, "grad_norm": 0.1212417334318161, "learning_rate": 3.4775000000000005e-05, "loss": 0.8775, "step": 609 }, { "epoch": 0.9111702375951081, "grad_norm": 0.12126675993204117, "learning_rate": 3.475e-05, "loss": 0.8853, "step": 610 }, { "epoch": 0.9126639592960837, "grad_norm": 0.11589443683624268, "learning_rate": 3.4725000000000004e-05, "loss": 0.7673, "step": 611 }, { "epoch": 0.9141576809970592, "grad_norm": 0.1263202726840973, "learning_rate": 3.4699999999999996e-05, "loss": 0.7982, "step": 612 }, { "epoch": 0.9156514026980348, "grad_norm": 0.12341882288455963, "learning_rate": 3.4675e-05, "loss": 0.9832, "step": 613 }, { "epoch": 0.9171451243990104, "grad_norm": 0.1159672960639, "learning_rate": 3.465e-05, "loss": 0.8012, "step": 614 }, { "epoch": 0.918638846099986, "grad_norm": 0.11675315350294113, "learning_rate": 3.4625e-05, "loss": 0.8807, "step": 615 }, { "epoch": 0.9201325678009615, "grad_norm": 0.12747396528720856, "learning_rate": 3.46e-05, "loss": 0.6777, "step": 616 }, { "epoch": 0.9216262895019371, "grad_norm": 0.12141313403844833, "learning_rate": 3.4575e-05, "loss": 0.9125, "step": 617 }, { "epoch": 0.9231200112029128, "grad_norm": 0.11567840725183487, "learning_rate": 3.455e-05, "loss": 0.8532, "step": 618 }, { "epoch": 0.9246137329038884, "grad_norm": 0.12084275484085083, "learning_rate": 3.4525e-05, "loss": 0.843, "step": 619 }, { "epoch": 0.9261074546048639, "grad_norm": 0.10760965943336487, "learning_rate": 3.45e-05, "loss": 0.7013, "step": 620 }, { "epoch": 0.9276011763058395, "grad_norm": 0.12319813668727875, "learning_rate": 3.4475000000000005e-05, "loss": 0.7791, "step": 621 }, { "epoch": 0.9290948980068151, "grad_norm": 0.12831102311611176, "learning_rate": 3.445e-05, "loss": 0.8905, "step": 622 }, { "epoch": 0.9305886197077907, "grad_norm": 0.1304023116827011, "learning_rate": 3.4425e-05, "loss": 0.9467, "step": 623 }, { "epoch": 0.9320823414087662, "grad_norm": 0.12220504879951477, "learning_rate": 3.4399999999999996e-05, "loss": 0.8783, "step": 624 }, { "epoch": 0.9335760631097418, "grad_norm": 0.11791114509105682, "learning_rate": 3.4375e-05, "loss": 0.7554, "step": 625 }, { "epoch": 0.9350697848107175, "grad_norm": 0.12192322313785553, "learning_rate": 3.435e-05, "loss": 0.8509, "step": 626 }, { "epoch": 0.9365635065116931, "grad_norm": 0.1260639876127243, "learning_rate": 3.4325e-05, "loss": 0.8744, "step": 627 }, { "epoch": 0.9380572282126687, "grad_norm": 0.12728920578956604, "learning_rate": 3.430000000000001e-05, "loss": 0.8664, "step": 628 }, { "epoch": 0.9395509499136442, "grad_norm": 0.12565158307552338, "learning_rate": 3.4275e-05, "loss": 0.8952, "step": 629 }, { "epoch": 0.9410446716146198, "grad_norm": 0.14639417827129364, "learning_rate": 3.4250000000000006e-05, "loss": 1.0019, "step": 630 }, { "epoch": 0.9425383933155954, "grad_norm": 0.12126431614160538, "learning_rate": 3.4225e-05, "loss": 0.9245, "step": 631 }, { "epoch": 0.944032115016571, "grad_norm": 0.12372881174087524, "learning_rate": 3.4200000000000005e-05, "loss": 0.8185, "step": 632 }, { "epoch": 0.9455258367175465, "grad_norm": 0.11828556656837463, "learning_rate": 3.4175000000000004e-05, "loss": 0.8957, "step": 633 }, { "epoch": 0.9470195584185221, "grad_norm": 0.14459969103336334, "learning_rate": 3.415e-05, "loss": 0.9213, "step": 634 }, { "epoch": 0.9485132801194978, "grad_norm": 0.1269361674785614, "learning_rate": 3.4125e-05, "loss": 0.8896, "step": 635 }, { "epoch": 0.9500070018204734, "grad_norm": 0.12435188889503479, "learning_rate": 3.41e-05, "loss": 0.8027, "step": 636 }, { "epoch": 0.9515007235214489, "grad_norm": 0.12179242074489594, "learning_rate": 3.4075e-05, "loss": 0.9499, "step": 637 }, { "epoch": 0.9529944452224245, "grad_norm": 0.11750852316617966, "learning_rate": 3.405e-05, "loss": 0.7887, "step": 638 }, { "epoch": 0.9544881669234001, "grad_norm": 0.126454159617424, "learning_rate": 3.4025e-05, "loss": 0.7979, "step": 639 }, { "epoch": 0.9559818886243757, "grad_norm": 0.11420923471450806, "learning_rate": 3.4000000000000007e-05, "loss": 0.8199, "step": 640 }, { "epoch": 0.9574756103253512, "grad_norm": 0.11463257670402527, "learning_rate": 3.3975e-05, "loss": 0.7809, "step": 641 }, { "epoch": 0.9589693320263268, "grad_norm": 0.1227886974811554, "learning_rate": 3.3950000000000005e-05, "loss": 0.8588, "step": 642 }, { "epoch": 0.9604630537273025, "grad_norm": 0.1278667002916336, "learning_rate": 3.3925e-05, "loss": 0.9328, "step": 643 }, { "epoch": 0.9619567754282781, "grad_norm": 0.11934813112020493, "learning_rate": 3.3900000000000004e-05, "loss": 0.7978, "step": 644 }, { "epoch": 0.9634504971292536, "grad_norm": 0.14180444180965424, "learning_rate": 3.3875000000000003e-05, "loss": 0.9193, "step": 645 }, { "epoch": 0.9649442188302292, "grad_norm": 0.11786319315433502, "learning_rate": 3.385e-05, "loss": 0.7871, "step": 646 }, { "epoch": 0.9664379405312048, "grad_norm": 0.12217289209365845, "learning_rate": 3.3825e-05, "loss": 0.7881, "step": 647 }, { "epoch": 0.9679316622321804, "grad_norm": 0.10589991509914398, "learning_rate": 3.38e-05, "loss": 0.7407, "step": 648 }, { "epoch": 0.9694253839331559, "grad_norm": 0.1312999725341797, "learning_rate": 3.3775e-05, "loss": 0.7883, "step": 649 }, { "epoch": 0.9709191056341315, "grad_norm": 0.11625451594591141, "learning_rate": 3.375000000000001e-05, "loss": 0.8124, "step": 650 }, { "epoch": 0.9724128273351071, "grad_norm": 0.1165098249912262, "learning_rate": 3.3725e-05, "loss": 0.8417, "step": 651 }, { "epoch": 0.9739065490360828, "grad_norm": 0.11921747028827667, "learning_rate": 3.3700000000000006e-05, "loss": 0.814, "step": 652 }, { "epoch": 0.9754002707370583, "grad_norm": 0.11282332241535187, "learning_rate": 3.3675e-05, "loss": 0.7288, "step": 653 }, { "epoch": 0.9768939924380339, "grad_norm": 0.11913489550352097, "learning_rate": 3.3650000000000005e-05, "loss": 0.8657, "step": 654 }, { "epoch": 0.9783877141390095, "grad_norm": 0.12776823341846466, "learning_rate": 3.3625000000000004e-05, "loss": 0.9329, "step": 655 }, { "epoch": 0.9798814358399851, "grad_norm": 0.11090242117643356, "learning_rate": 3.3600000000000004e-05, "loss": 0.7553, "step": 656 }, { "epoch": 0.9813751575409606, "grad_norm": 0.11579754203557968, "learning_rate": 3.3575e-05, "loss": 0.825, "step": 657 }, { "epoch": 0.9828688792419362, "grad_norm": 0.12329933047294617, "learning_rate": 3.355e-05, "loss": 0.8213, "step": 658 }, { "epoch": 0.9843626009429118, "grad_norm": 0.126212939620018, "learning_rate": 3.3525e-05, "loss": 0.7825, "step": 659 }, { "epoch": 0.9858563226438875, "grad_norm": 0.11927678436040878, "learning_rate": 3.35e-05, "loss": 0.9122, "step": 660 }, { "epoch": 0.987350044344863, "grad_norm": 0.12385684251785278, "learning_rate": 3.3475e-05, "loss": 0.8525, "step": 661 }, { "epoch": 0.9888437660458386, "grad_norm": 0.12204636633396149, "learning_rate": 3.345000000000001e-05, "loss": 0.9851, "step": 662 }, { "epoch": 0.9903374877468142, "grad_norm": 0.10816507786512375, "learning_rate": 3.3425e-05, "loss": 0.7869, "step": 663 }, { "epoch": 0.9918312094477898, "grad_norm": 0.11395607888698578, "learning_rate": 3.3400000000000005e-05, "loss": 0.8748, "step": 664 }, { "epoch": 0.9933249311487653, "grad_norm": 0.1163991168141365, "learning_rate": 3.3375e-05, "loss": 0.8626, "step": 665 }, { "epoch": 0.9948186528497409, "grad_norm": 0.11782239377498627, "learning_rate": 3.3350000000000004e-05, "loss": 0.8891, "step": 666 }, { "epoch": 0.9963123745507165, "grad_norm": 0.11901789158582687, "learning_rate": 3.3325000000000004e-05, "loss": 0.8252, "step": 667 }, { "epoch": 0.9978060962516921, "grad_norm": 0.12020383030176163, "learning_rate": 3.33e-05, "loss": 0.9567, "step": 668 }, { "epoch": 0.9992998179526676, "grad_norm": 0.1184314712882042, "learning_rate": 3.3275e-05, "loss": 0.7563, "step": 669 }, { "epoch": 1.0007935396536434, "grad_norm": 0.11987176537513733, "learning_rate": 3.325e-05, "loss": 0.8388, "step": 670 }, { "epoch": 1.0022872613546188, "grad_norm": 0.10995893180370331, "learning_rate": 3.3225e-05, "loss": 0.862, "step": 671 }, { "epoch": 1.0037809830555944, "grad_norm": 0.11009721457958221, "learning_rate": 3.32e-05, "loss": 0.7643, "step": 672 }, { "epoch": 1.00527470475657, "grad_norm": 0.11379164457321167, "learning_rate": 3.3175e-05, "loss": 0.7523, "step": 673 }, { "epoch": 1.0067684264575456, "grad_norm": 0.10723863542079926, "learning_rate": 3.3150000000000006e-05, "loss": 0.7339, "step": 674 }, { "epoch": 1.0082621481585212, "grad_norm": 0.11779145896434784, "learning_rate": 3.3125e-05, "loss": 0.8702, "step": 675 }, { "epoch": 1.0097558698594968, "grad_norm": 0.1139538437128067, "learning_rate": 3.3100000000000005e-05, "loss": 0.7524, "step": 676 }, { "epoch": 1.0112495915604724, "grad_norm": 0.12899470329284668, "learning_rate": 3.3075e-05, "loss": 0.8773, "step": 677 }, { "epoch": 1.012743313261448, "grad_norm": 0.10587508976459503, "learning_rate": 3.3050000000000004e-05, "loss": 0.8331, "step": 678 }, { "epoch": 1.0142370349624235, "grad_norm": 0.11931108683347702, "learning_rate": 3.3025e-05, "loss": 0.8372, "step": 679 }, { "epoch": 1.015730756663399, "grad_norm": 0.11700265854597092, "learning_rate": 3.3e-05, "loss": 0.7712, "step": 680 }, { "epoch": 1.0172244783643747, "grad_norm": 0.13131257891654968, "learning_rate": 3.2975e-05, "loss": 0.8945, "step": 681 }, { "epoch": 1.0187182000653503, "grad_norm": 0.11664092540740967, "learning_rate": 3.295e-05, "loss": 0.7756, "step": 682 }, { "epoch": 1.020211921766326, "grad_norm": 0.13824814558029175, "learning_rate": 3.2925e-05, "loss": 0.8837, "step": 683 }, { "epoch": 1.0217056434673015, "grad_norm": 0.12166835367679596, "learning_rate": 3.29e-05, "loss": 0.8789, "step": 684 }, { "epoch": 1.0231993651682771, "grad_norm": 0.11586460471153259, "learning_rate": 3.2875e-05, "loss": 0.8061, "step": 685 }, { "epoch": 1.0246930868692528, "grad_norm": 0.14503873884677887, "learning_rate": 3.2850000000000006e-05, "loss": 1.0775, "step": 686 }, { "epoch": 1.0261868085702281, "grad_norm": 0.11746609210968018, "learning_rate": 3.2825e-05, "loss": 0.7557, "step": 687 }, { "epoch": 1.0276805302712038, "grad_norm": 0.12008384615182877, "learning_rate": 3.2800000000000004e-05, "loss": 0.7923, "step": 688 }, { "epoch": 1.0291742519721794, "grad_norm": 0.12050891667604446, "learning_rate": 3.2775e-05, "loss": 0.711, "step": 689 }, { "epoch": 1.030667973673155, "grad_norm": 0.12483400851488113, "learning_rate": 3.275e-05, "loss": 0.8594, "step": 690 }, { "epoch": 1.0321616953741306, "grad_norm": 0.12449591606855392, "learning_rate": 3.2725e-05, "loss": 0.8707, "step": 691 }, { "epoch": 1.0336554170751062, "grad_norm": 0.11930117011070251, "learning_rate": 3.27e-05, "loss": 0.6629, "step": 692 }, { "epoch": 1.0351491387760818, "grad_norm": 0.10753221064805984, "learning_rate": 3.2675e-05, "loss": 0.6643, "step": 693 }, { "epoch": 1.0366428604770574, "grad_norm": 0.11001087725162506, "learning_rate": 3.265e-05, "loss": 0.7427, "step": 694 }, { "epoch": 1.038136582178033, "grad_norm": 0.1121913269162178, "learning_rate": 3.2625e-05, "loss": 0.8125, "step": 695 }, { "epoch": 1.0396303038790085, "grad_norm": 0.11637987196445465, "learning_rate": 3.26e-05, "loss": 0.8042, "step": 696 }, { "epoch": 1.041124025579984, "grad_norm": 0.13844354450702667, "learning_rate": 3.2575e-05, "loss": 0.9003, "step": 697 }, { "epoch": 1.0426177472809597, "grad_norm": 0.12216726690530777, "learning_rate": 3.2550000000000005e-05, "loss": 0.7845, "step": 698 }, { "epoch": 1.0441114689819353, "grad_norm": 0.12532316148281097, "learning_rate": 3.2525e-05, "loss": 0.8824, "step": 699 }, { "epoch": 1.045605190682911, "grad_norm": 0.12261206656694412, "learning_rate": 3.2500000000000004e-05, "loss": 0.8191, "step": 700 }, { "epoch": 1.0470989123838865, "grad_norm": 0.1161484345793724, "learning_rate": 3.2474999999999997e-05, "loss": 0.8953, "step": 701 }, { "epoch": 1.0485926340848621, "grad_norm": 0.12149600684642792, "learning_rate": 3.245e-05, "loss": 0.8345, "step": 702 }, { "epoch": 1.0500863557858378, "grad_norm": 0.12454210966825485, "learning_rate": 3.2425e-05, "loss": 0.8225, "step": 703 }, { "epoch": 1.0515800774868131, "grad_norm": 0.11556944251060486, "learning_rate": 3.24e-05, "loss": 0.7708, "step": 704 }, { "epoch": 1.0530737991877888, "grad_norm": 0.12368205189704895, "learning_rate": 3.2375e-05, "loss": 0.8936, "step": 705 }, { "epoch": 1.0545675208887644, "grad_norm": 0.13512544333934784, "learning_rate": 3.235e-05, "loss": 0.84, "step": 706 }, { "epoch": 1.05606124258974, "grad_norm": 0.11804860085248947, "learning_rate": 3.2325e-05, "loss": 0.8085, "step": 707 }, { "epoch": 1.0575549642907156, "grad_norm": 0.12429340183734894, "learning_rate": 3.2300000000000006e-05, "loss": 0.8752, "step": 708 }, { "epoch": 1.0590486859916912, "grad_norm": 0.12195641547441483, "learning_rate": 3.2275e-05, "loss": 0.9009, "step": 709 }, { "epoch": 1.0605424076926668, "grad_norm": 0.12330330908298492, "learning_rate": 3.2250000000000005e-05, "loss": 0.8695, "step": 710 }, { "epoch": 1.0620361293936424, "grad_norm": 0.12289360910654068, "learning_rate": 3.2225e-05, "loss": 0.797, "step": 711 }, { "epoch": 1.0635298510946178, "grad_norm": 0.13036511838436127, "learning_rate": 3.2200000000000003e-05, "loss": 0.8851, "step": 712 }, { "epoch": 1.0650235727955935, "grad_norm": 0.11787336319684982, "learning_rate": 3.2175e-05, "loss": 0.8715, "step": 713 }, { "epoch": 1.066517294496569, "grad_norm": 0.1291126161813736, "learning_rate": 3.215e-05, "loss": 0.9772, "step": 714 }, { "epoch": 1.0680110161975447, "grad_norm": 0.12259869277477264, "learning_rate": 3.2125e-05, "loss": 0.8915, "step": 715 }, { "epoch": 1.0695047378985203, "grad_norm": 0.11586686968803406, "learning_rate": 3.21e-05, "loss": 0.7973, "step": 716 }, { "epoch": 1.070998459599496, "grad_norm": 0.12052147090435028, "learning_rate": 3.2075e-05, "loss": 0.9131, "step": 717 }, { "epoch": 1.0724921813004715, "grad_norm": 0.11403396725654602, "learning_rate": 3.205e-05, "loss": 0.7443, "step": 718 }, { "epoch": 1.0739859030014471, "grad_norm": 0.12816496193408966, "learning_rate": 3.2025e-05, "loss": 0.7614, "step": 719 }, { "epoch": 1.0754796247024225, "grad_norm": 0.12989261746406555, "learning_rate": 3.2000000000000005e-05, "loss": 0.8321, "step": 720 }, { "epoch": 1.0769733464033981, "grad_norm": 0.11382956802845001, "learning_rate": 3.1975e-05, "loss": 0.7576, "step": 721 }, { "epoch": 1.0784670681043738, "grad_norm": 0.11391491442918777, "learning_rate": 3.1950000000000004e-05, "loss": 0.769, "step": 722 }, { "epoch": 1.0799607898053494, "grad_norm": 0.10964512825012207, "learning_rate": 3.1925e-05, "loss": 0.7953, "step": 723 }, { "epoch": 1.081454511506325, "grad_norm": 0.14847230911254883, "learning_rate": 3.19e-05, "loss": 0.9569, "step": 724 }, { "epoch": 1.0829482332073006, "grad_norm": 0.11573031544685364, "learning_rate": 3.1875e-05, "loss": 0.7193, "step": 725 }, { "epoch": 1.0844419549082762, "grad_norm": 0.12043916434049606, "learning_rate": 3.185e-05, "loss": 0.8117, "step": 726 }, { "epoch": 1.0859356766092518, "grad_norm": 0.12115609645843506, "learning_rate": 3.1825e-05, "loss": 0.8234, "step": 727 }, { "epoch": 1.0874293983102272, "grad_norm": 0.1124582514166832, "learning_rate": 3.18e-05, "loss": 0.7539, "step": 728 }, { "epoch": 1.0889231200112028, "grad_norm": 0.11602813005447388, "learning_rate": 3.1775e-05, "loss": 0.8078, "step": 729 }, { "epoch": 1.0904168417121785, "grad_norm": 0.12953437864780426, "learning_rate": 3.175e-05, "loss": 0.809, "step": 730 }, { "epoch": 1.091910563413154, "grad_norm": 0.11873941123485565, "learning_rate": 3.1725e-05, "loss": 0.9121, "step": 731 }, { "epoch": 1.0934042851141297, "grad_norm": 0.11871200054883957, "learning_rate": 3.1700000000000005e-05, "loss": 0.8006, "step": 732 }, { "epoch": 1.0948980068151053, "grad_norm": 0.1272260546684265, "learning_rate": 3.1675e-05, "loss": 0.7767, "step": 733 }, { "epoch": 1.096391728516081, "grad_norm": 0.13049134612083435, "learning_rate": 3.1650000000000004e-05, "loss": 0.8032, "step": 734 }, { "epoch": 1.0978854502170565, "grad_norm": 0.12206856906414032, "learning_rate": 3.1624999999999996e-05, "loss": 0.7866, "step": 735 }, { "epoch": 1.0993791719180321, "grad_norm": 0.12864869832992554, "learning_rate": 3.16e-05, "loss": 0.9179, "step": 736 }, { "epoch": 1.1008728936190075, "grad_norm": 0.11986593157052994, "learning_rate": 3.1575e-05, "loss": 0.7402, "step": 737 }, { "epoch": 1.1023666153199831, "grad_norm": 0.11580535769462585, "learning_rate": 3.155e-05, "loss": 0.699, "step": 738 }, { "epoch": 1.1038603370209588, "grad_norm": 0.1255674660205841, "learning_rate": 3.1525e-05, "loss": 0.7978, "step": 739 }, { "epoch": 1.1053540587219344, "grad_norm": 0.14150987565517426, "learning_rate": 3.15e-05, "loss": 0.8792, "step": 740 }, { "epoch": 1.10684778042291, "grad_norm": 0.1173759400844574, "learning_rate": 3.1475e-05, "loss": 0.8375, "step": 741 }, { "epoch": 1.1083415021238856, "grad_norm": 0.11646619439125061, "learning_rate": 3.145e-05, "loss": 0.8806, "step": 742 }, { "epoch": 1.1098352238248612, "grad_norm": 0.12605668604373932, "learning_rate": 3.1425e-05, "loss": 0.813, "step": 743 }, { "epoch": 1.1113289455258366, "grad_norm": 0.11172299832105637, "learning_rate": 3.1400000000000004e-05, "loss": 0.7607, "step": 744 }, { "epoch": 1.1128226672268122, "grad_norm": 0.12353788316249847, "learning_rate": 3.1375e-05, "loss": 0.8128, "step": 745 }, { "epoch": 1.1143163889277878, "grad_norm": 0.12126065045595169, "learning_rate": 3.135e-05, "loss": 0.8038, "step": 746 }, { "epoch": 1.1158101106287635, "grad_norm": 0.12446154654026031, "learning_rate": 3.1324999999999996e-05, "loss": 0.8757, "step": 747 }, { "epoch": 1.117303832329739, "grad_norm": 0.12315195798873901, "learning_rate": 3.13e-05, "loss": 0.9049, "step": 748 }, { "epoch": 1.1187975540307147, "grad_norm": 0.12286493927240372, "learning_rate": 3.1275e-05, "loss": 0.885, "step": 749 }, { "epoch": 1.1202912757316903, "grad_norm": 0.12597855925559998, "learning_rate": 3.125e-05, "loss": 0.8101, "step": 750 }, { "epoch": 1.121784997432666, "grad_norm": 0.11519240587949753, "learning_rate": 3.122500000000001e-05, "loss": 0.7711, "step": 751 }, { "epoch": 1.1232787191336415, "grad_norm": 0.125743567943573, "learning_rate": 3.12e-05, "loss": 0.9022, "step": 752 }, { "epoch": 1.124772440834617, "grad_norm": 0.1366565227508545, "learning_rate": 3.1175000000000006e-05, "loss": 0.9011, "step": 753 }, { "epoch": 1.1262661625355925, "grad_norm": 0.12499212473630905, "learning_rate": 3.115e-05, "loss": 0.8263, "step": 754 }, { "epoch": 1.1277598842365681, "grad_norm": 0.1193777322769165, "learning_rate": 3.1125000000000004e-05, "loss": 0.8799, "step": 755 }, { "epoch": 1.1292536059375438, "grad_norm": 0.1280308961868286, "learning_rate": 3.1100000000000004e-05, "loss": 0.7456, "step": 756 }, { "epoch": 1.1307473276385194, "grad_norm": 0.12463478744029999, "learning_rate": 3.1075e-05, "loss": 0.8987, "step": 757 }, { "epoch": 1.132241049339495, "grad_norm": 0.12268930673599243, "learning_rate": 3.105e-05, "loss": 0.7652, "step": 758 }, { "epoch": 1.1337347710404706, "grad_norm": 0.11514697968959808, "learning_rate": 3.1025e-05, "loss": 0.7736, "step": 759 }, { "epoch": 1.135228492741446, "grad_norm": 0.1284126192331314, "learning_rate": 3.1e-05, "loss": 0.8472, "step": 760 }, { "epoch": 1.1367222144424216, "grad_norm": 0.1332356333732605, "learning_rate": 3.0975e-05, "loss": 0.8292, "step": 761 }, { "epoch": 1.1382159361433972, "grad_norm": 0.12065643817186356, "learning_rate": 3.095e-05, "loss": 0.806, "step": 762 }, { "epoch": 1.1397096578443728, "grad_norm": 0.11629503965377808, "learning_rate": 3.0925000000000006e-05, "loss": 0.8281, "step": 763 }, { "epoch": 1.1412033795453485, "grad_norm": 0.11980944126844406, "learning_rate": 3.09e-05, "loss": 0.7989, "step": 764 }, { "epoch": 1.142697101246324, "grad_norm": 0.12815812230110168, "learning_rate": 3.0875000000000005e-05, "loss": 0.9416, "step": 765 }, { "epoch": 1.1441908229472997, "grad_norm": 0.12114842236042023, "learning_rate": 3.0850000000000004e-05, "loss": 0.848, "step": 766 }, { "epoch": 1.1456845446482753, "grad_norm": 0.11947838962078094, "learning_rate": 3.0825000000000004e-05, "loss": 0.8104, "step": 767 }, { "epoch": 1.147178266349251, "grad_norm": 0.13232921063899994, "learning_rate": 3.08e-05, "loss": 0.8528, "step": 768 }, { "epoch": 1.1486719880502263, "grad_norm": 0.11369500309228897, "learning_rate": 3.0775e-05, "loss": 0.8398, "step": 769 }, { "epoch": 1.150165709751202, "grad_norm": 0.1403481811285019, "learning_rate": 3.075e-05, "loss": 1.0204, "step": 770 }, { "epoch": 1.1516594314521775, "grad_norm": 0.12089207023382187, "learning_rate": 3.0725e-05, "loss": 0.8621, "step": 771 }, { "epoch": 1.1531531531531531, "grad_norm": 0.1267215609550476, "learning_rate": 3.07e-05, "loss": 0.8926, "step": 772 }, { "epoch": 1.1546468748541288, "grad_norm": 0.11243720352649689, "learning_rate": 3.067500000000001e-05, "loss": 0.6893, "step": 773 }, { "epoch": 1.1561405965551044, "grad_norm": 0.13128440082073212, "learning_rate": 3.065e-05, "loss": 0.911, "step": 774 }, { "epoch": 1.15763431825608, "grad_norm": 0.11187858879566193, "learning_rate": 3.0625000000000006e-05, "loss": 0.8151, "step": 775 }, { "epoch": 1.1591280399570554, "grad_norm": 0.12534433603286743, "learning_rate": 3.06e-05, "loss": 0.8513, "step": 776 }, { "epoch": 1.160621761658031, "grad_norm": 0.13090412318706512, "learning_rate": 3.0575000000000005e-05, "loss": 0.9618, "step": 777 }, { "epoch": 1.1621154833590066, "grad_norm": 0.11767179518938065, "learning_rate": 3.0550000000000004e-05, "loss": 0.714, "step": 778 }, { "epoch": 1.1636092050599822, "grad_norm": 0.11768019199371338, "learning_rate": 3.0525e-05, "loss": 0.8205, "step": 779 }, { "epoch": 1.1651029267609578, "grad_norm": 0.12319672107696533, "learning_rate": 3.05e-05, "loss": 0.8609, "step": 780 }, { "epoch": 1.1665966484619334, "grad_norm": 0.12127240002155304, "learning_rate": 3.0475000000000002e-05, "loss": 0.8835, "step": 781 }, { "epoch": 1.168090370162909, "grad_norm": 0.12307175993919373, "learning_rate": 3.045e-05, "loss": 0.8649, "step": 782 }, { "epoch": 1.1695840918638847, "grad_norm": 0.11641137301921844, "learning_rate": 3.0425000000000004e-05, "loss": 0.8259, "step": 783 }, { "epoch": 1.1710778135648603, "grad_norm": 0.12359960377216339, "learning_rate": 3.04e-05, "loss": 0.9358, "step": 784 }, { "epoch": 1.1725715352658357, "grad_norm": 0.1211109459400177, "learning_rate": 3.0375000000000003e-05, "loss": 0.8045, "step": 785 }, { "epoch": 1.1740652569668113, "grad_norm": 0.12040197849273682, "learning_rate": 3.035e-05, "loss": 0.8812, "step": 786 }, { "epoch": 1.175558978667787, "grad_norm": 0.15181660652160645, "learning_rate": 3.0325000000000002e-05, "loss": 0.8263, "step": 787 }, { "epoch": 1.1770527003687625, "grad_norm": 0.11324644833803177, "learning_rate": 3.03e-05, "loss": 0.7949, "step": 788 }, { "epoch": 1.1785464220697381, "grad_norm": 0.12861394882202148, "learning_rate": 3.0275000000000004e-05, "loss": 0.8304, "step": 789 }, { "epoch": 1.1800401437707138, "grad_norm": 0.12461481243371964, "learning_rate": 3.025e-05, "loss": 0.8255, "step": 790 }, { "epoch": 1.1815338654716894, "grad_norm": 0.13339683413505554, "learning_rate": 3.0225000000000003e-05, "loss": 0.7848, "step": 791 }, { "epoch": 1.183027587172665, "grad_norm": 0.1211990937590599, "learning_rate": 3.02e-05, "loss": 0.9068, "step": 792 }, { "epoch": 1.1845213088736406, "grad_norm": 0.1267743855714798, "learning_rate": 3.0175e-05, "loss": 0.8615, "step": 793 }, { "epoch": 1.186015030574616, "grad_norm": 0.1141204759478569, "learning_rate": 3.015e-05, "loss": 0.7774, "step": 794 }, { "epoch": 1.1875087522755916, "grad_norm": 0.11912230402231216, "learning_rate": 3.0125000000000004e-05, "loss": 0.7192, "step": 795 }, { "epoch": 1.1890024739765672, "grad_norm": 0.12390036135911942, "learning_rate": 3.01e-05, "loss": 0.7269, "step": 796 }, { "epoch": 1.1904961956775428, "grad_norm": 0.12176775187253952, "learning_rate": 3.0075000000000003e-05, "loss": 0.8833, "step": 797 }, { "epoch": 1.1919899173785184, "grad_norm": 0.12040335685014725, "learning_rate": 3.0050000000000002e-05, "loss": 0.8719, "step": 798 }, { "epoch": 1.193483639079494, "grad_norm": 0.12452569603919983, "learning_rate": 3.0025000000000005e-05, "loss": 0.8849, "step": 799 }, { "epoch": 1.1949773607804697, "grad_norm": 0.11826182156801224, "learning_rate": 3e-05, "loss": 0.8282, "step": 800 }, { "epoch": 1.196471082481445, "grad_norm": 0.12290486693382263, "learning_rate": 2.9975000000000004e-05, "loss": 0.8323, "step": 801 }, { "epoch": 1.1979648041824207, "grad_norm": 0.13511580228805542, "learning_rate": 2.995e-05, "loss": 0.865, "step": 802 }, { "epoch": 1.1994585258833963, "grad_norm": 0.1284627765417099, "learning_rate": 2.9925000000000002e-05, "loss": 0.7378, "step": 803 }, { "epoch": 1.200952247584372, "grad_norm": 0.11779852211475372, "learning_rate": 2.9900000000000002e-05, "loss": 0.7467, "step": 804 }, { "epoch": 1.2024459692853475, "grad_norm": 0.12605655193328857, "learning_rate": 2.9875000000000004e-05, "loss": 0.792, "step": 805 }, { "epoch": 1.2039396909863231, "grad_norm": 0.1197223886847496, "learning_rate": 2.985e-05, "loss": 0.7439, "step": 806 }, { "epoch": 1.2054334126872988, "grad_norm": 0.12521962821483612, "learning_rate": 2.9825000000000003e-05, "loss": 0.7743, "step": 807 }, { "epoch": 1.2069271343882744, "grad_norm": 0.10667683184146881, "learning_rate": 2.98e-05, "loss": 0.7418, "step": 808 }, { "epoch": 1.20842085608925, "grad_norm": 0.1281447559595108, "learning_rate": 2.9775000000000002e-05, "loss": 0.9072, "step": 809 }, { "epoch": 1.2099145777902254, "grad_norm": 0.13020466268062592, "learning_rate": 2.975e-05, "loss": 0.8492, "step": 810 }, { "epoch": 1.211408299491201, "grad_norm": 0.11372591555118561, "learning_rate": 2.9725000000000004e-05, "loss": 0.7949, "step": 811 }, { "epoch": 1.2129020211921766, "grad_norm": 0.11412885040044785, "learning_rate": 2.97e-05, "loss": 0.7713, "step": 812 }, { "epoch": 1.2143957428931522, "grad_norm": 0.12519080936908722, "learning_rate": 2.9675000000000003e-05, "loss": 0.7604, "step": 813 }, { "epoch": 1.2158894645941278, "grad_norm": 0.12963882088661194, "learning_rate": 2.965e-05, "loss": 0.9017, "step": 814 }, { "epoch": 1.2173831862951034, "grad_norm": 0.12160071730613708, "learning_rate": 2.9625000000000002e-05, "loss": 0.7521, "step": 815 }, { "epoch": 1.218876907996079, "grad_norm": 0.12326373159885406, "learning_rate": 2.96e-05, "loss": 0.8959, "step": 816 }, { "epoch": 1.2203706296970545, "grad_norm": 0.1230199784040451, "learning_rate": 2.9575000000000004e-05, "loss": 0.8495, "step": 817 }, { "epoch": 1.22186435139803, "grad_norm": 0.13004879653453827, "learning_rate": 2.955e-05, "loss": 0.8792, "step": 818 }, { "epoch": 1.2233580730990057, "grad_norm": 0.13560228049755096, "learning_rate": 2.9525000000000003e-05, "loss": 0.9356, "step": 819 }, { "epoch": 1.2248517947999813, "grad_norm": 0.11816021800041199, "learning_rate": 2.95e-05, "loss": 0.7903, "step": 820 }, { "epoch": 1.226345516500957, "grad_norm": 0.1120637059211731, "learning_rate": 2.9475e-05, "loss": 0.7779, "step": 821 }, { "epoch": 1.2278392382019325, "grad_norm": 0.12377166002988815, "learning_rate": 2.945e-05, "loss": 0.8345, "step": 822 }, { "epoch": 1.2293329599029081, "grad_norm": 0.12971679866313934, "learning_rate": 2.9425000000000004e-05, "loss": 0.866, "step": 823 }, { "epoch": 1.2308266816038838, "grad_norm": 0.11805073171854019, "learning_rate": 2.94e-05, "loss": 0.8675, "step": 824 }, { "epoch": 1.2323204033048594, "grad_norm": 0.13392971456050873, "learning_rate": 2.9375000000000003e-05, "loss": 0.7436, "step": 825 }, { "epoch": 1.2338141250058348, "grad_norm": 0.12267670035362244, "learning_rate": 2.935e-05, "loss": 0.808, "step": 826 }, { "epoch": 1.2353078467068104, "grad_norm": 0.11747957766056061, "learning_rate": 2.9325e-05, "loss": 0.8167, "step": 827 }, { "epoch": 1.236801568407786, "grad_norm": 0.12998729944229126, "learning_rate": 2.93e-05, "loss": 0.8897, "step": 828 }, { "epoch": 1.2382952901087616, "grad_norm": 0.12338648736476898, "learning_rate": 2.9275000000000003e-05, "loss": 0.8348, "step": 829 }, { "epoch": 1.2397890118097372, "grad_norm": 0.12606552243232727, "learning_rate": 2.925e-05, "loss": 0.8225, "step": 830 }, { "epoch": 1.2412827335107128, "grad_norm": 0.12477975338697433, "learning_rate": 2.9225000000000002e-05, "loss": 0.9242, "step": 831 }, { "epoch": 1.2427764552116884, "grad_norm": 0.1399284303188324, "learning_rate": 2.9199999999999998e-05, "loss": 0.9143, "step": 832 }, { "epoch": 1.2442701769126638, "grad_norm": 0.11336173862218857, "learning_rate": 2.9175e-05, "loss": 0.7624, "step": 833 }, { "epoch": 1.2457638986136395, "grad_norm": 0.1283206045627594, "learning_rate": 2.915e-05, "loss": 0.8395, "step": 834 }, { "epoch": 1.247257620314615, "grad_norm": 0.12334034591913223, "learning_rate": 2.9125000000000003e-05, "loss": 0.9128, "step": 835 }, { "epoch": 1.2487513420155907, "grad_norm": 0.14862501621246338, "learning_rate": 2.91e-05, "loss": 0.8431, "step": 836 }, { "epoch": 1.2502450637165663, "grad_norm": 0.12373830378055573, "learning_rate": 2.9075000000000002e-05, "loss": 0.8737, "step": 837 }, { "epoch": 1.251738785417542, "grad_norm": 0.1208437830209732, "learning_rate": 2.9049999999999998e-05, "loss": 0.8386, "step": 838 }, { "epoch": 1.2532325071185175, "grad_norm": 0.11459632962942123, "learning_rate": 2.9025e-05, "loss": 0.7379, "step": 839 }, { "epoch": 1.2547262288194931, "grad_norm": 0.11718658357858658, "learning_rate": 2.9e-05, "loss": 0.826, "step": 840 }, { "epoch": 1.2562199505204688, "grad_norm": 0.13426974415779114, "learning_rate": 2.8975000000000003e-05, "loss": 0.8722, "step": 841 }, { "epoch": 1.2577136722214441, "grad_norm": 0.11956042796373367, "learning_rate": 2.895e-05, "loss": 0.7715, "step": 842 }, { "epoch": 1.2592073939224198, "grad_norm": 0.12248075008392334, "learning_rate": 2.8925000000000002e-05, "loss": 0.9034, "step": 843 }, { "epoch": 1.2607011156233954, "grad_norm": 0.1273433119058609, "learning_rate": 2.8899999999999998e-05, "loss": 0.7758, "step": 844 }, { "epoch": 1.262194837324371, "grad_norm": 0.1338386833667755, "learning_rate": 2.8875e-05, "loss": 0.9501, "step": 845 }, { "epoch": 1.2636885590253466, "grad_norm": 0.11684931814670563, "learning_rate": 2.885e-05, "loss": 0.788, "step": 846 }, { "epoch": 1.2651822807263222, "grad_norm": 0.12744741141796112, "learning_rate": 2.8825000000000003e-05, "loss": 0.8533, "step": 847 }, { "epoch": 1.2666760024272978, "grad_norm": 0.1315309703350067, "learning_rate": 2.88e-05, "loss": 0.9569, "step": 848 }, { "epoch": 1.2681697241282732, "grad_norm": 0.1298670768737793, "learning_rate": 2.8775e-05, "loss": 0.9025, "step": 849 }, { "epoch": 1.269663445829249, "grad_norm": 0.1400361806154251, "learning_rate": 2.8749999999999997e-05, "loss": 0.8329, "step": 850 }, { "epoch": 1.2711571675302245, "grad_norm": 0.13071084022521973, "learning_rate": 2.8725e-05, "loss": 0.9038, "step": 851 }, { "epoch": 1.2726508892312, "grad_norm": 0.12083021551370621, "learning_rate": 2.87e-05, "loss": 0.7976, "step": 852 }, { "epoch": 1.2741446109321757, "grad_norm": 0.14189660549163818, "learning_rate": 2.8675000000000002e-05, "loss": 0.9083, "step": 853 }, { "epoch": 1.2756383326331513, "grad_norm": 0.13255542516708374, "learning_rate": 2.865e-05, "loss": 0.9074, "step": 854 }, { "epoch": 1.277132054334127, "grad_norm": 0.12679646909236908, "learning_rate": 2.8625e-05, "loss": 0.9341, "step": 855 }, { "epoch": 1.2786257760351025, "grad_norm": 0.11633843183517456, "learning_rate": 2.86e-05, "loss": 0.7427, "step": 856 }, { "epoch": 1.2801194977360781, "grad_norm": 0.12440744042396545, "learning_rate": 2.8575000000000003e-05, "loss": 0.8483, "step": 857 }, { "epoch": 1.2816132194370535, "grad_norm": 0.12180498242378235, "learning_rate": 2.855e-05, "loss": 0.7353, "step": 858 }, { "epoch": 1.2831069411380291, "grad_norm": 0.11779835820198059, "learning_rate": 2.8525000000000002e-05, "loss": 0.799, "step": 859 }, { "epoch": 1.2846006628390048, "grad_norm": 0.13578537106513977, "learning_rate": 2.8499999999999998e-05, "loss": 0.7484, "step": 860 }, { "epoch": 1.2860943845399804, "grad_norm": 0.12224894016981125, "learning_rate": 2.8475e-05, "loss": 0.8396, "step": 861 }, { "epoch": 1.287588106240956, "grad_norm": 0.12585173547267914, "learning_rate": 2.845e-05, "loss": 0.8337, "step": 862 }, { "epoch": 1.2890818279419316, "grad_norm": 0.11971063911914825, "learning_rate": 2.8425000000000003e-05, "loss": 0.8524, "step": 863 }, { "epoch": 1.2905755496429072, "grad_norm": 0.13245628774166107, "learning_rate": 2.84e-05, "loss": 0.9186, "step": 864 }, { "epoch": 1.2920692713438826, "grad_norm": 0.12556052207946777, "learning_rate": 2.8375000000000002e-05, "loss": 0.8287, "step": 865 }, { "epoch": 1.2935629930448584, "grad_norm": 0.11950189620256424, "learning_rate": 2.8349999999999998e-05, "loss": 0.7928, "step": 866 }, { "epoch": 1.2950567147458338, "grad_norm": 0.12176921963691711, "learning_rate": 2.8325e-05, "loss": 0.8424, "step": 867 }, { "epoch": 1.2965504364468095, "grad_norm": 0.12262732535600662, "learning_rate": 2.83e-05, "loss": 0.7806, "step": 868 }, { "epoch": 1.298044158147785, "grad_norm": 0.13159622251987457, "learning_rate": 2.8275000000000003e-05, "loss": 0.9156, "step": 869 }, { "epoch": 1.2995378798487607, "grad_norm": 0.11610793322324753, "learning_rate": 2.825e-05, "loss": 0.803, "step": 870 }, { "epoch": 1.3010316015497363, "grad_norm": 0.12151205539703369, "learning_rate": 2.8225e-05, "loss": 0.8206, "step": 871 }, { "epoch": 1.302525323250712, "grad_norm": 0.13041004538536072, "learning_rate": 2.8199999999999998e-05, "loss": 0.8986, "step": 872 }, { "epoch": 1.3040190449516875, "grad_norm": 0.1163647398352623, "learning_rate": 2.8175e-05, "loss": 0.7676, "step": 873 }, { "epoch": 1.305512766652663, "grad_norm": 0.13392485678195953, "learning_rate": 2.815e-05, "loss": 0.8614, "step": 874 }, { "epoch": 1.3070064883536388, "grad_norm": 0.14276322722434998, "learning_rate": 2.8125000000000003e-05, "loss": 0.9141, "step": 875 }, { "epoch": 1.3085002100546141, "grad_norm": 0.11519097536802292, "learning_rate": 2.8100000000000005e-05, "loss": 0.8055, "step": 876 }, { "epoch": 1.3099939317555898, "grad_norm": 0.12255548685789108, "learning_rate": 2.8075e-05, "loss": 0.9028, "step": 877 }, { "epoch": 1.3114876534565654, "grad_norm": 0.12805688381195068, "learning_rate": 2.8050000000000004e-05, "loss": 0.9277, "step": 878 }, { "epoch": 1.312981375157541, "grad_norm": 0.1194475069642067, "learning_rate": 2.8025e-05, "loss": 0.6876, "step": 879 }, { "epoch": 1.3144750968585166, "grad_norm": 0.13115958869457245, "learning_rate": 2.8000000000000003e-05, "loss": 0.8271, "step": 880 }, { "epoch": 1.3159688185594922, "grad_norm": 0.11539898067712784, "learning_rate": 2.7975000000000002e-05, "loss": 0.798, "step": 881 }, { "epoch": 1.3174625402604678, "grad_norm": 0.14462490379810333, "learning_rate": 2.7950000000000005e-05, "loss": 0.9912, "step": 882 }, { "epoch": 1.3189562619614432, "grad_norm": 0.13063012063503265, "learning_rate": 2.7925e-05, "loss": 0.8955, "step": 883 }, { "epoch": 1.3204499836624188, "grad_norm": 0.13040393590927124, "learning_rate": 2.7900000000000004e-05, "loss": 0.8437, "step": 884 }, { "epoch": 1.3219437053633945, "grad_norm": 0.12559951841831207, "learning_rate": 2.7875e-05, "loss": 0.8256, "step": 885 }, { "epoch": 1.32343742706437, "grad_norm": 0.12228979915380478, "learning_rate": 2.7850000000000003e-05, "loss": 0.7951, "step": 886 }, { "epoch": 1.3249311487653457, "grad_norm": 0.1279095560312271, "learning_rate": 2.7825000000000002e-05, "loss": 0.8527, "step": 887 }, { "epoch": 1.3264248704663213, "grad_norm": 0.13196344673633575, "learning_rate": 2.7800000000000005e-05, "loss": 0.9143, "step": 888 }, { "epoch": 1.327918592167297, "grad_norm": 0.12417804449796677, "learning_rate": 2.7775e-05, "loss": 0.8301, "step": 889 }, { "epoch": 1.3294123138682723, "grad_norm": 0.11949960887432098, "learning_rate": 2.7750000000000004e-05, "loss": 0.7203, "step": 890 }, { "epoch": 1.3309060355692481, "grad_norm": 0.11880392581224442, "learning_rate": 2.7725e-05, "loss": 0.8152, "step": 891 }, { "epoch": 1.3323997572702235, "grad_norm": 0.12138735502958298, "learning_rate": 2.7700000000000002e-05, "loss": 0.8348, "step": 892 }, { "epoch": 1.3338934789711991, "grad_norm": 0.11640416830778122, "learning_rate": 2.7675000000000002e-05, "loss": 0.7456, "step": 893 }, { "epoch": 1.3353872006721748, "grad_norm": 0.12531030178070068, "learning_rate": 2.7650000000000005e-05, "loss": 0.9137, "step": 894 }, { "epoch": 1.3368809223731504, "grad_norm": 0.126966193318367, "learning_rate": 2.7625e-05, "loss": 0.8638, "step": 895 }, { "epoch": 1.338374644074126, "grad_norm": 0.13121813535690308, "learning_rate": 2.7600000000000003e-05, "loss": 0.8261, "step": 896 }, { "epoch": 1.3398683657751016, "grad_norm": 0.13045121729373932, "learning_rate": 2.7575e-05, "loss": 0.8367, "step": 897 }, { "epoch": 1.3413620874760772, "grad_norm": 0.12211679667234421, "learning_rate": 2.7550000000000002e-05, "loss": 0.8988, "step": 898 }, { "epoch": 1.3428558091770526, "grad_norm": 0.13771837949752808, "learning_rate": 2.7525e-05, "loss": 0.9385, "step": 899 }, { "epoch": 1.3443495308780282, "grad_norm": 0.12631534039974213, "learning_rate": 2.7500000000000004e-05, "loss": 0.9086, "step": 900 }, { "epoch": 1.3458432525790038, "grad_norm": 0.13280881941318512, "learning_rate": 2.7475e-05, "loss": 0.9165, "step": 901 }, { "epoch": 1.3473369742799794, "grad_norm": 0.13106752932071686, "learning_rate": 2.7450000000000003e-05, "loss": 0.8859, "step": 902 }, { "epoch": 1.348830695980955, "grad_norm": 0.12267035245895386, "learning_rate": 2.7425e-05, "loss": 0.8885, "step": 903 }, { "epoch": 1.3503244176819307, "grad_norm": 0.11973594129085541, "learning_rate": 2.7400000000000002e-05, "loss": 0.7927, "step": 904 }, { "epoch": 1.3518181393829063, "grad_norm": 0.12059591710567474, "learning_rate": 2.7375e-05, "loss": 0.7829, "step": 905 }, { "epoch": 1.3533118610838817, "grad_norm": 0.12863373756408691, "learning_rate": 2.7350000000000004e-05, "loss": 0.8312, "step": 906 }, { "epoch": 1.3548055827848575, "grad_norm": 0.14020320773124695, "learning_rate": 2.7325e-05, "loss": 0.8858, "step": 907 }, { "epoch": 1.356299304485833, "grad_norm": 0.12440166622400284, "learning_rate": 2.7300000000000003e-05, "loss": 0.835, "step": 908 }, { "epoch": 1.3577930261868085, "grad_norm": 0.1348244845867157, "learning_rate": 2.7275e-05, "loss": 0.9583, "step": 909 }, { "epoch": 1.3592867478877841, "grad_norm": 0.12796446681022644, "learning_rate": 2.725e-05, "loss": 0.8501, "step": 910 }, { "epoch": 1.3607804695887598, "grad_norm": 0.13511206209659576, "learning_rate": 2.7225e-05, "loss": 0.9567, "step": 911 }, { "epoch": 1.3622741912897354, "grad_norm": 0.12794198095798492, "learning_rate": 2.7200000000000004e-05, "loss": 0.7893, "step": 912 }, { "epoch": 1.363767912990711, "grad_norm": 0.1381368488073349, "learning_rate": 2.7175e-05, "loss": 1.0099, "step": 913 }, { "epoch": 1.3652616346916866, "grad_norm": 0.12837548553943634, "learning_rate": 2.7150000000000003e-05, "loss": 0.9286, "step": 914 }, { "epoch": 1.366755356392662, "grad_norm": 0.1281077265739441, "learning_rate": 2.7125000000000002e-05, "loss": 0.8275, "step": 915 }, { "epoch": 1.3682490780936376, "grad_norm": 0.12404779344797134, "learning_rate": 2.7100000000000005e-05, "loss": 0.7344, "step": 916 }, { "epoch": 1.3697427997946132, "grad_norm": 0.11918750405311584, "learning_rate": 2.7075e-05, "loss": 0.8695, "step": 917 }, { "epoch": 1.3712365214955888, "grad_norm": 0.12467345595359802, "learning_rate": 2.7050000000000004e-05, "loss": 0.8611, "step": 918 }, { "epoch": 1.3727302431965644, "grad_norm": 0.1300368458032608, "learning_rate": 2.7025e-05, "loss": 0.9732, "step": 919 }, { "epoch": 1.37422396489754, "grad_norm": 0.11988348513841629, "learning_rate": 2.7000000000000002e-05, "loss": 0.8673, "step": 920 }, { "epoch": 1.3757176865985157, "grad_norm": 0.11603394895792007, "learning_rate": 2.6975000000000002e-05, "loss": 0.737, "step": 921 }, { "epoch": 1.377211408299491, "grad_norm": 0.12434028834104538, "learning_rate": 2.6950000000000005e-05, "loss": 0.8604, "step": 922 }, { "epoch": 1.378705130000467, "grad_norm": 0.12479966133832932, "learning_rate": 2.6925e-05, "loss": 0.8375, "step": 923 }, { "epoch": 1.3801988517014423, "grad_norm": 0.13643182814121246, "learning_rate": 2.6900000000000003e-05, "loss": 0.903, "step": 924 }, { "epoch": 1.381692573402418, "grad_norm": 0.12003342062234879, "learning_rate": 2.6875e-05, "loss": 0.8771, "step": 925 }, { "epoch": 1.3831862951033935, "grad_norm": 0.13435302674770355, "learning_rate": 2.6850000000000002e-05, "loss": 0.8849, "step": 926 }, { "epoch": 1.3846800168043691, "grad_norm": 0.12296207249164581, "learning_rate": 2.6825e-05, "loss": 0.9567, "step": 927 }, { "epoch": 1.3861737385053448, "grad_norm": 0.1464330554008484, "learning_rate": 2.6800000000000004e-05, "loss": 0.9514, "step": 928 }, { "epoch": 1.3876674602063204, "grad_norm": 0.11934661120176315, "learning_rate": 2.6775e-05, "loss": 0.7414, "step": 929 }, { "epoch": 1.389161181907296, "grad_norm": 0.12199968099594116, "learning_rate": 2.6750000000000003e-05, "loss": 0.8483, "step": 930 }, { "epoch": 1.3906549036082714, "grad_norm": 0.13648547232151031, "learning_rate": 2.6725e-05, "loss": 0.8901, "step": 931 }, { "epoch": 1.392148625309247, "grad_norm": 0.12130739539861679, "learning_rate": 2.6700000000000002e-05, "loss": 0.8099, "step": 932 }, { "epoch": 1.3936423470102226, "grad_norm": 0.12446040660142899, "learning_rate": 2.6675e-05, "loss": 0.8796, "step": 933 }, { "epoch": 1.3951360687111982, "grad_norm": 0.12485441565513611, "learning_rate": 2.6650000000000004e-05, "loss": 0.84, "step": 934 }, { "epoch": 1.3966297904121738, "grad_norm": 0.12332414835691452, "learning_rate": 2.6625e-05, "loss": 0.826, "step": 935 }, { "epoch": 1.3981235121131494, "grad_norm": 0.12359558790922165, "learning_rate": 2.6600000000000003e-05, "loss": 0.8295, "step": 936 }, { "epoch": 1.399617233814125, "grad_norm": 0.11534640192985535, "learning_rate": 2.6575e-05, "loss": 0.8177, "step": 937 }, { "epoch": 1.4011109555151007, "grad_norm": 0.13748447597026825, "learning_rate": 2.655e-05, "loss": 0.9605, "step": 938 }, { "epoch": 1.4026046772160763, "grad_norm": 0.12550769746303558, "learning_rate": 2.6525e-05, "loss": 0.8996, "step": 939 }, { "epoch": 1.4040983989170517, "grad_norm": 0.12319596111774445, "learning_rate": 2.6500000000000004e-05, "loss": 0.8337, "step": 940 }, { "epoch": 1.4055921206180273, "grad_norm": 0.12512342631816864, "learning_rate": 2.6475e-05, "loss": 0.8082, "step": 941 }, { "epoch": 1.407085842319003, "grad_norm": 0.12207676470279694, "learning_rate": 2.6450000000000003e-05, "loss": 0.8288, "step": 942 }, { "epoch": 1.4085795640199785, "grad_norm": 0.12230820953845978, "learning_rate": 2.6425e-05, "loss": 0.7466, "step": 943 }, { "epoch": 1.4100732857209541, "grad_norm": 0.13272836804389954, "learning_rate": 2.64e-05, "loss": 0.7746, "step": 944 }, { "epoch": 1.4115670074219298, "grad_norm": 0.13499213755130768, "learning_rate": 2.6375e-05, "loss": 0.9153, "step": 945 }, { "epoch": 1.4130607291229054, "grad_norm": 0.12010452151298523, "learning_rate": 2.6350000000000004e-05, "loss": 0.927, "step": 946 }, { "epoch": 1.4145544508238808, "grad_norm": 0.11952769756317139, "learning_rate": 2.6325e-05, "loss": 0.7817, "step": 947 }, { "epoch": 1.4160481725248566, "grad_norm": 0.12218941748142242, "learning_rate": 2.6300000000000002e-05, "loss": 0.8634, "step": 948 }, { "epoch": 1.417541894225832, "grad_norm": 0.13754992187023163, "learning_rate": 2.6275e-05, "loss": 0.9884, "step": 949 }, { "epoch": 1.4190356159268076, "grad_norm": 0.12959443032741547, "learning_rate": 2.625e-05, "loss": 0.8879, "step": 950 }, { "epoch": 1.4205293376277832, "grad_norm": 0.12210264056921005, "learning_rate": 2.6225e-05, "loss": 0.9146, "step": 951 }, { "epoch": 1.4220230593287588, "grad_norm": 0.14675554633140564, "learning_rate": 2.6200000000000003e-05, "loss": 0.9384, "step": 952 }, { "epoch": 1.4235167810297344, "grad_norm": 0.11978898197412491, "learning_rate": 2.6175e-05, "loss": 0.7991, "step": 953 }, { "epoch": 1.42501050273071, "grad_norm": 0.1318391114473343, "learning_rate": 2.6150000000000002e-05, "loss": 0.917, "step": 954 }, { "epoch": 1.4265042244316857, "grad_norm": 0.13110221922397614, "learning_rate": 2.6124999999999998e-05, "loss": 0.8382, "step": 955 }, { "epoch": 1.427997946132661, "grad_norm": 0.13638551533222198, "learning_rate": 2.61e-05, "loss": 0.7796, "step": 956 }, { "epoch": 1.4294916678336367, "grad_norm": 0.12785199284553528, "learning_rate": 2.6075e-05, "loss": 0.823, "step": 957 }, { "epoch": 1.4309853895346123, "grad_norm": 0.11785779148340225, "learning_rate": 2.6050000000000003e-05, "loss": 0.7295, "step": 958 }, { "epoch": 1.432479111235588, "grad_norm": 0.12728644907474518, "learning_rate": 2.6025e-05, "loss": 0.8086, "step": 959 }, { "epoch": 1.4339728329365635, "grad_norm": 0.12209265679121017, "learning_rate": 2.6000000000000002e-05, "loss": 0.8307, "step": 960 }, { "epoch": 1.4354665546375391, "grad_norm": 0.13694138824939728, "learning_rate": 2.5974999999999998e-05, "loss": 0.8834, "step": 961 }, { "epoch": 1.4369602763385148, "grad_norm": 0.12597723305225372, "learning_rate": 2.595e-05, "loss": 0.8595, "step": 962 }, { "epoch": 1.4384539980394901, "grad_norm": 0.12776942551136017, "learning_rate": 2.5925e-05, "loss": 0.7502, "step": 963 }, { "epoch": 1.439947719740466, "grad_norm": 0.13947486877441406, "learning_rate": 2.5900000000000003e-05, "loss": 0.8891, "step": 964 }, { "epoch": 1.4414414414414414, "grad_norm": 0.12271355837583542, "learning_rate": 2.5875e-05, "loss": 0.8202, "step": 965 }, { "epoch": 1.442935163142417, "grad_norm": 0.12153571844100952, "learning_rate": 2.585e-05, "loss": 0.8329, "step": 966 }, { "epoch": 1.4444288848433926, "grad_norm": 0.13112302124500275, "learning_rate": 2.5824999999999998e-05, "loss": 0.9614, "step": 967 }, { "epoch": 1.4459226065443682, "grad_norm": 0.12499723583459854, "learning_rate": 2.58e-05, "loss": 0.8169, "step": 968 }, { "epoch": 1.4474163282453438, "grad_norm": 0.12849260866641998, "learning_rate": 2.5775e-05, "loss": 0.837, "step": 969 }, { "epoch": 1.4489100499463194, "grad_norm": 0.11404875665903091, "learning_rate": 2.5750000000000002e-05, "loss": 0.7686, "step": 970 }, { "epoch": 1.450403771647295, "grad_norm": 0.1250537782907486, "learning_rate": 2.5725e-05, "loss": 0.8073, "step": 971 }, { "epoch": 1.4518974933482705, "grad_norm": 0.1266302913427353, "learning_rate": 2.57e-05, "loss": 0.8158, "step": 972 }, { "epoch": 1.453391215049246, "grad_norm": 0.11971023678779602, "learning_rate": 2.5675e-05, "loss": 0.827, "step": 973 }, { "epoch": 1.4548849367502217, "grad_norm": 0.1280481368303299, "learning_rate": 2.5650000000000003e-05, "loss": 0.786, "step": 974 }, { "epoch": 1.4563786584511973, "grad_norm": 0.1304454654455185, "learning_rate": 2.5625e-05, "loss": 0.9278, "step": 975 }, { "epoch": 1.457872380152173, "grad_norm": 0.12165829539299011, "learning_rate": 2.5600000000000002e-05, "loss": 0.8367, "step": 976 }, { "epoch": 1.4593661018531485, "grad_norm": 0.11573071777820587, "learning_rate": 2.5574999999999998e-05, "loss": 0.7258, "step": 977 }, { "epoch": 1.4608598235541241, "grad_norm": 0.13096196949481964, "learning_rate": 2.555e-05, "loss": 0.8842, "step": 978 }, { "epoch": 1.4623535452550995, "grad_norm": 0.12128228694200516, "learning_rate": 2.5525e-05, "loss": 0.7492, "step": 979 }, { "epoch": 1.4638472669560754, "grad_norm": 0.1297605186700821, "learning_rate": 2.5500000000000003e-05, "loss": 0.8792, "step": 980 }, { "epoch": 1.4653409886570508, "grad_norm": 0.12053804844617844, "learning_rate": 2.5475e-05, "loss": 0.8779, "step": 981 }, { "epoch": 1.4668347103580264, "grad_norm": 0.12664265930652618, "learning_rate": 2.5450000000000002e-05, "loss": 0.8875, "step": 982 }, { "epoch": 1.468328432059002, "grad_norm": 0.12238920480012894, "learning_rate": 2.5424999999999998e-05, "loss": 0.7827, "step": 983 }, { "epoch": 1.4698221537599776, "grad_norm": 0.12383371591567993, "learning_rate": 2.54e-05, "loss": 0.8701, "step": 984 }, { "epoch": 1.4713158754609532, "grad_norm": 0.1226775050163269, "learning_rate": 2.5375e-05, "loss": 0.6839, "step": 985 }, { "epoch": 1.4728095971619288, "grad_norm": 0.13554878532886505, "learning_rate": 2.5350000000000003e-05, "loss": 0.8657, "step": 986 }, { "epoch": 1.4743033188629044, "grad_norm": 0.14233556389808655, "learning_rate": 2.5325e-05, "loss": 0.8269, "step": 987 }, { "epoch": 1.4757970405638798, "grad_norm": 0.12661395967006683, "learning_rate": 2.5300000000000002e-05, "loss": 0.8333, "step": 988 }, { "epoch": 1.4772907622648555, "grad_norm": 0.1260916143655777, "learning_rate": 2.5274999999999998e-05, "loss": 0.8046, "step": 989 }, { "epoch": 1.478784483965831, "grad_norm": 0.13094903528690338, "learning_rate": 2.525e-05, "loss": 0.7928, "step": 990 }, { "epoch": 1.4802782056668067, "grad_norm": 0.11546465754508972, "learning_rate": 2.5225e-05, "loss": 0.7943, "step": 991 }, { "epoch": 1.4817719273677823, "grad_norm": 0.12880420684814453, "learning_rate": 2.5200000000000003e-05, "loss": 0.8855, "step": 992 }, { "epoch": 1.483265649068758, "grad_norm": 0.1356886625289917, "learning_rate": 2.5175e-05, "loss": 0.8226, "step": 993 }, { "epoch": 1.4847593707697335, "grad_norm": 0.12081035226583481, "learning_rate": 2.515e-05, "loss": 0.8109, "step": 994 }, { "epoch": 1.486253092470709, "grad_norm": 0.12199196964502335, "learning_rate": 2.5124999999999997e-05, "loss": 0.7699, "step": 995 }, { "epoch": 1.4877468141716848, "grad_norm": 0.12417642772197723, "learning_rate": 2.51e-05, "loss": 0.8482, "step": 996 }, { "epoch": 1.4892405358726601, "grad_norm": 0.14093239605426788, "learning_rate": 2.5075e-05, "loss": 0.8532, "step": 997 }, { "epoch": 1.4907342575736358, "grad_norm": 0.14215964078903198, "learning_rate": 2.5050000000000002e-05, "loss": 0.7965, "step": 998 }, { "epoch": 1.4922279792746114, "grad_norm": 0.12761370837688446, "learning_rate": 2.5025e-05, "loss": 0.7744, "step": 999 }, { "epoch": 1.493721700975587, "grad_norm": 0.13330742716789246, "learning_rate": 2.5e-05, "loss": 0.9491, "step": 1000 }, { "epoch": 1.4952154226765626, "grad_norm": 0.13185036182403564, "learning_rate": 2.4975e-05, "loss": 0.7916, "step": 1001 }, { "epoch": 1.4967091443775382, "grad_norm": 0.1290684938430786, "learning_rate": 2.495e-05, "loss": 0.933, "step": 1002 }, { "epoch": 1.4982028660785138, "grad_norm": 0.12649308145046234, "learning_rate": 2.4925000000000003e-05, "loss": 0.8864, "step": 1003 }, { "epoch": 1.4996965877794892, "grad_norm": 0.12478235363960266, "learning_rate": 2.4900000000000002e-05, "loss": 0.8163, "step": 1004 }, { "epoch": 1.501190309480465, "grad_norm": 0.1294005811214447, "learning_rate": 2.4875e-05, "loss": 0.7958, "step": 1005 }, { "epoch": 1.5026840311814404, "grad_norm": 0.1187678873538971, "learning_rate": 2.485e-05, "loss": 0.7575, "step": 1006 }, { "epoch": 1.504177752882416, "grad_norm": 0.128489151597023, "learning_rate": 2.4825e-05, "loss": 0.9051, "step": 1007 }, { "epoch": 1.5056714745833917, "grad_norm": 0.12688180804252625, "learning_rate": 2.48e-05, "loss": 0.8974, "step": 1008 }, { "epoch": 1.5071651962843673, "grad_norm": 0.1222631111741066, "learning_rate": 2.4775000000000003e-05, "loss": 0.7752, "step": 1009 }, { "epoch": 1.508658917985343, "grad_norm": 0.10836508125066757, "learning_rate": 2.4750000000000002e-05, "loss": 0.6646, "step": 1010 }, { "epoch": 1.5101526396863183, "grad_norm": 0.12008467316627502, "learning_rate": 2.4725e-05, "loss": 0.803, "step": 1011 }, { "epoch": 1.5116463613872941, "grad_norm": 0.14022228121757507, "learning_rate": 2.47e-05, "loss": 1.0052, "step": 1012 }, { "epoch": 1.5131400830882695, "grad_norm": 0.11962029337882996, "learning_rate": 2.4675e-05, "loss": 0.7985, "step": 1013 }, { "epoch": 1.5146338047892454, "grad_norm": 0.1218658909201622, "learning_rate": 2.465e-05, "loss": 0.8412, "step": 1014 }, { "epoch": 1.5161275264902208, "grad_norm": 0.1279427707195282, "learning_rate": 2.4625000000000002e-05, "loss": 0.886, "step": 1015 }, { "epoch": 1.5176212481911964, "grad_norm": 0.13322357833385468, "learning_rate": 2.46e-05, "loss": 0.8481, "step": 1016 }, { "epoch": 1.519114969892172, "grad_norm": 0.12052313983440399, "learning_rate": 2.4575e-05, "loss": 0.8474, "step": 1017 }, { "epoch": 1.5206086915931476, "grad_norm": 0.12180794775485992, "learning_rate": 2.455e-05, "loss": 0.9066, "step": 1018 }, { "epoch": 1.5221024132941232, "grad_norm": 0.12572136521339417, "learning_rate": 2.4525e-05, "loss": 0.8689, "step": 1019 }, { "epoch": 1.5235961349950986, "grad_norm": 0.13132348656654358, "learning_rate": 2.45e-05, "loss": 0.7871, "step": 1020 }, { "epoch": 1.5250898566960744, "grad_norm": 0.12071798741817474, "learning_rate": 2.4475000000000002e-05, "loss": 0.8731, "step": 1021 }, { "epoch": 1.5265835783970498, "grad_norm": 0.130793496966362, "learning_rate": 2.445e-05, "loss": 0.7973, "step": 1022 }, { "epoch": 1.5280773000980254, "grad_norm": 0.14586828649044037, "learning_rate": 2.4425e-05, "loss": 0.9808, "step": 1023 }, { "epoch": 1.529571021799001, "grad_norm": 0.11951061338186264, "learning_rate": 2.44e-05, "loss": 0.8272, "step": 1024 }, { "epoch": 1.5310647434999767, "grad_norm": 0.13476024568080902, "learning_rate": 2.4375e-05, "loss": 0.9313, "step": 1025 }, { "epoch": 1.5325584652009523, "grad_norm": 0.12136327475309372, "learning_rate": 2.435e-05, "loss": 0.8379, "step": 1026 }, { "epoch": 1.5340521869019277, "grad_norm": 0.12678933143615723, "learning_rate": 2.4325000000000002e-05, "loss": 0.7785, "step": 1027 }, { "epoch": 1.5355459086029035, "grad_norm": 0.1269274204969406, "learning_rate": 2.43e-05, "loss": 0.8939, "step": 1028 }, { "epoch": 1.537039630303879, "grad_norm": 0.1390867680311203, "learning_rate": 2.4275e-05, "loss": 0.992, "step": 1029 }, { "epoch": 1.5385333520048547, "grad_norm": 0.12505748867988586, "learning_rate": 2.425e-05, "loss": 0.7551, "step": 1030 }, { "epoch": 1.5400270737058301, "grad_norm": 0.1416258066892624, "learning_rate": 2.4225e-05, "loss": 0.782, "step": 1031 }, { "epoch": 1.5415207954068058, "grad_norm": 0.14086759090423584, "learning_rate": 2.4200000000000002e-05, "loss": 0.807, "step": 1032 }, { "epoch": 1.5430145171077814, "grad_norm": 0.1328718662261963, "learning_rate": 2.4175e-05, "loss": 0.9267, "step": 1033 }, { "epoch": 1.544508238808757, "grad_norm": 0.11550353467464447, "learning_rate": 2.415e-05, "loss": 0.8191, "step": 1034 }, { "epoch": 1.5460019605097326, "grad_norm": 0.16076602041721344, "learning_rate": 2.4125e-05, "loss": 0.8731, "step": 1035 }, { "epoch": 1.547495682210708, "grad_norm": 0.11952052265405655, "learning_rate": 2.41e-05, "loss": 0.8221, "step": 1036 }, { "epoch": 1.5489894039116838, "grad_norm": 0.11696983128786087, "learning_rate": 2.4075e-05, "loss": 0.7988, "step": 1037 }, { "epoch": 1.5504831256126592, "grad_norm": 0.13011442124843597, "learning_rate": 2.4050000000000002e-05, "loss": 0.9779, "step": 1038 }, { "epoch": 1.5519768473136348, "grad_norm": 0.12896400690078735, "learning_rate": 2.4025e-05, "loss": 0.7919, "step": 1039 }, { "epoch": 1.5534705690146104, "grad_norm": 0.13519851863384247, "learning_rate": 2.4e-05, "loss": 0.8699, "step": 1040 }, { "epoch": 1.554964290715586, "grad_norm": 0.12381348758935928, "learning_rate": 2.3975e-05, "loss": 0.8152, "step": 1041 }, { "epoch": 1.5564580124165617, "grad_norm": 0.12363633513450623, "learning_rate": 2.395e-05, "loss": 0.7882, "step": 1042 }, { "epoch": 1.557951734117537, "grad_norm": 0.12120139598846436, "learning_rate": 2.3925e-05, "loss": 0.793, "step": 1043 }, { "epoch": 1.559445455818513, "grad_norm": 0.12351545691490173, "learning_rate": 2.39e-05, "loss": 0.7557, "step": 1044 }, { "epoch": 1.5609391775194883, "grad_norm": 0.1266602724790573, "learning_rate": 2.3875e-05, "loss": 0.8476, "step": 1045 }, { "epoch": 1.5624328992204641, "grad_norm": 0.12460999935865402, "learning_rate": 2.385e-05, "loss": 0.8916, "step": 1046 }, { "epoch": 1.5639266209214395, "grad_norm": 0.11646848171949387, "learning_rate": 2.3825e-05, "loss": 0.7767, "step": 1047 }, { "epoch": 1.5654203426224151, "grad_norm": 0.12342283874750137, "learning_rate": 2.38e-05, "loss": 0.8387, "step": 1048 }, { "epoch": 1.5669140643233908, "grad_norm": 0.1228499487042427, "learning_rate": 2.3775e-05, "loss": 0.7643, "step": 1049 }, { "epoch": 1.5684077860243664, "grad_norm": 0.1286567598581314, "learning_rate": 2.375e-05, "loss": 0.841, "step": 1050 }, { "epoch": 1.569901507725342, "grad_norm": 0.13491226732730865, "learning_rate": 2.3725e-05, "loss": 0.8816, "step": 1051 }, { "epoch": 1.5713952294263174, "grad_norm": 0.1259843111038208, "learning_rate": 2.37e-05, "loss": 0.7514, "step": 1052 }, { "epoch": 1.5728889511272932, "grad_norm": 0.1325303167104721, "learning_rate": 2.3675e-05, "loss": 0.8464, "step": 1053 }, { "epoch": 1.5743826728282686, "grad_norm": 0.1264677494764328, "learning_rate": 2.365e-05, "loss": 0.9355, "step": 1054 }, { "epoch": 1.5758763945292442, "grad_norm": 0.12532635033130646, "learning_rate": 2.3624999999999998e-05, "loss": 0.8343, "step": 1055 }, { "epoch": 1.5773701162302198, "grad_norm": 0.12255837023258209, "learning_rate": 2.36e-05, "loss": 0.7952, "step": 1056 }, { "epoch": 1.5788638379311954, "grad_norm": 0.12196286767721176, "learning_rate": 2.3575e-05, "loss": 0.8314, "step": 1057 }, { "epoch": 1.580357559632171, "grad_norm": 0.1294473111629486, "learning_rate": 2.355e-05, "loss": 0.9653, "step": 1058 }, { "epoch": 1.5818512813331465, "grad_norm": 0.1203024610877037, "learning_rate": 2.3525e-05, "loss": 0.779, "step": 1059 }, { "epoch": 1.5833450030341223, "grad_norm": 0.15506958961486816, "learning_rate": 2.35e-05, "loss": 0.7833, "step": 1060 }, { "epoch": 1.5848387247350977, "grad_norm": 0.13594777882099152, "learning_rate": 2.3475e-05, "loss": 0.951, "step": 1061 }, { "epoch": 1.5863324464360735, "grad_norm": 0.12968482077121735, "learning_rate": 2.345e-05, "loss": 0.7111, "step": 1062 }, { "epoch": 1.587826168137049, "grad_norm": 0.14295588433742523, "learning_rate": 2.3425000000000004e-05, "loss": 0.9279, "step": 1063 }, { "epoch": 1.5893198898380245, "grad_norm": 0.12550316751003265, "learning_rate": 2.3400000000000003e-05, "loss": 0.8566, "step": 1064 }, { "epoch": 1.5908136115390001, "grad_norm": 0.12191827595233917, "learning_rate": 2.3375000000000002e-05, "loss": 0.7627, "step": 1065 }, { "epoch": 1.5923073332399758, "grad_norm": 0.1281816065311432, "learning_rate": 2.3350000000000002e-05, "loss": 0.839, "step": 1066 }, { "epoch": 1.5938010549409514, "grad_norm": 0.12854556739330292, "learning_rate": 2.3325e-05, "loss": 0.7396, "step": 1067 }, { "epoch": 1.5952947766419268, "grad_norm": 0.1363290399312973, "learning_rate": 2.3300000000000004e-05, "loss": 0.8931, "step": 1068 }, { "epoch": 1.5967884983429026, "grad_norm": 0.1250740885734558, "learning_rate": 2.3275000000000003e-05, "loss": 0.8534, "step": 1069 }, { "epoch": 1.598282220043878, "grad_norm": 0.1116446927189827, "learning_rate": 2.3250000000000003e-05, "loss": 0.7845, "step": 1070 }, { "epoch": 1.5997759417448538, "grad_norm": 0.13615292310714722, "learning_rate": 2.3225000000000002e-05, "loss": 0.7288, "step": 1071 }, { "epoch": 1.6012696634458292, "grad_norm": 0.14057093858718872, "learning_rate": 2.32e-05, "loss": 0.9525, "step": 1072 }, { "epoch": 1.6027633851468048, "grad_norm": 0.11963190883398056, "learning_rate": 2.3175e-05, "loss": 0.817, "step": 1073 }, { "epoch": 1.6042571068477804, "grad_norm": 0.13400131464004517, "learning_rate": 2.3150000000000004e-05, "loss": 0.8082, "step": 1074 }, { "epoch": 1.605750828548756, "grad_norm": 0.12595248222351074, "learning_rate": 2.3125000000000003e-05, "loss": 0.9227, "step": 1075 }, { "epoch": 1.6072445502497317, "grad_norm": 0.14488327503204346, "learning_rate": 2.3100000000000002e-05, "loss": 0.9529, "step": 1076 }, { "epoch": 1.608738271950707, "grad_norm": 0.14026322960853577, "learning_rate": 2.3075000000000002e-05, "loss": 0.8044, "step": 1077 }, { "epoch": 1.610231993651683, "grad_norm": 0.11714571714401245, "learning_rate": 2.305e-05, "loss": 0.7205, "step": 1078 }, { "epoch": 1.6117257153526583, "grad_norm": 0.12584763765335083, "learning_rate": 2.3025e-05, "loss": 0.7909, "step": 1079 }, { "epoch": 1.613219437053634, "grad_norm": 0.13368681073188782, "learning_rate": 2.3000000000000003e-05, "loss": 0.8649, "step": 1080 }, { "epoch": 1.6147131587546095, "grad_norm": 0.12409917265176773, "learning_rate": 2.2975000000000003e-05, "loss": 0.803, "step": 1081 }, { "epoch": 1.6162068804555851, "grad_norm": 0.12905006110668182, "learning_rate": 2.2950000000000002e-05, "loss": 0.8754, "step": 1082 }, { "epoch": 1.6177006021565608, "grad_norm": 0.12740354239940643, "learning_rate": 2.2925e-05, "loss": 0.9401, "step": 1083 }, { "epoch": 1.6191943238575361, "grad_norm": 0.12721090018749237, "learning_rate": 2.29e-05, "loss": 0.8111, "step": 1084 }, { "epoch": 1.620688045558512, "grad_norm": 0.11902303993701935, "learning_rate": 2.2875e-05, "loss": 0.7479, "step": 1085 }, { "epoch": 1.6221817672594874, "grad_norm": 0.13494817912578583, "learning_rate": 2.2850000000000003e-05, "loss": 0.8254, "step": 1086 }, { "epoch": 1.6236754889604632, "grad_norm": 0.12923003733158112, "learning_rate": 2.2825000000000003e-05, "loss": 0.8387, "step": 1087 }, { "epoch": 1.6251692106614386, "grad_norm": 0.12958462536334991, "learning_rate": 2.2800000000000002e-05, "loss": 0.9042, "step": 1088 }, { "epoch": 1.6266629323624142, "grad_norm": 0.12674763798713684, "learning_rate": 2.2775e-05, "loss": 0.8553, "step": 1089 }, { "epoch": 1.6281566540633898, "grad_norm": 0.13015075027942657, "learning_rate": 2.275e-05, "loss": 0.947, "step": 1090 }, { "epoch": 1.6296503757643654, "grad_norm": 0.12725356221199036, "learning_rate": 2.2725000000000003e-05, "loss": 0.7025, "step": 1091 }, { "epoch": 1.631144097465341, "grad_norm": 0.12174873799085617, "learning_rate": 2.2700000000000003e-05, "loss": 0.7605, "step": 1092 }, { "epoch": 1.6326378191663165, "grad_norm": 0.11882270872592926, "learning_rate": 2.2675000000000002e-05, "loss": 0.8418, "step": 1093 }, { "epoch": 1.6341315408672923, "grad_norm": 0.13565146923065186, "learning_rate": 2.265e-05, "loss": 0.8872, "step": 1094 }, { "epoch": 1.6356252625682677, "grad_norm": 0.12812140583992004, "learning_rate": 2.2625e-05, "loss": 0.9577, "step": 1095 }, { "epoch": 1.6371189842692433, "grad_norm": 0.1290404349565506, "learning_rate": 2.26e-05, "loss": 0.839, "step": 1096 }, { "epoch": 1.638612705970219, "grad_norm": 0.1419786959886551, "learning_rate": 2.2575000000000003e-05, "loss": 0.9028, "step": 1097 }, { "epoch": 1.6401064276711945, "grad_norm": 0.1193147525191307, "learning_rate": 2.2550000000000003e-05, "loss": 0.7727, "step": 1098 }, { "epoch": 1.6416001493721701, "grad_norm": 0.11725646257400513, "learning_rate": 2.2525000000000002e-05, "loss": 0.7183, "step": 1099 }, { "epoch": 1.6430938710731455, "grad_norm": 0.12226573377847672, "learning_rate": 2.25e-05, "loss": 0.8881, "step": 1100 }, { "epoch": 1.6445875927741214, "grad_norm": 0.14430628716945648, "learning_rate": 2.2475e-05, "loss": 0.8116, "step": 1101 }, { "epoch": 1.6460813144750968, "grad_norm": 0.12325803935527802, "learning_rate": 2.245e-05, "loss": 0.6795, "step": 1102 }, { "epoch": 1.6475750361760726, "grad_norm": 0.11961191147565842, "learning_rate": 2.2425000000000003e-05, "loss": 0.7287, "step": 1103 }, { "epoch": 1.649068757877048, "grad_norm": 0.129215806722641, "learning_rate": 2.2400000000000002e-05, "loss": 0.9068, "step": 1104 }, { "epoch": 1.6505624795780236, "grad_norm": 0.13367676734924316, "learning_rate": 2.2375000000000002e-05, "loss": 0.824, "step": 1105 }, { "epoch": 1.6520562012789992, "grad_norm": 0.1311062127351761, "learning_rate": 2.235e-05, "loss": 0.7867, "step": 1106 }, { "epoch": 1.6535499229799748, "grad_norm": 0.1289982944726944, "learning_rate": 2.2325e-05, "loss": 0.8264, "step": 1107 }, { "epoch": 1.6550436446809504, "grad_norm": 0.12069728225469589, "learning_rate": 2.23e-05, "loss": 0.7406, "step": 1108 }, { "epoch": 1.6565373663819258, "grad_norm": 0.1251276731491089, "learning_rate": 2.2275000000000003e-05, "loss": 0.8677, "step": 1109 }, { "epoch": 1.6580310880829017, "grad_norm": 0.1214185431599617, "learning_rate": 2.2250000000000002e-05, "loss": 0.8583, "step": 1110 }, { "epoch": 1.659524809783877, "grad_norm": 0.12126150727272034, "learning_rate": 2.2225e-05, "loss": 0.8956, "step": 1111 }, { "epoch": 1.6610185314848527, "grad_norm": 0.1429385542869568, "learning_rate": 2.22e-05, "loss": 0.9363, "step": 1112 }, { "epoch": 1.6625122531858283, "grad_norm": 0.12325936555862427, "learning_rate": 2.2175e-05, "loss": 0.8037, "step": 1113 }, { "epoch": 1.664005974886804, "grad_norm": 0.13724032044410706, "learning_rate": 2.215e-05, "loss": 0.9367, "step": 1114 }, { "epoch": 1.6654996965877795, "grad_norm": 0.12364470958709717, "learning_rate": 2.2125000000000002e-05, "loss": 0.8676, "step": 1115 }, { "epoch": 1.666993418288755, "grad_norm": 0.12396510690450668, "learning_rate": 2.2100000000000002e-05, "loss": 0.7797, "step": 1116 }, { "epoch": 1.6684871399897307, "grad_norm": 0.12917356193065643, "learning_rate": 2.2075e-05, "loss": 0.8185, "step": 1117 }, { "epoch": 1.6699808616907061, "grad_norm": 0.13536900281906128, "learning_rate": 2.205e-05, "loss": 0.9247, "step": 1118 }, { "epoch": 1.671474583391682, "grad_norm": 0.13616593182086945, "learning_rate": 2.2025e-05, "loss": 0.8216, "step": 1119 }, { "epoch": 1.6729683050926574, "grad_norm": 0.13057054579257965, "learning_rate": 2.2000000000000003e-05, "loss": 0.9149, "step": 1120 }, { "epoch": 1.674462026793633, "grad_norm": 0.12441746890544891, "learning_rate": 2.1975000000000002e-05, "loss": 0.7005, "step": 1121 }, { "epoch": 1.6759557484946086, "grad_norm": 0.1285722404718399, "learning_rate": 2.195e-05, "loss": 0.8602, "step": 1122 }, { "epoch": 1.6774494701955842, "grad_norm": 0.1396295428276062, "learning_rate": 2.1925e-05, "loss": 0.8155, "step": 1123 }, { "epoch": 1.6789431918965598, "grad_norm": 0.13131260871887207, "learning_rate": 2.19e-05, "loss": 0.884, "step": 1124 }, { "epoch": 1.6804369135975352, "grad_norm": 0.11417725682258606, "learning_rate": 2.1875e-05, "loss": 0.7587, "step": 1125 }, { "epoch": 1.681930635298511, "grad_norm": 0.12894614040851593, "learning_rate": 2.1850000000000003e-05, "loss": 0.9013, "step": 1126 }, { "epoch": 1.6834243569994864, "grad_norm": 0.12214215099811554, "learning_rate": 2.1825000000000002e-05, "loss": 0.8116, "step": 1127 }, { "epoch": 1.6849180787004623, "grad_norm": 0.12073803693056107, "learning_rate": 2.18e-05, "loss": 0.7701, "step": 1128 }, { "epoch": 1.6864118004014377, "grad_norm": 0.11767350137233734, "learning_rate": 2.1775e-05, "loss": 0.7489, "step": 1129 }, { "epoch": 1.6879055221024133, "grad_norm": 0.12384257465600967, "learning_rate": 2.175e-05, "loss": 0.8186, "step": 1130 }, { "epoch": 1.689399243803389, "grad_norm": 0.11816847324371338, "learning_rate": 2.1725e-05, "loss": 0.7377, "step": 1131 }, { "epoch": 1.6908929655043645, "grad_norm": 0.1298341602087021, "learning_rate": 2.1700000000000002e-05, "loss": 0.9338, "step": 1132 }, { "epoch": 1.6923866872053401, "grad_norm": 0.14173538982868195, "learning_rate": 2.1675e-05, "loss": 0.9595, "step": 1133 }, { "epoch": 1.6938804089063155, "grad_norm": 0.12180199474096298, "learning_rate": 2.165e-05, "loss": 0.8129, "step": 1134 }, { "epoch": 1.6953741306072914, "grad_norm": 0.1295206993818283, "learning_rate": 2.1625e-05, "loss": 0.8159, "step": 1135 }, { "epoch": 1.6968678523082668, "grad_norm": 0.12932823598384857, "learning_rate": 2.16e-05, "loss": 0.8724, "step": 1136 }, { "epoch": 1.6983615740092424, "grad_norm": 0.1295505166053772, "learning_rate": 2.1575e-05, "loss": 0.9864, "step": 1137 }, { "epoch": 1.699855295710218, "grad_norm": 0.12608982622623444, "learning_rate": 2.1550000000000002e-05, "loss": 0.8251, "step": 1138 }, { "epoch": 1.7013490174111936, "grad_norm": 0.14585590362548828, "learning_rate": 2.1525e-05, "loss": 0.8387, "step": 1139 }, { "epoch": 1.7028427391121692, "grad_norm": 0.16308699548244476, "learning_rate": 2.15e-05, "loss": 0.9894, "step": 1140 }, { "epoch": 1.7043364608131446, "grad_norm": 0.13090486824512482, "learning_rate": 2.1475e-05, "loss": 0.8089, "step": 1141 }, { "epoch": 1.7058301825141204, "grad_norm": 0.1305518001317978, "learning_rate": 2.145e-05, "loss": 0.916, "step": 1142 }, { "epoch": 1.7073239042150958, "grad_norm": 0.13046926259994507, "learning_rate": 2.1425e-05, "loss": 0.8277, "step": 1143 }, { "epoch": 1.7088176259160717, "grad_norm": 0.1327645182609558, "learning_rate": 2.1400000000000002e-05, "loss": 0.8857, "step": 1144 }, { "epoch": 1.710311347617047, "grad_norm": 0.12704814970493317, "learning_rate": 2.1375e-05, "loss": 0.7664, "step": 1145 }, { "epoch": 1.7118050693180227, "grad_norm": 0.12340005487203598, "learning_rate": 2.135e-05, "loss": 0.836, "step": 1146 }, { "epoch": 1.7132987910189983, "grad_norm": 0.12090782821178436, "learning_rate": 2.1325e-05, "loss": 0.7404, "step": 1147 }, { "epoch": 1.714792512719974, "grad_norm": 0.1308443397283554, "learning_rate": 2.13e-05, "loss": 0.8825, "step": 1148 }, { "epoch": 1.7162862344209495, "grad_norm": 0.12266965210437775, "learning_rate": 2.1275000000000002e-05, "loss": 0.818, "step": 1149 }, { "epoch": 1.717779956121925, "grad_norm": 0.12834343314170837, "learning_rate": 2.125e-05, "loss": 0.8594, "step": 1150 }, { "epoch": 1.7192736778229007, "grad_norm": 0.12989541888237, "learning_rate": 2.1225e-05, "loss": 0.8732, "step": 1151 }, { "epoch": 1.7207673995238761, "grad_norm": 0.12120307981967926, "learning_rate": 2.12e-05, "loss": 0.8155, "step": 1152 }, { "epoch": 1.7222611212248518, "grad_norm": 0.12815384566783905, "learning_rate": 2.1175e-05, "loss": 0.7908, "step": 1153 }, { "epoch": 1.7237548429258274, "grad_norm": 0.12307754904031754, "learning_rate": 2.115e-05, "loss": 0.7974, "step": 1154 }, { "epoch": 1.725248564626803, "grad_norm": 0.12445150315761566, "learning_rate": 2.1125000000000002e-05, "loss": 0.7797, "step": 1155 }, { "epoch": 1.7267422863277786, "grad_norm": 0.12120848894119263, "learning_rate": 2.11e-05, "loss": 0.8036, "step": 1156 }, { "epoch": 1.728236008028754, "grad_norm": 0.12162711471319199, "learning_rate": 2.1075e-05, "loss": 0.7252, "step": 1157 }, { "epoch": 1.7297297297297298, "grad_norm": 0.1269288957118988, "learning_rate": 2.105e-05, "loss": 0.9103, "step": 1158 }, { "epoch": 1.7312234514307052, "grad_norm": 0.11370481550693512, "learning_rate": 2.1025e-05, "loss": 0.7138, "step": 1159 }, { "epoch": 1.732717173131681, "grad_norm": 0.11330307275056839, "learning_rate": 2.1e-05, "loss": 0.7202, "step": 1160 }, { "epoch": 1.7342108948326564, "grad_norm": 0.1348200887441635, "learning_rate": 2.0975e-05, "loss": 0.8631, "step": 1161 }, { "epoch": 1.735704616533632, "grad_norm": 0.13190488517284393, "learning_rate": 2.095e-05, "loss": 0.8599, "step": 1162 }, { "epoch": 1.7371983382346077, "grad_norm": 0.12874913215637207, "learning_rate": 2.0925e-05, "loss": 0.9167, "step": 1163 }, { "epoch": 1.7386920599355833, "grad_norm": 0.12878409028053284, "learning_rate": 2.09e-05, "loss": 0.7977, "step": 1164 }, { "epoch": 1.740185781636559, "grad_norm": 0.13331151008605957, "learning_rate": 2.0875e-05, "loss": 0.8645, "step": 1165 }, { "epoch": 1.7416795033375343, "grad_norm": 0.11371038109064102, "learning_rate": 2.085e-05, "loss": 0.7806, "step": 1166 }, { "epoch": 1.7431732250385101, "grad_norm": 0.11937269568443298, "learning_rate": 2.0825e-05, "loss": 0.8051, "step": 1167 }, { "epoch": 1.7446669467394855, "grad_norm": 0.14610852301120758, "learning_rate": 2.08e-05, "loss": 0.9335, "step": 1168 }, { "epoch": 1.7461606684404611, "grad_norm": 0.12858544290065765, "learning_rate": 2.0775e-05, "loss": 0.8973, "step": 1169 }, { "epoch": 1.7476543901414368, "grad_norm": 0.13014452159404755, "learning_rate": 2.075e-05, "loss": 0.9514, "step": 1170 }, { "epoch": 1.7491481118424124, "grad_norm": 0.12502023577690125, "learning_rate": 2.0725e-05, "loss": 0.7432, "step": 1171 }, { "epoch": 1.750641833543388, "grad_norm": 0.13443705439567566, "learning_rate": 2.07e-05, "loss": 0.8794, "step": 1172 }, { "epoch": 1.7521355552443634, "grad_norm": 0.1333446353673935, "learning_rate": 2.0675e-05, "loss": 0.9908, "step": 1173 }, { "epoch": 1.7536292769453392, "grad_norm": 0.12675704061985016, "learning_rate": 2.065e-05, "loss": 0.915, "step": 1174 }, { "epoch": 1.7551229986463146, "grad_norm": 0.12597693502902985, "learning_rate": 2.0625e-05, "loss": 0.8235, "step": 1175 }, { "epoch": 1.7566167203472904, "grad_norm": 0.13093382120132446, "learning_rate": 2.06e-05, "loss": 0.8052, "step": 1176 }, { "epoch": 1.7581104420482658, "grad_norm": 0.12378624081611633, "learning_rate": 2.0575e-05, "loss": 0.7771, "step": 1177 }, { "epoch": 1.7596041637492414, "grad_norm": 0.1284242421388626, "learning_rate": 2.055e-05, "loss": 0.909, "step": 1178 }, { "epoch": 1.761097885450217, "grad_norm": 0.11909797787666321, "learning_rate": 2.0525e-05, "loss": 0.8171, "step": 1179 }, { "epoch": 1.7625916071511927, "grad_norm": 0.15220597386360168, "learning_rate": 2.05e-05, "loss": 0.789, "step": 1180 }, { "epoch": 1.7640853288521683, "grad_norm": 0.1184302493929863, "learning_rate": 2.0475e-05, "loss": 0.7734, "step": 1181 }, { "epoch": 1.7655790505531437, "grad_norm": 0.12755298614501953, "learning_rate": 2.045e-05, "loss": 0.8345, "step": 1182 }, { "epoch": 1.7670727722541195, "grad_norm": 0.12822335958480835, "learning_rate": 2.0425e-05, "loss": 0.9336, "step": 1183 }, { "epoch": 1.768566493955095, "grad_norm": 0.1274261772632599, "learning_rate": 2.04e-05, "loss": 0.758, "step": 1184 }, { "epoch": 1.7700602156560705, "grad_norm": 0.12130683660507202, "learning_rate": 2.0375e-05, "loss": 0.8574, "step": 1185 }, { "epoch": 1.7715539373570461, "grad_norm": 0.13471463322639465, "learning_rate": 2.035e-05, "loss": 0.8962, "step": 1186 }, { "epoch": 1.7730476590580218, "grad_norm": 0.13367508351802826, "learning_rate": 2.0325e-05, "loss": 0.8896, "step": 1187 }, { "epoch": 1.7745413807589974, "grad_norm": 0.14295263588428497, "learning_rate": 2.0300000000000002e-05, "loss": 0.9754, "step": 1188 }, { "epoch": 1.776035102459973, "grad_norm": 0.12447523325681686, "learning_rate": 2.0275e-05, "loss": 0.7829, "step": 1189 }, { "epoch": 1.7775288241609486, "grad_norm": 0.13588370382785797, "learning_rate": 2.025e-05, "loss": 0.8526, "step": 1190 }, { "epoch": 1.779022545861924, "grad_norm": 0.1171405240893364, "learning_rate": 2.0225000000000004e-05, "loss": 0.7856, "step": 1191 }, { "epoch": 1.7805162675628998, "grad_norm": 0.13177570700645447, "learning_rate": 2.0200000000000003e-05, "loss": 0.8382, "step": 1192 }, { "epoch": 1.7820099892638752, "grad_norm": 0.13416346907615662, "learning_rate": 2.0175000000000003e-05, "loss": 0.7645, "step": 1193 }, { "epoch": 1.7835037109648508, "grad_norm": 0.13866715133190155, "learning_rate": 2.0150000000000002e-05, "loss": 0.8831, "step": 1194 }, { "epoch": 1.7849974326658264, "grad_norm": 0.11391008645296097, "learning_rate": 2.0125e-05, "loss": 0.768, "step": 1195 }, { "epoch": 1.786491154366802, "grad_norm": 0.1376311182975769, "learning_rate": 2.01e-05, "loss": 0.7846, "step": 1196 }, { "epoch": 1.7879848760677777, "grad_norm": 0.12423084676265717, "learning_rate": 2.0075000000000003e-05, "loss": 0.7955, "step": 1197 }, { "epoch": 1.789478597768753, "grad_norm": 0.12957480549812317, "learning_rate": 2.0050000000000003e-05, "loss": 0.8933, "step": 1198 }, { "epoch": 1.790972319469729, "grad_norm": 0.12400379776954651, "learning_rate": 2.0025000000000002e-05, "loss": 0.8491, "step": 1199 }, { "epoch": 1.7924660411707043, "grad_norm": 0.13244979083538055, "learning_rate": 2e-05, "loss": 0.8268, "step": 1200 }, { "epoch": 1.7939597628716801, "grad_norm": 0.14426936209201813, "learning_rate": 1.9975e-05, "loss": 0.8622, "step": 1201 }, { "epoch": 1.7954534845726555, "grad_norm": 0.12692514061927795, "learning_rate": 1.995e-05, "loss": 0.8728, "step": 1202 }, { "epoch": 1.7969472062736311, "grad_norm": 0.12716281414031982, "learning_rate": 1.9925000000000003e-05, "loss": 0.8502, "step": 1203 }, { "epoch": 1.7984409279746068, "grad_norm": 0.13064870238304138, "learning_rate": 1.9900000000000003e-05, "loss": 0.8453, "step": 1204 }, { "epoch": 1.7999346496755824, "grad_norm": 0.12724879384040833, "learning_rate": 1.9875000000000002e-05, "loss": 0.8479, "step": 1205 }, { "epoch": 1.801428371376558, "grad_norm": 0.13362351059913635, "learning_rate": 1.985e-05, "loss": 0.8504, "step": 1206 }, { "epoch": 1.8029220930775334, "grad_norm": 0.13249877095222473, "learning_rate": 1.9825e-05, "loss": 0.9421, "step": 1207 }, { "epoch": 1.8044158147785092, "grad_norm": 0.12667708098888397, "learning_rate": 1.9800000000000004e-05, "loss": 0.7644, "step": 1208 }, { "epoch": 1.8059095364794846, "grad_norm": 0.1281357705593109, "learning_rate": 1.9775000000000003e-05, "loss": 0.92, "step": 1209 }, { "epoch": 1.8074032581804602, "grad_norm": 0.11832661926746368, "learning_rate": 1.9750000000000002e-05, "loss": 0.7861, "step": 1210 }, { "epoch": 1.8088969798814358, "grad_norm": 0.1320231556892395, "learning_rate": 1.9725000000000002e-05, "loss": 0.8106, "step": 1211 }, { "epoch": 1.8103907015824114, "grad_norm": 0.1332932561635971, "learning_rate": 1.97e-05, "loss": 0.8792, "step": 1212 }, { "epoch": 1.811884423283387, "grad_norm": 0.12139624357223511, "learning_rate": 1.9675e-05, "loss": 0.8468, "step": 1213 }, { "epoch": 1.8133781449843625, "grad_norm": 0.12055651843547821, "learning_rate": 1.9650000000000003e-05, "loss": 0.6502, "step": 1214 }, { "epoch": 1.8148718666853383, "grad_norm": 0.1206275150179863, "learning_rate": 1.9625000000000003e-05, "loss": 0.8227, "step": 1215 }, { "epoch": 1.8163655883863137, "grad_norm": 0.13419689238071442, "learning_rate": 1.9600000000000002e-05, "loss": 0.8581, "step": 1216 }, { "epoch": 1.8178593100872895, "grad_norm": 0.1295374482870102, "learning_rate": 1.9575e-05, "loss": 0.8665, "step": 1217 }, { "epoch": 1.819353031788265, "grad_norm": 0.13123850524425507, "learning_rate": 1.955e-05, "loss": 0.9257, "step": 1218 }, { "epoch": 1.8208467534892405, "grad_norm": 0.12637512385845184, "learning_rate": 1.9525e-05, "loss": 0.776, "step": 1219 }, { "epoch": 1.8223404751902161, "grad_norm": 0.12860271334648132, "learning_rate": 1.9500000000000003e-05, "loss": 0.7945, "step": 1220 }, { "epoch": 1.8238341968911918, "grad_norm": 0.12743626534938812, "learning_rate": 1.9475000000000002e-05, "loss": 0.8724, "step": 1221 }, { "epoch": 1.8253279185921674, "grad_norm": 0.13996891677379608, "learning_rate": 1.9450000000000002e-05, "loss": 0.8899, "step": 1222 }, { "epoch": 1.8268216402931428, "grad_norm": 0.13088734447956085, "learning_rate": 1.9425e-05, "loss": 0.8936, "step": 1223 }, { "epoch": 1.8283153619941186, "grad_norm": 0.13675321638584137, "learning_rate": 1.94e-05, "loss": 0.9291, "step": 1224 }, { "epoch": 1.829809083695094, "grad_norm": 0.13203264772891998, "learning_rate": 1.9375e-05, "loss": 0.9199, "step": 1225 }, { "epoch": 1.8313028053960696, "grad_norm": 0.12601108849048615, "learning_rate": 1.9350000000000003e-05, "loss": 0.783, "step": 1226 }, { "epoch": 1.8327965270970452, "grad_norm": 0.1380903273820877, "learning_rate": 1.9325000000000002e-05, "loss": 0.8334, "step": 1227 }, { "epoch": 1.8342902487980208, "grad_norm": 0.12493746727705002, "learning_rate": 1.93e-05, "loss": 0.8611, "step": 1228 }, { "epoch": 1.8357839704989964, "grad_norm": 0.15787345170974731, "learning_rate": 1.9275e-05, "loss": 0.8092, "step": 1229 }, { "epoch": 1.8372776921999718, "grad_norm": 0.12091052532196045, "learning_rate": 1.925e-05, "loss": 0.8115, "step": 1230 }, { "epoch": 1.8387714139009477, "grad_norm": 0.1378464251756668, "learning_rate": 1.9225e-05, "loss": 0.7892, "step": 1231 }, { "epoch": 1.840265135601923, "grad_norm": 0.1262321174144745, "learning_rate": 1.9200000000000003e-05, "loss": 0.8579, "step": 1232 }, { "epoch": 1.841758857302899, "grad_norm": 0.12837159633636475, "learning_rate": 1.9175000000000002e-05, "loss": 0.8849, "step": 1233 }, { "epoch": 1.8432525790038743, "grad_norm": 0.14167827367782593, "learning_rate": 1.915e-05, "loss": 0.8411, "step": 1234 }, { "epoch": 1.84474630070485, "grad_norm": 0.13262982666492462, "learning_rate": 1.9125e-05, "loss": 0.816, "step": 1235 }, { "epoch": 1.8462400224058255, "grad_norm": 0.13891661167144775, "learning_rate": 1.91e-05, "loss": 0.8475, "step": 1236 }, { "epoch": 1.8477337441068011, "grad_norm": 0.13125349581241608, "learning_rate": 1.9075000000000003e-05, "loss": 0.7635, "step": 1237 }, { "epoch": 1.8492274658077767, "grad_norm": 0.12550696730613708, "learning_rate": 1.9050000000000002e-05, "loss": 0.7812, "step": 1238 }, { "epoch": 1.8507211875087521, "grad_norm": 0.12594923377037048, "learning_rate": 1.9025e-05, "loss": 0.9463, "step": 1239 }, { "epoch": 1.852214909209728, "grad_norm": 0.13107465207576752, "learning_rate": 1.9e-05, "loss": 0.769, "step": 1240 }, { "epoch": 1.8537086309107034, "grad_norm": 0.14669524133205414, "learning_rate": 1.8975e-05, "loss": 0.8958, "step": 1241 }, { "epoch": 1.855202352611679, "grad_norm": 0.12869606912136078, "learning_rate": 1.895e-05, "loss": 0.8392, "step": 1242 }, { "epoch": 1.8566960743126546, "grad_norm": 0.13752463459968567, "learning_rate": 1.8925000000000003e-05, "loss": 0.9072, "step": 1243 }, { "epoch": 1.8581897960136302, "grad_norm": 0.1378937065601349, "learning_rate": 1.8900000000000002e-05, "loss": 0.9398, "step": 1244 }, { "epoch": 1.8596835177146058, "grad_norm": 0.13100892305374146, "learning_rate": 1.8875e-05, "loss": 0.8363, "step": 1245 }, { "epoch": 1.8611772394155812, "grad_norm": 0.1251470148563385, "learning_rate": 1.885e-05, "loss": 0.8459, "step": 1246 }, { "epoch": 1.862670961116557, "grad_norm": 0.12104335427284241, "learning_rate": 1.8825e-05, "loss": 0.7877, "step": 1247 }, { "epoch": 1.8641646828175324, "grad_norm": 0.14913339912891388, "learning_rate": 1.88e-05, "loss": 0.8598, "step": 1248 }, { "epoch": 1.8656584045185083, "grad_norm": 0.13479259610176086, "learning_rate": 1.8775000000000002e-05, "loss": 0.8283, "step": 1249 }, { "epoch": 1.8671521262194837, "grad_norm": 0.12756747007369995, "learning_rate": 1.8750000000000002e-05, "loss": 0.8591, "step": 1250 }, { "epoch": 1.8686458479204593, "grad_norm": 0.13393226265907288, "learning_rate": 1.8725e-05, "loss": 0.8603, "step": 1251 }, { "epoch": 1.870139569621435, "grad_norm": 0.13049785792827606, "learning_rate": 1.87e-05, "loss": 0.8235, "step": 1252 }, { "epoch": 1.8716332913224105, "grad_norm": 0.1224079355597496, "learning_rate": 1.8675e-05, "loss": 0.7867, "step": 1253 }, { "epoch": 1.8731270130233861, "grad_norm": 0.14483274519443512, "learning_rate": 1.865e-05, "loss": 0.9689, "step": 1254 }, { "epoch": 1.8746207347243615, "grad_norm": 0.13204032182693481, "learning_rate": 1.8625000000000002e-05, "loss": 0.8252, "step": 1255 }, { "epoch": 1.8761144564253374, "grad_norm": 0.1305558830499649, "learning_rate": 1.86e-05, "loss": 0.8059, "step": 1256 }, { "epoch": 1.8776081781263128, "grad_norm": 0.12201261520385742, "learning_rate": 1.8575e-05, "loss": 0.7538, "step": 1257 }, { "epoch": 1.8791018998272886, "grad_norm": 0.11993113160133362, "learning_rate": 1.855e-05, "loss": 0.7197, "step": 1258 }, { "epoch": 1.880595621528264, "grad_norm": 0.12333643436431885, "learning_rate": 1.8525e-05, "loss": 0.8161, "step": 1259 }, { "epoch": 1.8820893432292396, "grad_norm": 0.12143362313508987, "learning_rate": 1.85e-05, "loss": 0.7411, "step": 1260 }, { "epoch": 1.8835830649302152, "grad_norm": 0.14514648914337158, "learning_rate": 1.8475000000000002e-05, "loss": 0.7964, "step": 1261 }, { "epoch": 1.8850767866311908, "grad_norm": 0.12302449345588684, "learning_rate": 1.845e-05, "loss": 0.79, "step": 1262 }, { "epoch": 1.8865705083321664, "grad_norm": 0.12880870699882507, "learning_rate": 1.8425e-05, "loss": 0.9252, "step": 1263 }, { "epoch": 1.8880642300331418, "grad_norm": 0.12553147971630096, "learning_rate": 1.84e-05, "loss": 0.8222, "step": 1264 }, { "epoch": 1.8895579517341177, "grad_norm": 0.1323452889919281, "learning_rate": 1.8375e-05, "loss": 0.9565, "step": 1265 }, { "epoch": 1.891051673435093, "grad_norm": 0.13709118962287903, "learning_rate": 1.8350000000000002e-05, "loss": 0.8999, "step": 1266 }, { "epoch": 1.8925453951360687, "grad_norm": 0.12807294726371765, "learning_rate": 1.8325e-05, "loss": 0.8209, "step": 1267 }, { "epoch": 1.8940391168370443, "grad_norm": 0.13357622921466827, "learning_rate": 1.83e-05, "loss": 0.7477, "step": 1268 }, { "epoch": 1.89553283853802, "grad_norm": 0.14053070545196533, "learning_rate": 1.8275e-05, "loss": 0.8269, "step": 1269 }, { "epoch": 1.8970265602389955, "grad_norm": 0.12722478806972504, "learning_rate": 1.825e-05, "loss": 0.8324, "step": 1270 }, { "epoch": 1.898520281939971, "grad_norm": 0.1320943832397461, "learning_rate": 1.8225e-05, "loss": 0.8514, "step": 1271 }, { "epoch": 1.9000140036409467, "grad_norm": 0.13922828435897827, "learning_rate": 1.8200000000000002e-05, "loss": 0.999, "step": 1272 }, { "epoch": 1.9015077253419221, "grad_norm": 0.1268698275089264, "learning_rate": 1.8175e-05, "loss": 0.8753, "step": 1273 }, { "epoch": 1.903001447042898, "grad_norm": 0.12937690317630768, "learning_rate": 1.815e-05, "loss": 0.9049, "step": 1274 }, { "epoch": 1.9044951687438734, "grad_norm": 0.13577449321746826, "learning_rate": 1.8125e-05, "loss": 0.9066, "step": 1275 }, { "epoch": 1.905988890444849, "grad_norm": 0.12619349360466003, "learning_rate": 1.81e-05, "loss": 0.7736, "step": 1276 }, { "epoch": 1.9074826121458246, "grad_norm": 0.12751753628253937, "learning_rate": 1.8075e-05, "loss": 0.8451, "step": 1277 }, { "epoch": 1.9089763338468002, "grad_norm": 0.14080756902694702, "learning_rate": 1.805e-05, "loss": 0.9272, "step": 1278 }, { "epoch": 1.9104700555477758, "grad_norm": 0.13200397789478302, "learning_rate": 1.8025e-05, "loss": 0.8013, "step": 1279 }, { "epoch": 1.9119637772487512, "grad_norm": 0.13423481583595276, "learning_rate": 1.8e-05, "loss": 0.8364, "step": 1280 }, { "epoch": 1.913457498949727, "grad_norm": 0.14036668837070465, "learning_rate": 1.7975e-05, "loss": 0.8824, "step": 1281 }, { "epoch": 1.9149512206507024, "grad_norm": 0.14129145443439484, "learning_rate": 1.795e-05, "loss": 1.03, "step": 1282 }, { "epoch": 1.916444942351678, "grad_norm": 0.12899865210056305, "learning_rate": 1.7925e-05, "loss": 0.8036, "step": 1283 }, { "epoch": 1.9179386640526537, "grad_norm": 0.14570587873458862, "learning_rate": 1.79e-05, "loss": 0.752, "step": 1284 }, { "epoch": 1.9194323857536293, "grad_norm": 0.12746211886405945, "learning_rate": 1.7875e-05, "loss": 0.798, "step": 1285 }, { "epoch": 1.920926107454605, "grad_norm": 0.12048125267028809, "learning_rate": 1.785e-05, "loss": 0.7966, "step": 1286 }, { "epoch": 1.9224198291555803, "grad_norm": 0.12655916810035706, "learning_rate": 1.7825e-05, "loss": 0.7867, "step": 1287 }, { "epoch": 1.9239135508565561, "grad_norm": 0.12403115630149841, "learning_rate": 1.78e-05, "loss": 0.8397, "step": 1288 }, { "epoch": 1.9254072725575315, "grad_norm": 0.13430100679397583, "learning_rate": 1.7775e-05, "loss": 0.9348, "step": 1289 }, { "epoch": 1.9269009942585074, "grad_norm": 0.13335439562797546, "learning_rate": 1.775e-05, "loss": 0.8142, "step": 1290 }, { "epoch": 1.9283947159594828, "grad_norm": 0.12833663821220398, "learning_rate": 1.7725e-05, "loss": 0.8129, "step": 1291 }, { "epoch": 1.9298884376604584, "grad_norm": 0.1298838108778, "learning_rate": 1.77e-05, "loss": 0.8239, "step": 1292 }, { "epoch": 1.931382159361434, "grad_norm": 0.1253083199262619, "learning_rate": 1.7675e-05, "loss": 0.854, "step": 1293 }, { "epoch": 1.9328758810624096, "grad_norm": 0.11974871158599854, "learning_rate": 1.765e-05, "loss": 0.7493, "step": 1294 }, { "epoch": 1.9343696027633852, "grad_norm": 0.13346506655216217, "learning_rate": 1.7625e-05, "loss": 0.9484, "step": 1295 }, { "epoch": 1.9358633244643606, "grad_norm": 0.14413511753082275, "learning_rate": 1.76e-05, "loss": 0.9882, "step": 1296 }, { "epoch": 1.9373570461653364, "grad_norm": 0.12227410823106766, "learning_rate": 1.7575e-05, "loss": 0.8078, "step": 1297 }, { "epoch": 1.9388507678663118, "grad_norm": 0.12427707761526108, "learning_rate": 1.755e-05, "loss": 0.7403, "step": 1298 }, { "epoch": 1.9403444895672874, "grad_norm": 0.10972325503826141, "learning_rate": 1.7525e-05, "loss": 0.6862, "step": 1299 }, { "epoch": 1.941838211268263, "grad_norm": 0.1306079924106598, "learning_rate": 1.75e-05, "loss": 0.917, "step": 1300 }, { "epoch": 1.9433319329692387, "grad_norm": 0.13232000172138214, "learning_rate": 1.7475e-05, "loss": 0.848, "step": 1301 }, { "epoch": 1.9448256546702143, "grad_norm": 0.1238214448094368, "learning_rate": 1.745e-05, "loss": 0.919, "step": 1302 }, { "epoch": 1.9463193763711897, "grad_norm": 0.1390426605939865, "learning_rate": 1.7425e-05, "loss": 0.862, "step": 1303 }, { "epoch": 1.9478130980721655, "grad_norm": 0.12475966662168503, "learning_rate": 1.74e-05, "loss": 0.9148, "step": 1304 }, { "epoch": 1.949306819773141, "grad_norm": 0.13397116959095, "learning_rate": 1.7375e-05, "loss": 0.8144, "step": 1305 }, { "epoch": 1.9508005414741167, "grad_norm": 0.13606944680213928, "learning_rate": 1.7349999999999998e-05, "loss": 0.8133, "step": 1306 }, { "epoch": 1.9522942631750921, "grad_norm": 0.13705767691135406, "learning_rate": 1.7325e-05, "loss": 0.9466, "step": 1307 }, { "epoch": 1.9537879848760678, "grad_norm": 0.11950363218784332, "learning_rate": 1.73e-05, "loss": 0.7916, "step": 1308 }, { "epoch": 1.9552817065770434, "grad_norm": 0.13733291625976562, "learning_rate": 1.7275e-05, "loss": 0.8201, "step": 1309 }, { "epoch": 1.956775428278019, "grad_norm": 0.13214121758937836, "learning_rate": 1.725e-05, "loss": 0.8942, "step": 1310 }, { "epoch": 1.9582691499789946, "grad_norm": 0.13020627200603485, "learning_rate": 1.7225e-05, "loss": 0.7843, "step": 1311 }, { "epoch": 1.95976287167997, "grad_norm": 0.1347920000553131, "learning_rate": 1.7199999999999998e-05, "loss": 0.7652, "step": 1312 }, { "epoch": 1.9612565933809458, "grad_norm": 0.12419667094945908, "learning_rate": 1.7175e-05, "loss": 0.8739, "step": 1313 }, { "epoch": 1.9627503150819212, "grad_norm": 0.1298210769891739, "learning_rate": 1.7150000000000004e-05, "loss": 0.8814, "step": 1314 }, { "epoch": 1.964244036782897, "grad_norm": 0.12508539855480194, "learning_rate": 1.7125000000000003e-05, "loss": 0.7508, "step": 1315 }, { "epoch": 1.9657377584838724, "grad_norm": 0.13005280494689941, "learning_rate": 1.7100000000000002e-05, "loss": 0.7596, "step": 1316 }, { "epoch": 1.967231480184848, "grad_norm": 0.14853957295417786, "learning_rate": 1.7075e-05, "loss": 0.9094, "step": 1317 }, { "epoch": 1.9687252018858237, "grad_norm": 0.13223786652088165, "learning_rate": 1.705e-05, "loss": 0.902, "step": 1318 }, { "epoch": 1.9702189235867993, "grad_norm": 0.1233852431178093, "learning_rate": 1.7025e-05, "loss": 0.7794, "step": 1319 }, { "epoch": 1.971712645287775, "grad_norm": 0.1299366056919098, "learning_rate": 1.7000000000000003e-05, "loss": 0.783, "step": 1320 }, { "epoch": 1.9732063669887503, "grad_norm": 0.1469392627477646, "learning_rate": 1.6975000000000003e-05, "loss": 0.9647, "step": 1321 }, { "epoch": 1.9747000886897261, "grad_norm": 0.11656398326158524, "learning_rate": 1.6950000000000002e-05, "loss": 0.6668, "step": 1322 }, { "epoch": 1.9761938103907015, "grad_norm": 0.11369689553976059, "learning_rate": 1.6925e-05, "loss": 0.6626, "step": 1323 }, { "epoch": 1.9776875320916771, "grad_norm": 0.11943326145410538, "learning_rate": 1.69e-05, "loss": 0.7569, "step": 1324 }, { "epoch": 1.9791812537926528, "grad_norm": 0.13392581045627594, "learning_rate": 1.6875000000000004e-05, "loss": 0.8174, "step": 1325 }, { "epoch": 1.9806749754936284, "grad_norm": 0.14107809960842133, "learning_rate": 1.6850000000000003e-05, "loss": 0.9278, "step": 1326 }, { "epoch": 1.982168697194604, "grad_norm": 0.13062167167663574, "learning_rate": 1.6825000000000002e-05, "loss": 0.774, "step": 1327 }, { "epoch": 1.9836624188955794, "grad_norm": 0.1293383091688156, "learning_rate": 1.6800000000000002e-05, "loss": 0.694, "step": 1328 }, { "epoch": 1.9851561405965552, "grad_norm": 0.12562714517116547, "learning_rate": 1.6775e-05, "loss": 0.8929, "step": 1329 }, { "epoch": 1.9866498622975306, "grad_norm": 0.11631745845079422, "learning_rate": 1.675e-05, "loss": 0.8082, "step": 1330 }, { "epoch": 1.9881435839985064, "grad_norm": 0.13414128124713898, "learning_rate": 1.6725000000000003e-05, "loss": 0.8558, "step": 1331 }, { "epoch": 1.9896373056994818, "grad_norm": 0.13765303790569305, "learning_rate": 1.6700000000000003e-05, "loss": 0.832, "step": 1332 }, { "epoch": 1.9911310274004574, "grad_norm": 0.14016294479370117, "learning_rate": 1.6675000000000002e-05, "loss": 0.8601, "step": 1333 }, { "epoch": 1.992624749101433, "grad_norm": 0.13692569732666016, "learning_rate": 1.665e-05, "loss": 0.8842, "step": 1334 }, { "epoch": 1.9941184708024087, "grad_norm": 0.12565594911575317, "learning_rate": 1.6625e-05, "loss": 0.7945, "step": 1335 }, { "epoch": 1.9956121925033843, "grad_norm": 0.13419552147388458, "learning_rate": 1.66e-05, "loss": 0.883, "step": 1336 }, { "epoch": 1.9971059142043597, "grad_norm": 0.1379178762435913, "learning_rate": 1.6575000000000003e-05, "loss": 0.874, "step": 1337 }, { "epoch": 1.9985996359053355, "grad_norm": 0.13608182966709137, "learning_rate": 1.6550000000000002e-05, "loss": 0.9373, "step": 1338 }, { "epoch": 2.000093357606311, "grad_norm": 0.11911476403474808, "learning_rate": 1.6525000000000002e-05, "loss": 0.7373, "step": 1339 }, { "epoch": 2.0015870793072867, "grad_norm": 0.12470046430826187, "learning_rate": 1.65e-05, "loss": 0.7498, "step": 1340 }, { "epoch": 2.003080801008262, "grad_norm": 0.1347867250442505, "learning_rate": 1.6475e-05, "loss": 0.8324, "step": 1341 }, { "epoch": 2.0045745227092375, "grad_norm": 0.12354327738285065, "learning_rate": 1.645e-05, "loss": 0.8282, "step": 1342 }, { "epoch": 2.0060682444102134, "grad_norm": 0.11837272346019745, "learning_rate": 1.6425000000000003e-05, "loss": 0.7383, "step": 1343 }, { "epoch": 2.0075619661111888, "grad_norm": 0.13040128350257874, "learning_rate": 1.6400000000000002e-05, "loss": 0.916, "step": 1344 }, { "epoch": 2.0090556878121646, "grad_norm": 0.12235967069864273, "learning_rate": 1.6375e-05, "loss": 0.8215, "step": 1345 }, { "epoch": 2.01054940951314, "grad_norm": 0.1411534696817398, "learning_rate": 1.635e-05, "loss": 0.8102, "step": 1346 }, { "epoch": 2.012043131214116, "grad_norm": 0.12864133715629578, "learning_rate": 1.6325e-05, "loss": 0.7554, "step": 1347 }, { "epoch": 2.013536852915091, "grad_norm": 0.13392312824726105, "learning_rate": 1.63e-05, "loss": 0.7465, "step": 1348 }, { "epoch": 2.015030574616067, "grad_norm": 0.1329779028892517, "learning_rate": 1.6275000000000003e-05, "loss": 0.8962, "step": 1349 }, { "epoch": 2.0165242963170424, "grad_norm": 0.1418597549200058, "learning_rate": 1.6250000000000002e-05, "loss": 0.9835, "step": 1350 }, { "epoch": 2.018018018018018, "grad_norm": 0.1308550238609314, "learning_rate": 1.6225e-05, "loss": 0.8747, "step": 1351 }, { "epoch": 2.0195117397189937, "grad_norm": 0.12701186537742615, "learning_rate": 1.62e-05, "loss": 0.8062, "step": 1352 }, { "epoch": 2.021005461419969, "grad_norm": 0.1336052417755127, "learning_rate": 1.6175e-05, "loss": 0.7573, "step": 1353 }, { "epoch": 2.022499183120945, "grad_norm": 0.12314220517873764, "learning_rate": 1.6150000000000003e-05, "loss": 0.8322, "step": 1354 }, { "epoch": 2.0239929048219203, "grad_norm": 0.12207911908626556, "learning_rate": 1.6125000000000002e-05, "loss": 0.8581, "step": 1355 }, { "epoch": 2.025486626522896, "grad_norm": 0.1417931318283081, "learning_rate": 1.6100000000000002e-05, "loss": 0.8845, "step": 1356 }, { "epoch": 2.0269803482238715, "grad_norm": 0.12503816187381744, "learning_rate": 1.6075e-05, "loss": 0.8118, "step": 1357 }, { "epoch": 2.028474069924847, "grad_norm": 0.12786728143692017, "learning_rate": 1.605e-05, "loss": 0.8816, "step": 1358 }, { "epoch": 2.0299677916258227, "grad_norm": 0.11776399612426758, "learning_rate": 1.6025e-05, "loss": 0.6982, "step": 1359 }, { "epoch": 2.031461513326798, "grad_norm": 0.1279592514038086, "learning_rate": 1.6000000000000003e-05, "loss": 0.7332, "step": 1360 }, { "epoch": 2.032955235027774, "grad_norm": 0.12639985978603363, "learning_rate": 1.5975000000000002e-05, "loss": 0.8453, "step": 1361 }, { "epoch": 2.0344489567287494, "grad_norm": 0.1336001455783844, "learning_rate": 1.595e-05, "loss": 0.8203, "step": 1362 }, { "epoch": 2.035942678429725, "grad_norm": 0.13058388233184814, "learning_rate": 1.5925e-05, "loss": 0.9265, "step": 1363 }, { "epoch": 2.0374364001307006, "grad_norm": 0.13588199019432068, "learning_rate": 1.59e-05, "loss": 0.9287, "step": 1364 }, { "epoch": 2.0389301218316764, "grad_norm": 0.12712161242961884, "learning_rate": 1.5875e-05, "loss": 0.8634, "step": 1365 }, { "epoch": 2.040423843532652, "grad_norm": 0.12903352081775665, "learning_rate": 1.5850000000000002e-05, "loss": 0.8376, "step": 1366 }, { "epoch": 2.041917565233627, "grad_norm": 0.13086992502212524, "learning_rate": 1.5825000000000002e-05, "loss": 0.7775, "step": 1367 }, { "epoch": 2.043411286934603, "grad_norm": 0.13522988557815552, "learning_rate": 1.58e-05, "loss": 0.8644, "step": 1368 }, { "epoch": 2.0449050086355784, "grad_norm": 0.13410396873950958, "learning_rate": 1.5775e-05, "loss": 0.886, "step": 1369 }, { "epoch": 2.0463987303365543, "grad_norm": 0.12724365293979645, "learning_rate": 1.575e-05, "loss": 0.7781, "step": 1370 }, { "epoch": 2.0478924520375297, "grad_norm": 0.12271001935005188, "learning_rate": 1.5725e-05, "loss": 0.8206, "step": 1371 }, { "epoch": 2.0493861737385055, "grad_norm": 0.12714345753192902, "learning_rate": 1.5700000000000002e-05, "loss": 0.8121, "step": 1372 }, { "epoch": 2.050879895439481, "grad_norm": 0.14135147631168365, "learning_rate": 1.5675e-05, "loss": 0.8281, "step": 1373 }, { "epoch": 2.0523736171404563, "grad_norm": 0.1263141632080078, "learning_rate": 1.565e-05, "loss": 0.7824, "step": 1374 }, { "epoch": 2.053867338841432, "grad_norm": 0.11844862252473831, "learning_rate": 1.5625e-05, "loss": 0.7982, "step": 1375 }, { "epoch": 2.0553610605424075, "grad_norm": 0.12898267805576324, "learning_rate": 1.56e-05, "loss": 0.8482, "step": 1376 }, { "epoch": 2.0568547822433834, "grad_norm": 0.12399061769247055, "learning_rate": 1.5575e-05, "loss": 0.8564, "step": 1377 }, { "epoch": 2.0583485039443588, "grad_norm": 0.12906433641910553, "learning_rate": 1.5550000000000002e-05, "loss": 0.7325, "step": 1378 }, { "epoch": 2.0598422256453346, "grad_norm": 0.13466913998126984, "learning_rate": 1.5525e-05, "loss": 0.9284, "step": 1379 }, { "epoch": 2.06133594734631, "grad_norm": 0.13662073016166687, "learning_rate": 1.55e-05, "loss": 0.8348, "step": 1380 }, { "epoch": 2.062829669047286, "grad_norm": 0.12854935228824615, "learning_rate": 1.5475e-05, "loss": 0.8749, "step": 1381 }, { "epoch": 2.064323390748261, "grad_norm": 0.1422199159860611, "learning_rate": 1.545e-05, "loss": 1.0178, "step": 1382 }, { "epoch": 2.0658171124492366, "grad_norm": 0.12989145517349243, "learning_rate": 1.5425000000000002e-05, "loss": 0.8947, "step": 1383 }, { "epoch": 2.0673108341502124, "grad_norm": 0.12174545228481293, "learning_rate": 1.54e-05, "loss": 0.7956, "step": 1384 }, { "epoch": 2.068804555851188, "grad_norm": 0.12398344278335571, "learning_rate": 1.5375e-05, "loss": 0.8156, "step": 1385 }, { "epoch": 2.0702982775521637, "grad_norm": 0.13565067946910858, "learning_rate": 1.535e-05, "loss": 0.8707, "step": 1386 }, { "epoch": 2.071791999253139, "grad_norm": 0.12309794872999191, "learning_rate": 1.5325e-05, "loss": 0.821, "step": 1387 }, { "epoch": 2.073285720954115, "grad_norm": 0.14446188509464264, "learning_rate": 1.53e-05, "loss": 0.9476, "step": 1388 }, { "epoch": 2.0747794426550903, "grad_norm": 0.14990466833114624, "learning_rate": 1.5275000000000002e-05, "loss": 0.8043, "step": 1389 }, { "epoch": 2.076273164356066, "grad_norm": 0.12200348824262619, "learning_rate": 1.525e-05, "loss": 0.7749, "step": 1390 }, { "epoch": 2.0777668860570415, "grad_norm": 0.12738126516342163, "learning_rate": 1.5225e-05, "loss": 0.8116, "step": 1391 }, { "epoch": 2.079260607758017, "grad_norm": 0.1419604867696762, "learning_rate": 1.52e-05, "loss": 0.8918, "step": 1392 }, { "epoch": 2.0807543294589927, "grad_norm": 0.12958431243896484, "learning_rate": 1.5175e-05, "loss": 0.8097, "step": 1393 }, { "epoch": 2.082248051159968, "grad_norm": 0.13025572896003723, "learning_rate": 1.515e-05, "loss": 0.8866, "step": 1394 }, { "epoch": 2.083741772860944, "grad_norm": 0.12287663668394089, "learning_rate": 1.5125e-05, "loss": 0.7733, "step": 1395 }, { "epoch": 2.0852354945619194, "grad_norm": 0.1355210542678833, "learning_rate": 1.51e-05, "loss": 0.8116, "step": 1396 }, { "epoch": 2.086729216262895, "grad_norm": 0.1378578096628189, "learning_rate": 1.5075e-05, "loss": 0.8556, "step": 1397 }, { "epoch": 2.0882229379638706, "grad_norm": 0.12839314341545105, "learning_rate": 1.505e-05, "loss": 0.7969, "step": 1398 }, { "epoch": 2.089716659664846, "grad_norm": 0.1289265751838684, "learning_rate": 1.5025000000000001e-05, "loss": 0.7984, "step": 1399 }, { "epoch": 2.091210381365822, "grad_norm": 0.1375155746936798, "learning_rate": 1.5e-05, "loss": 0.8051, "step": 1400 }, { "epoch": 2.092704103066797, "grad_norm": 0.12847323715686798, "learning_rate": 1.4975e-05, "loss": 0.7894, "step": 1401 }, { "epoch": 2.094197824767773, "grad_norm": 0.13754858076572418, "learning_rate": 1.4950000000000001e-05, "loss": 0.8072, "step": 1402 }, { "epoch": 2.0956915464687484, "grad_norm": 0.13762550055980682, "learning_rate": 1.4925e-05, "loss": 0.9103, "step": 1403 }, { "epoch": 2.0971852681697243, "grad_norm": 0.14499983191490173, "learning_rate": 1.49e-05, "loss": 0.8145, "step": 1404 }, { "epoch": 2.0986789898706997, "grad_norm": 0.13695576786994934, "learning_rate": 1.4875e-05, "loss": 0.8757, "step": 1405 }, { "epoch": 2.1001727115716755, "grad_norm": 0.12869825959205627, "learning_rate": 1.485e-05, "loss": 0.8387, "step": 1406 }, { "epoch": 2.101666433272651, "grad_norm": 0.13788387179374695, "learning_rate": 1.4825e-05, "loss": 0.827, "step": 1407 }, { "epoch": 2.1031601549736263, "grad_norm": 0.1289745718240738, "learning_rate": 1.48e-05, "loss": 0.7971, "step": 1408 }, { "epoch": 2.104653876674602, "grad_norm": 0.11975058913230896, "learning_rate": 1.4775e-05, "loss": 0.7306, "step": 1409 }, { "epoch": 2.1061475983755775, "grad_norm": 0.12088936567306519, "learning_rate": 1.475e-05, "loss": 0.709, "step": 1410 }, { "epoch": 2.1076413200765534, "grad_norm": 0.12805478274822235, "learning_rate": 1.4725e-05, "loss": 0.8209, "step": 1411 }, { "epoch": 2.1091350417775288, "grad_norm": 0.12449615448713303, "learning_rate": 1.47e-05, "loss": 0.7232, "step": 1412 }, { "epoch": 2.1106287634785046, "grad_norm": 0.11938970535993576, "learning_rate": 1.4675e-05, "loss": 0.7462, "step": 1413 }, { "epoch": 2.11212248517948, "grad_norm": 0.13330097496509552, "learning_rate": 1.465e-05, "loss": 0.9104, "step": 1414 }, { "epoch": 2.1136162068804554, "grad_norm": 0.12861578166484833, "learning_rate": 1.4625e-05, "loss": 0.8621, "step": 1415 }, { "epoch": 2.115109928581431, "grad_norm": 0.13145603239536285, "learning_rate": 1.4599999999999999e-05, "loss": 0.7701, "step": 1416 }, { "epoch": 2.1166036502824066, "grad_norm": 0.1292240172624588, "learning_rate": 1.4575e-05, "loss": 0.7475, "step": 1417 }, { "epoch": 2.1180973719833824, "grad_norm": 0.12755291163921356, "learning_rate": 1.455e-05, "loss": 0.9183, "step": 1418 }, { "epoch": 2.119591093684358, "grad_norm": 0.13652198016643524, "learning_rate": 1.4524999999999999e-05, "loss": 0.712, "step": 1419 }, { "epoch": 2.1210848153853337, "grad_norm": 0.1325538456439972, "learning_rate": 1.45e-05, "loss": 0.8302, "step": 1420 }, { "epoch": 2.122578537086309, "grad_norm": 0.12699578702449799, "learning_rate": 1.4475e-05, "loss": 0.8448, "step": 1421 }, { "epoch": 2.124072258787285, "grad_norm": 0.13657280802726746, "learning_rate": 1.4449999999999999e-05, "loss": 0.9318, "step": 1422 }, { "epoch": 2.1255659804882603, "grad_norm": 0.1333833485841751, "learning_rate": 1.4425e-05, "loss": 0.8297, "step": 1423 }, { "epoch": 2.1270597021892357, "grad_norm": 0.12829788029193878, "learning_rate": 1.44e-05, "loss": 0.8125, "step": 1424 }, { "epoch": 2.1285534238902115, "grad_norm": 0.14383582770824432, "learning_rate": 1.4374999999999999e-05, "loss": 0.8649, "step": 1425 }, { "epoch": 2.130047145591187, "grad_norm": 0.13525085151195526, "learning_rate": 1.435e-05, "loss": 0.8231, "step": 1426 }, { "epoch": 2.1315408672921627, "grad_norm": 0.1254875212907791, "learning_rate": 1.4325e-05, "loss": 0.8024, "step": 1427 }, { "epoch": 2.133034588993138, "grad_norm": 0.1222248375415802, "learning_rate": 1.43e-05, "loss": 0.8979, "step": 1428 }, { "epoch": 2.134528310694114, "grad_norm": 0.1383056342601776, "learning_rate": 1.4275e-05, "loss": 0.8083, "step": 1429 }, { "epoch": 2.1360220323950894, "grad_norm": 0.13105610013008118, "learning_rate": 1.4249999999999999e-05, "loss": 0.8093, "step": 1430 }, { "epoch": 2.137515754096065, "grad_norm": 0.13794785737991333, "learning_rate": 1.4225e-05, "loss": 0.9126, "step": 1431 }, { "epoch": 2.1390094757970406, "grad_norm": 0.12733937799930573, "learning_rate": 1.42e-05, "loss": 0.7543, "step": 1432 }, { "epoch": 2.140503197498016, "grad_norm": 0.13362152874469757, "learning_rate": 1.4174999999999999e-05, "loss": 0.8706, "step": 1433 }, { "epoch": 2.141996919198992, "grad_norm": 0.14220747351646423, "learning_rate": 1.415e-05, "loss": 0.931, "step": 1434 }, { "epoch": 2.143490640899967, "grad_norm": 0.12921415269374847, "learning_rate": 1.4125e-05, "loss": 0.8556, "step": 1435 }, { "epoch": 2.144984362600943, "grad_norm": 0.1378594934940338, "learning_rate": 1.4099999999999999e-05, "loss": 0.8925, "step": 1436 }, { "epoch": 2.1464780843019184, "grad_norm": 0.14052025973796844, "learning_rate": 1.4075e-05, "loss": 0.8033, "step": 1437 }, { "epoch": 2.1479718060028943, "grad_norm": 0.13060006499290466, "learning_rate": 1.4050000000000003e-05, "loss": 0.9185, "step": 1438 }, { "epoch": 2.1494655277038697, "grad_norm": 0.1500641405582428, "learning_rate": 1.4025000000000002e-05, "loss": 0.7899, "step": 1439 }, { "epoch": 2.150959249404845, "grad_norm": 0.13628552854061127, "learning_rate": 1.4000000000000001e-05, "loss": 0.8667, "step": 1440 }, { "epoch": 2.152452971105821, "grad_norm": 0.12043400853872299, "learning_rate": 1.3975000000000003e-05, "loss": 0.7907, "step": 1441 }, { "epoch": 2.1539466928067963, "grad_norm": 0.13282616436481476, "learning_rate": 1.3950000000000002e-05, "loss": 0.9235, "step": 1442 }, { "epoch": 2.155440414507772, "grad_norm": 0.13222989439964294, "learning_rate": 1.3925000000000001e-05, "loss": 0.6996, "step": 1443 }, { "epoch": 2.1569341362087475, "grad_norm": 0.13287708163261414, "learning_rate": 1.3900000000000002e-05, "loss": 0.8012, "step": 1444 }, { "epoch": 2.1584278579097234, "grad_norm": 0.14189693331718445, "learning_rate": 1.3875000000000002e-05, "loss": 0.8597, "step": 1445 }, { "epoch": 2.1599215796106987, "grad_norm": 0.1403769552707672, "learning_rate": 1.3850000000000001e-05, "loss": 0.9287, "step": 1446 }, { "epoch": 2.161415301311674, "grad_norm": 0.1361783742904663, "learning_rate": 1.3825000000000002e-05, "loss": 0.7086, "step": 1447 }, { "epoch": 2.16290902301265, "grad_norm": 0.1325969249010086, "learning_rate": 1.3800000000000002e-05, "loss": 0.8662, "step": 1448 }, { "epoch": 2.1644027447136254, "grad_norm": 0.14067314565181732, "learning_rate": 1.3775000000000001e-05, "loss": 0.857, "step": 1449 }, { "epoch": 2.165896466414601, "grad_norm": 0.13178247213363647, "learning_rate": 1.3750000000000002e-05, "loss": 0.8393, "step": 1450 }, { "epoch": 2.1673901881155766, "grad_norm": 0.13023096323013306, "learning_rate": 1.3725000000000002e-05, "loss": 0.8393, "step": 1451 }, { "epoch": 2.1688839098165524, "grad_norm": 0.12503521144390106, "learning_rate": 1.3700000000000001e-05, "loss": 0.7808, "step": 1452 }, { "epoch": 2.170377631517528, "grad_norm": 0.1315786987543106, "learning_rate": 1.3675000000000002e-05, "loss": 0.905, "step": 1453 }, { "epoch": 2.1718713532185037, "grad_norm": 0.13432063162326813, "learning_rate": 1.3650000000000001e-05, "loss": 0.9221, "step": 1454 }, { "epoch": 2.173365074919479, "grad_norm": 0.12739317119121552, "learning_rate": 1.3625e-05, "loss": 0.8364, "step": 1455 }, { "epoch": 2.1748587966204544, "grad_norm": 0.1363159716129303, "learning_rate": 1.3600000000000002e-05, "loss": 0.8688, "step": 1456 }, { "epoch": 2.1763525183214303, "grad_norm": 0.13407284021377563, "learning_rate": 1.3575000000000001e-05, "loss": 0.8484, "step": 1457 }, { "epoch": 2.1778462400224057, "grad_norm": 0.13221803307533264, "learning_rate": 1.3550000000000002e-05, "loss": 0.8769, "step": 1458 }, { "epoch": 2.1793399617233815, "grad_norm": 0.13646693527698517, "learning_rate": 1.3525000000000002e-05, "loss": 0.7731, "step": 1459 }, { "epoch": 2.180833683424357, "grad_norm": 0.15357060730457306, "learning_rate": 1.3500000000000001e-05, "loss": 0.9013, "step": 1460 }, { "epoch": 2.1823274051253327, "grad_norm": 0.13165098428726196, "learning_rate": 1.3475000000000002e-05, "loss": 0.7911, "step": 1461 }, { "epoch": 2.183821126826308, "grad_norm": 0.14789406955242157, "learning_rate": 1.3450000000000002e-05, "loss": 0.8847, "step": 1462 }, { "epoch": 2.185314848527284, "grad_norm": 0.13142547011375427, "learning_rate": 1.3425000000000001e-05, "loss": 0.8257, "step": 1463 }, { "epoch": 2.1868085702282594, "grad_norm": 0.12578803300857544, "learning_rate": 1.3400000000000002e-05, "loss": 0.8378, "step": 1464 }, { "epoch": 2.1883022919292348, "grad_norm": 0.12787596881389618, "learning_rate": 1.3375000000000002e-05, "loss": 0.7567, "step": 1465 }, { "epoch": 2.1897960136302106, "grad_norm": 0.1288217008113861, "learning_rate": 1.3350000000000001e-05, "loss": 0.8652, "step": 1466 }, { "epoch": 2.191289735331186, "grad_norm": 0.1354709267616272, "learning_rate": 1.3325000000000002e-05, "loss": 0.7819, "step": 1467 }, { "epoch": 2.192783457032162, "grad_norm": 0.11878442019224167, "learning_rate": 1.3300000000000001e-05, "loss": 0.6858, "step": 1468 }, { "epoch": 2.194277178733137, "grad_norm": 0.1263943463563919, "learning_rate": 1.3275e-05, "loss": 0.78, "step": 1469 }, { "epoch": 2.195770900434113, "grad_norm": 0.15008960664272308, "learning_rate": 1.3250000000000002e-05, "loss": 0.8798, "step": 1470 }, { "epoch": 2.1972646221350884, "grad_norm": 0.13995765149593353, "learning_rate": 1.3225000000000001e-05, "loss": 0.9317, "step": 1471 }, { "epoch": 2.1987583438360643, "grad_norm": 0.12223789095878601, "learning_rate": 1.32e-05, "loss": 0.6894, "step": 1472 }, { "epoch": 2.2002520655370397, "grad_norm": 0.12036357820034027, "learning_rate": 1.3175000000000002e-05, "loss": 0.863, "step": 1473 }, { "epoch": 2.201745787238015, "grad_norm": 0.13175898790359497, "learning_rate": 1.3150000000000001e-05, "loss": 0.8377, "step": 1474 }, { "epoch": 2.203239508938991, "grad_norm": 0.13795016705989838, "learning_rate": 1.3125e-05, "loss": 0.8036, "step": 1475 }, { "epoch": 2.2047332306399663, "grad_norm": 0.12804925441741943, "learning_rate": 1.3100000000000002e-05, "loss": 0.7926, "step": 1476 }, { "epoch": 2.206226952340942, "grad_norm": 0.12702885270118713, "learning_rate": 1.3075000000000001e-05, "loss": 0.8037, "step": 1477 }, { "epoch": 2.2077206740419175, "grad_norm": 0.12805096805095673, "learning_rate": 1.305e-05, "loss": 0.8074, "step": 1478 }, { "epoch": 2.2092143957428934, "grad_norm": 0.1372733861207962, "learning_rate": 1.3025000000000002e-05, "loss": 0.803, "step": 1479 }, { "epoch": 2.2107081174438687, "grad_norm": 0.1263812780380249, "learning_rate": 1.3000000000000001e-05, "loss": 0.8167, "step": 1480 }, { "epoch": 2.212201839144844, "grad_norm": 0.13209514319896698, "learning_rate": 1.2975e-05, "loss": 0.8588, "step": 1481 }, { "epoch": 2.21369556084582, "grad_norm": 0.14626124501228333, "learning_rate": 1.2950000000000001e-05, "loss": 0.9021, "step": 1482 }, { "epoch": 2.2151892825467954, "grad_norm": 0.13836370408535004, "learning_rate": 1.2925e-05, "loss": 0.8521, "step": 1483 }, { "epoch": 2.216683004247771, "grad_norm": 0.13727393746376038, "learning_rate": 1.29e-05, "loss": 0.7701, "step": 1484 }, { "epoch": 2.2181767259487466, "grad_norm": 0.13722606003284454, "learning_rate": 1.2875000000000001e-05, "loss": 0.8687, "step": 1485 }, { "epoch": 2.2196704476497224, "grad_norm": 0.13781701028347015, "learning_rate": 1.285e-05, "loss": 0.832, "step": 1486 }, { "epoch": 2.221164169350698, "grad_norm": 0.13844557106494904, "learning_rate": 1.2825000000000002e-05, "loss": 0.855, "step": 1487 }, { "epoch": 2.222657891051673, "grad_norm": 0.13126686215400696, "learning_rate": 1.2800000000000001e-05, "loss": 0.8757, "step": 1488 }, { "epoch": 2.224151612752649, "grad_norm": 0.1271667331457138, "learning_rate": 1.2775e-05, "loss": 0.7929, "step": 1489 }, { "epoch": 2.2256453344536244, "grad_norm": 0.12975314259529114, "learning_rate": 1.2750000000000002e-05, "loss": 0.8664, "step": 1490 }, { "epoch": 2.2271390561546003, "grad_norm": 0.1469721794128418, "learning_rate": 1.2725000000000001e-05, "loss": 0.7985, "step": 1491 }, { "epoch": 2.2286327778555757, "grad_norm": 0.13200722634792328, "learning_rate": 1.27e-05, "loss": 0.8971, "step": 1492 }, { "epoch": 2.2301264995565515, "grad_norm": 0.1337095946073532, "learning_rate": 1.2675000000000001e-05, "loss": 0.9615, "step": 1493 }, { "epoch": 2.231620221257527, "grad_norm": 0.14528469741344452, "learning_rate": 1.2650000000000001e-05, "loss": 0.9174, "step": 1494 }, { "epoch": 2.2331139429585027, "grad_norm": 0.1322767287492752, "learning_rate": 1.2625e-05, "loss": 0.8103, "step": 1495 }, { "epoch": 2.234607664659478, "grad_norm": 0.12214113771915436, "learning_rate": 1.2600000000000001e-05, "loss": 0.8592, "step": 1496 }, { "epoch": 2.2361013863604535, "grad_norm": 0.12879252433776855, "learning_rate": 1.2575e-05, "loss": 0.8493, "step": 1497 }, { "epoch": 2.2375951080614294, "grad_norm": 0.13059349358081818, "learning_rate": 1.255e-05, "loss": 0.833, "step": 1498 }, { "epoch": 2.2390888297624048, "grad_norm": 0.12225184589624405, "learning_rate": 1.2525000000000001e-05, "loss": 0.8357, "step": 1499 }, { "epoch": 2.2405825514633806, "grad_norm": 0.15062668919563293, "learning_rate": 1.25e-05, "loss": 0.8132, "step": 1500 }, { "epoch": 2.242076273164356, "grad_norm": 0.125452920794487, "learning_rate": 1.2475e-05, "loss": 0.8055, "step": 1501 }, { "epoch": 2.243569994865332, "grad_norm": 0.12242195010185242, "learning_rate": 1.2450000000000001e-05, "loss": 0.7315, "step": 1502 }, { "epoch": 2.245063716566307, "grad_norm": 0.1343228965997696, "learning_rate": 1.2425e-05, "loss": 0.8425, "step": 1503 }, { "epoch": 2.246557438267283, "grad_norm": 0.12163127213716507, "learning_rate": 1.24e-05, "loss": 0.7227, "step": 1504 }, { "epoch": 2.2480511599682584, "grad_norm": 0.1247539222240448, "learning_rate": 1.2375000000000001e-05, "loss": 0.7534, "step": 1505 }, { "epoch": 2.249544881669234, "grad_norm": 0.14668796956539154, "learning_rate": 1.235e-05, "loss": 0.7948, "step": 1506 }, { "epoch": 2.2510386033702097, "grad_norm": 0.14860163629055023, "learning_rate": 1.2325e-05, "loss": 0.8072, "step": 1507 }, { "epoch": 2.252532325071185, "grad_norm": 0.12285487353801727, "learning_rate": 1.23e-05, "loss": 0.774, "step": 1508 }, { "epoch": 2.254026046772161, "grad_norm": 0.12565334141254425, "learning_rate": 1.2275e-05, "loss": 0.7623, "step": 1509 }, { "epoch": 2.2555197684731363, "grad_norm": 0.1210622787475586, "learning_rate": 1.225e-05, "loss": 0.8316, "step": 1510 }, { "epoch": 2.257013490174112, "grad_norm": 0.12317316234111786, "learning_rate": 1.2225e-05, "loss": 0.74, "step": 1511 }, { "epoch": 2.2585072118750875, "grad_norm": 0.1293085664510727, "learning_rate": 1.22e-05, "loss": 0.8303, "step": 1512 }, { "epoch": 2.2600009335760634, "grad_norm": 0.1413058191537857, "learning_rate": 1.2175e-05, "loss": 0.8567, "step": 1513 }, { "epoch": 2.2614946552770387, "grad_norm": 0.12485679984092712, "learning_rate": 1.215e-05, "loss": 0.8409, "step": 1514 }, { "epoch": 2.262988376978014, "grad_norm": 0.11647962033748627, "learning_rate": 1.2125e-05, "loss": 0.7703, "step": 1515 }, { "epoch": 2.26448209867899, "grad_norm": 0.1256875842809677, "learning_rate": 1.2100000000000001e-05, "loss": 0.7425, "step": 1516 }, { "epoch": 2.2659758203799654, "grad_norm": 0.1279749721288681, "learning_rate": 1.2075e-05, "loss": 0.8077, "step": 1517 }, { "epoch": 2.267469542080941, "grad_norm": 0.14736074209213257, "learning_rate": 1.205e-05, "loss": 1.0191, "step": 1518 }, { "epoch": 2.2689632637819166, "grad_norm": 0.13851991295814514, "learning_rate": 1.2025000000000001e-05, "loss": 0.7886, "step": 1519 }, { "epoch": 2.270456985482892, "grad_norm": 0.13012689352035522, "learning_rate": 1.2e-05, "loss": 0.8249, "step": 1520 }, { "epoch": 2.271950707183868, "grad_norm": 0.134495809674263, "learning_rate": 1.1975e-05, "loss": 0.8388, "step": 1521 }, { "epoch": 2.273444428884843, "grad_norm": 0.12851493060588837, "learning_rate": 1.195e-05, "loss": 0.8027, "step": 1522 }, { "epoch": 2.274938150585819, "grad_norm": 0.1296277493238449, "learning_rate": 1.1925e-05, "loss": 0.8694, "step": 1523 }, { "epoch": 2.2764318722867944, "grad_norm": 0.14748579263687134, "learning_rate": 1.19e-05, "loss": 0.8419, "step": 1524 }, { "epoch": 2.2779255939877703, "grad_norm": 0.1285569965839386, "learning_rate": 1.1875e-05, "loss": 0.7648, "step": 1525 }, { "epoch": 2.2794193156887457, "grad_norm": 0.13195540010929108, "learning_rate": 1.185e-05, "loss": 0.8324, "step": 1526 }, { "epoch": 2.2809130373897215, "grad_norm": 0.134907066822052, "learning_rate": 1.1825e-05, "loss": 0.8484, "step": 1527 }, { "epoch": 2.282406759090697, "grad_norm": 0.12359859049320221, "learning_rate": 1.18e-05, "loss": 0.8093, "step": 1528 }, { "epoch": 2.2839004807916723, "grad_norm": 0.13489587604999542, "learning_rate": 1.1775e-05, "loss": 0.7688, "step": 1529 }, { "epoch": 2.285394202492648, "grad_norm": 0.13822411000728607, "learning_rate": 1.175e-05, "loss": 0.8448, "step": 1530 }, { "epoch": 2.2868879241936235, "grad_norm": 0.12085796892642975, "learning_rate": 1.1725e-05, "loss": 0.7608, "step": 1531 }, { "epoch": 2.2883816458945994, "grad_norm": 0.13494689762592316, "learning_rate": 1.1700000000000001e-05, "loss": 0.7933, "step": 1532 }, { "epoch": 2.2898753675955748, "grad_norm": 0.14290502667427063, "learning_rate": 1.1675000000000001e-05, "loss": 0.8919, "step": 1533 }, { "epoch": 2.2913690892965506, "grad_norm": 0.12424629926681519, "learning_rate": 1.1650000000000002e-05, "loss": 0.7339, "step": 1534 }, { "epoch": 2.292862810997526, "grad_norm": 0.13355375826358795, "learning_rate": 1.1625000000000001e-05, "loss": 0.8326, "step": 1535 }, { "epoch": 2.294356532698502, "grad_norm": 0.12734705209732056, "learning_rate": 1.16e-05, "loss": 0.8711, "step": 1536 }, { "epoch": 2.295850254399477, "grad_norm": 0.1327841728925705, "learning_rate": 1.1575000000000002e-05, "loss": 0.7971, "step": 1537 }, { "epoch": 2.2973439761004526, "grad_norm": 0.13338157534599304, "learning_rate": 1.1550000000000001e-05, "loss": 0.8664, "step": 1538 }, { "epoch": 2.2988376978014284, "grad_norm": 0.12820352613925934, "learning_rate": 1.1525e-05, "loss": 0.8371, "step": 1539 }, { "epoch": 2.300331419502404, "grad_norm": 0.14329390227794647, "learning_rate": 1.1500000000000002e-05, "loss": 0.9221, "step": 1540 }, { "epoch": 2.3018251412033797, "grad_norm": 0.1253558248281479, "learning_rate": 1.1475000000000001e-05, "loss": 0.73, "step": 1541 }, { "epoch": 2.303318862904355, "grad_norm": 0.1411030888557434, "learning_rate": 1.145e-05, "loss": 0.8466, "step": 1542 }, { "epoch": 2.304812584605331, "grad_norm": 0.1364670991897583, "learning_rate": 1.1425000000000002e-05, "loss": 0.9492, "step": 1543 }, { "epoch": 2.3063063063063063, "grad_norm": 0.13084156811237335, "learning_rate": 1.1400000000000001e-05, "loss": 0.7134, "step": 1544 }, { "epoch": 2.307800028007282, "grad_norm": 0.1440039724111557, "learning_rate": 1.1375e-05, "loss": 0.8756, "step": 1545 }, { "epoch": 2.3092937497082575, "grad_norm": 0.1258358210325241, "learning_rate": 1.1350000000000001e-05, "loss": 0.7444, "step": 1546 }, { "epoch": 2.310787471409233, "grad_norm": 0.13377144932746887, "learning_rate": 1.1325e-05, "loss": 0.7829, "step": 1547 }, { "epoch": 2.3122811931102087, "grad_norm": 0.13118520379066467, "learning_rate": 1.13e-05, "loss": 0.7469, "step": 1548 }, { "epoch": 2.313774914811184, "grad_norm": 0.1317671835422516, "learning_rate": 1.1275000000000001e-05, "loss": 0.8361, "step": 1549 }, { "epoch": 2.31526863651216, "grad_norm": 0.12405332922935486, "learning_rate": 1.125e-05, "loss": 0.803, "step": 1550 }, { "epoch": 2.3167623582131354, "grad_norm": 0.12751910090446472, "learning_rate": 1.1225e-05, "loss": 0.8473, "step": 1551 }, { "epoch": 2.3182560799141108, "grad_norm": 0.1287851333618164, "learning_rate": 1.1200000000000001e-05, "loss": 0.8075, "step": 1552 }, { "epoch": 2.3197498016150866, "grad_norm": 0.12435499578714371, "learning_rate": 1.1175e-05, "loss": 0.722, "step": 1553 }, { "epoch": 2.321243523316062, "grad_norm": 0.1304754912853241, "learning_rate": 1.115e-05, "loss": 0.8742, "step": 1554 }, { "epoch": 2.322737245017038, "grad_norm": 0.1294432133436203, "learning_rate": 1.1125000000000001e-05, "loss": 0.8873, "step": 1555 }, { "epoch": 2.324230966718013, "grad_norm": 0.13354115188121796, "learning_rate": 1.11e-05, "loss": 0.8142, "step": 1556 }, { "epoch": 2.325724688418989, "grad_norm": 0.13450966775417328, "learning_rate": 1.1075e-05, "loss": 0.8196, "step": 1557 }, { "epoch": 2.3272184101199644, "grad_norm": 0.1310683786869049, "learning_rate": 1.1050000000000001e-05, "loss": 0.7981, "step": 1558 }, { "epoch": 2.3287121318209403, "grad_norm": 0.13599038124084473, "learning_rate": 1.1025e-05, "loss": 0.8211, "step": 1559 }, { "epoch": 2.3302058535219157, "grad_norm": 0.13422448933124542, "learning_rate": 1.1000000000000001e-05, "loss": 0.9288, "step": 1560 }, { "epoch": 2.331699575222891, "grad_norm": 0.15591545403003693, "learning_rate": 1.0975e-05, "loss": 0.8726, "step": 1561 }, { "epoch": 2.333193296923867, "grad_norm": 0.1375562995672226, "learning_rate": 1.095e-05, "loss": 0.7755, "step": 1562 }, { "epoch": 2.3346870186248423, "grad_norm": 0.12302491068840027, "learning_rate": 1.0925000000000001e-05, "loss": 0.8471, "step": 1563 }, { "epoch": 2.336180740325818, "grad_norm": 0.14004065096378326, "learning_rate": 1.09e-05, "loss": 0.9542, "step": 1564 }, { "epoch": 2.3376744620267935, "grad_norm": 0.12543515861034393, "learning_rate": 1.0875e-05, "loss": 0.8742, "step": 1565 }, { "epoch": 2.3391681837277694, "grad_norm": 0.13544298708438873, "learning_rate": 1.0850000000000001e-05, "loss": 0.9459, "step": 1566 }, { "epoch": 2.3406619054287447, "grad_norm": 0.12574240565299988, "learning_rate": 1.0825e-05, "loss": 0.7478, "step": 1567 }, { "epoch": 2.3421556271297206, "grad_norm": 0.12070362269878387, "learning_rate": 1.08e-05, "loss": 0.7969, "step": 1568 }, { "epoch": 2.343649348830696, "grad_norm": 0.13169358670711517, "learning_rate": 1.0775000000000001e-05, "loss": 0.8411, "step": 1569 }, { "epoch": 2.3451430705316714, "grad_norm": 0.12987551093101501, "learning_rate": 1.075e-05, "loss": 0.7038, "step": 1570 }, { "epoch": 2.346636792232647, "grad_norm": 0.12777556478977203, "learning_rate": 1.0725e-05, "loss": 0.8424, "step": 1571 }, { "epoch": 2.3481305139336226, "grad_norm": 0.1309777796268463, "learning_rate": 1.0700000000000001e-05, "loss": 0.7709, "step": 1572 }, { "epoch": 2.3496242356345984, "grad_norm": 0.12878122925758362, "learning_rate": 1.0675e-05, "loss": 0.8038, "step": 1573 }, { "epoch": 2.351117957335574, "grad_norm": 0.13598176836967468, "learning_rate": 1.065e-05, "loss": 0.8192, "step": 1574 }, { "epoch": 2.3526116790365497, "grad_norm": 0.1476617306470871, "learning_rate": 1.0625e-05, "loss": 0.9551, "step": 1575 }, { "epoch": 2.354105400737525, "grad_norm": 0.1347997635602951, "learning_rate": 1.06e-05, "loss": 0.7775, "step": 1576 }, { "epoch": 2.355599122438501, "grad_norm": 0.13012950122356415, "learning_rate": 1.0575e-05, "loss": 0.8029, "step": 1577 }, { "epoch": 2.3570928441394763, "grad_norm": 0.14514939486980438, "learning_rate": 1.055e-05, "loss": 0.9066, "step": 1578 }, { "epoch": 2.3585865658404517, "grad_norm": 0.14387516677379608, "learning_rate": 1.0525e-05, "loss": 0.9254, "step": 1579 }, { "epoch": 2.3600802875414275, "grad_norm": 0.14368614554405212, "learning_rate": 1.05e-05, "loss": 0.8399, "step": 1580 }, { "epoch": 2.361574009242403, "grad_norm": 0.14390935003757477, "learning_rate": 1.0475e-05, "loss": 0.9815, "step": 1581 }, { "epoch": 2.3630677309433787, "grad_norm": 0.12406496703624725, "learning_rate": 1.045e-05, "loss": 0.7964, "step": 1582 }, { "epoch": 2.364561452644354, "grad_norm": 0.13632139563560486, "learning_rate": 1.0425e-05, "loss": 0.9199, "step": 1583 }, { "epoch": 2.36605517434533, "grad_norm": 0.1363624632358551, "learning_rate": 1.04e-05, "loss": 0.8061, "step": 1584 }, { "epoch": 2.3675488960463054, "grad_norm": 0.12627658247947693, "learning_rate": 1.0375e-05, "loss": 0.8601, "step": 1585 }, { "epoch": 2.369042617747281, "grad_norm": 0.13970385491847992, "learning_rate": 1.035e-05, "loss": 0.7839, "step": 1586 }, { "epoch": 2.3705363394482566, "grad_norm": 0.1380336433649063, "learning_rate": 1.0325e-05, "loss": 0.835, "step": 1587 }, { "epoch": 2.372030061149232, "grad_norm": 0.15103888511657715, "learning_rate": 1.03e-05, "loss": 0.9189, "step": 1588 }, { "epoch": 2.373523782850208, "grad_norm": 0.14437958598136902, "learning_rate": 1.0275e-05, "loss": 0.8294, "step": 1589 }, { "epoch": 2.375017504551183, "grad_norm": 0.12611033022403717, "learning_rate": 1.025e-05, "loss": 0.7672, "step": 1590 }, { "epoch": 2.376511226252159, "grad_norm": 0.12859007716178894, "learning_rate": 1.0225e-05, "loss": 0.8372, "step": 1591 }, { "epoch": 2.3780049479531344, "grad_norm": 0.13344968855381012, "learning_rate": 1.02e-05, "loss": 0.7956, "step": 1592 }, { "epoch": 2.37949866965411, "grad_norm": 0.12313632667064667, "learning_rate": 1.0175e-05, "loss": 0.8611, "step": 1593 }, { "epoch": 2.3809923913550857, "grad_norm": 0.13681364059448242, "learning_rate": 1.0150000000000001e-05, "loss": 0.755, "step": 1594 }, { "epoch": 2.382486113056061, "grad_norm": 0.12807931005954742, "learning_rate": 1.0125e-05, "loss": 0.8039, "step": 1595 }, { "epoch": 2.383979834757037, "grad_norm": 0.13871797919273376, "learning_rate": 1.0100000000000002e-05, "loss": 0.8937, "step": 1596 }, { "epoch": 2.3854735564580123, "grad_norm": 0.13878598809242249, "learning_rate": 1.0075000000000001e-05, "loss": 0.8795, "step": 1597 }, { "epoch": 2.386967278158988, "grad_norm": 0.14163492619991302, "learning_rate": 1.005e-05, "loss": 0.8052, "step": 1598 }, { "epoch": 2.3884609998599635, "grad_norm": 0.13997888565063477, "learning_rate": 1.0025000000000001e-05, "loss": 0.7738, "step": 1599 }, { "epoch": 2.3899547215609394, "grad_norm": 0.13275523483753204, "learning_rate": 1e-05, "loss": 0.9174, "step": 1600 }, { "epoch": 2.3914484432619147, "grad_norm": 0.14010214805603027, "learning_rate": 9.975e-06, "loss": 0.9595, "step": 1601 }, { "epoch": 2.39294216496289, "grad_norm": 0.12675809860229492, "learning_rate": 9.950000000000001e-06, "loss": 0.6569, "step": 1602 }, { "epoch": 2.394435886663866, "grad_norm": 0.13883498311042786, "learning_rate": 9.925e-06, "loss": 0.9202, "step": 1603 }, { "epoch": 2.3959296083648414, "grad_norm": 0.13949346542358398, "learning_rate": 9.900000000000002e-06, "loss": 0.8668, "step": 1604 }, { "epoch": 2.397423330065817, "grad_norm": 0.1440535932779312, "learning_rate": 9.875000000000001e-06, "loss": 0.9574, "step": 1605 }, { "epoch": 2.3989170517667926, "grad_norm": 0.14349329471588135, "learning_rate": 9.85e-06, "loss": 0.9312, "step": 1606 }, { "epoch": 2.4004107734677684, "grad_norm": 0.12184840440750122, "learning_rate": 9.825000000000002e-06, "loss": 0.6692, "step": 1607 }, { "epoch": 2.401904495168744, "grad_norm": 0.1365250200033188, "learning_rate": 9.800000000000001e-06, "loss": 0.7819, "step": 1608 }, { "epoch": 2.4033982168697197, "grad_norm": 0.12792789936065674, "learning_rate": 9.775e-06, "loss": 0.7816, "step": 1609 }, { "epoch": 2.404891938570695, "grad_norm": 0.1326299011707306, "learning_rate": 9.750000000000002e-06, "loss": 0.801, "step": 1610 }, { "epoch": 2.4063856602716704, "grad_norm": 0.13100428879261017, "learning_rate": 9.725000000000001e-06, "loss": 0.8571, "step": 1611 }, { "epoch": 2.4078793819726463, "grad_norm": 0.13497088849544525, "learning_rate": 9.7e-06, "loss": 0.8218, "step": 1612 }, { "epoch": 2.4093731036736217, "grad_norm": 0.1401243507862091, "learning_rate": 9.675000000000001e-06, "loss": 0.891, "step": 1613 }, { "epoch": 2.4108668253745975, "grad_norm": 0.14140330255031586, "learning_rate": 9.65e-06, "loss": 0.8898, "step": 1614 }, { "epoch": 2.412360547075573, "grad_norm": 0.13791531324386597, "learning_rate": 9.625e-06, "loss": 0.8135, "step": 1615 }, { "epoch": 2.4138542687765487, "grad_norm": 0.13639304041862488, "learning_rate": 9.600000000000001e-06, "loss": 0.8703, "step": 1616 }, { "epoch": 2.415347990477524, "grad_norm": 0.13722801208496094, "learning_rate": 9.575e-06, "loss": 0.8213, "step": 1617 }, { "epoch": 2.4168417121785, "grad_norm": 0.1370147317647934, "learning_rate": 9.55e-06, "loss": 0.9454, "step": 1618 }, { "epoch": 2.4183354338794754, "grad_norm": 0.13090604543685913, "learning_rate": 9.525000000000001e-06, "loss": 0.7894, "step": 1619 }, { "epoch": 2.4198291555804508, "grad_norm": 0.1484290510416031, "learning_rate": 9.5e-06, "loss": 0.998, "step": 1620 }, { "epoch": 2.4213228772814266, "grad_norm": 0.1358598917722702, "learning_rate": 9.475e-06, "loss": 0.7302, "step": 1621 }, { "epoch": 2.422816598982402, "grad_norm": 0.1421786993741989, "learning_rate": 9.450000000000001e-06, "loss": 0.7857, "step": 1622 }, { "epoch": 2.424310320683378, "grad_norm": 0.12079962342977524, "learning_rate": 9.425e-06, "loss": 0.7308, "step": 1623 }, { "epoch": 2.425804042384353, "grad_norm": 0.14012330770492554, "learning_rate": 9.4e-06, "loss": 0.8219, "step": 1624 }, { "epoch": 2.427297764085329, "grad_norm": 0.12991274893283844, "learning_rate": 9.375000000000001e-06, "loss": 0.8416, "step": 1625 }, { "epoch": 2.4287914857863044, "grad_norm": 0.12285028398036957, "learning_rate": 9.35e-06, "loss": 0.8451, "step": 1626 }, { "epoch": 2.4302852074872803, "grad_norm": 0.13463440537452698, "learning_rate": 9.325e-06, "loss": 0.8289, "step": 1627 }, { "epoch": 2.4317789291882557, "grad_norm": 0.1391773521900177, "learning_rate": 9.3e-06, "loss": 0.7759, "step": 1628 }, { "epoch": 2.433272650889231, "grad_norm": 0.12777116894721985, "learning_rate": 9.275e-06, "loss": 0.7987, "step": 1629 }, { "epoch": 2.434766372590207, "grad_norm": 0.13660910725593567, "learning_rate": 9.25e-06, "loss": 0.8515, "step": 1630 }, { "epoch": 2.4362600942911823, "grad_norm": 0.12964710593223572, "learning_rate": 9.225e-06, "loss": 0.7574, "step": 1631 }, { "epoch": 2.437753815992158, "grad_norm": 0.12837424874305725, "learning_rate": 9.2e-06, "loss": 0.8747, "step": 1632 }, { "epoch": 2.4392475376931335, "grad_norm": 0.1291448026895523, "learning_rate": 9.175000000000001e-06, "loss": 0.8271, "step": 1633 }, { "epoch": 2.440741259394109, "grad_norm": 0.1409589648246765, "learning_rate": 9.15e-06, "loss": 0.9421, "step": 1634 }, { "epoch": 2.4422349810950847, "grad_norm": 0.13819751143455505, "learning_rate": 9.125e-06, "loss": 0.7814, "step": 1635 }, { "epoch": 2.44372870279606, "grad_norm": 0.13469266891479492, "learning_rate": 9.100000000000001e-06, "loss": 0.8556, "step": 1636 }, { "epoch": 2.445222424497036, "grad_norm": 0.12626972794532776, "learning_rate": 9.075e-06, "loss": 0.7909, "step": 1637 }, { "epoch": 2.4467161461980114, "grad_norm": 0.12042160332202911, "learning_rate": 9.05e-06, "loss": 0.7127, "step": 1638 }, { "epoch": 2.448209867898987, "grad_norm": 0.13929767906665802, "learning_rate": 9.025e-06, "loss": 0.662, "step": 1639 }, { "epoch": 2.4497035895999626, "grad_norm": 0.12142099440097809, "learning_rate": 9e-06, "loss": 0.7227, "step": 1640 }, { "epoch": 2.4511973113009384, "grad_norm": 0.12549759447574615, "learning_rate": 8.975e-06, "loss": 0.7692, "step": 1641 }, { "epoch": 2.452691033001914, "grad_norm": 0.130094513297081, "learning_rate": 8.95e-06, "loss": 0.8099, "step": 1642 }, { "epoch": 2.454184754702889, "grad_norm": 0.14137783646583557, "learning_rate": 8.925e-06, "loss": 0.8656, "step": 1643 }, { "epoch": 2.455678476403865, "grad_norm": 0.1304675042629242, "learning_rate": 8.9e-06, "loss": 0.709, "step": 1644 }, { "epoch": 2.4571721981048404, "grad_norm": 0.1303737759590149, "learning_rate": 8.875e-06, "loss": 0.8724, "step": 1645 }, { "epoch": 2.4586659198058163, "grad_norm": 0.12621229887008667, "learning_rate": 8.85e-06, "loss": 0.7967, "step": 1646 }, { "epoch": 2.4601596415067917, "grad_norm": 0.14707183837890625, "learning_rate": 8.825e-06, "loss": 0.9009, "step": 1647 }, { "epoch": 2.4616533632077675, "grad_norm": 0.13802896440029144, "learning_rate": 8.8e-06, "loss": 0.8925, "step": 1648 }, { "epoch": 2.463147084908743, "grad_norm": 0.14698544144630432, "learning_rate": 8.775e-06, "loss": 0.9778, "step": 1649 }, { "epoch": 2.4646408066097187, "grad_norm": 0.12345429509878159, "learning_rate": 8.75e-06, "loss": 0.7952, "step": 1650 }, { "epoch": 2.466134528310694, "grad_norm": 0.143492192029953, "learning_rate": 8.725e-06, "loss": 0.8299, "step": 1651 }, { "epoch": 2.4676282500116695, "grad_norm": 0.11622001230716705, "learning_rate": 8.7e-06, "loss": 0.6952, "step": 1652 }, { "epoch": 2.4691219717126454, "grad_norm": 0.13288253545761108, "learning_rate": 8.674999999999999e-06, "loss": 0.8428, "step": 1653 }, { "epoch": 2.4706156934136208, "grad_norm": 0.12536628544330597, "learning_rate": 8.65e-06, "loss": 0.7772, "step": 1654 }, { "epoch": 2.4721094151145966, "grad_norm": 0.14896896481513977, "learning_rate": 8.625e-06, "loss": 0.9448, "step": 1655 }, { "epoch": 2.473603136815572, "grad_norm": 0.13216160237789154, "learning_rate": 8.599999999999999e-06, "loss": 0.6263, "step": 1656 }, { "epoch": 2.475096858516548, "grad_norm": 0.13372622430324554, "learning_rate": 8.575000000000002e-06, "loss": 0.8791, "step": 1657 }, { "epoch": 2.476590580217523, "grad_norm": 0.13719454407691956, "learning_rate": 8.550000000000001e-06, "loss": 0.914, "step": 1658 }, { "epoch": 2.478084301918499, "grad_norm": 0.13118299841880798, "learning_rate": 8.525e-06, "loss": 0.8274, "step": 1659 }, { "epoch": 2.4795780236194744, "grad_norm": 0.1369413584470749, "learning_rate": 8.500000000000002e-06, "loss": 0.8394, "step": 1660 }, { "epoch": 2.48107174532045, "grad_norm": 0.14312155544757843, "learning_rate": 8.475000000000001e-06, "loss": 0.7449, "step": 1661 }, { "epoch": 2.4825654670214257, "grad_norm": 0.1280394345521927, "learning_rate": 8.45e-06, "loss": 0.8357, "step": 1662 }, { "epoch": 2.484059188722401, "grad_norm": 0.13753177225589752, "learning_rate": 8.425000000000001e-06, "loss": 0.769, "step": 1663 }, { "epoch": 2.485552910423377, "grad_norm": 0.13042978942394257, "learning_rate": 8.400000000000001e-06, "loss": 0.7611, "step": 1664 }, { "epoch": 2.4870466321243523, "grad_norm": 0.13273215293884277, "learning_rate": 8.375e-06, "loss": 0.8468, "step": 1665 }, { "epoch": 2.4885403538253277, "grad_norm": 0.14755016565322876, "learning_rate": 8.350000000000001e-06, "loss": 0.8742, "step": 1666 }, { "epoch": 2.4900340755263035, "grad_norm": 0.13268254697322845, "learning_rate": 8.325e-06, "loss": 0.8521, "step": 1667 }, { "epoch": 2.491527797227279, "grad_norm": 0.134341761469841, "learning_rate": 8.3e-06, "loss": 0.8061, "step": 1668 }, { "epoch": 2.4930215189282547, "grad_norm": 0.13497136533260345, "learning_rate": 8.275000000000001e-06, "loss": 0.8899, "step": 1669 }, { "epoch": 2.49451524062923, "grad_norm": 0.1420523077249527, "learning_rate": 8.25e-06, "loss": 0.9462, "step": 1670 }, { "epoch": 2.496008962330206, "grad_norm": 0.139367014169693, "learning_rate": 8.225e-06, "loss": 0.8961, "step": 1671 }, { "epoch": 2.4975026840311814, "grad_norm": 0.13517650961875916, "learning_rate": 8.200000000000001e-06, "loss": 0.8924, "step": 1672 }, { "epoch": 2.498996405732157, "grad_norm": 0.135527104139328, "learning_rate": 8.175e-06, "loss": 0.9396, "step": 1673 }, { "epoch": 2.5004901274331326, "grad_norm": 0.13519176840782166, "learning_rate": 8.15e-06, "loss": 0.8864, "step": 1674 }, { "epoch": 2.501983849134108, "grad_norm": 0.12942421436309814, "learning_rate": 8.125000000000001e-06, "loss": 0.8642, "step": 1675 }, { "epoch": 2.503477570835084, "grad_norm": 0.131195530295372, "learning_rate": 8.1e-06, "loss": 0.8517, "step": 1676 }, { "epoch": 2.504971292536059, "grad_norm": 0.12647759914398193, "learning_rate": 8.075000000000001e-06, "loss": 0.8562, "step": 1677 }, { "epoch": 2.506465014237035, "grad_norm": 0.12854419648647308, "learning_rate": 8.050000000000001e-06, "loss": 0.7768, "step": 1678 }, { "epoch": 2.5079587359380104, "grad_norm": 0.1449342519044876, "learning_rate": 8.025e-06, "loss": 0.9247, "step": 1679 }, { "epoch": 2.5094524576389863, "grad_norm": 0.14190371334552765, "learning_rate": 8.000000000000001e-06, "loss": 0.9746, "step": 1680 }, { "epoch": 2.5109461793399617, "grad_norm": 0.1388547271490097, "learning_rate": 7.975e-06, "loss": 0.953, "step": 1681 }, { "epoch": 2.5124399010409375, "grad_norm": 0.1378171592950821, "learning_rate": 7.95e-06, "loss": 0.8855, "step": 1682 }, { "epoch": 2.513933622741913, "grad_norm": 0.13029062747955322, "learning_rate": 7.925000000000001e-06, "loss": 0.7702, "step": 1683 }, { "epoch": 2.5154273444428883, "grad_norm": 0.12645050883293152, "learning_rate": 7.9e-06, "loss": 0.8024, "step": 1684 }, { "epoch": 2.516921066143864, "grad_norm": 0.1293843537569046, "learning_rate": 7.875e-06, "loss": 0.835, "step": 1685 }, { "epoch": 2.5184147878448395, "grad_norm": 0.13059695065021515, "learning_rate": 7.850000000000001e-06, "loss": 0.8229, "step": 1686 }, { "epoch": 2.5199085095458154, "grad_norm": 0.12498169392347336, "learning_rate": 7.825e-06, "loss": 0.8576, "step": 1687 }, { "epoch": 2.5214022312467907, "grad_norm": 0.13808417320251465, "learning_rate": 7.8e-06, "loss": 0.7646, "step": 1688 }, { "epoch": 2.5228959529477666, "grad_norm": 0.14361457526683807, "learning_rate": 7.775000000000001e-06, "loss": 0.8825, "step": 1689 }, { "epoch": 2.524389674648742, "grad_norm": 0.13564848899841309, "learning_rate": 7.75e-06, "loss": 0.8869, "step": 1690 }, { "epoch": 2.525883396349718, "grad_norm": 0.15011367201805115, "learning_rate": 7.725e-06, "loss": 0.8305, "step": 1691 }, { "epoch": 2.527377118050693, "grad_norm": 0.13280567526817322, "learning_rate": 7.7e-06, "loss": 0.8411, "step": 1692 }, { "epoch": 2.5288708397516686, "grad_norm": 0.1329461634159088, "learning_rate": 7.675e-06, "loss": 0.937, "step": 1693 }, { "epoch": 2.5303645614526444, "grad_norm": 0.12553295493125916, "learning_rate": 7.65e-06, "loss": 0.6977, "step": 1694 }, { "epoch": 2.53185828315362, "grad_norm": 0.1183817908167839, "learning_rate": 7.625e-06, "loss": 0.8277, "step": 1695 }, { "epoch": 2.5333520048545957, "grad_norm": 0.13988672196865082, "learning_rate": 7.6e-06, "loss": 0.9121, "step": 1696 }, { "epoch": 2.534845726555571, "grad_norm": 0.1428712159395218, "learning_rate": 7.575e-06, "loss": 0.8491, "step": 1697 }, { "epoch": 2.5363394482565464, "grad_norm": 0.13072678446769714, "learning_rate": 7.55e-06, "loss": 0.8261, "step": 1698 }, { "epoch": 2.5378331699575223, "grad_norm": 0.12075422704219818, "learning_rate": 7.525e-06, "loss": 0.7251, "step": 1699 }, { "epoch": 2.539326891658498, "grad_norm": 0.13287901878356934, "learning_rate": 7.5e-06, "loss": 0.7844, "step": 1700 }, { "epoch": 2.5408206133594735, "grad_norm": 0.1310594528913498, "learning_rate": 7.4750000000000004e-06, "loss": 0.7479, "step": 1701 }, { "epoch": 2.542314335060449, "grad_norm": 0.14920195937156677, "learning_rate": 7.45e-06, "loss": 0.9002, "step": 1702 }, { "epoch": 2.5438080567614247, "grad_norm": 0.12811461091041565, "learning_rate": 7.425e-06, "loss": 0.8388, "step": 1703 }, { "epoch": 2.5453017784624, "grad_norm": 0.13412749767303467, "learning_rate": 7.4e-06, "loss": 0.8406, "step": 1704 }, { "epoch": 2.546795500163376, "grad_norm": 0.1403205394744873, "learning_rate": 7.375e-06, "loss": 0.802, "step": 1705 }, { "epoch": 2.5482892218643514, "grad_norm": 0.12584206461906433, "learning_rate": 7.35e-06, "loss": 0.7792, "step": 1706 }, { "epoch": 2.5497829435653268, "grad_norm": 0.13781550526618958, "learning_rate": 7.325e-06, "loss": 0.7875, "step": 1707 }, { "epoch": 2.5512766652663026, "grad_norm": 0.13393403589725494, "learning_rate": 7.2999999999999996e-06, "loss": 0.822, "step": 1708 }, { "epoch": 2.5527703869672784, "grad_norm": 0.12788349390029907, "learning_rate": 7.275e-06, "loss": 0.708, "step": 1709 }, { "epoch": 2.554264108668254, "grad_norm": 0.12759651243686676, "learning_rate": 7.25e-06, "loss": 0.8469, "step": 1710 }, { "epoch": 2.555757830369229, "grad_norm": 0.13573463261127472, "learning_rate": 7.2249999999999994e-06, "loss": 0.7561, "step": 1711 }, { "epoch": 2.557251552070205, "grad_norm": 0.13102664053440094, "learning_rate": 7.2e-06, "loss": 0.7455, "step": 1712 }, { "epoch": 2.5587452737711804, "grad_norm": 0.13038717210292816, "learning_rate": 7.175e-06, "loss": 0.8092, "step": 1713 }, { "epoch": 2.5602389954721563, "grad_norm": 0.14736557006835938, "learning_rate": 7.15e-06, "loss": 0.8627, "step": 1714 }, { "epoch": 2.5617327171731317, "grad_norm": 0.13350047171115875, "learning_rate": 7.1249999999999995e-06, "loss": 0.9436, "step": 1715 }, { "epoch": 2.563226438874107, "grad_norm": 0.1417786031961441, "learning_rate": 7.1e-06, "loss": 0.8362, "step": 1716 }, { "epoch": 2.564720160575083, "grad_norm": 0.1260395497083664, "learning_rate": 7.075e-06, "loss": 0.81, "step": 1717 }, { "epoch": 2.5662138822760583, "grad_norm": 0.13158594071865082, "learning_rate": 7.049999999999999e-06, "loss": 0.841, "step": 1718 }, { "epoch": 2.567707603977034, "grad_norm": 0.12901322543621063, "learning_rate": 7.025000000000001e-06, "loss": 0.78, "step": 1719 }, { "epoch": 2.5692013256780095, "grad_norm": 0.1355431079864502, "learning_rate": 7.000000000000001e-06, "loss": 0.839, "step": 1720 }, { "epoch": 2.5706950473789854, "grad_norm": 0.1427827924489975, "learning_rate": 6.975000000000001e-06, "loss": 0.8659, "step": 1721 }, { "epoch": 2.5721887690799607, "grad_norm": 0.14304494857788086, "learning_rate": 6.950000000000001e-06, "loss": 0.8723, "step": 1722 }, { "epoch": 2.5736824907809366, "grad_norm": 0.1485297828912735, "learning_rate": 6.925000000000001e-06, "loss": 0.9697, "step": 1723 }, { "epoch": 2.575176212481912, "grad_norm": 0.13630133867263794, "learning_rate": 6.900000000000001e-06, "loss": 0.8009, "step": 1724 }, { "epoch": 2.5766699341828874, "grad_norm": 0.14172403514385223, "learning_rate": 6.875000000000001e-06, "loss": 0.8572, "step": 1725 }, { "epoch": 2.578163655883863, "grad_norm": 0.13798026740550995, "learning_rate": 6.8500000000000005e-06, "loss": 0.8649, "step": 1726 }, { "epoch": 2.5796573775848386, "grad_norm": 0.12402907013893127, "learning_rate": 6.825000000000001e-06, "loss": 0.6778, "step": 1727 }, { "epoch": 2.5811510992858144, "grad_norm": 0.15164606273174286, "learning_rate": 6.800000000000001e-06, "loss": 0.9138, "step": 1728 }, { "epoch": 2.58264482098679, "grad_norm": 0.13195770978927612, "learning_rate": 6.775000000000001e-06, "loss": 0.8944, "step": 1729 }, { "epoch": 2.584138542687765, "grad_norm": 0.14242012798786163, "learning_rate": 6.750000000000001e-06, "loss": 0.8314, "step": 1730 }, { "epoch": 2.585632264388741, "grad_norm": 0.13847272098064423, "learning_rate": 6.725000000000001e-06, "loss": 0.9559, "step": 1731 }, { "epoch": 2.587125986089717, "grad_norm": 0.14341412484645844, "learning_rate": 6.700000000000001e-06, "loss": 0.9169, "step": 1732 }, { "epoch": 2.5886197077906923, "grad_norm": 0.12358768284320831, "learning_rate": 6.6750000000000005e-06, "loss": 0.7339, "step": 1733 }, { "epoch": 2.5901134294916677, "grad_norm": 0.12449809908866882, "learning_rate": 6.650000000000001e-06, "loss": 0.7669, "step": 1734 }, { "epoch": 2.5916071511926435, "grad_norm": 0.13108479976654053, "learning_rate": 6.625000000000001e-06, "loss": 0.8894, "step": 1735 }, { "epoch": 2.593100872893619, "grad_norm": 0.13441932201385498, "learning_rate": 6.6e-06, "loss": 0.9135, "step": 1736 }, { "epoch": 2.5945945945945947, "grad_norm": 0.13525666296482086, "learning_rate": 6.5750000000000006e-06, "loss": 0.7843, "step": 1737 }, { "epoch": 2.59608831629557, "grad_norm": 0.14019419252872467, "learning_rate": 6.550000000000001e-06, "loss": 0.9218, "step": 1738 }, { "epoch": 2.5975820379965455, "grad_norm": 0.1342540830373764, "learning_rate": 6.525e-06, "loss": 0.8612, "step": 1739 }, { "epoch": 2.5990757596975214, "grad_norm": 0.13906580209732056, "learning_rate": 6.5000000000000004e-06, "loss": 0.9276, "step": 1740 }, { "epoch": 2.600569481398497, "grad_norm": 0.12673482298851013, "learning_rate": 6.475000000000001e-06, "loss": 0.7992, "step": 1741 }, { "epoch": 2.6020632030994726, "grad_norm": 0.1329709142446518, "learning_rate": 6.45e-06, "loss": 0.755, "step": 1742 }, { "epoch": 2.603556924800448, "grad_norm": 0.12654058635234833, "learning_rate": 6.425e-06, "loss": 0.7582, "step": 1743 }, { "epoch": 2.605050646501424, "grad_norm": 0.15160240232944489, "learning_rate": 6.4000000000000006e-06, "loss": 0.9387, "step": 1744 }, { "epoch": 2.606544368202399, "grad_norm": 0.12987355887889862, "learning_rate": 6.375000000000001e-06, "loss": 0.8233, "step": 1745 }, { "epoch": 2.608038089903375, "grad_norm": 0.1493702232837677, "learning_rate": 6.35e-06, "loss": 0.8015, "step": 1746 }, { "epoch": 2.6095318116043504, "grad_norm": 0.13637439906597137, "learning_rate": 6.3250000000000004e-06, "loss": 0.8958, "step": 1747 }, { "epoch": 2.611025533305326, "grad_norm": 0.13375301659107208, "learning_rate": 6.300000000000001e-06, "loss": 0.8976, "step": 1748 }, { "epoch": 2.6125192550063017, "grad_norm": 0.13253824412822723, "learning_rate": 6.275e-06, "loss": 0.9442, "step": 1749 }, { "epoch": 2.6140129767072775, "grad_norm": 0.13268591463565826, "learning_rate": 6.25e-06, "loss": 0.8652, "step": 1750 }, { "epoch": 2.615506698408253, "grad_norm": 0.12633271515369415, "learning_rate": 6.2250000000000005e-06, "loss": 0.7085, "step": 1751 }, { "epoch": 2.6170004201092283, "grad_norm": 0.1329997181892395, "learning_rate": 6.2e-06, "loss": 0.8471, "step": 1752 }, { "epoch": 2.618494141810204, "grad_norm": 0.13599537312984467, "learning_rate": 6.175e-06, "loss": 0.8263, "step": 1753 }, { "epoch": 2.6199878635111795, "grad_norm": 0.1285291165113449, "learning_rate": 6.15e-06, "loss": 0.7685, "step": 1754 }, { "epoch": 2.6214815852121554, "grad_norm": 0.1368568390607834, "learning_rate": 6.125e-06, "loss": 0.903, "step": 1755 }, { "epoch": 2.6229753069131307, "grad_norm": 0.1353880912065506, "learning_rate": 6.1e-06, "loss": 0.8548, "step": 1756 }, { "epoch": 2.624469028614106, "grad_norm": 0.1320316046476364, "learning_rate": 6.075e-06, "loss": 0.8185, "step": 1757 }, { "epoch": 2.625962750315082, "grad_norm": 0.13284337520599365, "learning_rate": 6.0500000000000005e-06, "loss": 0.7286, "step": 1758 }, { "epoch": 2.6274564720160574, "grad_norm": 0.13002096116542816, "learning_rate": 6.025e-06, "loss": 0.8219, "step": 1759 }, { "epoch": 2.628950193717033, "grad_norm": 0.14487500488758087, "learning_rate": 6e-06, "loss": 0.9014, "step": 1760 }, { "epoch": 2.6304439154180086, "grad_norm": 0.14321301877498627, "learning_rate": 5.975e-06, "loss": 0.8515, "step": 1761 }, { "epoch": 2.6319376371189844, "grad_norm": 0.1415593922138214, "learning_rate": 5.95e-06, "loss": 0.9285, "step": 1762 }, { "epoch": 2.63343135881996, "grad_norm": 0.14804531633853912, "learning_rate": 5.925e-06, "loss": 0.9141, "step": 1763 }, { "epoch": 2.6349250805209357, "grad_norm": 0.1371275782585144, "learning_rate": 5.9e-06, "loss": 0.7887, "step": 1764 }, { "epoch": 2.636418802221911, "grad_norm": 0.12895362079143524, "learning_rate": 5.875e-06, "loss": 0.8227, "step": 1765 }, { "epoch": 2.6379125239228864, "grad_norm": 0.14142970740795135, "learning_rate": 5.850000000000001e-06, "loss": 0.9123, "step": 1766 }, { "epoch": 2.6394062456238623, "grad_norm": 0.1487559825181961, "learning_rate": 5.825000000000001e-06, "loss": 0.8071, "step": 1767 }, { "epoch": 2.6408999673248377, "grad_norm": 0.12963686883449554, "learning_rate": 5.8e-06, "loss": 0.7834, "step": 1768 }, { "epoch": 2.6423936890258135, "grad_norm": 0.14351999759674072, "learning_rate": 5.775000000000001e-06, "loss": 0.6836, "step": 1769 }, { "epoch": 2.643887410726789, "grad_norm": 0.13228961825370789, "learning_rate": 5.750000000000001e-06, "loss": 0.8056, "step": 1770 }, { "epoch": 2.6453811324277643, "grad_norm": 0.12551921606063843, "learning_rate": 5.725e-06, "loss": 0.8065, "step": 1771 }, { "epoch": 2.64687485412874, "grad_norm": 0.13224585354328156, "learning_rate": 5.7000000000000005e-06, "loss": 0.788, "step": 1772 }, { "epoch": 2.648368575829716, "grad_norm": 0.16082735359668732, "learning_rate": 5.675000000000001e-06, "loss": 0.8534, "step": 1773 }, { "epoch": 2.6498622975306914, "grad_norm": 0.1378648430109024, "learning_rate": 5.65e-06, "loss": 0.8252, "step": 1774 }, { "epoch": 2.6513560192316667, "grad_norm": 0.13810893893241882, "learning_rate": 5.625e-06, "loss": 0.8488, "step": 1775 }, { "epoch": 2.6528497409326426, "grad_norm": 0.12280350923538208, "learning_rate": 5.600000000000001e-06, "loss": 0.7969, "step": 1776 }, { "epoch": 2.654343462633618, "grad_norm": 0.12407539784908295, "learning_rate": 5.575e-06, "loss": 0.8173, "step": 1777 }, { "epoch": 2.655837184334594, "grad_norm": 0.14103572070598602, "learning_rate": 5.55e-06, "loss": 0.8975, "step": 1778 }, { "epoch": 2.657330906035569, "grad_norm": 0.1294793039560318, "learning_rate": 5.5250000000000005e-06, "loss": 0.8291, "step": 1779 }, { "epoch": 2.6588246277365446, "grad_norm": 0.13135084509849548, "learning_rate": 5.500000000000001e-06, "loss": 0.9005, "step": 1780 }, { "epoch": 2.6603183494375204, "grad_norm": 0.12542477250099182, "learning_rate": 5.475e-06, "loss": 0.7856, "step": 1781 }, { "epoch": 2.6618120711384963, "grad_norm": 0.13214416801929474, "learning_rate": 5.45e-06, "loss": 0.7726, "step": 1782 }, { "epoch": 2.6633057928394717, "grad_norm": 0.16382290422916412, "learning_rate": 5.4250000000000006e-06, "loss": 0.9307, "step": 1783 }, { "epoch": 2.664799514540447, "grad_norm": 0.12743014097213745, "learning_rate": 5.4e-06, "loss": 0.7938, "step": 1784 }, { "epoch": 2.666293236241423, "grad_norm": 0.134892076253891, "learning_rate": 5.375e-06, "loss": 0.8037, "step": 1785 }, { "epoch": 2.6677869579423983, "grad_norm": 0.1286378651857376, "learning_rate": 5.3500000000000004e-06, "loss": 0.8705, "step": 1786 }, { "epoch": 2.669280679643374, "grad_norm": 0.1305590718984604, "learning_rate": 5.325e-06, "loss": 0.7629, "step": 1787 }, { "epoch": 2.6707744013443495, "grad_norm": 0.13347311317920685, "learning_rate": 5.3e-06, "loss": 0.8426, "step": 1788 }, { "epoch": 2.672268123045325, "grad_norm": 0.13337348401546478, "learning_rate": 5.275e-06, "loss": 0.7885, "step": 1789 }, { "epoch": 2.6737618447463007, "grad_norm": 0.1362558752298355, "learning_rate": 5.25e-06, "loss": 0.7939, "step": 1790 }, { "epoch": 2.675255566447276, "grad_norm": 0.14057612419128418, "learning_rate": 5.225e-06, "loss": 0.925, "step": 1791 }, { "epoch": 2.676749288148252, "grad_norm": 0.1408485621213913, "learning_rate": 5.2e-06, "loss": 0.8119, "step": 1792 }, { "epoch": 2.6782430098492274, "grad_norm": 0.12870322167873383, "learning_rate": 5.175e-06, "loss": 0.7873, "step": 1793 }, { "epoch": 2.679736731550203, "grad_norm": 0.13328923285007477, "learning_rate": 5.15e-06, "loss": 0.8716, "step": 1794 }, { "epoch": 2.6812304532511786, "grad_norm": 0.1331106424331665, "learning_rate": 5.125e-06, "loss": 0.9096, "step": 1795 }, { "epoch": 2.6827241749521544, "grad_norm": 0.12948597967624664, "learning_rate": 5.1e-06, "loss": 0.7791, "step": 1796 }, { "epoch": 2.68421789665313, "grad_norm": 0.1256321668624878, "learning_rate": 5.0750000000000005e-06, "loss": 0.7796, "step": 1797 }, { "epoch": 2.685711618354105, "grad_norm": 0.1406431943178177, "learning_rate": 5.050000000000001e-06, "loss": 0.7643, "step": 1798 }, { "epoch": 2.687205340055081, "grad_norm": 0.13756687939167023, "learning_rate": 5.025e-06, "loss": 0.8564, "step": 1799 }, { "epoch": 2.6886990617560564, "grad_norm": 0.14057467877864838, "learning_rate": 5e-06, "loss": 0.8774, "step": 1800 }, { "epoch": 2.6901927834570323, "grad_norm": 0.12428513169288635, "learning_rate": 4.975000000000001e-06, "loss": 0.7608, "step": 1801 }, { "epoch": 2.6916865051580077, "grad_norm": 0.12221617996692657, "learning_rate": 4.950000000000001e-06, "loss": 0.6653, "step": 1802 }, { "epoch": 2.693180226858983, "grad_norm": 0.13842907547950745, "learning_rate": 4.925e-06, "loss": 0.7913, "step": 1803 }, { "epoch": 2.694673948559959, "grad_norm": 0.12830296158790588, "learning_rate": 4.9000000000000005e-06, "loss": 0.6992, "step": 1804 }, { "epoch": 2.6961676702609347, "grad_norm": 0.13095708191394806, "learning_rate": 4.875000000000001e-06, "loss": 0.8443, "step": 1805 }, { "epoch": 2.69766139196191, "grad_norm": 0.1387687623500824, "learning_rate": 4.85e-06, "loss": 0.7664, "step": 1806 }, { "epoch": 2.6991551136628855, "grad_norm": 0.1322169452905655, "learning_rate": 4.825e-06, "loss": 0.8103, "step": 1807 }, { "epoch": 2.7006488353638614, "grad_norm": 0.12232271581888199, "learning_rate": 4.800000000000001e-06, "loss": 0.6559, "step": 1808 }, { "epoch": 2.7021425570648367, "grad_norm": 0.14776401221752167, "learning_rate": 4.775e-06, "loss": 0.8919, "step": 1809 }, { "epoch": 2.7036362787658126, "grad_norm": 0.1448414921760559, "learning_rate": 4.75e-06, "loss": 0.8053, "step": 1810 }, { "epoch": 2.705130000466788, "grad_norm": 0.13454516232013702, "learning_rate": 4.7250000000000005e-06, "loss": 0.9528, "step": 1811 }, { "epoch": 2.7066237221677634, "grad_norm": 0.12588457763195038, "learning_rate": 4.7e-06, "loss": 0.7691, "step": 1812 }, { "epoch": 2.708117443868739, "grad_norm": 0.1592012345790863, "learning_rate": 4.675e-06, "loss": 0.9547, "step": 1813 }, { "epoch": 2.709611165569715, "grad_norm": 0.12324593216180801, "learning_rate": 4.65e-06, "loss": 0.7282, "step": 1814 }, { "epoch": 2.7111048872706904, "grad_norm": 0.13930857181549072, "learning_rate": 4.625e-06, "loss": 0.9444, "step": 1815 }, { "epoch": 2.712598608971666, "grad_norm": 0.13926108181476593, "learning_rate": 4.6e-06, "loss": 0.8136, "step": 1816 }, { "epoch": 2.7140923306726417, "grad_norm": 0.1290612369775772, "learning_rate": 4.575e-06, "loss": 0.6821, "step": 1817 }, { "epoch": 2.715586052373617, "grad_norm": 0.13707637786865234, "learning_rate": 4.5500000000000005e-06, "loss": 0.8348, "step": 1818 }, { "epoch": 2.717079774074593, "grad_norm": 0.13621026277542114, "learning_rate": 4.525e-06, "loss": 0.7954, "step": 1819 }, { "epoch": 2.7185734957755683, "grad_norm": 0.137715145945549, "learning_rate": 4.5e-06, "loss": 0.7873, "step": 1820 }, { "epoch": 2.7200672174765437, "grad_norm": 0.13955800235271454, "learning_rate": 4.475e-06, "loss": 1.0121, "step": 1821 }, { "epoch": 2.7215609391775195, "grad_norm": 0.14451739192008972, "learning_rate": 4.45e-06, "loss": 0.8116, "step": 1822 }, { "epoch": 2.7230546608784953, "grad_norm": 0.12721778452396393, "learning_rate": 4.425e-06, "loss": 0.7922, "step": 1823 }, { "epoch": 2.7245483825794707, "grad_norm": 0.13402287662029266, "learning_rate": 4.4e-06, "loss": 0.8029, "step": 1824 }, { "epoch": 2.726042104280446, "grad_norm": 0.1325477957725525, "learning_rate": 4.375e-06, "loss": 0.8315, "step": 1825 }, { "epoch": 2.727535825981422, "grad_norm": 0.13294009864330292, "learning_rate": 4.35e-06, "loss": 0.8104, "step": 1826 }, { "epoch": 2.7290295476823974, "grad_norm": 0.12543267011642456, "learning_rate": 4.325e-06, "loss": 0.8102, "step": 1827 }, { "epoch": 2.730523269383373, "grad_norm": 0.13892562687397003, "learning_rate": 4.2999999999999995e-06, "loss": 0.8258, "step": 1828 }, { "epoch": 2.7320169910843486, "grad_norm": 0.1353042721748352, "learning_rate": 4.2750000000000006e-06, "loss": 0.824, "step": 1829 }, { "epoch": 2.733510712785324, "grad_norm": 0.14181531965732574, "learning_rate": 4.250000000000001e-06, "loss": 0.8472, "step": 1830 }, { "epoch": 2.7350044344863, "grad_norm": 0.12989366054534912, "learning_rate": 4.225e-06, "loss": 0.7487, "step": 1831 }, { "epoch": 2.736498156187275, "grad_norm": 0.14948944747447968, "learning_rate": 4.2000000000000004e-06, "loss": 0.8817, "step": 1832 }, { "epoch": 2.737991877888251, "grad_norm": 0.1324087530374527, "learning_rate": 4.175000000000001e-06, "loss": 0.7819, "step": 1833 }, { "epoch": 2.7394855995892264, "grad_norm": 0.13186846673488617, "learning_rate": 4.15e-06, "loss": 0.8095, "step": 1834 }, { "epoch": 2.7409793212902023, "grad_norm": 0.13826900720596313, "learning_rate": 4.125e-06, "loss": 0.9313, "step": 1835 }, { "epoch": 2.7424730429911777, "grad_norm": 0.13519445061683655, "learning_rate": 4.1000000000000006e-06, "loss": 0.8621, "step": 1836 }, { "epoch": 2.7439667646921535, "grad_norm": 0.13476042449474335, "learning_rate": 4.075e-06, "loss": 0.8959, "step": 1837 }, { "epoch": 2.745460486393129, "grad_norm": 0.13646842539310455, "learning_rate": 4.05e-06, "loss": 0.9588, "step": 1838 }, { "epoch": 2.7469542080941043, "grad_norm": 0.1318429410457611, "learning_rate": 4.0250000000000004e-06, "loss": 0.8553, "step": 1839 }, { "epoch": 2.74844792979508, "grad_norm": 0.13644909858703613, "learning_rate": 4.000000000000001e-06, "loss": 0.9192, "step": 1840 }, { "epoch": 2.7499416514960555, "grad_norm": 0.1210256963968277, "learning_rate": 3.975e-06, "loss": 0.6603, "step": 1841 }, { "epoch": 2.7514353731970314, "grad_norm": 0.13633497059345245, "learning_rate": 3.95e-06, "loss": 0.824, "step": 1842 }, { "epoch": 2.7529290948980067, "grad_norm": 0.12807141244411469, "learning_rate": 3.9250000000000005e-06, "loss": 0.8327, "step": 1843 }, { "epoch": 2.754422816598982, "grad_norm": 0.13438399136066437, "learning_rate": 3.9e-06, "loss": 0.8967, "step": 1844 }, { "epoch": 2.755916538299958, "grad_norm": 0.13785484433174133, "learning_rate": 3.875e-06, "loss": 0.8375, "step": 1845 }, { "epoch": 2.757410260000934, "grad_norm": 0.13231568038463593, "learning_rate": 3.85e-06, "loss": 0.8437, "step": 1846 }, { "epoch": 2.758903981701909, "grad_norm": 0.1270337849855423, "learning_rate": 3.825e-06, "loss": 0.6876, "step": 1847 }, { "epoch": 2.7603977034028846, "grad_norm": 0.14716912806034088, "learning_rate": 3.8e-06, "loss": 0.7744, "step": 1848 }, { "epoch": 2.7618914251038604, "grad_norm": 0.14222261309623718, "learning_rate": 3.775e-06, "loss": 0.7405, "step": 1849 }, { "epoch": 2.763385146804836, "grad_norm": 0.13402339816093445, "learning_rate": 3.75e-06, "loss": 0.8852, "step": 1850 }, { "epoch": 2.7648788685058117, "grad_norm": 0.13895094394683838, "learning_rate": 3.725e-06, "loss": 0.8011, "step": 1851 }, { "epoch": 2.766372590206787, "grad_norm": 0.14416490495204926, "learning_rate": 3.7e-06, "loss": 0.8168, "step": 1852 }, { "epoch": 2.7678663119077624, "grad_norm": 0.12278428673744202, "learning_rate": 3.675e-06, "loss": 0.7503, "step": 1853 }, { "epoch": 2.7693600336087383, "grad_norm": 0.13074032962322235, "learning_rate": 3.6499999999999998e-06, "loss": 0.8374, "step": 1854 }, { "epoch": 2.770853755309714, "grad_norm": 0.13166756927967072, "learning_rate": 3.625e-06, "loss": 0.8697, "step": 1855 }, { "epoch": 2.7723474770106895, "grad_norm": 0.13269732892513275, "learning_rate": 3.6e-06, "loss": 0.8634, "step": 1856 }, { "epoch": 2.773841198711665, "grad_norm": 0.1259765475988388, "learning_rate": 3.575e-06, "loss": 0.7638, "step": 1857 }, { "epoch": 2.7753349204126407, "grad_norm": 0.13202543556690216, "learning_rate": 3.55e-06, "loss": 0.8539, "step": 1858 }, { "epoch": 2.776828642113616, "grad_norm": 0.12907275557518005, "learning_rate": 3.5249999999999997e-06, "loss": 0.7191, "step": 1859 }, { "epoch": 2.778322363814592, "grad_norm": 0.14748698472976685, "learning_rate": 3.5000000000000004e-06, "loss": 0.9987, "step": 1860 }, { "epoch": 2.7798160855155674, "grad_norm": 0.1488581746816635, "learning_rate": 3.4750000000000006e-06, "loss": 0.8912, "step": 1861 }, { "epoch": 2.7813098072165428, "grad_norm": 0.13347996771335602, "learning_rate": 3.4500000000000004e-06, "loss": 0.8231, "step": 1862 }, { "epoch": 2.7828035289175186, "grad_norm": 0.13739247620105743, "learning_rate": 3.4250000000000002e-06, "loss": 0.8701, "step": 1863 }, { "epoch": 2.784297250618494, "grad_norm": 0.12814565002918243, "learning_rate": 3.4000000000000005e-06, "loss": 0.8631, "step": 1864 }, { "epoch": 2.78579097231947, "grad_norm": 0.14584243297576904, "learning_rate": 3.3750000000000003e-06, "loss": 0.8322, "step": 1865 }, { "epoch": 2.787284694020445, "grad_norm": 0.12359193712472916, "learning_rate": 3.3500000000000005e-06, "loss": 0.7266, "step": 1866 }, { "epoch": 2.788778415721421, "grad_norm": 0.13280299305915833, "learning_rate": 3.3250000000000004e-06, "loss": 0.8232, "step": 1867 }, { "epoch": 2.7902721374223964, "grad_norm": 0.12858431041240692, "learning_rate": 3.3e-06, "loss": 0.776, "step": 1868 }, { "epoch": 2.7917658591233723, "grad_norm": 0.14832574129104614, "learning_rate": 3.2750000000000004e-06, "loss": 0.8475, "step": 1869 }, { "epoch": 2.7932595808243477, "grad_norm": 0.12968012690544128, "learning_rate": 3.2500000000000002e-06, "loss": 0.7524, "step": 1870 }, { "epoch": 2.794753302525323, "grad_norm": 0.13714861869812012, "learning_rate": 3.225e-06, "loss": 0.7884, "step": 1871 }, { "epoch": 2.796247024226299, "grad_norm": 0.1276467889547348, "learning_rate": 3.2000000000000003e-06, "loss": 0.8054, "step": 1872 }, { "epoch": 2.7977407459272743, "grad_norm": 0.1376863569021225, "learning_rate": 3.175e-06, "loss": 0.9044, "step": 1873 }, { "epoch": 2.79923446762825, "grad_norm": 0.14112500846385956, "learning_rate": 3.1500000000000003e-06, "loss": 0.8894, "step": 1874 }, { "epoch": 2.8007281893292255, "grad_norm": 0.1406603455543518, "learning_rate": 3.125e-06, "loss": 0.9462, "step": 1875 }, { "epoch": 2.8022219110302014, "grad_norm": 0.13707482814788818, "learning_rate": 3.1e-06, "loss": 0.8635, "step": 1876 }, { "epoch": 2.8037156327311767, "grad_norm": 0.14117224514484406, "learning_rate": 3.075e-06, "loss": 0.9941, "step": 1877 }, { "epoch": 2.8052093544321526, "grad_norm": 0.12427777051925659, "learning_rate": 3.05e-06, "loss": 0.6789, "step": 1878 }, { "epoch": 2.806703076133128, "grad_norm": 0.14062422513961792, "learning_rate": 3.0250000000000003e-06, "loss": 0.8065, "step": 1879 }, { "epoch": 2.8081967978341034, "grad_norm": 0.12801432609558105, "learning_rate": 3e-06, "loss": 0.8097, "step": 1880 }, { "epoch": 2.809690519535079, "grad_norm": 0.14395985007286072, "learning_rate": 2.975e-06, "loss": 0.976, "step": 1881 }, { "epoch": 2.8111842412360546, "grad_norm": 0.12981171905994415, "learning_rate": 2.95e-06, "loss": 0.8086, "step": 1882 }, { "epoch": 2.8126779629370304, "grad_norm": 0.12972977757453918, "learning_rate": 2.9250000000000004e-06, "loss": 0.7668, "step": 1883 }, { "epoch": 2.814171684638006, "grad_norm": 0.13736984133720398, "learning_rate": 2.9e-06, "loss": 0.8503, "step": 1884 }, { "epoch": 2.815665406338981, "grad_norm": 0.15451616048812866, "learning_rate": 2.8750000000000004e-06, "loss": 0.8456, "step": 1885 }, { "epoch": 2.817159128039957, "grad_norm": 0.12091310322284698, "learning_rate": 2.8500000000000002e-06, "loss": 0.7112, "step": 1886 }, { "epoch": 2.818652849740933, "grad_norm": 0.1309812217950821, "learning_rate": 2.825e-06, "loss": 0.8776, "step": 1887 }, { "epoch": 2.8201465714419083, "grad_norm": 0.1265869140625, "learning_rate": 2.8000000000000003e-06, "loss": 0.7902, "step": 1888 }, { "epoch": 2.8216402931428837, "grad_norm": 0.1340566873550415, "learning_rate": 2.775e-06, "loss": 0.8696, "step": 1889 }, { "epoch": 2.8231340148438595, "grad_norm": 0.13236786425113678, "learning_rate": 2.7500000000000004e-06, "loss": 0.8118, "step": 1890 }, { "epoch": 2.824627736544835, "grad_norm": 0.12906421720981598, "learning_rate": 2.725e-06, "loss": 0.8332, "step": 1891 }, { "epoch": 2.8261214582458107, "grad_norm": 0.14878374338150024, "learning_rate": 2.7e-06, "loss": 0.8829, "step": 1892 }, { "epoch": 2.827615179946786, "grad_norm": 0.1303490847349167, "learning_rate": 2.6750000000000002e-06, "loss": 0.8239, "step": 1893 }, { "epoch": 2.8291089016477615, "grad_norm": 0.13555185496807098, "learning_rate": 2.65e-06, "loss": 0.7708, "step": 1894 }, { "epoch": 2.8306026233487374, "grad_norm": 0.14033333957195282, "learning_rate": 2.625e-06, "loss": 0.8795, "step": 1895 }, { "epoch": 2.832096345049713, "grad_norm": 0.14355766773223877, "learning_rate": 2.6e-06, "loss": 0.9749, "step": 1896 }, { "epoch": 2.8335900667506886, "grad_norm": 0.12297162413597107, "learning_rate": 2.575e-06, "loss": 0.8227, "step": 1897 }, { "epoch": 2.835083788451664, "grad_norm": 0.1683724969625473, "learning_rate": 2.55e-06, "loss": 0.9988, "step": 1898 }, { "epoch": 2.83657751015264, "grad_norm": 0.13489475846290588, "learning_rate": 2.5250000000000004e-06, "loss": 0.7643, "step": 1899 }, { "epoch": 2.838071231853615, "grad_norm": 0.12544168531894684, "learning_rate": 2.5e-06, "loss": 0.7465, "step": 1900 }, { "epoch": 2.839564953554591, "grad_norm": 0.13414187729358673, "learning_rate": 2.4750000000000004e-06, "loss": 0.7038, "step": 1901 }, { "epoch": 2.8410586752555664, "grad_norm": 0.13224484026432037, "learning_rate": 2.4500000000000003e-06, "loss": 0.8226, "step": 1902 }, { "epoch": 2.842552396956542, "grad_norm": 0.18595905601978302, "learning_rate": 2.425e-06, "loss": 0.8574, "step": 1903 }, { "epoch": 2.8440461186575177, "grad_norm": 0.14025714993476868, "learning_rate": 2.4000000000000003e-06, "loss": 0.8657, "step": 1904 }, { "epoch": 2.845539840358493, "grad_norm": 0.13900399208068848, "learning_rate": 2.375e-06, "loss": 0.9354, "step": 1905 }, { "epoch": 2.847033562059469, "grad_norm": 0.13567961752414703, "learning_rate": 2.35e-06, "loss": 0.949, "step": 1906 }, { "epoch": 2.8485272837604443, "grad_norm": 0.13972541689872742, "learning_rate": 2.325e-06, "loss": 0.8588, "step": 1907 }, { "epoch": 2.85002100546142, "grad_norm": 0.13053269684314728, "learning_rate": 2.3e-06, "loss": 0.8081, "step": 1908 }, { "epoch": 2.8515147271623955, "grad_norm": 0.13018196821212769, "learning_rate": 2.2750000000000002e-06, "loss": 0.8285, "step": 1909 }, { "epoch": 2.8530084488633713, "grad_norm": 0.13185670971870422, "learning_rate": 2.25e-06, "loss": 0.8178, "step": 1910 }, { "epoch": 2.8545021705643467, "grad_norm": 0.1283489167690277, "learning_rate": 2.225e-06, "loss": 0.7469, "step": 1911 }, { "epoch": 2.855995892265322, "grad_norm": 0.14950460195541382, "learning_rate": 2.2e-06, "loss": 0.7991, "step": 1912 }, { "epoch": 2.857489613966298, "grad_norm": 0.13155660033226013, "learning_rate": 2.175e-06, "loss": 0.7538, "step": 1913 }, { "epoch": 2.8589833356672734, "grad_norm": 0.13807563483715057, "learning_rate": 2.1499999999999997e-06, "loss": 0.8601, "step": 1914 }, { "epoch": 2.860477057368249, "grad_norm": 0.12953098118305206, "learning_rate": 2.1250000000000004e-06, "loss": 0.8206, "step": 1915 }, { "epoch": 2.8619707790692246, "grad_norm": 0.11969473958015442, "learning_rate": 2.1000000000000002e-06, "loss": 0.7081, "step": 1916 }, { "epoch": 2.8634645007702, "grad_norm": 0.1373208612203598, "learning_rate": 2.075e-06, "loss": 0.8703, "step": 1917 }, { "epoch": 2.864958222471176, "grad_norm": 0.1290508508682251, "learning_rate": 2.0500000000000003e-06, "loss": 0.6939, "step": 1918 }, { "epoch": 2.8664519441721517, "grad_norm": 0.13451719284057617, "learning_rate": 2.025e-06, "loss": 0.8793, "step": 1919 }, { "epoch": 2.867945665873127, "grad_norm": 0.12745928764343262, "learning_rate": 2.0000000000000003e-06, "loss": 0.7261, "step": 1920 }, { "epoch": 2.8694393875741024, "grad_norm": 0.1275077760219574, "learning_rate": 1.975e-06, "loss": 0.9061, "step": 1921 }, { "epoch": 2.8709331092750783, "grad_norm": 0.1491306871175766, "learning_rate": 1.95e-06, "loss": 0.9045, "step": 1922 }, { "epoch": 2.8724268309760537, "grad_norm": 0.1316901445388794, "learning_rate": 1.925e-06, "loss": 0.8316, "step": 1923 }, { "epoch": 2.8739205526770295, "grad_norm": 0.13724473118782043, "learning_rate": 1.9e-06, "loss": 0.8155, "step": 1924 }, { "epoch": 2.875414274378005, "grad_norm": 0.1299271285533905, "learning_rate": 1.875e-06, "loss": 0.8039, "step": 1925 }, { "epoch": 2.8769079960789803, "grad_norm": 0.13260062038898468, "learning_rate": 1.85e-06, "loss": 0.8966, "step": 1926 }, { "epoch": 2.878401717779956, "grad_norm": 0.13353827595710754, "learning_rate": 1.8249999999999999e-06, "loss": 0.8693, "step": 1927 }, { "epoch": 2.879895439480932, "grad_norm": 0.13441555202007294, "learning_rate": 1.8e-06, "loss": 0.8609, "step": 1928 }, { "epoch": 2.8813891611819074, "grad_norm": 0.12084376066923141, "learning_rate": 1.775e-06, "loss": 0.7374, "step": 1929 }, { "epoch": 2.8828828828828827, "grad_norm": 0.1256219893693924, "learning_rate": 1.7500000000000002e-06, "loss": 0.7947, "step": 1930 }, { "epoch": 2.8843766045838586, "grad_norm": 0.13652807474136353, "learning_rate": 1.7250000000000002e-06, "loss": 0.8834, "step": 1931 }, { "epoch": 2.885870326284834, "grad_norm": 0.13753530383110046, "learning_rate": 1.7000000000000002e-06, "loss": 0.8572, "step": 1932 }, { "epoch": 2.88736404798581, "grad_norm": 0.13067521154880524, "learning_rate": 1.6750000000000003e-06, "loss": 0.7689, "step": 1933 }, { "epoch": 2.888857769686785, "grad_norm": 0.143594890832901, "learning_rate": 1.65e-06, "loss": 0.7923, "step": 1934 }, { "epoch": 2.8903514913877606, "grad_norm": 0.13341201841831207, "learning_rate": 1.6250000000000001e-06, "loss": 0.8152, "step": 1935 }, { "epoch": 2.8918452130887364, "grad_norm": 0.13183757662773132, "learning_rate": 1.6000000000000001e-06, "loss": 0.8945, "step": 1936 }, { "epoch": 2.8933389347897123, "grad_norm": 0.12819354236125946, "learning_rate": 1.5750000000000002e-06, "loss": 0.7591, "step": 1937 }, { "epoch": 2.8948326564906877, "grad_norm": 0.1375562697649002, "learning_rate": 1.55e-06, "loss": 0.8998, "step": 1938 }, { "epoch": 2.896326378191663, "grad_norm": 0.1300017088651657, "learning_rate": 1.525e-06, "loss": 0.906, "step": 1939 }, { "epoch": 2.897820099892639, "grad_norm": 0.13214221596717834, "learning_rate": 1.5e-06, "loss": 0.815, "step": 1940 }, { "epoch": 2.8993138215936143, "grad_norm": 0.13736654818058014, "learning_rate": 1.475e-06, "loss": 0.94, "step": 1941 }, { "epoch": 2.90080754329459, "grad_norm": 0.1382712572813034, "learning_rate": 1.45e-06, "loss": 0.8826, "step": 1942 }, { "epoch": 2.9023012649955655, "grad_norm": 0.1277526319026947, "learning_rate": 1.4250000000000001e-06, "loss": 0.7826, "step": 1943 }, { "epoch": 2.903794986696541, "grad_norm": 0.1370660364627838, "learning_rate": 1.4000000000000001e-06, "loss": 0.7901, "step": 1944 }, { "epoch": 2.9052887083975167, "grad_norm": 0.13495205342769623, "learning_rate": 1.3750000000000002e-06, "loss": 0.8788, "step": 1945 }, { "epoch": 2.906782430098492, "grad_norm": 0.1350393146276474, "learning_rate": 1.35e-06, "loss": 0.7678, "step": 1946 }, { "epoch": 2.908276151799468, "grad_norm": 0.14372332394123077, "learning_rate": 1.325e-06, "loss": 0.9522, "step": 1947 }, { "epoch": 2.9097698735004434, "grad_norm": 0.1417144387960434, "learning_rate": 1.3e-06, "loss": 0.8265, "step": 1948 }, { "epoch": 2.911263595201419, "grad_norm": 0.13423344492912292, "learning_rate": 1.275e-06, "loss": 0.9221, "step": 1949 }, { "epoch": 2.9127573169023946, "grad_norm": 0.1304454803466797, "learning_rate": 1.25e-06, "loss": 0.7722, "step": 1950 }, { "epoch": 2.9142510386033704, "grad_norm": 0.1410652995109558, "learning_rate": 1.2250000000000001e-06, "loss": 0.7883, "step": 1951 }, { "epoch": 2.915744760304346, "grad_norm": 0.1329158991575241, "learning_rate": 1.2000000000000002e-06, "loss": 0.801, "step": 1952 }, { "epoch": 2.917238482005321, "grad_norm": 0.13874948024749756, "learning_rate": 1.175e-06, "loss": 0.914, "step": 1953 }, { "epoch": 2.918732203706297, "grad_norm": 0.1357342004776001, "learning_rate": 1.15e-06, "loss": 0.9164, "step": 1954 }, { "epoch": 2.9202259254072724, "grad_norm": 0.1369502693414688, "learning_rate": 1.125e-06, "loss": 0.8376, "step": 1955 }, { "epoch": 2.9217196471082483, "grad_norm": 0.1390984058380127, "learning_rate": 1.1e-06, "loss": 0.7947, "step": 1956 }, { "epoch": 2.9232133688092237, "grad_norm": 0.13058212399482727, "learning_rate": 1.0749999999999999e-06, "loss": 0.7117, "step": 1957 }, { "epoch": 2.924707090510199, "grad_norm": 0.1306363195180893, "learning_rate": 1.0500000000000001e-06, "loss": 0.8822, "step": 1958 }, { "epoch": 2.926200812211175, "grad_norm": 0.1476043164730072, "learning_rate": 1.0250000000000001e-06, "loss": 0.8378, "step": 1959 }, { "epoch": 2.9276945339121507, "grad_norm": 0.12513180077075958, "learning_rate": 1.0000000000000002e-06, "loss": 0.722, "step": 1960 }, { "epoch": 2.929188255613126, "grad_norm": 0.14964450895786285, "learning_rate": 9.75e-07, "loss": 0.8483, "step": 1961 }, { "epoch": 2.9306819773141015, "grad_norm": 0.1434432715177536, "learning_rate": 9.5e-07, "loss": 0.8425, "step": 1962 }, { "epoch": 2.9321756990150774, "grad_norm": 0.14688511192798615, "learning_rate": 9.25e-07, "loss": 0.8818, "step": 1963 }, { "epoch": 2.9336694207160527, "grad_norm": 0.1257922500371933, "learning_rate": 9e-07, "loss": 0.6783, "step": 1964 }, { "epoch": 2.9351631424170286, "grad_norm": 0.14518772065639496, "learning_rate": 8.750000000000001e-07, "loss": 0.9043, "step": 1965 }, { "epoch": 2.936656864118004, "grad_norm": 0.13337141275405884, "learning_rate": 8.500000000000001e-07, "loss": 0.8792, "step": 1966 }, { "epoch": 2.9381505858189794, "grad_norm": 0.13176392018795013, "learning_rate": 8.25e-07, "loss": 0.8364, "step": 1967 }, { "epoch": 2.939644307519955, "grad_norm": 0.13096357882022858, "learning_rate": 8.000000000000001e-07, "loss": 0.84, "step": 1968 }, { "epoch": 2.941138029220931, "grad_norm": 0.14396648108959198, "learning_rate": 7.75e-07, "loss": 0.9241, "step": 1969 }, { "epoch": 2.9426317509219064, "grad_norm": 0.12752515077590942, "learning_rate": 7.5e-07, "loss": 0.8947, "step": 1970 }, { "epoch": 2.944125472622882, "grad_norm": 0.12772531807422638, "learning_rate": 7.25e-07, "loss": 0.7739, "step": 1971 }, { "epoch": 2.9456191943238577, "grad_norm": 0.14180846512317657, "learning_rate": 7.000000000000001e-07, "loss": 0.8859, "step": 1972 }, { "epoch": 2.947112916024833, "grad_norm": 0.1298796534538269, "learning_rate": 6.75e-07, "loss": 0.7306, "step": 1973 }, { "epoch": 2.948606637725809, "grad_norm": 0.13846975564956665, "learning_rate": 6.5e-07, "loss": 0.9478, "step": 1974 }, { "epoch": 2.9501003594267843, "grad_norm": 0.1275867223739624, "learning_rate": 6.25e-07, "loss": 0.7665, "step": 1975 }, { "epoch": 2.9515940811277597, "grad_norm": 0.13771145045757294, "learning_rate": 6.000000000000001e-07, "loss": 0.7644, "step": 1976 }, { "epoch": 2.9530878028287355, "grad_norm": 0.14976158738136292, "learning_rate": 5.75e-07, "loss": 1.0802, "step": 1977 }, { "epoch": 2.954581524529711, "grad_norm": 0.11821278184652328, "learning_rate": 5.5e-07, "loss": 0.7612, "step": 1978 }, { "epoch": 2.9560752462306867, "grad_norm": 0.12651163339614868, "learning_rate": 5.250000000000001e-07, "loss": 0.7035, "step": 1979 }, { "epoch": 2.957568967931662, "grad_norm": 0.13467690348625183, "learning_rate": 5.000000000000001e-07, "loss": 0.8437, "step": 1980 }, { "epoch": 2.959062689632638, "grad_norm": 0.1531185507774353, "learning_rate": 4.75e-07, "loss": 0.8258, "step": 1981 }, { "epoch": 2.9605564113336134, "grad_norm": 0.14234229922294617, "learning_rate": 4.5e-07, "loss": 0.7503, "step": 1982 }, { "epoch": 2.962050133034589, "grad_norm": 0.12664321064949036, "learning_rate": 4.2500000000000006e-07, "loss": 0.8703, "step": 1983 }, { "epoch": 2.9635438547355646, "grad_norm": 0.1282534897327423, "learning_rate": 4.0000000000000003e-07, "loss": 0.7372, "step": 1984 }, { "epoch": 2.96503757643654, "grad_norm": 0.15417100489139557, "learning_rate": 3.75e-07, "loss": 0.9149, "step": 1985 }, { "epoch": 2.966531298137516, "grad_norm": 0.13329479098320007, "learning_rate": 3.5000000000000004e-07, "loss": 0.8692, "step": 1986 }, { "epoch": 2.968025019838491, "grad_norm": 0.13036604225635529, "learning_rate": 3.25e-07, "loss": 0.8263, "step": 1987 }, { "epoch": 2.969518741539467, "grad_norm": 0.13623115420341492, "learning_rate": 3.0000000000000004e-07, "loss": 0.8249, "step": 1988 }, { "epoch": 2.9710124632404424, "grad_norm": 0.14026089012622833, "learning_rate": 2.75e-07, "loss": 0.9329, "step": 1989 }, { "epoch": 2.972506184941418, "grad_norm": 0.13027694821357727, "learning_rate": 2.5000000000000004e-07, "loss": 0.8672, "step": 1990 }, { "epoch": 2.9739999066423937, "grad_norm": 0.1526614874601364, "learning_rate": 2.25e-07, "loss": 0.9916, "step": 1991 }, { "epoch": 2.9754936283433695, "grad_norm": 0.13207581639289856, "learning_rate": 2.0000000000000002e-07, "loss": 0.8637, "step": 1992 }, { "epoch": 2.976987350044345, "grad_norm": 0.1457149088382721, "learning_rate": 1.7500000000000002e-07, "loss": 0.8649, "step": 1993 }, { "epoch": 2.9784810717453203, "grad_norm": 0.1365957111120224, "learning_rate": 1.5000000000000002e-07, "loss": 0.9008, "step": 1994 }, { "epoch": 2.979974793446296, "grad_norm": 0.1299731433391571, "learning_rate": 1.2500000000000002e-07, "loss": 0.7634, "step": 1995 }, { "epoch": 2.9814685151472715, "grad_norm": 0.13118845224380493, "learning_rate": 1.0000000000000001e-07, "loss": 0.7495, "step": 1996 }, { "epoch": 2.9829622368482474, "grad_norm": 0.13413678109645844, "learning_rate": 7.500000000000001e-08, "loss": 0.9108, "step": 1997 }, { "epoch": 2.9844559585492227, "grad_norm": 0.12679436802864075, "learning_rate": 5.0000000000000004e-08, "loss": 0.81, "step": 1998 }, { "epoch": 2.985949680250198, "grad_norm": 0.14571337401866913, "learning_rate": 2.5000000000000002e-08, "loss": 0.9572, "step": 1999 }, { "epoch": 2.987443401951174, "grad_norm": 0.12721623480319977, "learning_rate": 0.0, "loss": 0.7663, "step": 2000 }, { "epoch": 2.987443401951174, "step": 2000, "total_flos": 1.04520375926784e+18, "train_loss": 0.8730267834961414, "train_runtime": 69929.3152, "train_samples_per_second": 0.915, "train_steps_per_second": 0.029 } ], "logging_steps": 1.0, "max_steps": 2000, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.04520375926784e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }