diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.10006054268259122, + "epoch": 0.15009081402388685, "eval_steps": 500, - "global_step": 1818, + "global_step": 2727, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -12733,6 +12733,6369 @@ "learning_rate": 9.94464527995885e-06, "loss": 0.8529, "step": 1818 + }, + { + "epoch": 0.10011558148494688, + "grad_norm": 0.9318063259124756, + "learning_rate": 9.944580939660501e-06, + "loss": 0.8978, + "step": 1819 + }, + { + "epoch": 0.10017062028730254, + "grad_norm": 0.847023069858551, + "learning_rate": 9.944516562200004e-06, + "loss": 0.8007, + "step": 1820 + }, + { + "epoch": 0.1002256590896582, + "grad_norm": 0.8817011117935181, + "learning_rate": 9.944452147577844e-06, + "loss": 0.8819, + "step": 1821 + }, + { + "epoch": 0.10028069789201387, + "grad_norm": 0.8560144901275635, + "learning_rate": 9.944387695794505e-06, + "loss": 0.8219, + "step": 1822 + }, + { + "epoch": 0.10033573669436953, + "grad_norm": 0.9358342885971069, + "learning_rate": 9.944323206850472e-06, + "loss": 0.8533, + "step": 1823 + }, + { + "epoch": 0.10039077549672519, + "grad_norm": 0.8327087163925171, + "learning_rate": 9.94425868074623e-06, + "loss": 0.8359, + "step": 1824 + }, + { + "epoch": 0.10044581429908085, + "grad_norm": 1.0590367317199707, + "learning_rate": 9.944194117482263e-06, + "loss": 0.9659, + "step": 1825 + }, + { + "epoch": 0.10050085310143651, + "grad_norm": 0.8739829063415527, + "learning_rate": 9.944129517059055e-06, + "loss": 0.7868, + "step": 1826 + }, + { + "epoch": 0.10055589190379217, + "grad_norm": 0.8465235233306885, + "learning_rate": 9.944064879477093e-06, + "loss": 0.8554, + "step": 1827 + }, + { + "epoch": 0.10061093070614784, + "grad_norm": 0.9068321585655212, + "learning_rate": 9.944000204736864e-06, + "loss": 0.8648, + "step": 1828 + }, + { + "epoch": 0.1006659695085035, + "grad_norm": 0.8308066725730896, + "learning_rate": 9.943935492838853e-06, + "loss": 0.8471, + "step": 1829 + }, + { + "epoch": 0.10072100831085916, + "grad_norm": 0.9973901510238647, + "learning_rate": 9.943870743783545e-06, + "loss": 0.9398, + "step": 1830 + }, + { + "epoch": 0.10077604711321482, + "grad_norm": 0.8532593250274658, + "learning_rate": 9.94380595757143e-06, + "loss": 0.9001, + "step": 1831 + }, + { + "epoch": 0.10083108591557048, + "grad_norm": 0.8571139574050903, + "learning_rate": 9.94374113420299e-06, + "loss": 0.85, + "step": 1832 + }, + { + "epoch": 0.10088612471792614, + "grad_norm": 0.905624508857727, + "learning_rate": 9.943676273678717e-06, + "loss": 0.9587, + "step": 1833 + }, + { + "epoch": 0.1009411635202818, + "grad_norm": 1.0224663019180298, + "learning_rate": 9.943611375999097e-06, + "loss": 0.8236, + "step": 1834 + }, + { + "epoch": 0.10099620232263747, + "grad_norm": 0.8900588154792786, + "learning_rate": 9.943546441164615e-06, + "loss": 0.877, + "step": 1835 + }, + { + "epoch": 0.10105124112499313, + "grad_norm": 0.8852938413619995, + "learning_rate": 9.943481469175765e-06, + "loss": 0.9521, + "step": 1836 + }, + { + "epoch": 0.10110627992734877, + "grad_norm": 0.9249371290206909, + "learning_rate": 9.943416460033027e-06, + "loss": 0.8541, + "step": 1837 + }, + { + "epoch": 0.10116131872970444, + "grad_norm": 0.8533583283424377, + "learning_rate": 9.943351413736897e-06, + "loss": 0.8571, + "step": 1838 + }, + { + "epoch": 0.1012163575320601, + "grad_norm": 0.743800699710846, + "learning_rate": 9.94328633028786e-06, + "loss": 0.749, + "step": 1839 + }, + { + "epoch": 0.10127139633441576, + "grad_norm": 0.7836641669273376, + "learning_rate": 9.943221209686407e-06, + "loss": 0.8237, + "step": 1840 + }, + { + "epoch": 0.10132643513677142, + "grad_norm": 0.800782322883606, + "learning_rate": 9.943156051933024e-06, + "loss": 0.8323, + "step": 1841 + }, + { + "epoch": 0.10138147393912708, + "grad_norm": 0.7531478404998779, + "learning_rate": 9.943090857028206e-06, + "loss": 0.8041, + "step": 1842 + }, + { + "epoch": 0.10143651274148274, + "grad_norm": 0.9837996959686279, + "learning_rate": 9.94302562497244e-06, + "loss": 0.8084, + "step": 1843 + }, + { + "epoch": 0.1014915515438384, + "grad_norm": 0.8038331866264343, + "learning_rate": 9.942960355766216e-06, + "loss": 0.8454, + "step": 1844 + }, + { + "epoch": 0.10154659034619407, + "grad_norm": 0.7822145819664001, + "learning_rate": 9.942895049410024e-06, + "loss": 0.8137, + "step": 1845 + }, + { + "epoch": 0.10160162914854973, + "grad_norm": 0.8222663998603821, + "learning_rate": 9.942829705904358e-06, + "loss": 0.8981, + "step": 1846 + }, + { + "epoch": 0.10165666795090539, + "grad_norm": 1.0095717906951904, + "learning_rate": 9.942764325249707e-06, + "loss": 0.9159, + "step": 1847 + }, + { + "epoch": 0.10171170675326105, + "grad_norm": 0.8264054656028748, + "learning_rate": 9.942698907446561e-06, + "loss": 0.9233, + "step": 1848 + }, + { + "epoch": 0.10176674555561671, + "grad_norm": 0.8244288563728333, + "learning_rate": 9.942633452495414e-06, + "loss": 0.8507, + "step": 1849 + }, + { + "epoch": 0.10182178435797237, + "grad_norm": 0.8457715511322021, + "learning_rate": 9.942567960396755e-06, + "loss": 0.7897, + "step": 1850 + }, + { + "epoch": 0.10187682316032803, + "grad_norm": 0.8356698155403137, + "learning_rate": 9.94250243115108e-06, + "loss": 0.7927, + "step": 1851 + }, + { + "epoch": 0.1019318619626837, + "grad_norm": 0.8251230716705322, + "learning_rate": 9.94243686475888e-06, + "loss": 0.8977, + "step": 1852 + }, + { + "epoch": 0.10198690076503936, + "grad_norm": 0.8370125889778137, + "learning_rate": 9.942371261220647e-06, + "loss": 0.8204, + "step": 1853 + }, + { + "epoch": 0.10204193956739502, + "grad_norm": 1.6722066402435303, + "learning_rate": 9.942305620536876e-06, + "loss": 0.9284, + "step": 1854 + }, + { + "epoch": 0.10209697836975068, + "grad_norm": 0.8424906730651855, + "learning_rate": 9.942239942708057e-06, + "loss": 0.833, + "step": 1855 + }, + { + "epoch": 0.10215201717210634, + "grad_norm": 0.7475115656852722, + "learning_rate": 9.942174227734686e-06, + "loss": 0.6158, + "step": 1856 + }, + { + "epoch": 0.102207055974462, + "grad_norm": 0.8652095198631287, + "learning_rate": 9.942108475617256e-06, + "loss": 0.8781, + "step": 1857 + }, + { + "epoch": 0.10226209477681765, + "grad_norm": 1.0621691942214966, + "learning_rate": 9.942042686356263e-06, + "loss": 1.0276, + "step": 1858 + }, + { + "epoch": 0.10231713357917331, + "grad_norm": 1.113357424736023, + "learning_rate": 9.941976859952199e-06, + "loss": 0.8799, + "step": 1859 + }, + { + "epoch": 0.10237217238152897, + "grad_norm": 0.9153568148612976, + "learning_rate": 9.94191099640556e-06, + "loss": 0.7988, + "step": 1860 + }, + { + "epoch": 0.10242721118388463, + "grad_norm": 0.9217341542243958, + "learning_rate": 9.941845095716842e-06, + "loss": 0.7785, + "step": 1861 + }, + { + "epoch": 0.1024822499862403, + "grad_norm": 0.8702190518379211, + "learning_rate": 9.941779157886538e-06, + "loss": 0.7648, + "step": 1862 + }, + { + "epoch": 0.10253728878859596, + "grad_norm": 0.8609822988510132, + "learning_rate": 9.941713182915144e-06, + "loss": 0.9095, + "step": 1863 + }, + { + "epoch": 0.10259232759095162, + "grad_norm": 0.7766719460487366, + "learning_rate": 9.941647170803157e-06, + "loss": 0.6984, + "step": 1864 + }, + { + "epoch": 0.10264736639330728, + "grad_norm": 0.8497375249862671, + "learning_rate": 9.941581121551074e-06, + "loss": 0.9161, + "step": 1865 + }, + { + "epoch": 0.10270240519566294, + "grad_norm": 0.8007600903511047, + "learning_rate": 9.941515035159388e-06, + "loss": 0.8099, + "step": 1866 + }, + { + "epoch": 0.1027574439980186, + "grad_norm": 0.7932959794998169, + "learning_rate": 9.941448911628599e-06, + "loss": 0.8049, + "step": 1867 + }, + { + "epoch": 0.10281248280037426, + "grad_norm": 1.3169244527816772, + "learning_rate": 9.941382750959203e-06, + "loss": 0.8601, + "step": 1868 + }, + { + "epoch": 0.10286752160272992, + "grad_norm": 0.8011140823364258, + "learning_rate": 9.941316553151696e-06, + "loss": 0.8397, + "step": 1869 + }, + { + "epoch": 0.10292256040508559, + "grad_norm": 0.811210572719574, + "learning_rate": 9.941250318206577e-06, + "loss": 0.7863, + "step": 1870 + }, + { + "epoch": 0.10297759920744125, + "grad_norm": 0.8172751665115356, + "learning_rate": 9.941184046124342e-06, + "loss": 0.8114, + "step": 1871 + }, + { + "epoch": 0.10303263800979691, + "grad_norm": 0.8072887063026428, + "learning_rate": 9.941117736905493e-06, + "loss": 0.8928, + "step": 1872 + }, + { + "epoch": 0.10308767681215257, + "grad_norm": 0.9111380577087402, + "learning_rate": 9.941051390550524e-06, + "loss": 0.866, + "step": 1873 + }, + { + "epoch": 0.10314271561450823, + "grad_norm": 0.8158383369445801, + "learning_rate": 9.940985007059936e-06, + "loss": 0.7805, + "step": 1874 + }, + { + "epoch": 0.1031977544168639, + "grad_norm": 0.8858961462974548, + "learning_rate": 9.940918586434226e-06, + "loss": 0.8424, + "step": 1875 + }, + { + "epoch": 0.10325279321921955, + "grad_norm": 0.8835182189941406, + "learning_rate": 9.940852128673895e-06, + "loss": 0.7816, + "step": 1876 + }, + { + "epoch": 0.10330783202157522, + "grad_norm": 1.044227123260498, + "learning_rate": 9.940785633779444e-06, + "loss": 0.8952, + "step": 1877 + }, + { + "epoch": 0.10336287082393088, + "grad_norm": 0.8255050778388977, + "learning_rate": 9.940719101751367e-06, + "loss": 0.8215, + "step": 1878 + }, + { + "epoch": 0.10341790962628654, + "grad_norm": 0.8561689257621765, + "learning_rate": 9.940652532590172e-06, + "loss": 0.9686, + "step": 1879 + }, + { + "epoch": 0.10347294842864219, + "grad_norm": 0.8798959255218506, + "learning_rate": 9.94058592629635e-06, + "loss": 0.8993, + "step": 1880 + }, + { + "epoch": 0.10352798723099785, + "grad_norm": 0.9292098879814148, + "learning_rate": 9.940519282870411e-06, + "loss": 0.8536, + "step": 1881 + }, + { + "epoch": 0.10358302603335351, + "grad_norm": 0.8865400552749634, + "learning_rate": 9.940452602312851e-06, + "loss": 0.8024, + "step": 1882 + }, + { + "epoch": 0.10363806483570917, + "grad_norm": 0.8985510468482971, + "learning_rate": 9.94038588462417e-06, + "loss": 0.7748, + "step": 1883 + }, + { + "epoch": 0.10369310363806483, + "grad_norm": 0.9973617196083069, + "learning_rate": 9.940319129804872e-06, + "loss": 0.875, + "step": 1884 + }, + { + "epoch": 0.1037481424404205, + "grad_norm": 0.8615350723266602, + "learning_rate": 9.940252337855458e-06, + "loss": 0.904, + "step": 1885 + }, + { + "epoch": 0.10380318124277615, + "grad_norm": 0.8752412796020508, + "learning_rate": 9.940185508776429e-06, + "loss": 0.8735, + "step": 1886 + }, + { + "epoch": 0.10385822004513182, + "grad_norm": 0.8639446496963501, + "learning_rate": 9.94011864256829e-06, + "loss": 0.7952, + "step": 1887 + }, + { + "epoch": 0.10391325884748748, + "grad_norm": 0.7932116389274597, + "learning_rate": 9.94005173923154e-06, + "loss": 0.8721, + "step": 1888 + }, + { + "epoch": 0.10396829764984314, + "grad_norm": 0.8573791980743408, + "learning_rate": 9.939984798766685e-06, + "loss": 0.9271, + "step": 1889 + }, + { + "epoch": 0.1040233364521988, + "grad_norm": 0.9080122113227844, + "learning_rate": 9.939917821174225e-06, + "loss": 0.8991, + "step": 1890 + }, + { + "epoch": 0.10407837525455446, + "grad_norm": 0.7883808612823486, + "learning_rate": 9.939850806454664e-06, + "loss": 0.6895, + "step": 1891 + }, + { + "epoch": 0.10413341405691012, + "grad_norm": 0.8067768216133118, + "learning_rate": 9.93978375460851e-06, + "loss": 0.835, + "step": 1892 + }, + { + "epoch": 0.10418845285926578, + "grad_norm": 0.8756459951400757, + "learning_rate": 9.939716665636262e-06, + "loss": 0.8144, + "step": 1893 + }, + { + "epoch": 0.10424349166162145, + "grad_norm": 0.8056700825691223, + "learning_rate": 9.939649539538425e-06, + "loss": 0.7454, + "step": 1894 + }, + { + "epoch": 0.10429853046397711, + "grad_norm": 1.0756300687789917, + "learning_rate": 9.939582376315505e-06, + "loss": 0.8096, + "step": 1895 + }, + { + "epoch": 0.10435356926633277, + "grad_norm": 0.8938102126121521, + "learning_rate": 9.939515175968006e-06, + "loss": 0.7496, + "step": 1896 + }, + { + "epoch": 0.10440860806868843, + "grad_norm": 0.9371656775474548, + "learning_rate": 9.939447938496434e-06, + "loss": 0.9817, + "step": 1897 + }, + { + "epoch": 0.10446364687104409, + "grad_norm": 1.0216082334518433, + "learning_rate": 9.939380663901292e-06, + "loss": 0.8804, + "step": 1898 + }, + { + "epoch": 0.10451868567339975, + "grad_norm": 0.8791126012802124, + "learning_rate": 9.939313352183088e-06, + "loss": 0.7811, + "step": 1899 + }, + { + "epoch": 0.10457372447575541, + "grad_norm": 0.9925445914268494, + "learning_rate": 9.939246003342326e-06, + "loss": 0.8892, + "step": 1900 + }, + { + "epoch": 0.10462876327811106, + "grad_norm": 1.0459916591644287, + "learning_rate": 9.939178617379514e-06, + "loss": 0.7938, + "step": 1901 + }, + { + "epoch": 0.10468380208046672, + "grad_norm": 0.9103816747665405, + "learning_rate": 9.93911119429516e-06, + "loss": 0.8282, + "step": 1902 + }, + { + "epoch": 0.10473884088282238, + "grad_norm": 0.9602296352386475, + "learning_rate": 9.939043734089764e-06, + "loss": 0.919, + "step": 1903 + }, + { + "epoch": 0.10479387968517805, + "grad_norm": 0.9529246687889099, + "learning_rate": 9.93897623676384e-06, + "loss": 0.9469, + "step": 1904 + }, + { + "epoch": 0.10484891848753371, + "grad_norm": 0.9619705080986023, + "learning_rate": 9.938908702317893e-06, + "loss": 0.9371, + "step": 1905 + }, + { + "epoch": 0.10490395728988937, + "grad_norm": 1.0106935501098633, + "learning_rate": 9.938841130752428e-06, + "loss": 0.7502, + "step": 1906 + }, + { + "epoch": 0.10495899609224503, + "grad_norm": 0.913985013961792, + "learning_rate": 9.938773522067957e-06, + "loss": 0.8172, + "step": 1907 + }, + { + "epoch": 0.10501403489460069, + "grad_norm": 0.9474983215332031, + "learning_rate": 9.938705876264985e-06, + "loss": 0.8999, + "step": 1908 + }, + { + "epoch": 0.10506907369695635, + "grad_norm": 0.9185097813606262, + "learning_rate": 9.938638193344024e-06, + "loss": 0.8976, + "step": 1909 + }, + { + "epoch": 0.10512411249931201, + "grad_norm": 0.7633675932884216, + "learning_rate": 9.938570473305578e-06, + "loss": 0.7777, + "step": 1910 + }, + { + "epoch": 0.10517915130166768, + "grad_norm": 0.9547691345214844, + "learning_rate": 9.938502716150159e-06, + "loss": 0.8154, + "step": 1911 + }, + { + "epoch": 0.10523419010402334, + "grad_norm": 0.8556191921234131, + "learning_rate": 9.938434921878275e-06, + "loss": 0.828, + "step": 1912 + }, + { + "epoch": 0.105289228906379, + "grad_norm": 0.9826140999794006, + "learning_rate": 9.938367090490437e-06, + "loss": 0.8085, + "step": 1913 + }, + { + "epoch": 0.10534426770873466, + "grad_norm": 0.8610432744026184, + "learning_rate": 9.938299221987154e-06, + "loss": 0.9103, + "step": 1914 + }, + { + "epoch": 0.10539930651109032, + "grad_norm": 0.8383543491363525, + "learning_rate": 9.938231316368934e-06, + "loss": 0.8182, + "step": 1915 + }, + { + "epoch": 0.10545434531344598, + "grad_norm": 0.8552964925765991, + "learning_rate": 9.93816337363629e-06, + "loss": 0.8024, + "step": 1916 + }, + { + "epoch": 0.10550938411580164, + "grad_norm": 0.9255730509757996, + "learning_rate": 9.938095393789732e-06, + "loss": 0.8566, + "step": 1917 + }, + { + "epoch": 0.1055644229181573, + "grad_norm": 0.9882987141609192, + "learning_rate": 9.938027376829774e-06, + "loss": 0.7119, + "step": 1918 + }, + { + "epoch": 0.10561946172051297, + "grad_norm": 1.139404535293579, + "learning_rate": 9.93795932275692e-06, + "loss": 0.8839, + "step": 1919 + }, + { + "epoch": 0.10567450052286863, + "grad_norm": 1.004782795906067, + "learning_rate": 9.937891231571686e-06, + "loss": 0.904, + "step": 1920 + }, + { + "epoch": 0.10572953932522429, + "grad_norm": 0.8437260389328003, + "learning_rate": 9.937823103274585e-06, + "loss": 0.7942, + "step": 1921 + }, + { + "epoch": 0.10578457812757995, + "grad_norm": 1.1388722658157349, + "learning_rate": 9.937754937866127e-06, + "loss": 0.9491, + "step": 1922 + }, + { + "epoch": 0.1058396169299356, + "grad_norm": 0.9266740083694458, + "learning_rate": 9.937686735346823e-06, + "loss": 0.9067, + "step": 1923 + }, + { + "epoch": 0.10589465573229126, + "grad_norm": 0.7536123991012573, + "learning_rate": 9.93761849571719e-06, + "loss": 0.6533, + "step": 1924 + }, + { + "epoch": 0.10594969453464692, + "grad_norm": 0.8781737089157104, + "learning_rate": 9.937550218977737e-06, + "loss": 0.8319, + "step": 1925 + }, + { + "epoch": 0.10600473333700258, + "grad_norm": 0.8577924966812134, + "learning_rate": 9.937481905128976e-06, + "loss": 0.8604, + "step": 1926 + }, + { + "epoch": 0.10605977213935824, + "grad_norm": 0.8351713418960571, + "learning_rate": 9.937413554171424e-06, + "loss": 0.946, + "step": 1927 + }, + { + "epoch": 0.1061148109417139, + "grad_norm": 0.971491813659668, + "learning_rate": 9.937345166105594e-06, + "loss": 0.7383, + "step": 1928 + }, + { + "epoch": 0.10616984974406957, + "grad_norm": 0.8020079731941223, + "learning_rate": 9.937276740932001e-06, + "loss": 0.7468, + "step": 1929 + }, + { + "epoch": 0.10622488854642523, + "grad_norm": 0.9057347178459167, + "learning_rate": 9.937208278651153e-06, + "loss": 0.8223, + "step": 1930 + }, + { + "epoch": 0.10627992734878089, + "grad_norm": 0.8384734392166138, + "learning_rate": 9.937139779263574e-06, + "loss": 0.8773, + "step": 1931 + }, + { + "epoch": 0.10633496615113655, + "grad_norm": 0.8732065558433533, + "learning_rate": 9.93707124276977e-06, + "loss": 0.8265, + "step": 1932 + }, + { + "epoch": 0.10639000495349221, + "grad_norm": 0.8744868040084839, + "learning_rate": 9.937002669170264e-06, + "loss": 0.8497, + "step": 1933 + }, + { + "epoch": 0.10644504375584787, + "grad_norm": 0.8589879870414734, + "learning_rate": 9.936934058465564e-06, + "loss": 0.8116, + "step": 1934 + }, + { + "epoch": 0.10650008255820353, + "grad_norm": 0.8614563941955566, + "learning_rate": 9.936865410656192e-06, + "loss": 0.7823, + "step": 1935 + }, + { + "epoch": 0.1065551213605592, + "grad_norm": 0.8381434082984924, + "learning_rate": 9.93679672574266e-06, + "loss": 0.7889, + "step": 1936 + }, + { + "epoch": 0.10661016016291486, + "grad_norm": 0.9834293127059937, + "learning_rate": 9.936728003725484e-06, + "loss": 0.8358, + "step": 1937 + }, + { + "epoch": 0.10666519896527052, + "grad_norm": 0.8461851477622986, + "learning_rate": 9.936659244605184e-06, + "loss": 0.8408, + "step": 1938 + }, + { + "epoch": 0.10672023776762618, + "grad_norm": 1.0186371803283691, + "learning_rate": 9.936590448382273e-06, + "loss": 0.8118, + "step": 1939 + }, + { + "epoch": 0.10677527656998184, + "grad_norm": 0.866321325302124, + "learning_rate": 9.93652161505727e-06, + "loss": 0.8696, + "step": 1940 + }, + { + "epoch": 0.1068303153723375, + "grad_norm": 0.9179622530937195, + "learning_rate": 9.936452744630692e-06, + "loss": 0.8419, + "step": 1941 + }, + { + "epoch": 0.10688535417469316, + "grad_norm": 0.8250496983528137, + "learning_rate": 9.936383837103057e-06, + "loss": 0.8511, + "step": 1942 + }, + { + "epoch": 0.10694039297704883, + "grad_norm": 0.8475700616836548, + "learning_rate": 9.936314892474883e-06, + "loss": 0.8404, + "step": 1943 + }, + { + "epoch": 0.10699543177940447, + "grad_norm": 0.774334192276001, + "learning_rate": 9.936245910746684e-06, + "loss": 0.7461, + "step": 1944 + }, + { + "epoch": 0.10705047058176013, + "grad_norm": 0.9313948154449463, + "learning_rate": 9.936176891918986e-06, + "loss": 0.8486, + "step": 1945 + }, + { + "epoch": 0.1071055093841158, + "grad_norm": 0.8784124255180359, + "learning_rate": 9.936107835992304e-06, + "loss": 0.84, + "step": 1946 + }, + { + "epoch": 0.10716054818647146, + "grad_norm": 0.9087465405464172, + "learning_rate": 9.936038742967154e-06, + "loss": 0.9012, + "step": 1947 + }, + { + "epoch": 0.10721558698882712, + "grad_norm": 0.8462012410163879, + "learning_rate": 9.93596961284406e-06, + "loss": 0.9193, + "step": 1948 + }, + { + "epoch": 0.10727062579118278, + "grad_norm": 0.8984553813934326, + "learning_rate": 9.935900445623538e-06, + "loss": 0.781, + "step": 1949 + }, + { + "epoch": 0.10732566459353844, + "grad_norm": 0.9197295308113098, + "learning_rate": 9.935831241306111e-06, + "loss": 0.8861, + "step": 1950 + }, + { + "epoch": 0.1073807033958941, + "grad_norm": 0.8452801704406738, + "learning_rate": 9.935761999892296e-06, + "loss": 0.8649, + "step": 1951 + }, + { + "epoch": 0.10743574219824976, + "grad_norm": 0.8047192096710205, + "learning_rate": 9.935692721382618e-06, + "loss": 0.8704, + "step": 1952 + }, + { + "epoch": 0.10749078100060543, + "grad_norm": 0.9536359906196594, + "learning_rate": 9.935623405777593e-06, + "loss": 0.7803, + "step": 1953 + }, + { + "epoch": 0.10754581980296109, + "grad_norm": 0.8215291500091553, + "learning_rate": 9.935554053077744e-06, + "loss": 0.8247, + "step": 1954 + }, + { + "epoch": 0.10760085860531675, + "grad_norm": 0.9261930584907532, + "learning_rate": 9.93548466328359e-06, + "loss": 0.8594, + "step": 1955 + }, + { + "epoch": 0.10765589740767241, + "grad_norm": 0.7973492741584778, + "learning_rate": 9.935415236395656e-06, + "loss": 0.7464, + "step": 1956 + }, + { + "epoch": 0.10771093621002807, + "grad_norm": 0.9328988790512085, + "learning_rate": 9.935345772414463e-06, + "loss": 0.8472, + "step": 1957 + }, + { + "epoch": 0.10776597501238373, + "grad_norm": 0.9490759968757629, + "learning_rate": 9.935276271340532e-06, + "loss": 0.806, + "step": 1958 + }, + { + "epoch": 0.1078210138147394, + "grad_norm": 0.9149925112724304, + "learning_rate": 9.935206733174385e-06, + "loss": 0.8741, + "step": 1959 + }, + { + "epoch": 0.10787605261709506, + "grad_norm": 1.0074039697647095, + "learning_rate": 9.935137157916546e-06, + "loss": 0.8493, + "step": 1960 + }, + { + "epoch": 0.10793109141945072, + "grad_norm": 0.8783678412437439, + "learning_rate": 9.935067545567535e-06, + "loss": 0.8132, + "step": 1961 + }, + { + "epoch": 0.10798613022180638, + "grad_norm": 0.8273885250091553, + "learning_rate": 9.934997896127879e-06, + "loss": 0.7448, + "step": 1962 + }, + { + "epoch": 0.10804116902416204, + "grad_norm": 0.761947512626648, + "learning_rate": 9.9349282095981e-06, + "loss": 0.7933, + "step": 1963 + }, + { + "epoch": 0.1080962078265177, + "grad_norm": 0.814809262752533, + "learning_rate": 9.934858485978722e-06, + "loss": 0.7551, + "step": 1964 + }, + { + "epoch": 0.10815124662887336, + "grad_norm": 0.8108895421028137, + "learning_rate": 9.934788725270266e-06, + "loss": 0.6787, + "step": 1965 + }, + { + "epoch": 0.10820628543122901, + "grad_norm": 0.8669139742851257, + "learning_rate": 9.934718927473262e-06, + "loss": 0.8395, + "step": 1966 + }, + { + "epoch": 0.10826132423358467, + "grad_norm": 0.9093756079673767, + "learning_rate": 9.93464909258823e-06, + "loss": 0.8341, + "step": 1967 + }, + { + "epoch": 0.10831636303594033, + "grad_norm": 0.8923841714859009, + "learning_rate": 9.934579220615697e-06, + "loss": 0.9422, + "step": 1968 + }, + { + "epoch": 0.108371401838296, + "grad_norm": 0.850429117679596, + "learning_rate": 9.934509311556186e-06, + "loss": 0.8446, + "step": 1969 + }, + { + "epoch": 0.10842644064065166, + "grad_norm": 0.8762460350990295, + "learning_rate": 9.934439365410224e-06, + "loss": 0.7788, + "step": 1970 + }, + { + "epoch": 0.10848147944300732, + "grad_norm": 0.9700387716293335, + "learning_rate": 9.934369382178338e-06, + "loss": 0.8455, + "step": 1971 + }, + { + "epoch": 0.10853651824536298, + "grad_norm": 0.8003185987472534, + "learning_rate": 9.934299361861053e-06, + "loss": 0.8026, + "step": 1972 + }, + { + "epoch": 0.10859155704771864, + "grad_norm": 0.9626984596252441, + "learning_rate": 9.934229304458893e-06, + "loss": 0.8219, + "step": 1973 + }, + { + "epoch": 0.1086465958500743, + "grad_norm": 0.8722280859947205, + "learning_rate": 9.934159209972386e-06, + "loss": 0.8866, + "step": 1974 + }, + { + "epoch": 0.10870163465242996, + "grad_norm": 0.838736355304718, + "learning_rate": 9.934089078402061e-06, + "loss": 0.7723, + "step": 1975 + }, + { + "epoch": 0.10875667345478562, + "grad_norm": 0.8373032808303833, + "learning_rate": 9.934018909748443e-06, + "loss": 0.9003, + "step": 1976 + }, + { + "epoch": 0.10881171225714129, + "grad_norm": 0.8704653978347778, + "learning_rate": 9.93394870401206e-06, + "loss": 0.8926, + "step": 1977 + }, + { + "epoch": 0.10886675105949695, + "grad_norm": 0.8088163733482361, + "learning_rate": 9.933878461193437e-06, + "loss": 0.8059, + "step": 1978 + }, + { + "epoch": 0.10892178986185261, + "grad_norm": 0.856421947479248, + "learning_rate": 9.933808181293108e-06, + "loss": 0.8447, + "step": 1979 + }, + { + "epoch": 0.10897682866420827, + "grad_norm": 0.9676237106323242, + "learning_rate": 9.933737864311595e-06, + "loss": 0.9009, + "step": 1980 + }, + { + "epoch": 0.10903186746656393, + "grad_norm": 0.7955103516578674, + "learning_rate": 9.933667510249428e-06, + "loss": 0.881, + "step": 1981 + }, + { + "epoch": 0.10908690626891959, + "grad_norm": 0.7935854196548462, + "learning_rate": 9.933597119107136e-06, + "loss": 0.8773, + "step": 1982 + }, + { + "epoch": 0.10914194507127525, + "grad_norm": 0.7726008296012878, + "learning_rate": 9.933526690885251e-06, + "loss": 0.8133, + "step": 1983 + }, + { + "epoch": 0.10919698387363092, + "grad_norm": 0.8577712178230286, + "learning_rate": 9.9334562255843e-06, + "loss": 0.7455, + "step": 1984 + }, + { + "epoch": 0.10925202267598658, + "grad_norm": 0.9996447563171387, + "learning_rate": 9.933385723204812e-06, + "loss": 0.7312, + "step": 1985 + }, + { + "epoch": 0.10930706147834224, + "grad_norm": 0.9600629806518555, + "learning_rate": 9.933315183747318e-06, + "loss": 0.8792, + "step": 1986 + }, + { + "epoch": 0.10936210028069789, + "grad_norm": 0.9126206636428833, + "learning_rate": 9.933244607212347e-06, + "loss": 1.0023, + "step": 1987 + }, + { + "epoch": 0.10941713908305355, + "grad_norm": 0.774153470993042, + "learning_rate": 9.93317399360043e-06, + "loss": 0.7877, + "step": 1988 + }, + { + "epoch": 0.10947217788540921, + "grad_norm": 0.848495364189148, + "learning_rate": 9.933103342912096e-06, + "loss": 0.8825, + "step": 1989 + }, + { + "epoch": 0.10952721668776487, + "grad_norm": 0.806408166885376, + "learning_rate": 9.933032655147881e-06, + "loss": 0.7389, + "step": 1990 + }, + { + "epoch": 0.10958225549012053, + "grad_norm": 0.8579222559928894, + "learning_rate": 9.932961930308312e-06, + "loss": 0.8283, + "step": 1991 + }, + { + "epoch": 0.10963729429247619, + "grad_norm": 0.7548109292984009, + "learning_rate": 9.93289116839392e-06, + "loss": 0.7971, + "step": 1992 + }, + { + "epoch": 0.10969233309483185, + "grad_norm": 0.7954711318016052, + "learning_rate": 9.93282036940524e-06, + "loss": 0.849, + "step": 1993 + }, + { + "epoch": 0.10974737189718752, + "grad_norm": 0.7911425232887268, + "learning_rate": 9.932749533342802e-06, + "loss": 0.86, + "step": 1994 + }, + { + "epoch": 0.10980241069954318, + "grad_norm": 0.8505094051361084, + "learning_rate": 9.932678660207141e-06, + "loss": 0.7871, + "step": 1995 + }, + { + "epoch": 0.10985744950189884, + "grad_norm": 0.809612512588501, + "learning_rate": 9.932607749998784e-06, + "loss": 0.8337, + "step": 1996 + }, + { + "epoch": 0.1099124883042545, + "grad_norm": 0.738523006439209, + "learning_rate": 9.93253680271827e-06, + "loss": 0.7634, + "step": 1997 + }, + { + "epoch": 0.10996752710661016, + "grad_norm": 0.8434372544288635, + "learning_rate": 9.932465818366128e-06, + "loss": 0.7987, + "step": 1998 + }, + { + "epoch": 0.11002256590896582, + "grad_norm": 0.8068081140518188, + "learning_rate": 9.932394796942895e-06, + "loss": 0.9496, + "step": 1999 + }, + { + "epoch": 0.11007760471132148, + "grad_norm": 0.754342794418335, + "learning_rate": 9.932323738449103e-06, + "loss": 0.7355, + "step": 2000 + }, + { + "epoch": 0.11013264351367714, + "grad_norm": 0.8830806612968445, + "learning_rate": 9.932252642885285e-06, + "loss": 0.8458, + "step": 2001 + }, + { + "epoch": 0.1101876823160328, + "grad_norm": 0.9915485978126526, + "learning_rate": 9.932181510251977e-06, + "loss": 0.8116, + "step": 2002 + }, + { + "epoch": 0.11024272111838847, + "grad_norm": 0.858368992805481, + "learning_rate": 9.932110340549712e-06, + "loss": 0.8354, + "step": 2003 + }, + { + "epoch": 0.11029775992074413, + "grad_norm": 0.8591521382331848, + "learning_rate": 9.932039133779028e-06, + "loss": 0.8316, + "step": 2004 + }, + { + "epoch": 0.11035279872309979, + "grad_norm": 0.8714838624000549, + "learning_rate": 9.931967889940455e-06, + "loss": 0.8106, + "step": 2005 + }, + { + "epoch": 0.11040783752545545, + "grad_norm": 0.8082797527313232, + "learning_rate": 9.931896609034534e-06, + "loss": 0.7762, + "step": 2006 + }, + { + "epoch": 0.11046287632781111, + "grad_norm": 0.9226199984550476, + "learning_rate": 9.931825291061797e-06, + "loss": 0.8641, + "step": 2007 + }, + { + "epoch": 0.11051791513016677, + "grad_norm": 0.8883050680160522, + "learning_rate": 9.931753936022783e-06, + "loss": 0.9014, + "step": 2008 + }, + { + "epoch": 0.11057295393252242, + "grad_norm": 0.9024807810783386, + "learning_rate": 9.931682543918024e-06, + "loss": 0.9085, + "step": 2009 + }, + { + "epoch": 0.11062799273487808, + "grad_norm": 0.8381460905075073, + "learning_rate": 9.931611114748062e-06, + "loss": 0.8043, + "step": 2010 + }, + { + "epoch": 0.11068303153723374, + "grad_norm": 1.1222339868545532, + "learning_rate": 9.931539648513429e-06, + "loss": 0.8388, + "step": 2011 + }, + { + "epoch": 0.1107380703395894, + "grad_norm": 0.9710868000984192, + "learning_rate": 9.931468145214665e-06, + "loss": 0.8934, + "step": 2012 + }, + { + "epoch": 0.11079310914194507, + "grad_norm": 0.9821141958236694, + "learning_rate": 9.931396604852304e-06, + "loss": 0.931, + "step": 2013 + }, + { + "epoch": 0.11084814794430073, + "grad_norm": 1.0658717155456543, + "learning_rate": 9.931325027426889e-06, + "loss": 0.9032, + "step": 2014 + }, + { + "epoch": 0.11090318674665639, + "grad_norm": 0.8836946487426758, + "learning_rate": 9.931253412938956e-06, + "loss": 0.9131, + "step": 2015 + }, + { + "epoch": 0.11095822554901205, + "grad_norm": 0.8438361883163452, + "learning_rate": 9.93118176138904e-06, + "loss": 0.8674, + "step": 2016 + }, + { + "epoch": 0.11101326435136771, + "grad_norm": 0.928142786026001, + "learning_rate": 9.93111007277768e-06, + "loss": 0.8882, + "step": 2017 + }, + { + "epoch": 0.11106830315372337, + "grad_norm": 0.9176276922225952, + "learning_rate": 9.93103834710542e-06, + "loss": 0.8904, + "step": 2018 + }, + { + "epoch": 0.11112334195607904, + "grad_norm": 1.0462889671325684, + "learning_rate": 9.930966584372795e-06, + "loss": 0.8029, + "step": 2019 + }, + { + "epoch": 0.1111783807584347, + "grad_norm": 0.7627375721931458, + "learning_rate": 9.930894784580344e-06, + "loss": 0.8474, + "step": 2020 + }, + { + "epoch": 0.11123341956079036, + "grad_norm": 1.0545588731765747, + "learning_rate": 9.93082294772861e-06, + "loss": 0.7985, + "step": 2021 + }, + { + "epoch": 0.11128845836314602, + "grad_norm": 0.9752298593521118, + "learning_rate": 9.93075107381813e-06, + "loss": 0.8725, + "step": 2022 + }, + { + "epoch": 0.11134349716550168, + "grad_norm": 0.8403159379959106, + "learning_rate": 9.930679162849444e-06, + "loss": 0.8854, + "step": 2023 + }, + { + "epoch": 0.11139853596785734, + "grad_norm": 0.8879380226135254, + "learning_rate": 9.930607214823094e-06, + "loss": 0.7269, + "step": 2024 + }, + { + "epoch": 0.111453574770213, + "grad_norm": 0.907256543636322, + "learning_rate": 9.930535229739618e-06, + "loss": 0.8145, + "step": 2025 + }, + { + "epoch": 0.11150861357256867, + "grad_norm": 1.1066968441009521, + "learning_rate": 9.93046320759956e-06, + "loss": 0.9281, + "step": 2026 + }, + { + "epoch": 0.11156365237492433, + "grad_norm": 0.9226258397102356, + "learning_rate": 9.930391148403462e-06, + "loss": 0.9048, + "step": 2027 + }, + { + "epoch": 0.11161869117727999, + "grad_norm": 0.9652156829833984, + "learning_rate": 9.930319052151862e-06, + "loss": 0.9321, + "step": 2028 + }, + { + "epoch": 0.11167372997963565, + "grad_norm": 0.9102638363838196, + "learning_rate": 9.930246918845305e-06, + "loss": 0.8169, + "step": 2029 + }, + { + "epoch": 0.1117287687819913, + "grad_norm": 0.7765716314315796, + "learning_rate": 9.93017474848433e-06, + "loss": 0.7691, + "step": 2030 + }, + { + "epoch": 0.11178380758434696, + "grad_norm": 0.9053775072097778, + "learning_rate": 9.930102541069484e-06, + "loss": 0.782, + "step": 2031 + }, + { + "epoch": 0.11183884638670262, + "grad_norm": 0.8892827033996582, + "learning_rate": 9.930030296601306e-06, + "loss": 0.8575, + "step": 2032 + }, + { + "epoch": 0.11189388518905828, + "grad_norm": 0.8947604894638062, + "learning_rate": 9.929958015080339e-06, + "loss": 0.8607, + "step": 2033 + }, + { + "epoch": 0.11194892399141394, + "grad_norm": 0.8936871290206909, + "learning_rate": 9.929885696507127e-06, + "loss": 0.8111, + "step": 2034 + }, + { + "epoch": 0.1120039627937696, + "grad_norm": 0.9579165577888489, + "learning_rate": 9.929813340882214e-06, + "loss": 0.911, + "step": 2035 + }, + { + "epoch": 0.11205900159612527, + "grad_norm": 0.7885386347770691, + "learning_rate": 9.929740948206146e-06, + "loss": 0.8074, + "step": 2036 + }, + { + "epoch": 0.11211404039848093, + "grad_norm": 0.817939281463623, + "learning_rate": 9.929668518479462e-06, + "loss": 0.8451, + "step": 2037 + }, + { + "epoch": 0.11216907920083659, + "grad_norm": 0.8695761561393738, + "learning_rate": 9.92959605170271e-06, + "loss": 0.7158, + "step": 2038 + }, + { + "epoch": 0.11222411800319225, + "grad_norm": 0.8569639325141907, + "learning_rate": 9.929523547876433e-06, + "loss": 0.8568, + "step": 2039 + }, + { + "epoch": 0.11227915680554791, + "grad_norm": 0.8569897413253784, + "learning_rate": 9.929451007001176e-06, + "loss": 0.8971, + "step": 2040 + }, + { + "epoch": 0.11233419560790357, + "grad_norm": 0.8520069718360901, + "learning_rate": 9.929378429077487e-06, + "loss": 0.9027, + "step": 2041 + }, + { + "epoch": 0.11238923441025923, + "grad_norm": 0.9338961839675903, + "learning_rate": 9.929305814105907e-06, + "loss": 0.8646, + "step": 2042 + }, + { + "epoch": 0.1124442732126149, + "grad_norm": 0.8497192859649658, + "learning_rate": 9.929233162086985e-06, + "loss": 0.9068, + "step": 2043 + }, + { + "epoch": 0.11249931201497056, + "grad_norm": 0.8570863008499146, + "learning_rate": 9.929160473021267e-06, + "loss": 0.962, + "step": 2044 + }, + { + "epoch": 0.11255435081732622, + "grad_norm": 0.9072359800338745, + "learning_rate": 9.929087746909296e-06, + "loss": 0.8454, + "step": 2045 + }, + { + "epoch": 0.11260938961968188, + "grad_norm": 0.7920698523521423, + "learning_rate": 9.929014983751623e-06, + "loss": 0.8031, + "step": 2046 + }, + { + "epoch": 0.11266442842203754, + "grad_norm": 1.0180169343948364, + "learning_rate": 9.928942183548791e-06, + "loss": 0.7759, + "step": 2047 + }, + { + "epoch": 0.1127194672243932, + "grad_norm": 0.8746892809867859, + "learning_rate": 9.928869346301351e-06, + "loss": 0.9038, + "step": 2048 + }, + { + "epoch": 0.11277450602674886, + "grad_norm": 0.8283438086509705, + "learning_rate": 9.928796472009846e-06, + "loss": 0.8883, + "step": 2049 + }, + { + "epoch": 0.11282954482910452, + "grad_norm": 1.321917176246643, + "learning_rate": 9.928723560674828e-06, + "loss": 0.835, + "step": 2050 + }, + { + "epoch": 0.11288458363146017, + "grad_norm": 0.9356202483177185, + "learning_rate": 9.928650612296841e-06, + "loss": 0.8077, + "step": 2051 + }, + { + "epoch": 0.11293962243381583, + "grad_norm": 0.8493767380714417, + "learning_rate": 9.928577626876439e-06, + "loss": 0.8295, + "step": 2052 + }, + { + "epoch": 0.1129946612361715, + "grad_norm": 0.784818708896637, + "learning_rate": 9.928504604414164e-06, + "loss": 0.8322, + "step": 2053 + }, + { + "epoch": 0.11304970003852716, + "grad_norm": 0.9095364809036255, + "learning_rate": 9.928431544910567e-06, + "loss": 0.8757, + "step": 2054 + }, + { + "epoch": 0.11310473884088282, + "grad_norm": 0.8889689445495605, + "learning_rate": 9.9283584483662e-06, + "loss": 0.8583, + "step": 2055 + }, + { + "epoch": 0.11315977764323848, + "grad_norm": 0.8702652454376221, + "learning_rate": 9.928285314781607e-06, + "loss": 0.8414, + "step": 2056 + }, + { + "epoch": 0.11321481644559414, + "grad_norm": 0.8531168699264526, + "learning_rate": 9.928212144157342e-06, + "loss": 0.7844, + "step": 2057 + }, + { + "epoch": 0.1132698552479498, + "grad_norm": 1.0250271558761597, + "learning_rate": 9.928138936493956e-06, + "loss": 0.8766, + "step": 2058 + }, + { + "epoch": 0.11332489405030546, + "grad_norm": 0.7963449358940125, + "learning_rate": 9.928065691791996e-06, + "loss": 0.8166, + "step": 2059 + }, + { + "epoch": 0.11337993285266112, + "grad_norm": 1.1033011674880981, + "learning_rate": 9.927992410052013e-06, + "loss": 0.8748, + "step": 2060 + }, + { + "epoch": 0.11343497165501679, + "grad_norm": 0.8760959506034851, + "learning_rate": 9.927919091274558e-06, + "loss": 0.8623, + "step": 2061 + }, + { + "epoch": 0.11349001045737245, + "grad_norm": 1.1783028841018677, + "learning_rate": 9.927845735460182e-06, + "loss": 0.9144, + "step": 2062 + }, + { + "epoch": 0.11354504925972811, + "grad_norm": 0.8868625164031982, + "learning_rate": 9.927772342609437e-06, + "loss": 0.8614, + "step": 2063 + }, + { + "epoch": 0.11360008806208377, + "grad_norm": 0.8784704804420471, + "learning_rate": 9.927698912722874e-06, + "loss": 0.7802, + "step": 2064 + }, + { + "epoch": 0.11365512686443943, + "grad_norm": 1.0090643167495728, + "learning_rate": 9.927625445801046e-06, + "loss": 0.8876, + "step": 2065 + }, + { + "epoch": 0.1137101656667951, + "grad_norm": 0.7624390721321106, + "learning_rate": 9.927551941844502e-06, + "loss": 0.794, + "step": 2066 + }, + { + "epoch": 0.11376520446915075, + "grad_norm": 0.7814189791679382, + "learning_rate": 9.927478400853798e-06, + "loss": 0.8176, + "step": 2067 + }, + { + "epoch": 0.11382024327150642, + "grad_norm": 0.876338541507721, + "learning_rate": 9.927404822829486e-06, + "loss": 0.8634, + "step": 2068 + }, + { + "epoch": 0.11387528207386208, + "grad_norm": 0.7931430339813232, + "learning_rate": 9.927331207772117e-06, + "loss": 0.8012, + "step": 2069 + }, + { + "epoch": 0.11393032087621774, + "grad_norm": 1.0064504146575928, + "learning_rate": 9.927257555682246e-06, + "loss": 0.8321, + "step": 2070 + }, + { + "epoch": 0.1139853596785734, + "grad_norm": 0.8233053684234619, + "learning_rate": 9.927183866560425e-06, + "loss": 0.8004, + "step": 2071 + }, + { + "epoch": 0.11404039848092906, + "grad_norm": 1.0106632709503174, + "learning_rate": 9.927110140407211e-06, + "loss": 0.8627, + "step": 2072 + }, + { + "epoch": 0.11409543728328471, + "grad_norm": 0.8262843489646912, + "learning_rate": 9.927036377223155e-06, + "loss": 0.737, + "step": 2073 + }, + { + "epoch": 0.11415047608564037, + "grad_norm": 0.9349029660224915, + "learning_rate": 9.926962577008813e-06, + "loss": 0.9049, + "step": 2074 + }, + { + "epoch": 0.11420551488799603, + "grad_norm": 0.8689929842948914, + "learning_rate": 9.926888739764739e-06, + "loss": 0.7858, + "step": 2075 + }, + { + "epoch": 0.1142605536903517, + "grad_norm": 0.8442347645759583, + "learning_rate": 9.926814865491487e-06, + "loss": 0.8145, + "step": 2076 + }, + { + "epoch": 0.11431559249270735, + "grad_norm": 0.9143397212028503, + "learning_rate": 9.926740954189615e-06, + "loss": 0.8025, + "step": 2077 + }, + { + "epoch": 0.11437063129506302, + "grad_norm": 1.293251395225525, + "learning_rate": 9.926667005859676e-06, + "loss": 1.0256, + "step": 2078 + }, + { + "epoch": 0.11442567009741868, + "grad_norm": 0.9661351442337036, + "learning_rate": 9.926593020502226e-06, + "loss": 0.991, + "step": 2079 + }, + { + "epoch": 0.11448070889977434, + "grad_norm": 0.8110861778259277, + "learning_rate": 9.926518998117823e-06, + "loss": 0.7129, + "step": 2080 + }, + { + "epoch": 0.11453574770213, + "grad_norm": 0.8351119160652161, + "learning_rate": 9.92644493870702e-06, + "loss": 0.8894, + "step": 2081 + }, + { + "epoch": 0.11459078650448566, + "grad_norm": 0.8492733240127563, + "learning_rate": 9.926370842270377e-06, + "loss": 0.8039, + "step": 2082 + }, + { + "epoch": 0.11464582530684132, + "grad_norm": 0.895353376865387, + "learning_rate": 9.92629670880845e-06, + "loss": 0.8743, + "step": 2083 + }, + { + "epoch": 0.11470086410919698, + "grad_norm": 0.7871271967887878, + "learning_rate": 9.926222538321795e-06, + "loss": 0.8426, + "step": 2084 + }, + { + "epoch": 0.11475590291155265, + "grad_norm": 0.8904643058776855, + "learning_rate": 9.92614833081097e-06, + "loss": 0.8454, + "step": 2085 + }, + { + "epoch": 0.11481094171390831, + "grad_norm": 0.9166308641433716, + "learning_rate": 9.926074086276532e-06, + "loss": 0.9162, + "step": 2086 + }, + { + "epoch": 0.11486598051626397, + "grad_norm": 0.8730728626251221, + "learning_rate": 9.92599980471904e-06, + "loss": 0.8524, + "step": 2087 + }, + { + "epoch": 0.11492101931861963, + "grad_norm": 0.7932829260826111, + "learning_rate": 9.925925486139052e-06, + "loss": 0.7838, + "step": 2088 + }, + { + "epoch": 0.11497605812097529, + "grad_norm": 1.0033760070800781, + "learning_rate": 9.925851130537127e-06, + "loss": 0.8746, + "step": 2089 + }, + { + "epoch": 0.11503109692333095, + "grad_norm": 0.7783192992210388, + "learning_rate": 9.925776737913823e-06, + "loss": 0.7308, + "step": 2090 + }, + { + "epoch": 0.11508613572568661, + "grad_norm": 0.8441587686538696, + "learning_rate": 9.925702308269702e-06, + "loss": 0.7933, + "step": 2091 + }, + { + "epoch": 0.11514117452804228, + "grad_norm": 0.9433023929595947, + "learning_rate": 9.925627841605319e-06, + "loss": 0.7857, + "step": 2092 + }, + { + "epoch": 0.11519621333039794, + "grad_norm": 0.8958256244659424, + "learning_rate": 9.925553337921235e-06, + "loss": 0.9116, + "step": 2093 + }, + { + "epoch": 0.11525125213275358, + "grad_norm": 0.7610845565795898, + "learning_rate": 9.925478797218011e-06, + "loss": 0.8006, + "step": 2094 + }, + { + "epoch": 0.11530629093510925, + "grad_norm": 0.7977023720741272, + "learning_rate": 9.925404219496207e-06, + "loss": 0.8068, + "step": 2095 + }, + { + "epoch": 0.11536132973746491, + "grad_norm": 0.8087283372879028, + "learning_rate": 9.925329604756383e-06, + "loss": 0.7968, + "step": 2096 + }, + { + "epoch": 0.11541636853982057, + "grad_norm": 1.1066477298736572, + "learning_rate": 9.925254952999102e-06, + "loss": 0.8167, + "step": 2097 + }, + { + "epoch": 0.11547140734217623, + "grad_norm": 0.7806832194328308, + "learning_rate": 9.925180264224921e-06, + "loss": 0.8069, + "step": 2098 + }, + { + "epoch": 0.11552644614453189, + "grad_norm": 0.7745190858840942, + "learning_rate": 9.925105538434406e-06, + "loss": 0.7968, + "step": 2099 + }, + { + "epoch": 0.11558148494688755, + "grad_norm": 0.9045543074607849, + "learning_rate": 9.925030775628113e-06, + "loss": 0.8417, + "step": 2100 + }, + { + "epoch": 0.11563652374924321, + "grad_norm": 1.2962623834609985, + "learning_rate": 9.924955975806608e-06, + "loss": 0.8162, + "step": 2101 + }, + { + "epoch": 0.11569156255159888, + "grad_norm": 0.8571485877037048, + "learning_rate": 9.924881138970453e-06, + "loss": 0.8581, + "step": 2102 + }, + { + "epoch": 0.11574660135395454, + "grad_norm": 0.8326650857925415, + "learning_rate": 9.92480626512021e-06, + "loss": 0.8438, + "step": 2103 + }, + { + "epoch": 0.1158016401563102, + "grad_norm": 0.7973701357841492, + "learning_rate": 9.924731354256441e-06, + "loss": 0.8337, + "step": 2104 + }, + { + "epoch": 0.11585667895866586, + "grad_norm": 0.8614075779914856, + "learning_rate": 9.924656406379708e-06, + "loss": 0.8275, + "step": 2105 + }, + { + "epoch": 0.11591171776102152, + "grad_norm": 0.7911350131034851, + "learning_rate": 9.924581421490577e-06, + "loss": 0.8032, + "step": 2106 + }, + { + "epoch": 0.11596675656337718, + "grad_norm": 0.8763116598129272, + "learning_rate": 9.92450639958961e-06, + "loss": 0.8725, + "step": 2107 + }, + { + "epoch": 0.11602179536573284, + "grad_norm": 0.9754133224487305, + "learning_rate": 9.92443134067737e-06, + "loss": 0.9115, + "step": 2108 + }, + { + "epoch": 0.1160768341680885, + "grad_norm": 0.7783731818199158, + "learning_rate": 9.924356244754425e-06, + "loss": 0.8223, + "step": 2109 + }, + { + "epoch": 0.11613187297044417, + "grad_norm": 0.865301787853241, + "learning_rate": 9.924281111821335e-06, + "loss": 0.8053, + "step": 2110 + }, + { + "epoch": 0.11618691177279983, + "grad_norm": 0.8654297590255737, + "learning_rate": 9.924205941878666e-06, + "loss": 0.716, + "step": 2111 + }, + { + "epoch": 0.11624195057515549, + "grad_norm": 0.7646550536155701, + "learning_rate": 9.924130734926982e-06, + "loss": 0.8027, + "step": 2112 + }, + { + "epoch": 0.11629698937751115, + "grad_norm": 0.810587465763092, + "learning_rate": 9.924055490966851e-06, + "loss": 0.7416, + "step": 2113 + }, + { + "epoch": 0.11635202817986681, + "grad_norm": 0.8610082268714905, + "learning_rate": 9.923980209998838e-06, + "loss": 0.8527, + "step": 2114 + }, + { + "epoch": 0.11640706698222247, + "grad_norm": 0.8409233689308167, + "learning_rate": 9.923904892023506e-06, + "loss": 0.8169, + "step": 2115 + }, + { + "epoch": 0.11646210578457812, + "grad_norm": 0.7786587476730347, + "learning_rate": 9.923829537041425e-06, + "loss": 0.6897, + "step": 2116 + }, + { + "epoch": 0.11651714458693378, + "grad_norm": 0.852908730506897, + "learning_rate": 9.923754145053158e-06, + "loss": 0.7821, + "step": 2117 + }, + { + "epoch": 0.11657218338928944, + "grad_norm": 0.9130391478538513, + "learning_rate": 9.923678716059273e-06, + "loss": 1.0377, + "step": 2118 + }, + { + "epoch": 0.1166272221916451, + "grad_norm": 0.8371701240539551, + "learning_rate": 9.923603250060336e-06, + "loss": 0.8312, + "step": 2119 + }, + { + "epoch": 0.11668226099400077, + "grad_norm": 0.8045756220817566, + "learning_rate": 9.923527747056916e-06, + "loss": 0.7971, + "step": 2120 + }, + { + "epoch": 0.11673729979635643, + "grad_norm": 0.8832160234451294, + "learning_rate": 9.923452207049577e-06, + "loss": 0.7362, + "step": 2121 + }, + { + "epoch": 0.11679233859871209, + "grad_norm": 0.8253088593482971, + "learning_rate": 9.923376630038893e-06, + "loss": 0.8177, + "step": 2122 + }, + { + "epoch": 0.11684737740106775, + "grad_norm": 0.7953168749809265, + "learning_rate": 9.923301016025424e-06, + "loss": 0.7053, + "step": 2123 + }, + { + "epoch": 0.11690241620342341, + "grad_norm": 0.7256457805633545, + "learning_rate": 9.923225365009745e-06, + "loss": 0.7554, + "step": 2124 + }, + { + "epoch": 0.11695745500577907, + "grad_norm": 0.9896693229675293, + "learning_rate": 9.923149676992424e-06, + "loss": 0.8285, + "step": 2125 + }, + { + "epoch": 0.11701249380813473, + "grad_norm": 0.7846312522888184, + "learning_rate": 9.923073951974023e-06, + "loss": 0.7527, + "step": 2126 + }, + { + "epoch": 0.1170675326104904, + "grad_norm": 0.8949825167655945, + "learning_rate": 9.92299818995512e-06, + "loss": 0.8545, + "step": 2127 + }, + { + "epoch": 0.11712257141284606, + "grad_norm": 1.0023548603057861, + "learning_rate": 9.922922390936278e-06, + "loss": 0.7668, + "step": 2128 + }, + { + "epoch": 0.11717761021520172, + "grad_norm": 0.8663881421089172, + "learning_rate": 9.92284655491807e-06, + "loss": 0.8073, + "step": 2129 + }, + { + "epoch": 0.11723264901755738, + "grad_norm": 0.8274385929107666, + "learning_rate": 9.922770681901064e-06, + "loss": 0.9002, + "step": 2130 + }, + { + "epoch": 0.11728768781991304, + "grad_norm": 0.8508959412574768, + "learning_rate": 9.922694771885832e-06, + "loss": 0.9325, + "step": 2131 + }, + { + "epoch": 0.1173427266222687, + "grad_norm": 0.8176792860031128, + "learning_rate": 9.922618824872946e-06, + "loss": 0.8415, + "step": 2132 + }, + { + "epoch": 0.11739776542462436, + "grad_norm": 0.770951509475708, + "learning_rate": 9.922542840862971e-06, + "loss": 0.8051, + "step": 2133 + }, + { + "epoch": 0.11745280422698003, + "grad_norm": 0.8558167219161987, + "learning_rate": 9.922466819856484e-06, + "loss": 0.85, + "step": 2134 + }, + { + "epoch": 0.11750784302933569, + "grad_norm": 0.8288151025772095, + "learning_rate": 9.922390761854053e-06, + "loss": 0.8141, + "step": 2135 + }, + { + "epoch": 0.11756288183169135, + "grad_norm": 0.8220882415771484, + "learning_rate": 9.922314666856252e-06, + "loss": 0.8109, + "step": 2136 + }, + { + "epoch": 0.117617920634047, + "grad_norm": 0.7875000238418579, + "learning_rate": 9.92223853486365e-06, + "loss": 0.9085, + "step": 2137 + }, + { + "epoch": 0.11767295943640266, + "grad_norm": 0.8052374124526978, + "learning_rate": 9.922162365876822e-06, + "loss": 0.8785, + "step": 2138 + }, + { + "epoch": 0.11772799823875832, + "grad_norm": 1.0311180353164673, + "learning_rate": 9.922086159896338e-06, + "loss": 0.9112, + "step": 2139 + }, + { + "epoch": 0.11778303704111398, + "grad_norm": 0.943911075592041, + "learning_rate": 9.922009916922773e-06, + "loss": 0.8332, + "step": 2140 + }, + { + "epoch": 0.11783807584346964, + "grad_norm": 0.8156648278236389, + "learning_rate": 9.921933636956697e-06, + "loss": 0.8837, + "step": 2141 + }, + { + "epoch": 0.1178931146458253, + "grad_norm": 0.860292375087738, + "learning_rate": 9.921857319998688e-06, + "loss": 0.7963, + "step": 2142 + }, + { + "epoch": 0.11794815344818096, + "grad_norm": 0.8861456513404846, + "learning_rate": 9.921780966049315e-06, + "loss": 0.8335, + "step": 2143 + }, + { + "epoch": 0.11800319225053663, + "grad_norm": 0.793533205986023, + "learning_rate": 9.921704575109155e-06, + "loss": 0.7881, + "step": 2144 + }, + { + "epoch": 0.11805823105289229, + "grad_norm": 0.8039320111274719, + "learning_rate": 9.921628147178781e-06, + "loss": 0.8369, + "step": 2145 + }, + { + "epoch": 0.11811326985524795, + "grad_norm": 0.8785450458526611, + "learning_rate": 9.921551682258765e-06, + "loss": 0.7981, + "step": 2146 + }, + { + "epoch": 0.11816830865760361, + "grad_norm": 0.810251772403717, + "learning_rate": 9.921475180349687e-06, + "loss": 0.7926, + "step": 2147 + }, + { + "epoch": 0.11822334745995927, + "grad_norm": 0.8470801115036011, + "learning_rate": 9.921398641452117e-06, + "loss": 0.8061, + "step": 2148 + }, + { + "epoch": 0.11827838626231493, + "grad_norm": 0.8147469162940979, + "learning_rate": 9.921322065566633e-06, + "loss": 0.7906, + "step": 2149 + }, + { + "epoch": 0.1183334250646706, + "grad_norm": 0.8792327046394348, + "learning_rate": 9.92124545269381e-06, + "loss": 0.9025, + "step": 2150 + }, + { + "epoch": 0.11838846386702626, + "grad_norm": 0.794607400894165, + "learning_rate": 9.921168802834223e-06, + "loss": 0.8284, + "step": 2151 + }, + { + "epoch": 0.11844350266938192, + "grad_norm": 0.8601556420326233, + "learning_rate": 9.921092115988447e-06, + "loss": 0.8196, + "step": 2152 + }, + { + "epoch": 0.11849854147173758, + "grad_norm": 0.786967933177948, + "learning_rate": 9.921015392157062e-06, + "loss": 0.8744, + "step": 2153 + }, + { + "epoch": 0.11855358027409324, + "grad_norm": 0.8481432199478149, + "learning_rate": 9.920938631340641e-06, + "loss": 0.7206, + "step": 2154 + }, + { + "epoch": 0.1186086190764489, + "grad_norm": 0.8025142550468445, + "learning_rate": 9.920861833539765e-06, + "loss": 0.8126, + "step": 2155 + }, + { + "epoch": 0.11866365787880456, + "grad_norm": 0.9853057265281677, + "learning_rate": 9.920784998755006e-06, + "loss": 0.8883, + "step": 2156 + }, + { + "epoch": 0.11871869668116022, + "grad_norm": 1.0008476972579956, + "learning_rate": 9.920708126986947e-06, + "loss": 0.9326, + "step": 2157 + }, + { + "epoch": 0.11877373548351589, + "grad_norm": 0.837347686290741, + "learning_rate": 9.920631218236161e-06, + "loss": 0.9002, + "step": 2158 + }, + { + "epoch": 0.11882877428587153, + "grad_norm": 0.7866735458374023, + "learning_rate": 9.920554272503227e-06, + "loss": 0.765, + "step": 2159 + }, + { + "epoch": 0.1188838130882272, + "grad_norm": 0.8714935779571533, + "learning_rate": 9.920477289788726e-06, + "loss": 1.0294, + "step": 2160 + }, + { + "epoch": 0.11893885189058286, + "grad_norm": 1.0671826601028442, + "learning_rate": 9.920400270093234e-06, + "loss": 0.8341, + "step": 2161 + }, + { + "epoch": 0.11899389069293852, + "grad_norm": 0.8594604134559631, + "learning_rate": 9.92032321341733e-06, + "loss": 0.8731, + "step": 2162 + }, + { + "epoch": 0.11904892949529418, + "grad_norm": 0.8387738466262817, + "learning_rate": 9.920246119761597e-06, + "loss": 0.7898, + "step": 2163 + }, + { + "epoch": 0.11910396829764984, + "grad_norm": 0.8957195281982422, + "learning_rate": 9.920168989126608e-06, + "loss": 0.8475, + "step": 2164 + }, + { + "epoch": 0.1191590071000055, + "grad_norm": 0.8224207162857056, + "learning_rate": 9.920091821512948e-06, + "loss": 0.7944, + "step": 2165 + }, + { + "epoch": 0.11921404590236116, + "grad_norm": 1.0309031009674072, + "learning_rate": 9.920014616921192e-06, + "loss": 0.8992, + "step": 2166 + }, + { + "epoch": 0.11926908470471682, + "grad_norm": 0.7300832271575928, + "learning_rate": 9.919937375351925e-06, + "loss": 0.7016, + "step": 2167 + }, + { + "epoch": 0.11932412350707249, + "grad_norm": 0.7565537691116333, + "learning_rate": 9.919860096805724e-06, + "loss": 0.8113, + "step": 2168 + }, + { + "epoch": 0.11937916230942815, + "grad_norm": 1.0101505517959595, + "learning_rate": 9.919782781283174e-06, + "loss": 0.8765, + "step": 2169 + }, + { + "epoch": 0.11943420111178381, + "grad_norm": 0.8369461894035339, + "learning_rate": 9.919705428784852e-06, + "loss": 0.8248, + "step": 2170 + }, + { + "epoch": 0.11948923991413947, + "grad_norm": 0.8106105327606201, + "learning_rate": 9.919628039311342e-06, + "loss": 0.8585, + "step": 2171 + }, + { + "epoch": 0.11954427871649513, + "grad_norm": 0.7863745093345642, + "learning_rate": 9.919550612863224e-06, + "loss": 0.8393, + "step": 2172 + }, + { + "epoch": 0.11959931751885079, + "grad_norm": 0.8664719462394714, + "learning_rate": 9.919473149441081e-06, + "loss": 0.8882, + "step": 2173 + }, + { + "epoch": 0.11965435632120645, + "grad_norm": 0.6977574825286865, + "learning_rate": 9.919395649045494e-06, + "loss": 0.7264, + "step": 2174 + }, + { + "epoch": 0.11970939512356212, + "grad_norm": 0.8000102639198303, + "learning_rate": 9.919318111677045e-06, + "loss": 0.7828, + "step": 2175 + }, + { + "epoch": 0.11976443392591778, + "grad_norm": 0.868228018283844, + "learning_rate": 9.91924053733632e-06, + "loss": 0.7904, + "step": 2176 + }, + { + "epoch": 0.11981947272827344, + "grad_norm": 0.839080274105072, + "learning_rate": 9.9191629260239e-06, + "loss": 0.7663, + "step": 2177 + }, + { + "epoch": 0.1198745115306291, + "grad_norm": 0.8222747445106506, + "learning_rate": 9.919085277740366e-06, + "loss": 0.7208, + "step": 2178 + }, + { + "epoch": 0.11992955033298476, + "grad_norm": 1.4550986289978027, + "learning_rate": 9.919007592486304e-06, + "loss": 0.8154, + "step": 2179 + }, + { + "epoch": 0.11998458913534041, + "grad_norm": 0.9110257625579834, + "learning_rate": 9.9189298702623e-06, + "loss": 0.8134, + "step": 2180 + }, + { + "epoch": 0.12003962793769607, + "grad_norm": 0.84796142578125, + "learning_rate": 9.918852111068935e-06, + "loss": 0.8074, + "step": 2181 + }, + { + "epoch": 0.12009466674005173, + "grad_norm": 0.8134179711341858, + "learning_rate": 9.918774314906793e-06, + "loss": 0.6335, + "step": 2182 + }, + { + "epoch": 0.12014970554240739, + "grad_norm": 0.8481448888778687, + "learning_rate": 9.918696481776461e-06, + "loss": 0.8804, + "step": 2183 + }, + { + "epoch": 0.12020474434476305, + "grad_norm": 0.88057941198349, + "learning_rate": 9.918618611678523e-06, + "loss": 0.9326, + "step": 2184 + }, + { + "epoch": 0.12025978314711872, + "grad_norm": 0.8435977697372437, + "learning_rate": 9.918540704613564e-06, + "loss": 0.8141, + "step": 2185 + }, + { + "epoch": 0.12031482194947438, + "grad_norm": 0.8186982870101929, + "learning_rate": 9.918462760582169e-06, + "loss": 0.837, + "step": 2186 + }, + { + "epoch": 0.12036986075183004, + "grad_norm": 0.887783944606781, + "learning_rate": 9.918384779584924e-06, + "loss": 0.8062, + "step": 2187 + }, + { + "epoch": 0.1204248995541857, + "grad_norm": 0.9368415474891663, + "learning_rate": 9.918306761622417e-06, + "loss": 1.0098, + "step": 2188 + }, + { + "epoch": 0.12047993835654136, + "grad_norm": 0.8443986773490906, + "learning_rate": 9.918228706695232e-06, + "loss": 0.8178, + "step": 2189 + }, + { + "epoch": 0.12053497715889702, + "grad_norm": 0.7897284626960754, + "learning_rate": 9.918150614803956e-06, + "loss": 0.8013, + "step": 2190 + }, + { + "epoch": 0.12059001596125268, + "grad_norm": 0.886012077331543, + "learning_rate": 9.91807248594918e-06, + "loss": 0.8141, + "step": 2191 + }, + { + "epoch": 0.12064505476360834, + "grad_norm": 0.8585757613182068, + "learning_rate": 9.917994320131484e-06, + "loss": 0.8381, + "step": 2192 + }, + { + "epoch": 0.120700093565964, + "grad_norm": 1.6192269325256348, + "learning_rate": 9.917916117351459e-06, + "loss": 0.9082, + "step": 2193 + }, + { + "epoch": 0.12075513236831967, + "grad_norm": 1.160414457321167, + "learning_rate": 9.917837877609695e-06, + "loss": 0.8673, + "step": 2194 + }, + { + "epoch": 0.12081017117067533, + "grad_norm": 0.8363412022590637, + "learning_rate": 9.917759600906775e-06, + "loss": 0.816, + "step": 2195 + }, + { + "epoch": 0.12086520997303099, + "grad_norm": 0.8344097137451172, + "learning_rate": 9.917681287243292e-06, + "loss": 0.8629, + "step": 2196 + }, + { + "epoch": 0.12092024877538665, + "grad_norm": 0.9817582368850708, + "learning_rate": 9.917602936619834e-06, + "loss": 0.8106, + "step": 2197 + }, + { + "epoch": 0.12097528757774231, + "grad_norm": 0.8828088641166687, + "learning_rate": 9.917524549036987e-06, + "loss": 0.8465, + "step": 2198 + }, + { + "epoch": 0.12103032638009797, + "grad_norm": 0.8428277969360352, + "learning_rate": 9.917446124495344e-06, + "loss": 0.7721, + "step": 2199 + }, + { + "epoch": 0.12108536518245364, + "grad_norm": 0.8748664855957031, + "learning_rate": 9.917367662995489e-06, + "loss": 0.8679, + "step": 2200 + }, + { + "epoch": 0.1211404039848093, + "grad_norm": 0.8652347922325134, + "learning_rate": 9.917289164538018e-06, + "loss": 0.8906, + "step": 2201 + }, + { + "epoch": 0.12119544278716494, + "grad_norm": 1.157142162322998, + "learning_rate": 9.917210629123518e-06, + "loss": 0.9046, + "step": 2202 + }, + { + "epoch": 0.1212504815895206, + "grad_norm": 0.8186333179473877, + "learning_rate": 9.917132056752576e-06, + "loss": 0.8494, + "step": 2203 + }, + { + "epoch": 0.12130552039187627, + "grad_norm": 0.7769078612327576, + "learning_rate": 9.917053447425788e-06, + "loss": 0.8018, + "step": 2204 + }, + { + "epoch": 0.12136055919423193, + "grad_norm": 0.9190469980239868, + "learning_rate": 9.916974801143742e-06, + "loss": 0.8206, + "step": 2205 + }, + { + "epoch": 0.12141559799658759, + "grad_norm": 1.2200725078582764, + "learning_rate": 9.91689611790703e-06, + "loss": 0.9109, + "step": 2206 + }, + { + "epoch": 0.12147063679894325, + "grad_norm": 0.7902093529701233, + "learning_rate": 9.916817397716243e-06, + "loss": 0.8314, + "step": 2207 + }, + { + "epoch": 0.12152567560129891, + "grad_norm": 0.8160610198974609, + "learning_rate": 9.91673864057197e-06, + "loss": 0.8605, + "step": 2208 + }, + { + "epoch": 0.12158071440365457, + "grad_norm": 0.833163857460022, + "learning_rate": 9.916659846474807e-06, + "loss": 0.8125, + "step": 2209 + }, + { + "epoch": 0.12163575320601024, + "grad_norm": 0.776314377784729, + "learning_rate": 9.916581015425346e-06, + "loss": 0.8137, + "step": 2210 + }, + { + "epoch": 0.1216907920083659, + "grad_norm": 0.8525915145874023, + "learning_rate": 9.916502147424178e-06, + "loss": 0.8703, + "step": 2211 + }, + { + "epoch": 0.12174583081072156, + "grad_norm": 0.8268684148788452, + "learning_rate": 9.916423242471895e-06, + "loss": 0.7775, + "step": 2212 + }, + { + "epoch": 0.12180086961307722, + "grad_norm": 0.8717706799507141, + "learning_rate": 9.916344300569091e-06, + "loss": 0.8002, + "step": 2213 + }, + { + "epoch": 0.12185590841543288, + "grad_norm": 0.9499961137771606, + "learning_rate": 9.91626532171636e-06, + "loss": 0.8861, + "step": 2214 + }, + { + "epoch": 0.12191094721778854, + "grad_norm": 0.9521885514259338, + "learning_rate": 9.916186305914296e-06, + "loss": 0.7602, + "step": 2215 + }, + { + "epoch": 0.1219659860201442, + "grad_norm": 0.8945447206497192, + "learning_rate": 9.916107253163488e-06, + "loss": 0.8603, + "step": 2216 + }, + { + "epoch": 0.12202102482249987, + "grad_norm": 0.8232392072677612, + "learning_rate": 9.916028163464536e-06, + "loss": 0.8419, + "step": 2217 + }, + { + "epoch": 0.12207606362485553, + "grad_norm": 0.8183467984199524, + "learning_rate": 9.915949036818032e-06, + "loss": 0.9038, + "step": 2218 + }, + { + "epoch": 0.12213110242721119, + "grad_norm": 0.7805467247962952, + "learning_rate": 9.915869873224571e-06, + "loss": 0.7313, + "step": 2219 + }, + { + "epoch": 0.12218614122956685, + "grad_norm": 0.838101327419281, + "learning_rate": 9.915790672684749e-06, + "loss": 0.7973, + "step": 2220 + }, + { + "epoch": 0.12224118003192251, + "grad_norm": 0.7795171141624451, + "learning_rate": 9.915711435199158e-06, + "loss": 0.7796, + "step": 2221 + }, + { + "epoch": 0.12229621883427817, + "grad_norm": 0.7971234917640686, + "learning_rate": 9.915632160768398e-06, + "loss": 0.8309, + "step": 2222 + }, + { + "epoch": 0.12235125763663382, + "grad_norm": 0.8543851375579834, + "learning_rate": 9.915552849393061e-06, + "loss": 0.7826, + "step": 2223 + }, + { + "epoch": 0.12240629643898948, + "grad_norm": 0.9315086007118225, + "learning_rate": 9.915473501073744e-06, + "loss": 0.9294, + "step": 2224 + }, + { + "epoch": 0.12246133524134514, + "grad_norm": 0.8794427514076233, + "learning_rate": 9.915394115811046e-06, + "loss": 0.8968, + "step": 2225 + }, + { + "epoch": 0.1225163740437008, + "grad_norm": 0.9499204754829407, + "learning_rate": 9.91531469360556e-06, + "loss": 0.9841, + "step": 2226 + }, + { + "epoch": 0.12257141284605647, + "grad_norm": 0.9233788251876831, + "learning_rate": 9.915235234457885e-06, + "loss": 0.7794, + "step": 2227 + }, + { + "epoch": 0.12262645164841213, + "grad_norm": 0.8971870541572571, + "learning_rate": 9.915155738368618e-06, + "loss": 0.919, + "step": 2228 + }, + { + "epoch": 0.12268149045076779, + "grad_norm": 0.8122105002403259, + "learning_rate": 9.915076205338356e-06, + "loss": 0.8227, + "step": 2229 + }, + { + "epoch": 0.12273652925312345, + "grad_norm": 0.7878004908561707, + "learning_rate": 9.914996635367696e-06, + "loss": 0.7622, + "step": 2230 + }, + { + "epoch": 0.12279156805547911, + "grad_norm": 0.8229606747627258, + "learning_rate": 9.914917028457238e-06, + "loss": 0.8265, + "step": 2231 + }, + { + "epoch": 0.12284660685783477, + "grad_norm": 0.8972312808036804, + "learning_rate": 9.914837384607578e-06, + "loss": 0.8914, + "step": 2232 + }, + { + "epoch": 0.12290164566019043, + "grad_norm": 0.762922465801239, + "learning_rate": 9.914757703819318e-06, + "loss": 0.6853, + "step": 2233 + }, + { + "epoch": 0.1229566844625461, + "grad_norm": 0.8949442505836487, + "learning_rate": 9.914677986093054e-06, + "loss": 0.8303, + "step": 2234 + }, + { + "epoch": 0.12301172326490176, + "grad_norm": 1.0220820903778076, + "learning_rate": 9.914598231429384e-06, + "loss": 1.0027, + "step": 2235 + }, + { + "epoch": 0.12306676206725742, + "grad_norm": 0.8265436887741089, + "learning_rate": 9.914518439828911e-06, + "loss": 0.8317, + "step": 2236 + }, + { + "epoch": 0.12312180086961308, + "grad_norm": 0.780444324016571, + "learning_rate": 9.914438611292231e-06, + "loss": 0.756, + "step": 2237 + }, + { + "epoch": 0.12317683967196874, + "grad_norm": 0.8569482564926147, + "learning_rate": 9.914358745819948e-06, + "loss": 0.8126, + "step": 2238 + }, + { + "epoch": 0.1232318784743244, + "grad_norm": 0.8167145848274231, + "learning_rate": 9.91427884341266e-06, + "loss": 0.8345, + "step": 2239 + }, + { + "epoch": 0.12328691727668006, + "grad_norm": 0.7915990948677063, + "learning_rate": 9.914198904070967e-06, + "loss": 0.7416, + "step": 2240 + }, + { + "epoch": 0.12334195607903573, + "grad_norm": 0.8568083047866821, + "learning_rate": 9.91411892779547e-06, + "loss": 0.8329, + "step": 2241 + }, + { + "epoch": 0.12339699488139139, + "grad_norm": 1.1727303266525269, + "learning_rate": 9.914038914586772e-06, + "loss": 0.8421, + "step": 2242 + }, + { + "epoch": 0.12345203368374705, + "grad_norm": 0.8706398010253906, + "learning_rate": 9.913958864445472e-06, + "loss": 0.9013, + "step": 2243 + }, + { + "epoch": 0.12350707248610271, + "grad_norm": 0.8376144170761108, + "learning_rate": 9.913878777372173e-06, + "loss": 0.8456, + "step": 2244 + }, + { + "epoch": 0.12356211128845836, + "grad_norm": 0.8388974070549011, + "learning_rate": 9.913798653367478e-06, + "loss": 0.787, + "step": 2245 + }, + { + "epoch": 0.12361715009081402, + "grad_norm": 0.8625446557998657, + "learning_rate": 9.913718492431984e-06, + "loss": 0.7758, + "step": 2246 + }, + { + "epoch": 0.12367218889316968, + "grad_norm": 0.8805570006370544, + "learning_rate": 9.913638294566299e-06, + "loss": 0.8755, + "step": 2247 + }, + { + "epoch": 0.12372722769552534, + "grad_norm": 0.8102611899375916, + "learning_rate": 9.913558059771025e-06, + "loss": 0.8495, + "step": 2248 + }, + { + "epoch": 0.123782266497881, + "grad_norm": 0.8506311774253845, + "learning_rate": 9.913477788046762e-06, + "loss": 0.7413, + "step": 2249 + }, + { + "epoch": 0.12383730530023666, + "grad_norm": 1.0789196491241455, + "learning_rate": 9.913397479394116e-06, + "loss": 0.8993, + "step": 2250 + }, + { + "epoch": 0.12389234410259232, + "grad_norm": 1.5664849281311035, + "learning_rate": 9.91331713381369e-06, + "loss": 0.8322, + "step": 2251 + }, + { + "epoch": 0.12394738290494799, + "grad_norm": 1.1347390413284302, + "learning_rate": 9.913236751306085e-06, + "loss": 0.8756, + "step": 2252 + }, + { + "epoch": 0.12400242170730365, + "grad_norm": 0.8111063241958618, + "learning_rate": 9.913156331871911e-06, + "loss": 0.831, + "step": 2253 + }, + { + "epoch": 0.12405746050965931, + "grad_norm": 0.817812979221344, + "learning_rate": 9.913075875511769e-06, + "loss": 0.8531, + "step": 2254 + }, + { + "epoch": 0.12411249931201497, + "grad_norm": 0.7678318619728088, + "learning_rate": 9.912995382226263e-06, + "loss": 0.8028, + "step": 2255 + }, + { + "epoch": 0.12416753811437063, + "grad_norm": 0.8207805156707764, + "learning_rate": 9.912914852015998e-06, + "loss": 0.8856, + "step": 2256 + }, + { + "epoch": 0.1242225769167263, + "grad_norm": 0.978484570980072, + "learning_rate": 9.912834284881582e-06, + "loss": 0.933, + "step": 2257 + }, + { + "epoch": 0.12427761571908195, + "grad_norm": 0.9215858578681946, + "learning_rate": 9.912753680823617e-06, + "loss": 0.7771, + "step": 2258 + }, + { + "epoch": 0.12433265452143762, + "grad_norm": 0.8542179465293884, + "learning_rate": 9.91267303984271e-06, + "loss": 0.8652, + "step": 2259 + }, + { + "epoch": 0.12438769332379328, + "grad_norm": 0.7985575199127197, + "learning_rate": 9.912592361939469e-06, + "loss": 0.7011, + "step": 2260 + }, + { + "epoch": 0.12444273212614894, + "grad_norm": 0.8868670463562012, + "learning_rate": 9.912511647114498e-06, + "loss": 0.8222, + "step": 2261 + }, + { + "epoch": 0.1244977709285046, + "grad_norm": 0.7966209650039673, + "learning_rate": 9.912430895368405e-06, + "loss": 0.776, + "step": 2262 + }, + { + "epoch": 0.12455280973086026, + "grad_norm": 0.7844830751419067, + "learning_rate": 9.912350106701796e-06, + "loss": 0.7513, + "step": 2263 + }, + { + "epoch": 0.12460784853321592, + "grad_norm": 0.7788559794425964, + "learning_rate": 9.912269281115278e-06, + "loss": 0.8517, + "step": 2264 + }, + { + "epoch": 0.12466288733557158, + "grad_norm": 0.778225839138031, + "learning_rate": 9.912188418609461e-06, + "loss": 0.7504, + "step": 2265 + }, + { + "epoch": 0.12471792613792723, + "grad_norm": 0.7955968976020813, + "learning_rate": 9.912107519184947e-06, + "loss": 0.8152, + "step": 2266 + }, + { + "epoch": 0.1247729649402829, + "grad_norm": 1.1202566623687744, + "learning_rate": 9.912026582842352e-06, + "loss": 0.9325, + "step": 2267 + }, + { + "epoch": 0.12482800374263855, + "grad_norm": 0.9762749671936035, + "learning_rate": 9.911945609582279e-06, + "loss": 0.9027, + "step": 2268 + }, + { + "epoch": 0.12488304254499422, + "grad_norm": 0.8311051726341248, + "learning_rate": 9.911864599405336e-06, + "loss": 0.838, + "step": 2269 + }, + { + "epoch": 0.12493808134734988, + "grad_norm": 1.0136815309524536, + "learning_rate": 9.911783552312134e-06, + "loss": 0.9288, + "step": 2270 + }, + { + "epoch": 0.12499312014970554, + "grad_norm": 0.7960494160652161, + "learning_rate": 9.911702468303282e-06, + "loss": 0.8007, + "step": 2271 + }, + { + "epoch": 0.1250481589520612, + "grad_norm": 0.9980880618095398, + "learning_rate": 9.911621347379388e-06, + "loss": 0.8613, + "step": 2272 + }, + { + "epoch": 0.12510319775441686, + "grad_norm": 0.8916807770729065, + "learning_rate": 9.911540189541065e-06, + "loss": 0.8783, + "step": 2273 + }, + { + "epoch": 0.12515823655677252, + "grad_norm": 0.9455892443656921, + "learning_rate": 9.911458994788919e-06, + "loss": 0.8676, + "step": 2274 + }, + { + "epoch": 0.12521327535912818, + "grad_norm": 0.7649906277656555, + "learning_rate": 9.911377763123561e-06, + "loss": 0.7763, + "step": 2275 + }, + { + "epoch": 0.12526831416148385, + "grad_norm": 0.8971202373504639, + "learning_rate": 9.911296494545604e-06, + "loss": 0.9022, + "step": 2276 + }, + { + "epoch": 0.1253233529638395, + "grad_norm": 0.833678126335144, + "learning_rate": 9.911215189055657e-06, + "loss": 0.8401, + "step": 2277 + }, + { + "epoch": 0.12537839176619517, + "grad_norm": 0.8967958688735962, + "learning_rate": 9.911133846654331e-06, + "loss": 0.8678, + "step": 2278 + }, + { + "epoch": 0.12543343056855083, + "grad_norm": 0.8195546865463257, + "learning_rate": 9.911052467342239e-06, + "loss": 0.842, + "step": 2279 + }, + { + "epoch": 0.1254884693709065, + "grad_norm": 1.095815896987915, + "learning_rate": 9.910971051119988e-06, + "loss": 0.845, + "step": 2280 + }, + { + "epoch": 0.12554350817326215, + "grad_norm": 0.9452629685401917, + "learning_rate": 9.910889597988197e-06, + "loss": 0.8971, + "step": 2281 + }, + { + "epoch": 0.12559854697561781, + "grad_norm": 0.9872332215309143, + "learning_rate": 9.910808107947471e-06, + "loss": 0.7994, + "step": 2282 + }, + { + "epoch": 0.12565358577797348, + "grad_norm": 0.7761966586112976, + "learning_rate": 9.910726580998427e-06, + "loss": 0.7791, + "step": 2283 + }, + { + "epoch": 0.12570862458032914, + "grad_norm": 0.8950315713882446, + "learning_rate": 9.910645017141678e-06, + "loss": 0.8499, + "step": 2284 + }, + { + "epoch": 0.1257636633826848, + "grad_norm": 0.8796371221542358, + "learning_rate": 9.910563416377834e-06, + "loss": 0.8587, + "step": 2285 + }, + { + "epoch": 0.12581870218504046, + "grad_norm": 0.8291982412338257, + "learning_rate": 9.91048177870751e-06, + "loss": 0.9166, + "step": 2286 + }, + { + "epoch": 0.12587374098739612, + "grad_norm": 0.758369505405426, + "learning_rate": 9.91040010413132e-06, + "loss": 0.8305, + "step": 2287 + }, + { + "epoch": 0.12592877978975178, + "grad_norm": 0.8775640726089478, + "learning_rate": 9.910318392649876e-06, + "loss": 0.8513, + "step": 2288 + }, + { + "epoch": 0.12598381859210744, + "grad_norm": 0.8581671118736267, + "learning_rate": 9.910236644263796e-06, + "loss": 0.8134, + "step": 2289 + }, + { + "epoch": 0.1260388573944631, + "grad_norm": 0.8570736050605774, + "learning_rate": 9.910154858973689e-06, + "loss": 0.826, + "step": 2290 + }, + { + "epoch": 0.12609389619681877, + "grad_norm": 0.8712487816810608, + "learning_rate": 9.910073036780173e-06, + "loss": 0.8042, + "step": 2291 + }, + { + "epoch": 0.12614893499917443, + "grad_norm": 0.7584837675094604, + "learning_rate": 9.909991177683862e-06, + "loss": 0.7715, + "step": 2292 + }, + { + "epoch": 0.1262039738015301, + "grad_norm": 0.8618917465209961, + "learning_rate": 9.909909281685373e-06, + "loss": 0.8755, + "step": 2293 + }, + { + "epoch": 0.12625901260388575, + "grad_norm": 0.9530277848243713, + "learning_rate": 9.90982734878532e-06, + "loss": 0.8538, + "step": 2294 + }, + { + "epoch": 0.1263140514062414, + "grad_norm": 0.8394436836242676, + "learning_rate": 9.909745378984319e-06, + "loss": 0.8401, + "step": 2295 + }, + { + "epoch": 0.12636909020859707, + "grad_norm": 0.8224034309387207, + "learning_rate": 9.909663372282984e-06, + "loss": 0.7201, + "step": 2296 + }, + { + "epoch": 0.12642412901095273, + "grad_norm": 0.8215349912643433, + "learning_rate": 9.909581328681934e-06, + "loss": 0.8824, + "step": 2297 + }, + { + "epoch": 0.12647916781330837, + "grad_norm": 0.839389443397522, + "learning_rate": 9.909499248181786e-06, + "loss": 0.8056, + "step": 2298 + }, + { + "epoch": 0.12653420661566403, + "grad_norm": 0.9440048933029175, + "learning_rate": 9.909417130783156e-06, + "loss": 0.908, + "step": 2299 + }, + { + "epoch": 0.1265892454180197, + "grad_norm": 0.8336486220359802, + "learning_rate": 9.90933497648666e-06, + "loss": 0.8382, + "step": 2300 + }, + { + "epoch": 0.12664428422037535, + "grad_norm": 1.1541366577148438, + "learning_rate": 9.909252785292918e-06, + "loss": 0.8782, + "step": 2301 + }, + { + "epoch": 0.12669932302273101, + "grad_norm": 0.8730320334434509, + "learning_rate": 9.909170557202545e-06, + "loss": 0.7687, + "step": 2302 + }, + { + "epoch": 0.12675436182508668, + "grad_norm": 0.9927527904510498, + "learning_rate": 9.90908829221616e-06, + "loss": 0.8134, + "step": 2303 + }, + { + "epoch": 0.12680940062744234, + "grad_norm": 0.9521791338920593, + "learning_rate": 9.909005990334381e-06, + "loss": 0.9187, + "step": 2304 + }, + { + "epoch": 0.126864439429798, + "grad_norm": 0.8012455701828003, + "learning_rate": 9.908923651557828e-06, + "loss": 0.8581, + "step": 2305 + }, + { + "epoch": 0.12691947823215366, + "grad_norm": 0.8882689476013184, + "learning_rate": 9.90884127588712e-06, + "loss": 0.9317, + "step": 2306 + }, + { + "epoch": 0.12697451703450932, + "grad_norm": 0.8408340215682983, + "learning_rate": 9.908758863322872e-06, + "loss": 0.8444, + "step": 2307 + }, + { + "epoch": 0.12702955583686498, + "grad_norm": 0.7856307029724121, + "learning_rate": 9.908676413865709e-06, + "loss": 0.8457, + "step": 2308 + }, + { + "epoch": 0.12708459463922064, + "grad_norm": 0.9459167718887329, + "learning_rate": 9.908593927516247e-06, + "loss": 0.8153, + "step": 2309 + }, + { + "epoch": 0.1271396334415763, + "grad_norm": 0.8629655838012695, + "learning_rate": 9.908511404275107e-06, + "loss": 0.8279, + "step": 2310 + }, + { + "epoch": 0.12719467224393197, + "grad_norm": 1.2012875080108643, + "learning_rate": 9.90842884414291e-06, + "loss": 1.4388, + "step": 2311 + }, + { + "epoch": 0.12724971104628763, + "grad_norm": 1.20725417137146, + "learning_rate": 9.908346247120274e-06, + "loss": 0.8704, + "step": 2312 + }, + { + "epoch": 0.1273047498486433, + "grad_norm": 0.8152929544448853, + "learning_rate": 9.908263613207822e-06, + "loss": 0.8618, + "step": 2313 + }, + { + "epoch": 0.12735978865099895, + "grad_norm": 0.8400965332984924, + "learning_rate": 9.908180942406175e-06, + "loss": 0.7881, + "step": 2314 + }, + { + "epoch": 0.1274148274533546, + "grad_norm": 0.8856974840164185, + "learning_rate": 9.908098234715956e-06, + "loss": 0.9073, + "step": 2315 + }, + { + "epoch": 0.12746986625571027, + "grad_norm": 0.8708439469337463, + "learning_rate": 9.908015490137782e-06, + "loss": 0.8099, + "step": 2316 + }, + { + "epoch": 0.12752490505806593, + "grad_norm": 0.8632444143295288, + "learning_rate": 9.907932708672277e-06, + "loss": 0.8472, + "step": 2317 + }, + { + "epoch": 0.1275799438604216, + "grad_norm": 0.8977149128913879, + "learning_rate": 9.907849890320062e-06, + "loss": 0.8878, + "step": 2318 + }, + { + "epoch": 0.12763498266277726, + "grad_norm": 0.8589425086975098, + "learning_rate": 9.907767035081765e-06, + "loss": 0.7905, + "step": 2319 + }, + { + "epoch": 0.12769002146513292, + "grad_norm": 0.9873501062393188, + "learning_rate": 9.907684142958002e-06, + "loss": 0.9002, + "step": 2320 + }, + { + "epoch": 0.12774506026748858, + "grad_norm": 0.8963840007781982, + "learning_rate": 9.9076012139494e-06, + "loss": 0.92, + "step": 2321 + }, + { + "epoch": 0.12780009906984424, + "grad_norm": 0.7933574318885803, + "learning_rate": 9.90751824805658e-06, + "loss": 0.7664, + "step": 2322 + }, + { + "epoch": 0.1278551378721999, + "grad_norm": 0.9660933017730713, + "learning_rate": 9.907435245280167e-06, + "loss": 0.9162, + "step": 2323 + }, + { + "epoch": 0.12791017667455556, + "grad_norm": 0.8698949217796326, + "learning_rate": 9.907352205620783e-06, + "loss": 0.7988, + "step": 2324 + }, + { + "epoch": 0.12796521547691123, + "grad_norm": 0.9077615141868591, + "learning_rate": 9.907269129079055e-06, + "loss": 0.8581, + "step": 2325 + }, + { + "epoch": 0.1280202542792669, + "grad_norm": 0.9128179550170898, + "learning_rate": 9.907186015655607e-06, + "loss": 0.8552, + "step": 2326 + }, + { + "epoch": 0.12807529308162255, + "grad_norm": 0.9321265816688538, + "learning_rate": 9.907102865351062e-06, + "loss": 0.889, + "step": 2327 + }, + { + "epoch": 0.1281303318839782, + "grad_norm": 0.9687464833259583, + "learning_rate": 9.907019678166044e-06, + "loss": 0.7944, + "step": 2328 + }, + { + "epoch": 0.12818537068633387, + "grad_norm": 0.862223207950592, + "learning_rate": 9.90693645410118e-06, + "loss": 0.7699, + "step": 2329 + }, + { + "epoch": 0.12824040948868953, + "grad_norm": 0.9662127494812012, + "learning_rate": 9.906853193157095e-06, + "loss": 0.7818, + "step": 2330 + }, + { + "epoch": 0.1282954482910452, + "grad_norm": 0.8008295297622681, + "learning_rate": 9.906769895334413e-06, + "loss": 0.8443, + "step": 2331 + }, + { + "epoch": 0.12835048709340086, + "grad_norm": 0.8638464212417603, + "learning_rate": 9.906686560633765e-06, + "loss": 0.8438, + "step": 2332 + }, + { + "epoch": 0.12840552589575652, + "grad_norm": 0.9215866327285767, + "learning_rate": 9.906603189055773e-06, + "loss": 0.7481, + "step": 2333 + }, + { + "epoch": 0.12846056469811218, + "grad_norm": 0.7926739454269409, + "learning_rate": 9.906519780601066e-06, + "loss": 0.7404, + "step": 2334 + }, + { + "epoch": 0.12851560350046784, + "grad_norm": 0.9590242505073547, + "learning_rate": 9.906436335270268e-06, + "loss": 0.8319, + "step": 2335 + }, + { + "epoch": 0.1285706423028235, + "grad_norm": 1.0300076007843018, + "learning_rate": 9.906352853064009e-06, + "loss": 0.8635, + "step": 2336 + }, + { + "epoch": 0.12862568110517916, + "grad_norm": 0.8401443958282471, + "learning_rate": 9.906269333982915e-06, + "loss": 0.9584, + "step": 2337 + }, + { + "epoch": 0.12868071990753482, + "grad_norm": 0.8144069910049438, + "learning_rate": 9.906185778027613e-06, + "loss": 0.7375, + "step": 2338 + }, + { + "epoch": 0.12873575870989049, + "grad_norm": 0.8513948917388916, + "learning_rate": 9.906102185198733e-06, + "loss": 0.8353, + "step": 2339 + }, + { + "epoch": 0.12879079751224615, + "grad_norm": 0.8243077397346497, + "learning_rate": 9.906018555496903e-06, + "loss": 0.8665, + "step": 2340 + }, + { + "epoch": 0.12884583631460178, + "grad_norm": 0.8699066042900085, + "learning_rate": 9.905934888922749e-06, + "loss": 0.8537, + "step": 2341 + }, + { + "epoch": 0.12890087511695744, + "grad_norm": 1.0980210304260254, + "learning_rate": 9.905851185476902e-06, + "loss": 0.8887, + "step": 2342 + }, + { + "epoch": 0.1289559139193131, + "grad_norm": 0.8189190030097961, + "learning_rate": 9.905767445159992e-06, + "loss": 0.8467, + "step": 2343 + }, + { + "epoch": 0.12901095272166876, + "grad_norm": 0.8273541331291199, + "learning_rate": 9.905683667972645e-06, + "loss": 0.8701, + "step": 2344 + }, + { + "epoch": 0.12906599152402443, + "grad_norm": 0.8987969160079956, + "learning_rate": 9.905599853915496e-06, + "loss": 0.909, + "step": 2345 + }, + { + "epoch": 0.1291210303263801, + "grad_norm": 0.818268895149231, + "learning_rate": 9.905516002989168e-06, + "loss": 0.7946, + "step": 2346 + }, + { + "epoch": 0.12917606912873575, + "grad_norm": 0.7401725053787231, + "learning_rate": 9.905432115194296e-06, + "loss": 0.7006, + "step": 2347 + }, + { + "epoch": 0.1292311079310914, + "grad_norm": 0.8263179659843445, + "learning_rate": 9.905348190531511e-06, + "loss": 0.7768, + "step": 2348 + }, + { + "epoch": 0.12928614673344707, + "grad_norm": 0.9241918921470642, + "learning_rate": 9.90526422900144e-06, + "loss": 0.8593, + "step": 2349 + }, + { + "epoch": 0.12934118553580273, + "grad_norm": 0.7804501056671143, + "learning_rate": 9.905180230604718e-06, + "loss": 0.7607, + "step": 2350 + }, + { + "epoch": 0.1293962243381584, + "grad_norm": 0.9408491253852844, + "learning_rate": 9.905096195341973e-06, + "loss": 0.8906, + "step": 2351 + }, + { + "epoch": 0.12945126314051406, + "grad_norm": 1.0356301069259644, + "learning_rate": 9.905012123213838e-06, + "loss": 0.8051, + "step": 2352 + }, + { + "epoch": 0.12950630194286972, + "grad_norm": 0.8546886444091797, + "learning_rate": 9.904928014220945e-06, + "loss": 0.7543, + "step": 2353 + }, + { + "epoch": 0.12956134074522538, + "grad_norm": 0.9229897856712341, + "learning_rate": 9.904843868363927e-06, + "loss": 0.8823, + "step": 2354 + }, + { + "epoch": 0.12961637954758104, + "grad_norm": 0.8364199995994568, + "learning_rate": 9.904759685643414e-06, + "loss": 0.8825, + "step": 2355 + }, + { + "epoch": 0.1296714183499367, + "grad_norm": 0.9092077016830444, + "learning_rate": 9.90467546606004e-06, + "loss": 0.8721, + "step": 2356 + }, + { + "epoch": 0.12972645715229236, + "grad_norm": 1.042973518371582, + "learning_rate": 9.904591209614441e-06, + "loss": 0.7984, + "step": 2357 + }, + { + "epoch": 0.12978149595464802, + "grad_norm": 0.7262618541717529, + "learning_rate": 9.904506916307243e-06, + "loss": 0.6721, + "step": 2358 + }, + { + "epoch": 0.12983653475700369, + "grad_norm": 0.7562826871871948, + "learning_rate": 9.904422586139086e-06, + "loss": 0.7702, + "step": 2359 + }, + { + "epoch": 0.12989157355935935, + "grad_norm": 0.8821595907211304, + "learning_rate": 9.904338219110603e-06, + "loss": 0.8555, + "step": 2360 + }, + { + "epoch": 0.129946612361715, + "grad_norm": 1.0340098142623901, + "learning_rate": 9.904253815222424e-06, + "loss": 0.9004, + "step": 2361 + }, + { + "epoch": 0.13000165116407067, + "grad_norm": 0.8533693552017212, + "learning_rate": 9.904169374475188e-06, + "loss": 0.836, + "step": 2362 + }, + { + "epoch": 0.13005668996642633, + "grad_norm": 0.8564199805259705, + "learning_rate": 9.904084896869528e-06, + "loss": 0.9281, + "step": 2363 + }, + { + "epoch": 0.130111728768782, + "grad_norm": 0.7817538976669312, + "learning_rate": 9.904000382406079e-06, + "loss": 0.7444, + "step": 2364 + }, + { + "epoch": 0.13016676757113765, + "grad_norm": 1.1420893669128418, + "learning_rate": 9.903915831085473e-06, + "loss": 0.9116, + "step": 2365 + }, + { + "epoch": 0.13022180637349332, + "grad_norm": 0.9671920537948608, + "learning_rate": 9.903831242908351e-06, + "loss": 0.899, + "step": 2366 + }, + { + "epoch": 0.13027684517584898, + "grad_norm": 0.8528717756271362, + "learning_rate": 9.903746617875345e-06, + "loss": 0.7231, + "step": 2367 + }, + { + "epoch": 0.13033188397820464, + "grad_norm": 0.786960244178772, + "learning_rate": 9.903661955987091e-06, + "loss": 0.7997, + "step": 2368 + }, + { + "epoch": 0.1303869227805603, + "grad_norm": 0.941683292388916, + "learning_rate": 9.903577257244228e-06, + "loss": 0.9127, + "step": 2369 + }, + { + "epoch": 0.13044196158291596, + "grad_norm": 0.886900007724762, + "learning_rate": 9.903492521647391e-06, + "loss": 0.9086, + "step": 2370 + }, + { + "epoch": 0.13049700038527162, + "grad_norm": 0.9924801588058472, + "learning_rate": 9.903407749197216e-06, + "loss": 0.9055, + "step": 2371 + }, + { + "epoch": 0.13055203918762728, + "grad_norm": 0.6998724341392517, + "learning_rate": 9.903322939894342e-06, + "loss": 0.6972, + "step": 2372 + }, + { + "epoch": 0.13060707798998294, + "grad_norm": 0.8448702096939087, + "learning_rate": 9.903238093739404e-06, + "loss": 0.7862, + "step": 2373 + }, + { + "epoch": 0.1306621167923386, + "grad_norm": 0.8557441830635071, + "learning_rate": 9.90315321073304e-06, + "loss": 0.8364, + "step": 2374 + }, + { + "epoch": 0.13071715559469427, + "grad_norm": 0.7978441119194031, + "learning_rate": 9.903068290875892e-06, + "loss": 0.7671, + "step": 2375 + }, + { + "epoch": 0.13077219439704993, + "grad_norm": 0.781315803527832, + "learning_rate": 9.902983334168594e-06, + "loss": 0.7963, + "step": 2376 + }, + { + "epoch": 0.1308272331994056, + "grad_norm": 0.7326155304908752, + "learning_rate": 9.902898340611785e-06, + "loss": 0.8, + "step": 2377 + }, + { + "epoch": 0.13088227200176125, + "grad_norm": 0.7693139314651489, + "learning_rate": 9.902813310206105e-06, + "loss": 0.8459, + "step": 2378 + }, + { + "epoch": 0.1309373108041169, + "grad_norm": 0.9441308975219727, + "learning_rate": 9.902728242952191e-06, + "loss": 0.8519, + "step": 2379 + }, + { + "epoch": 0.13099234960647257, + "grad_norm": 0.8350616693496704, + "learning_rate": 9.902643138850686e-06, + "loss": 0.876, + "step": 2380 + }, + { + "epoch": 0.13104738840882824, + "grad_norm": 0.8675554394721985, + "learning_rate": 9.902557997902227e-06, + "loss": 0.8172, + "step": 2381 + }, + { + "epoch": 0.1311024272111839, + "grad_norm": 0.9618930220603943, + "learning_rate": 9.902472820107454e-06, + "loss": 0.8852, + "step": 2382 + }, + { + "epoch": 0.13115746601353956, + "grad_norm": 0.862341046333313, + "learning_rate": 9.902387605467007e-06, + "loss": 0.9256, + "step": 2383 + }, + { + "epoch": 0.1312125048158952, + "grad_norm": 0.8749859929084778, + "learning_rate": 9.902302353981527e-06, + "loss": 0.8809, + "step": 2384 + }, + { + "epoch": 0.13126754361825085, + "grad_norm": 0.9061958193778992, + "learning_rate": 9.902217065651657e-06, + "loss": 0.779, + "step": 2385 + }, + { + "epoch": 0.13132258242060652, + "grad_norm": 0.8909298777580261, + "learning_rate": 9.902131740478033e-06, + "loss": 0.8203, + "step": 2386 + }, + { + "epoch": 0.13137762122296218, + "grad_norm": 0.8507269024848938, + "learning_rate": 9.902046378461302e-06, + "loss": 0.776, + "step": 2387 + }, + { + "epoch": 0.13143266002531784, + "grad_norm": 0.9577299356460571, + "learning_rate": 9.901960979602101e-06, + "loss": 0.8104, + "step": 2388 + }, + { + "epoch": 0.1314876988276735, + "grad_norm": 0.9244948625564575, + "learning_rate": 9.901875543901074e-06, + "loss": 0.9035, + "step": 2389 + }, + { + "epoch": 0.13154273763002916, + "grad_norm": 0.7534334063529968, + "learning_rate": 9.901790071358861e-06, + "loss": 0.7262, + "step": 2390 + }, + { + "epoch": 0.13159777643238482, + "grad_norm": 0.8920090198516846, + "learning_rate": 9.901704561976106e-06, + "loss": 0.932, + "step": 2391 + }, + { + "epoch": 0.13165281523474048, + "grad_norm": 0.8524243235588074, + "learning_rate": 9.901619015753455e-06, + "loss": 0.8107, + "step": 2392 + }, + { + "epoch": 0.13170785403709614, + "grad_norm": 0.8170381784439087, + "learning_rate": 9.901533432691543e-06, + "loss": 0.8814, + "step": 2393 + }, + { + "epoch": 0.1317628928394518, + "grad_norm": 0.8281697034835815, + "learning_rate": 9.90144781279102e-06, + "loss": 0.8221, + "step": 2394 + }, + { + "epoch": 0.13181793164180747, + "grad_norm": 0.9283351302146912, + "learning_rate": 9.901362156052528e-06, + "loss": 0.8346, + "step": 2395 + }, + { + "epoch": 0.13187297044416313, + "grad_norm": 0.8331275582313538, + "learning_rate": 9.901276462476708e-06, + "loss": 0.7498, + "step": 2396 + }, + { + "epoch": 0.1319280092465188, + "grad_norm": 0.8427191972732544, + "learning_rate": 9.901190732064207e-06, + "loss": 0.8265, + "step": 2397 + }, + { + "epoch": 0.13198304804887445, + "grad_norm": 0.8510351777076721, + "learning_rate": 9.901104964815669e-06, + "loss": 0.8369, + "step": 2398 + }, + { + "epoch": 0.1320380868512301, + "grad_norm": 0.8468914031982422, + "learning_rate": 9.901019160731738e-06, + "loss": 0.8585, + "step": 2399 + }, + { + "epoch": 0.13209312565358577, + "grad_norm": 0.8302182555198669, + "learning_rate": 9.900933319813058e-06, + "loss": 0.8611, + "step": 2400 + }, + { + "epoch": 0.13214816445594144, + "grad_norm": 0.8527448773384094, + "learning_rate": 9.900847442060277e-06, + "loss": 0.899, + "step": 2401 + }, + { + "epoch": 0.1322032032582971, + "grad_norm": 0.8354688286781311, + "learning_rate": 9.900761527474037e-06, + "loss": 0.8083, + "step": 2402 + }, + { + "epoch": 0.13225824206065276, + "grad_norm": 0.8612173795700073, + "learning_rate": 9.900675576054986e-06, + "loss": 0.8124, + "step": 2403 + }, + { + "epoch": 0.13231328086300842, + "grad_norm": 0.7424876689910889, + "learning_rate": 9.900589587803767e-06, + "loss": 0.6884, + "step": 2404 + }, + { + "epoch": 0.13236831966536408, + "grad_norm": 0.8431115746498108, + "learning_rate": 9.90050356272103e-06, + "loss": 0.9575, + "step": 2405 + }, + { + "epoch": 0.13242335846771974, + "grad_norm": 0.7958092093467712, + "learning_rate": 9.90041750080742e-06, + "loss": 0.7608, + "step": 2406 + }, + { + "epoch": 0.1324783972700754, + "grad_norm": 0.926258385181427, + "learning_rate": 9.900331402063583e-06, + "loss": 0.9072, + "step": 2407 + }, + { + "epoch": 0.13253343607243107, + "grad_norm": 0.7952526807785034, + "learning_rate": 9.900245266490169e-06, + "loss": 0.8001, + "step": 2408 + }, + { + "epoch": 0.13258847487478673, + "grad_norm": 0.8309933543205261, + "learning_rate": 9.900159094087822e-06, + "loss": 0.9154, + "step": 2409 + }, + { + "epoch": 0.1326435136771424, + "grad_norm": 0.858007550239563, + "learning_rate": 9.90007288485719e-06, + "loss": 0.855, + "step": 2410 + }, + { + "epoch": 0.13269855247949805, + "grad_norm": 0.9513822197914124, + "learning_rate": 9.899986638798923e-06, + "loss": 0.8162, + "step": 2411 + }, + { + "epoch": 0.1327535912818537, + "grad_norm": 0.8387427926063538, + "learning_rate": 9.899900355913668e-06, + "loss": 0.8955, + "step": 2412 + }, + { + "epoch": 0.13280863008420937, + "grad_norm": 0.7727940678596497, + "learning_rate": 9.899814036202073e-06, + "loss": 0.6765, + "step": 2413 + }, + { + "epoch": 0.13286366888656503, + "grad_norm": 0.7760928869247437, + "learning_rate": 9.899727679664788e-06, + "loss": 0.7179, + "step": 2414 + }, + { + "epoch": 0.1329187076889207, + "grad_norm": 0.7798073887825012, + "learning_rate": 9.899641286302462e-06, + "loss": 0.8541, + "step": 2415 + }, + { + "epoch": 0.13297374649127636, + "grad_norm": 0.8302769660949707, + "learning_rate": 9.899554856115743e-06, + "loss": 0.8925, + "step": 2416 + }, + { + "epoch": 0.13302878529363202, + "grad_norm": 0.8300751447677612, + "learning_rate": 9.89946838910528e-06, + "loss": 0.7489, + "step": 2417 + }, + { + "epoch": 0.13308382409598768, + "grad_norm": 0.8032094240188599, + "learning_rate": 9.899381885271725e-06, + "loss": 0.811, + "step": 2418 + }, + { + "epoch": 0.13313886289834334, + "grad_norm": 5.237870216369629, + "learning_rate": 9.899295344615727e-06, + "loss": 0.7609, + "step": 2419 + }, + { + "epoch": 0.133193901700699, + "grad_norm": 0.8145740628242493, + "learning_rate": 9.899208767137935e-06, + "loss": 0.8435, + "step": 2420 + }, + { + "epoch": 0.13324894050305466, + "grad_norm": 0.9716018438339233, + "learning_rate": 9.899122152839004e-06, + "loss": 0.7924, + "step": 2421 + }, + { + "epoch": 0.13330397930541033, + "grad_norm": 0.7846183776855469, + "learning_rate": 9.899035501719582e-06, + "loss": 0.8941, + "step": 2422 + }, + { + "epoch": 0.133359018107766, + "grad_norm": 0.7653689980506897, + "learning_rate": 9.89894881378032e-06, + "loss": 0.811, + "step": 2423 + }, + { + "epoch": 0.13341405691012165, + "grad_norm": 0.8221875429153442, + "learning_rate": 9.89886208902187e-06, + "loss": 0.8131, + "step": 2424 + }, + { + "epoch": 0.1334690957124773, + "grad_norm": 0.7422335147857666, + "learning_rate": 9.898775327444885e-06, + "loss": 0.6366, + "step": 2425 + }, + { + "epoch": 0.13352413451483297, + "grad_norm": 0.8072695136070251, + "learning_rate": 9.898688529050014e-06, + "loss": 0.7989, + "step": 2426 + }, + { + "epoch": 0.1335791733171886, + "grad_norm": 0.7717600464820862, + "learning_rate": 9.898601693837911e-06, + "loss": 0.7524, + "step": 2427 + }, + { + "epoch": 0.13363421211954427, + "grad_norm": 0.8070919513702393, + "learning_rate": 9.898514821809231e-06, + "loss": 0.7724, + "step": 2428 + }, + { + "epoch": 0.13368925092189993, + "grad_norm": 0.8184726238250732, + "learning_rate": 9.898427912964624e-06, + "loss": 0.845, + "step": 2429 + }, + { + "epoch": 0.1337442897242556, + "grad_norm": 0.8168759346008301, + "learning_rate": 9.898340967304744e-06, + "loss": 0.8377, + "step": 2430 + }, + { + "epoch": 0.13379932852661125, + "grad_norm": 0.8701872825622559, + "learning_rate": 9.898253984830244e-06, + "loss": 0.908, + "step": 2431 + }, + { + "epoch": 0.1338543673289669, + "grad_norm": 0.8092133402824402, + "learning_rate": 9.898166965541779e-06, + "loss": 0.866, + "step": 2432 + }, + { + "epoch": 0.13390940613132257, + "grad_norm": 0.8337095975875854, + "learning_rate": 9.898079909440002e-06, + "loss": 0.8622, + "step": 2433 + }, + { + "epoch": 0.13396444493367823, + "grad_norm": 1.1016209125518799, + "learning_rate": 9.897992816525567e-06, + "loss": 0.8486, + "step": 2434 + }, + { + "epoch": 0.1340194837360339, + "grad_norm": 0.8136518597602844, + "learning_rate": 9.89790568679913e-06, + "loss": 0.8681, + "step": 2435 + }, + { + "epoch": 0.13407452253838956, + "grad_norm": 0.8202341794967651, + "learning_rate": 9.897818520261344e-06, + "loss": 0.9144, + "step": 2436 + }, + { + "epoch": 0.13412956134074522, + "grad_norm": 0.8836861848831177, + "learning_rate": 9.897731316912866e-06, + "loss": 0.8643, + "step": 2437 + }, + { + "epoch": 0.13418460014310088, + "grad_norm": 0.9040210247039795, + "learning_rate": 9.89764407675435e-06, + "loss": 0.7681, + "step": 2438 + }, + { + "epoch": 0.13423963894545654, + "grad_norm": 0.8762359619140625, + "learning_rate": 9.897556799786452e-06, + "loss": 0.8765, + "step": 2439 + }, + { + "epoch": 0.1342946777478122, + "grad_norm": 0.8859462738037109, + "learning_rate": 9.897469486009827e-06, + "loss": 0.9051, + "step": 2440 + }, + { + "epoch": 0.13434971655016786, + "grad_norm": 0.7727539539337158, + "learning_rate": 9.897382135425134e-06, + "loss": 0.7397, + "step": 2441 + }, + { + "epoch": 0.13440475535252353, + "grad_norm": 0.9018967151641846, + "learning_rate": 9.897294748033028e-06, + "loss": 0.8542, + "step": 2442 + }, + { + "epoch": 0.1344597941548792, + "grad_norm": 0.8228337168693542, + "learning_rate": 9.897207323834165e-06, + "loss": 0.7585, + "step": 2443 + }, + { + "epoch": 0.13451483295723485, + "grad_norm": 0.7509974241256714, + "learning_rate": 9.897119862829203e-06, + "loss": 0.7285, + "step": 2444 + }, + { + "epoch": 0.1345698717595905, + "grad_norm": 0.9225835800170898, + "learning_rate": 9.897032365018797e-06, + "loss": 0.8352, + "step": 2445 + }, + { + "epoch": 0.13462491056194617, + "grad_norm": 0.800981879234314, + "learning_rate": 9.896944830403609e-06, + "loss": 0.7352, + "step": 2446 + }, + { + "epoch": 0.13467994936430183, + "grad_norm": 0.8263673186302185, + "learning_rate": 9.896857258984294e-06, + "loss": 0.8426, + "step": 2447 + }, + { + "epoch": 0.1347349881666575, + "grad_norm": 0.8857110738754272, + "learning_rate": 9.89676965076151e-06, + "loss": 0.8078, + "step": 2448 + }, + { + "epoch": 0.13479002696901315, + "grad_norm": 0.8637158274650574, + "learning_rate": 9.896682005735916e-06, + "loss": 0.8688, + "step": 2449 + }, + { + "epoch": 0.13484506577136882, + "grad_norm": 0.9050095081329346, + "learning_rate": 9.89659432390817e-06, + "loss": 0.831, + "step": 2450 + }, + { + "epoch": 0.13490010457372448, + "grad_norm": 0.829757034778595, + "learning_rate": 9.896506605278933e-06, + "loss": 0.8095, + "step": 2451 + }, + { + "epoch": 0.13495514337608014, + "grad_norm": 0.8910449743270874, + "learning_rate": 9.896418849848864e-06, + "loss": 0.9134, + "step": 2452 + }, + { + "epoch": 0.1350101821784358, + "grad_norm": 0.8856307864189148, + "learning_rate": 9.89633105761862e-06, + "loss": 0.8171, + "step": 2453 + }, + { + "epoch": 0.13506522098079146, + "grad_norm": 0.8159938454627991, + "learning_rate": 9.896243228588864e-06, + "loss": 0.8205, + "step": 2454 + }, + { + "epoch": 0.13512025978314712, + "grad_norm": 0.8200929760932922, + "learning_rate": 9.896155362760254e-06, + "loss": 0.7529, + "step": 2455 + }, + { + "epoch": 0.13517529858550278, + "grad_norm": 0.7591279149055481, + "learning_rate": 9.89606746013345e-06, + "loss": 0.8205, + "step": 2456 + }, + { + "epoch": 0.13523033738785845, + "grad_norm": 0.8598676323890686, + "learning_rate": 9.895979520709114e-06, + "loss": 0.8212, + "step": 2457 + }, + { + "epoch": 0.1352853761902141, + "grad_norm": 0.7290365099906921, + "learning_rate": 9.895891544487905e-06, + "loss": 0.7893, + "step": 2458 + }, + { + "epoch": 0.13534041499256977, + "grad_norm": 0.8040594458580017, + "learning_rate": 9.895803531470487e-06, + "loss": 0.8358, + "step": 2459 + }, + { + "epoch": 0.13539545379492543, + "grad_norm": 0.9286525249481201, + "learning_rate": 9.895715481657522e-06, + "loss": 0.8104, + "step": 2460 + }, + { + "epoch": 0.1354504925972811, + "grad_norm": 0.843054473400116, + "learning_rate": 9.895627395049668e-06, + "loss": 0.7872, + "step": 2461 + }, + { + "epoch": 0.13550553139963675, + "grad_norm": 0.7894387245178223, + "learning_rate": 9.895539271647588e-06, + "loss": 0.8615, + "step": 2462 + }, + { + "epoch": 0.13556057020199241, + "grad_norm": 0.9185294508934021, + "learning_rate": 9.895451111451948e-06, + "loss": 0.8732, + "step": 2463 + }, + { + "epoch": 0.13561560900434808, + "grad_norm": 0.8586474657058716, + "learning_rate": 9.895362914463405e-06, + "loss": 0.9658, + "step": 2464 + }, + { + "epoch": 0.13567064780670374, + "grad_norm": 0.8810474276542664, + "learning_rate": 9.895274680682628e-06, + "loss": 0.8622, + "step": 2465 + }, + { + "epoch": 0.1357256866090594, + "grad_norm": 0.8862990736961365, + "learning_rate": 9.895186410110273e-06, + "loss": 0.916, + "step": 2466 + }, + { + "epoch": 0.13578072541141506, + "grad_norm": 0.7916743159294128, + "learning_rate": 9.89509810274701e-06, + "loss": 0.837, + "step": 2467 + }, + { + "epoch": 0.13583576421377072, + "grad_norm": 0.9063515663146973, + "learning_rate": 9.8950097585935e-06, + "loss": 0.8065, + "step": 2468 + }, + { + "epoch": 0.13589080301612638, + "grad_norm": 0.7656043767929077, + "learning_rate": 9.894921377650405e-06, + "loss": 0.7064, + "step": 2469 + }, + { + "epoch": 0.13594584181848202, + "grad_norm": 1.0630278587341309, + "learning_rate": 9.894832959918392e-06, + "loss": 0.8168, + "step": 2470 + }, + { + "epoch": 0.13600088062083768, + "grad_norm": 0.9118956923484802, + "learning_rate": 9.894744505398126e-06, + "loss": 0.8972, + "step": 2471 + }, + { + "epoch": 0.13605591942319334, + "grad_norm": 0.8989213705062866, + "learning_rate": 9.89465601409027e-06, + "loss": 0.8374, + "step": 2472 + }, + { + "epoch": 0.136110958225549, + "grad_norm": 0.9398229718208313, + "learning_rate": 9.894567485995489e-06, + "loss": 0.8956, + "step": 2473 + }, + { + "epoch": 0.13616599702790466, + "grad_norm": 0.7980280518531799, + "learning_rate": 9.894478921114449e-06, + "loss": 0.8055, + "step": 2474 + }, + { + "epoch": 0.13622103583026032, + "grad_norm": 0.8910034894943237, + "learning_rate": 9.894390319447816e-06, + "loss": 0.8371, + "step": 2475 + }, + { + "epoch": 0.13627607463261598, + "grad_norm": 0.7848070859909058, + "learning_rate": 9.894301680996255e-06, + "loss": 0.8024, + "step": 2476 + }, + { + "epoch": 0.13633111343497165, + "grad_norm": 0.8538175821304321, + "learning_rate": 9.894213005760434e-06, + "loss": 0.8819, + "step": 2477 + }, + { + "epoch": 0.1363861522373273, + "grad_norm": 0.7885367274284363, + "learning_rate": 9.894124293741017e-06, + "loss": 0.7916, + "step": 2478 + }, + { + "epoch": 0.13644119103968297, + "grad_norm": 0.8555673956871033, + "learning_rate": 9.894035544938672e-06, + "loss": 0.8521, + "step": 2479 + }, + { + "epoch": 0.13649622984203863, + "grad_norm": 0.8104771971702576, + "learning_rate": 9.893946759354066e-06, + "loss": 0.8437, + "step": 2480 + }, + { + "epoch": 0.1365512686443943, + "grad_norm": 0.9131864309310913, + "learning_rate": 9.893857936987866e-06, + "loss": 0.8123, + "step": 2481 + }, + { + "epoch": 0.13660630744674995, + "grad_norm": 0.9414293766021729, + "learning_rate": 9.893769077840739e-06, + "loss": 0.7897, + "step": 2482 + }, + { + "epoch": 0.13666134624910561, + "grad_norm": 0.823265016078949, + "learning_rate": 9.893680181913355e-06, + "loss": 0.847, + "step": 2483 + }, + { + "epoch": 0.13671638505146128, + "grad_norm": 0.82098788022995, + "learning_rate": 9.89359124920638e-06, + "loss": 0.7823, + "step": 2484 + }, + { + "epoch": 0.13677142385381694, + "grad_norm": 0.817551851272583, + "learning_rate": 9.893502279720483e-06, + "loss": 0.8084, + "step": 2485 + }, + { + "epoch": 0.1368264626561726, + "grad_norm": 1.0722150802612305, + "learning_rate": 9.893413273456333e-06, + "loss": 0.7394, + "step": 2486 + }, + { + "epoch": 0.13688150145852826, + "grad_norm": 0.8045433759689331, + "learning_rate": 9.893324230414598e-06, + "loss": 0.7528, + "step": 2487 + }, + { + "epoch": 0.13693654026088392, + "grad_norm": 0.8694071173667908, + "learning_rate": 9.893235150595949e-06, + "loss": 0.803, + "step": 2488 + }, + { + "epoch": 0.13699157906323958, + "grad_norm": 0.8238615989685059, + "learning_rate": 9.893146034001054e-06, + "loss": 0.7909, + "step": 2489 + }, + { + "epoch": 0.13704661786559524, + "grad_norm": 0.7782405018806458, + "learning_rate": 9.893056880630583e-06, + "loss": 0.6859, + "step": 2490 + }, + { + "epoch": 0.1371016566679509, + "grad_norm": 0.7865599989891052, + "learning_rate": 9.892967690485207e-06, + "loss": 0.7982, + "step": 2491 + }, + { + "epoch": 0.13715669547030657, + "grad_norm": 0.768120288848877, + "learning_rate": 9.892878463565595e-06, + "loss": 0.8234, + "step": 2492 + }, + { + "epoch": 0.13721173427266223, + "grad_norm": 0.812493085861206, + "learning_rate": 9.89278919987242e-06, + "loss": 0.9152, + "step": 2493 + }, + { + "epoch": 0.1372667730750179, + "grad_norm": 0.7256335616111755, + "learning_rate": 9.892699899406348e-06, + "loss": 0.6703, + "step": 2494 + }, + { + "epoch": 0.13732181187737355, + "grad_norm": 0.8022804260253906, + "learning_rate": 9.892610562168054e-06, + "loss": 0.7918, + "step": 2495 + }, + { + "epoch": 0.1373768506797292, + "grad_norm": 0.8204907774925232, + "learning_rate": 9.89252118815821e-06, + "loss": 0.9094, + "step": 2496 + }, + { + "epoch": 0.13743188948208487, + "grad_norm": 0.9986788630485535, + "learning_rate": 9.892431777377484e-06, + "loss": 0.8921, + "step": 2497 + }, + { + "epoch": 0.13748692828444053, + "grad_norm": 0.7937983870506287, + "learning_rate": 9.892342329826554e-06, + "loss": 0.8048, + "step": 2498 + }, + { + "epoch": 0.1375419670867962, + "grad_norm": 0.9295744895935059, + "learning_rate": 9.892252845506086e-06, + "loss": 0.755, + "step": 2499 + }, + { + "epoch": 0.13759700588915186, + "grad_norm": 0.7920984625816345, + "learning_rate": 9.892163324416757e-06, + "loss": 0.7603, + "step": 2500 + }, + { + "epoch": 0.13765204469150752, + "grad_norm": 0.9229464530944824, + "learning_rate": 9.892073766559236e-06, + "loss": 0.8115, + "step": 2501 + }, + { + "epoch": 0.13770708349386318, + "grad_norm": 0.8205353021621704, + "learning_rate": 9.8919841719342e-06, + "loss": 0.8357, + "step": 2502 + }, + { + "epoch": 0.13776212229621884, + "grad_norm": 0.86461341381073, + "learning_rate": 9.891894540542318e-06, + "loss": 0.748, + "step": 2503 + }, + { + "epoch": 0.1378171610985745, + "grad_norm": 0.767145574092865, + "learning_rate": 9.891804872384267e-06, + "loss": 0.7404, + "step": 2504 + }, + { + "epoch": 0.13787219990093016, + "grad_norm": 0.7492040991783142, + "learning_rate": 9.891715167460721e-06, + "loss": 0.6958, + "step": 2505 + }, + { + "epoch": 0.13792723870328583, + "grad_norm": 0.8643150329589844, + "learning_rate": 9.891625425772353e-06, + "loss": 0.8408, + "step": 2506 + }, + { + "epoch": 0.1379822775056415, + "grad_norm": 0.8026981353759766, + "learning_rate": 9.891535647319838e-06, + "loss": 0.7895, + "step": 2507 + }, + { + "epoch": 0.13803731630799715, + "grad_norm": 1.2780394554138184, + "learning_rate": 9.89144583210385e-06, + "loss": 0.9113, + "step": 2508 + }, + { + "epoch": 0.1380923551103528, + "grad_norm": 0.8476191163063049, + "learning_rate": 9.891355980125064e-06, + "loss": 0.8224, + "step": 2509 + }, + { + "epoch": 0.13814739391270847, + "grad_norm": 1.048682689666748, + "learning_rate": 9.891266091384157e-06, + "loss": 0.8913, + "step": 2510 + }, + { + "epoch": 0.13820243271506413, + "grad_norm": 1.0314993858337402, + "learning_rate": 9.891176165881801e-06, + "loss": 0.8315, + "step": 2511 + }, + { + "epoch": 0.1382574715174198, + "grad_norm": 0.9500058889389038, + "learning_rate": 9.891086203618676e-06, + "loss": 0.9185, + "step": 2512 + }, + { + "epoch": 0.13831251031977543, + "grad_norm": 0.7860653400421143, + "learning_rate": 9.890996204595457e-06, + "loss": 0.804, + "step": 2513 + }, + { + "epoch": 0.1383675491221311, + "grad_norm": 0.8354741930961609, + "learning_rate": 9.89090616881282e-06, + "loss": 0.8214, + "step": 2514 + }, + { + "epoch": 0.13842258792448675, + "grad_norm": 0.9115905165672302, + "learning_rate": 9.890816096271438e-06, + "loss": 0.8801, + "step": 2515 + }, + { + "epoch": 0.1384776267268424, + "grad_norm": 0.8852075338363647, + "learning_rate": 9.890725986971994e-06, + "loss": 0.8821, + "step": 2516 + }, + { + "epoch": 0.13853266552919807, + "grad_norm": 0.804314374923706, + "learning_rate": 9.890635840915164e-06, + "loss": 0.8412, + "step": 2517 + }, + { + "epoch": 0.13858770433155373, + "grad_norm": 0.8242805600166321, + "learning_rate": 9.890545658101623e-06, + "loss": 0.8447, + "step": 2518 + }, + { + "epoch": 0.1386427431339094, + "grad_norm": 0.8385655879974365, + "learning_rate": 9.890455438532048e-06, + "loss": 0.8161, + "step": 2519 + }, + { + "epoch": 0.13869778193626506, + "grad_norm": 0.7950524687767029, + "learning_rate": 9.89036518220712e-06, + "loss": 0.8024, + "step": 2520 + }, + { + "epoch": 0.13875282073862072, + "grad_norm": 1.0031861066818237, + "learning_rate": 9.890274889127518e-06, + "loss": 0.8399, + "step": 2521 + }, + { + "epoch": 0.13880785954097638, + "grad_norm": 0.8403242230415344, + "learning_rate": 9.890184559293917e-06, + "loss": 0.8115, + "step": 2522 + }, + { + "epoch": 0.13886289834333204, + "grad_norm": 0.8389976024627686, + "learning_rate": 9.890094192706998e-06, + "loss": 0.9573, + "step": 2523 + }, + { + "epoch": 0.1389179371456877, + "grad_norm": 0.8408516645431519, + "learning_rate": 9.890003789367442e-06, + "loss": 0.8572, + "step": 2524 + }, + { + "epoch": 0.13897297594804336, + "grad_norm": 0.7607787251472473, + "learning_rate": 9.889913349275925e-06, + "loss": 0.8119, + "step": 2525 + }, + { + "epoch": 0.13902801475039903, + "grad_norm": 0.7696373462677002, + "learning_rate": 9.889822872433127e-06, + "loss": 0.8287, + "step": 2526 + }, + { + "epoch": 0.1390830535527547, + "grad_norm": 0.8518380522727966, + "learning_rate": 9.889732358839732e-06, + "loss": 0.9008, + "step": 2527 + }, + { + "epoch": 0.13913809235511035, + "grad_norm": 0.8851314783096313, + "learning_rate": 9.889641808496416e-06, + "loss": 0.8148, + "step": 2528 + }, + { + "epoch": 0.139193131157466, + "grad_norm": 0.9245797395706177, + "learning_rate": 9.889551221403862e-06, + "loss": 0.846, + "step": 2529 + }, + { + "epoch": 0.13924816995982167, + "grad_norm": 0.8445762991905212, + "learning_rate": 9.889460597562748e-06, + "loss": 0.8306, + "step": 2530 + }, + { + "epoch": 0.13930320876217733, + "grad_norm": 0.9149277806282043, + "learning_rate": 9.88936993697376e-06, + "loss": 0.8033, + "step": 2531 + }, + { + "epoch": 0.139358247564533, + "grad_norm": 0.894666850566864, + "learning_rate": 9.889279239637572e-06, + "loss": 0.8299, + "step": 2532 + }, + { + "epoch": 0.13941328636688866, + "grad_norm": 1.2897371053695679, + "learning_rate": 9.889188505554871e-06, + "loss": 0.7776, + "step": 2533 + }, + { + "epoch": 0.13946832516924432, + "grad_norm": 0.8927022218704224, + "learning_rate": 9.889097734726341e-06, + "loss": 0.8706, + "step": 2534 + }, + { + "epoch": 0.13952336397159998, + "grad_norm": 0.7688571214675903, + "learning_rate": 9.889006927152658e-06, + "loss": 0.8191, + "step": 2535 + }, + { + "epoch": 0.13957840277395564, + "grad_norm": 0.926671028137207, + "learning_rate": 9.88891608283451e-06, + "loss": 0.7489, + "step": 2536 + }, + { + "epoch": 0.1396334415763113, + "grad_norm": 0.8316965699195862, + "learning_rate": 9.888825201772577e-06, + "loss": 0.7783, + "step": 2537 + }, + { + "epoch": 0.13968848037866696, + "grad_norm": 0.8619750738143921, + "learning_rate": 9.88873428396754e-06, + "loss": 0.8269, + "step": 2538 + }, + { + "epoch": 0.13974351918102262, + "grad_norm": 0.8588540554046631, + "learning_rate": 9.888643329420086e-06, + "loss": 0.8133, + "step": 2539 + }, + { + "epoch": 0.13979855798337829, + "grad_norm": 0.7947841882705688, + "learning_rate": 9.8885523381309e-06, + "loss": 0.8041, + "step": 2540 + }, + { + "epoch": 0.13985359678573395, + "grad_norm": 0.8440257906913757, + "learning_rate": 9.888461310100661e-06, + "loss": 0.8324, + "step": 2541 + }, + { + "epoch": 0.1399086355880896, + "grad_norm": 0.7842260003089905, + "learning_rate": 9.888370245330055e-06, + "loss": 0.8031, + "step": 2542 + }, + { + "epoch": 0.13996367439044527, + "grad_norm": 0.8108223080635071, + "learning_rate": 9.888279143819768e-06, + "loss": 0.7998, + "step": 2543 + }, + { + "epoch": 0.14001871319280093, + "grad_norm": 0.9748625159263611, + "learning_rate": 9.888188005570482e-06, + "loss": 0.9553, + "step": 2544 + }, + { + "epoch": 0.1400737519951566, + "grad_norm": 0.8465562462806702, + "learning_rate": 9.888096830582883e-06, + "loss": 0.7884, + "step": 2545 + }, + { + "epoch": 0.14012879079751225, + "grad_norm": 0.9339833855628967, + "learning_rate": 9.88800561885766e-06, + "loss": 0.8135, + "step": 2546 + }, + { + "epoch": 0.14018382959986792, + "grad_norm": 0.7749297022819519, + "learning_rate": 9.887914370395492e-06, + "loss": 0.8411, + "step": 2547 + }, + { + "epoch": 0.14023886840222358, + "grad_norm": 0.862606942653656, + "learning_rate": 9.887823085197068e-06, + "loss": 0.7631, + "step": 2548 + }, + { + "epoch": 0.14029390720457924, + "grad_norm": 1.3383793830871582, + "learning_rate": 9.887731763263076e-06, + "loss": 0.7979, + "step": 2549 + }, + { + "epoch": 0.1403489460069349, + "grad_norm": 0.8092008233070374, + "learning_rate": 9.887640404594199e-06, + "loss": 0.7566, + "step": 2550 + }, + { + "epoch": 0.14040398480929056, + "grad_norm": 0.9233745336532593, + "learning_rate": 9.887549009191126e-06, + "loss": 0.8954, + "step": 2551 + }, + { + "epoch": 0.14045902361164622, + "grad_norm": 0.8533664345741272, + "learning_rate": 9.887457577054542e-06, + "loss": 0.8311, + "step": 2552 + }, + { + "epoch": 0.14051406241400188, + "grad_norm": 0.7679287791252136, + "learning_rate": 9.887366108185135e-06, + "loss": 0.7641, + "step": 2553 + }, + { + "epoch": 0.14056910121635754, + "grad_norm": 0.7998354434967041, + "learning_rate": 9.887274602583594e-06, + "loss": 0.7759, + "step": 2554 + }, + { + "epoch": 0.1406241400187132, + "grad_norm": 0.8877138495445251, + "learning_rate": 9.887183060250605e-06, + "loss": 0.8928, + "step": 2555 + }, + { + "epoch": 0.14067917882106884, + "grad_norm": 0.8022066354751587, + "learning_rate": 9.887091481186855e-06, + "loss": 0.8233, + "step": 2556 + }, + { + "epoch": 0.1407342176234245, + "grad_norm": 0.8419097065925598, + "learning_rate": 9.886999865393035e-06, + "loss": 0.8044, + "step": 2557 + }, + { + "epoch": 0.14078925642578016, + "grad_norm": 0.9581286311149597, + "learning_rate": 9.88690821286983e-06, + "loss": 0.8531, + "step": 2558 + }, + { + "epoch": 0.14084429522813582, + "grad_norm": 0.894851803779602, + "learning_rate": 9.886816523617933e-06, + "loss": 0.8594, + "step": 2559 + }, + { + "epoch": 0.14089933403049149, + "grad_norm": 0.7813432812690735, + "learning_rate": 9.886724797638032e-06, + "loss": 0.7311, + "step": 2560 + }, + { + "epoch": 0.14095437283284715, + "grad_norm": 0.8194118142127991, + "learning_rate": 9.886633034930814e-06, + "loss": 0.8067, + "step": 2561 + }, + { + "epoch": 0.1410094116352028, + "grad_norm": 0.8091121912002563, + "learning_rate": 9.88654123549697e-06, + "loss": 0.7558, + "step": 2562 + }, + { + "epoch": 0.14106445043755847, + "grad_norm": 0.8334764242172241, + "learning_rate": 9.88644939933719e-06, + "loss": 0.8375, + "step": 2563 + }, + { + "epoch": 0.14111948923991413, + "grad_norm": 0.8283817768096924, + "learning_rate": 9.886357526452166e-06, + "loss": 0.7839, + "step": 2564 + }, + { + "epoch": 0.1411745280422698, + "grad_norm": 0.8708772659301758, + "learning_rate": 9.886265616842585e-06, + "loss": 0.8193, + "step": 2565 + }, + { + "epoch": 0.14122956684462545, + "grad_norm": 0.9883641600608826, + "learning_rate": 9.886173670509141e-06, + "loss": 0.9409, + "step": 2566 + }, + { + "epoch": 0.14128460564698112, + "grad_norm": 0.8601766228675842, + "learning_rate": 9.886081687452523e-06, + "loss": 0.9391, + "step": 2567 + }, + { + "epoch": 0.14133964444933678, + "grad_norm": 0.8729620575904846, + "learning_rate": 9.885989667673422e-06, + "loss": 0.8372, + "step": 2568 + }, + { + "epoch": 0.14139468325169244, + "grad_norm": 0.7899564504623413, + "learning_rate": 9.885897611172532e-06, + "loss": 0.7773, + "step": 2569 + }, + { + "epoch": 0.1414497220540481, + "grad_norm": 0.8120512962341309, + "learning_rate": 9.885805517950542e-06, + "loss": 0.887, + "step": 2570 + }, + { + "epoch": 0.14150476085640376, + "grad_norm": 0.8475256562232971, + "learning_rate": 9.885713388008148e-06, + "loss": 0.7935, + "step": 2571 + }, + { + "epoch": 0.14155979965875942, + "grad_norm": 0.7669919729232788, + "learning_rate": 9.885621221346038e-06, + "loss": 0.7728, + "step": 2572 + }, + { + "epoch": 0.14161483846111508, + "grad_norm": 0.8298916220664978, + "learning_rate": 9.885529017964906e-06, + "loss": 0.7723, + "step": 2573 + }, + { + "epoch": 0.14166987726347074, + "grad_norm": 0.8630721569061279, + "learning_rate": 9.885436777865447e-06, + "loss": 0.8395, + "step": 2574 + }, + { + "epoch": 0.1417249160658264, + "grad_norm": 0.7566008567810059, + "learning_rate": 9.885344501048352e-06, + "loss": 0.806, + "step": 2575 + }, + { + "epoch": 0.14177995486818207, + "grad_norm": 0.7870769500732422, + "learning_rate": 9.885252187514316e-06, + "loss": 0.7683, + "step": 2576 + }, + { + "epoch": 0.14183499367053773, + "grad_norm": 0.879648745059967, + "learning_rate": 9.885159837264033e-06, + "loss": 0.8472, + "step": 2577 + }, + { + "epoch": 0.1418900324728934, + "grad_norm": 0.76839280128479, + "learning_rate": 9.885067450298196e-06, + "loss": 0.8534, + "step": 2578 + }, + { + "epoch": 0.14194507127524905, + "grad_norm": 0.8268701434135437, + "learning_rate": 9.884975026617498e-06, + "loss": 0.7799, + "step": 2579 + }, + { + "epoch": 0.1420001100776047, + "grad_norm": 0.8226090669631958, + "learning_rate": 9.884882566222638e-06, + "loss": 0.6756, + "step": 2580 + }, + { + "epoch": 0.14205514887996037, + "grad_norm": 0.8299756050109863, + "learning_rate": 9.884790069114307e-06, + "loss": 0.734, + "step": 2581 + }, + { + "epoch": 0.14211018768231604, + "grad_norm": 0.8241812586784363, + "learning_rate": 9.8846975352932e-06, + "loss": 0.8335, + "step": 2582 + }, + { + "epoch": 0.1421652264846717, + "grad_norm": 0.8458926677703857, + "learning_rate": 9.884604964760016e-06, + "loss": 0.7376, + "step": 2583 + }, + { + "epoch": 0.14222026528702736, + "grad_norm": 0.876966655254364, + "learning_rate": 9.884512357515447e-06, + "loss": 0.9414, + "step": 2584 + }, + { + "epoch": 0.14227530408938302, + "grad_norm": 0.770252525806427, + "learning_rate": 9.88441971356019e-06, + "loss": 0.8312, + "step": 2585 + }, + { + "epoch": 0.14233034289173868, + "grad_norm": 0.7883023023605347, + "learning_rate": 9.884327032894945e-06, + "loss": 0.8568, + "step": 2586 + }, + { + "epoch": 0.14238538169409434, + "grad_norm": 0.9092289209365845, + "learning_rate": 9.884234315520405e-06, + "loss": 0.9078, + "step": 2587 + }, + { + "epoch": 0.14244042049645, + "grad_norm": 0.7946531176567078, + "learning_rate": 9.884141561437266e-06, + "loss": 0.6895, + "step": 2588 + }, + { + "epoch": 0.14249545929880567, + "grad_norm": 0.7791070342063904, + "learning_rate": 9.884048770646227e-06, + "loss": 0.6984, + "step": 2589 + }, + { + "epoch": 0.14255049810116133, + "grad_norm": 0.7775537371635437, + "learning_rate": 9.883955943147982e-06, + "loss": 0.7568, + "step": 2590 + }, + { + "epoch": 0.142605536903517, + "grad_norm": 0.7735158801078796, + "learning_rate": 9.883863078943234e-06, + "loss": 0.8215, + "step": 2591 + }, + { + "epoch": 0.14266057570587265, + "grad_norm": 0.881365180015564, + "learning_rate": 9.88377017803268e-06, + "loss": 0.8817, + "step": 2592 + }, + { + "epoch": 0.1427156145082283, + "grad_norm": 0.8643443584442139, + "learning_rate": 9.883677240417014e-06, + "loss": 0.8024, + "step": 2593 + }, + { + "epoch": 0.14277065331058397, + "grad_norm": 0.885713517665863, + "learning_rate": 9.883584266096938e-06, + "loss": 0.7612, + "step": 2594 + }, + { + "epoch": 0.14282569211293963, + "grad_norm": 0.771340012550354, + "learning_rate": 9.88349125507315e-06, + "loss": 0.8293, + "step": 2595 + }, + { + "epoch": 0.1428807309152953, + "grad_norm": 0.8284093737602234, + "learning_rate": 9.88339820734635e-06, + "loss": 0.8539, + "step": 2596 + }, + { + "epoch": 0.14293576971765096, + "grad_norm": 0.9597725868225098, + "learning_rate": 9.883305122917233e-06, + "loss": 0.9054, + "step": 2597 + }, + { + "epoch": 0.14299080852000662, + "grad_norm": 0.7552937269210815, + "learning_rate": 9.883212001786504e-06, + "loss": 0.8047, + "step": 2598 + }, + { + "epoch": 0.14304584732236225, + "grad_norm": 0.8008492588996887, + "learning_rate": 9.883118843954861e-06, + "loss": 0.802, + "step": 2599 + }, + { + "epoch": 0.1431008861247179, + "grad_norm": 0.8169753551483154, + "learning_rate": 9.883025649423003e-06, + "loss": 0.8837, + "step": 2600 + }, + { + "epoch": 0.14315592492707357, + "grad_norm": 0.8521036505699158, + "learning_rate": 9.882932418191632e-06, + "loss": 0.8266, + "step": 2601 + }, + { + "epoch": 0.14321096372942924, + "grad_norm": 0.8647341728210449, + "learning_rate": 9.882839150261449e-06, + "loss": 0.8949, + "step": 2602 + }, + { + "epoch": 0.1432660025317849, + "grad_norm": 0.9236162304878235, + "learning_rate": 9.882745845633153e-06, + "loss": 0.8474, + "step": 2603 + }, + { + "epoch": 0.14332104133414056, + "grad_norm": 0.8422677516937256, + "learning_rate": 9.882652504307445e-06, + "loss": 0.8396, + "step": 2604 + }, + { + "epoch": 0.14337608013649622, + "grad_norm": 0.902036190032959, + "learning_rate": 9.88255912628503e-06, + "loss": 0.8075, + "step": 2605 + }, + { + "epoch": 0.14343111893885188, + "grad_norm": 0.8972339630126953, + "learning_rate": 9.882465711566605e-06, + "loss": 0.8143, + "step": 2606 + }, + { + "epoch": 0.14348615774120754, + "grad_norm": 0.8025243282318115, + "learning_rate": 9.882372260152877e-06, + "loss": 0.771, + "step": 2607 + }, + { + "epoch": 0.1435411965435632, + "grad_norm": 0.8260911107063293, + "learning_rate": 9.882278772044545e-06, + "loss": 0.7679, + "step": 2608 + }, + { + "epoch": 0.14359623534591887, + "grad_norm": 0.8069774508476257, + "learning_rate": 9.882185247242313e-06, + "loss": 0.8489, + "step": 2609 + }, + { + "epoch": 0.14365127414827453, + "grad_norm": 0.8702567219734192, + "learning_rate": 9.882091685746883e-06, + "loss": 0.9258, + "step": 2610 + }, + { + "epoch": 0.1437063129506302, + "grad_norm": 0.8841683268547058, + "learning_rate": 9.881998087558959e-06, + "loss": 0.7858, + "step": 2611 + }, + { + "epoch": 0.14376135175298585, + "grad_norm": 0.7302986979484558, + "learning_rate": 9.881904452679246e-06, + "loss": 0.7339, + "step": 2612 + }, + { + "epoch": 0.1438163905553415, + "grad_norm": 0.7852466106414795, + "learning_rate": 9.881810781108442e-06, + "loss": 0.8397, + "step": 2613 + }, + { + "epoch": 0.14387142935769717, + "grad_norm": 0.7986249327659607, + "learning_rate": 9.881717072847258e-06, + "loss": 0.7573, + "step": 2614 + }, + { + "epoch": 0.14392646816005283, + "grad_norm": 0.750000536441803, + "learning_rate": 9.881623327896395e-06, + "loss": 0.7128, + "step": 2615 + }, + { + "epoch": 0.1439815069624085, + "grad_norm": 0.8796436786651611, + "learning_rate": 9.881529546256557e-06, + "loss": 0.9364, + "step": 2616 + }, + { + "epoch": 0.14403654576476416, + "grad_norm": 0.8621297478675842, + "learning_rate": 9.881435727928449e-06, + "loss": 0.9323, + "step": 2617 + }, + { + "epoch": 0.14409158456711982, + "grad_norm": 0.8213173151016235, + "learning_rate": 9.881341872912777e-06, + "loss": 0.7746, + "step": 2618 + }, + { + "epoch": 0.14414662336947548, + "grad_norm": 0.7761938571929932, + "learning_rate": 9.881247981210247e-06, + "loss": 0.8065, + "step": 2619 + }, + { + "epoch": 0.14420166217183114, + "grad_norm": 0.8333988785743713, + "learning_rate": 9.881154052821564e-06, + "loss": 0.8727, + "step": 2620 + }, + { + "epoch": 0.1442567009741868, + "grad_norm": 0.7263909578323364, + "learning_rate": 9.881060087747433e-06, + "loss": 0.8194, + "step": 2621 + }, + { + "epoch": 0.14431173977654246, + "grad_norm": 0.7472667098045349, + "learning_rate": 9.880966085988562e-06, + "loss": 0.77, + "step": 2622 + }, + { + "epoch": 0.14436677857889813, + "grad_norm": 0.7999943494796753, + "learning_rate": 9.880872047545656e-06, + "loss": 0.7936, + "step": 2623 + }, + { + "epoch": 0.1444218173812538, + "grad_norm": 0.8359610438346863, + "learning_rate": 9.88077797241942e-06, + "loss": 0.7946, + "step": 2624 + }, + { + "epoch": 0.14447685618360945, + "grad_norm": 0.8666403889656067, + "learning_rate": 9.880683860610566e-06, + "loss": 0.8152, + "step": 2625 + }, + { + "epoch": 0.1445318949859651, + "grad_norm": 0.7883741855621338, + "learning_rate": 9.880589712119797e-06, + "loss": 0.7972, + "step": 2626 + }, + { + "epoch": 0.14458693378832077, + "grad_norm": 0.8048827648162842, + "learning_rate": 9.880495526947824e-06, + "loss": 0.8221, + "step": 2627 + }, + { + "epoch": 0.14464197259067643, + "grad_norm": 0.718292236328125, + "learning_rate": 9.88040130509535e-06, + "loss": 0.7648, + "step": 2628 + }, + { + "epoch": 0.1446970113930321, + "grad_norm": 0.7748421430587769, + "learning_rate": 9.880307046563088e-06, + "loss": 0.8146, + "step": 2629 + }, + { + "epoch": 0.14475205019538775, + "grad_norm": 0.8015987873077393, + "learning_rate": 9.880212751351745e-06, + "loss": 0.7935, + "step": 2630 + }, + { + "epoch": 0.14480708899774342, + "grad_norm": 0.7628459930419922, + "learning_rate": 9.88011841946203e-06, + "loss": 0.7469, + "step": 2631 + }, + { + "epoch": 0.14486212780009908, + "grad_norm": 0.7152888774871826, + "learning_rate": 9.88002405089465e-06, + "loss": 0.7721, + "step": 2632 + }, + { + "epoch": 0.14491716660245474, + "grad_norm": 0.8075545430183411, + "learning_rate": 9.879929645650315e-06, + "loss": 0.8799, + "step": 2633 + }, + { + "epoch": 0.1449722054048104, + "grad_norm": 0.7981964945793152, + "learning_rate": 9.879835203729736e-06, + "loss": 0.8265, + "step": 2634 + }, + { + "epoch": 0.14502724420716606, + "grad_norm": 0.7699866890907288, + "learning_rate": 9.879740725133623e-06, + "loss": 0.8489, + "step": 2635 + }, + { + "epoch": 0.14508228300952172, + "grad_norm": 0.7991634011268616, + "learning_rate": 9.879646209862682e-06, + "loss": 0.8754, + "step": 2636 + }, + { + "epoch": 0.14513732181187738, + "grad_norm": 0.8284991383552551, + "learning_rate": 9.879551657917628e-06, + "loss": 0.811, + "step": 2637 + }, + { + "epoch": 0.14519236061423305, + "grad_norm": 0.9189227819442749, + "learning_rate": 9.87945706929917e-06, + "loss": 0.8486, + "step": 2638 + }, + { + "epoch": 0.1452473994165887, + "grad_norm": 0.8599026799201965, + "learning_rate": 9.879362444008018e-06, + "loss": 0.8383, + "step": 2639 + }, + { + "epoch": 0.14530243821894437, + "grad_norm": 0.8764603137969971, + "learning_rate": 9.879267782044885e-06, + "loss": 0.7918, + "step": 2640 + }, + { + "epoch": 0.14535747702130003, + "grad_norm": 0.8061341047286987, + "learning_rate": 9.87917308341048e-06, + "loss": 0.8292, + "step": 2641 + }, + { + "epoch": 0.14541251582365566, + "grad_norm": 1.031220555305481, + "learning_rate": 9.879078348105518e-06, + "loss": 0.6612, + "step": 2642 + }, + { + "epoch": 0.14546755462601133, + "grad_norm": 1.014491319656372, + "learning_rate": 9.878983576130708e-06, + "loss": 0.8512, + "step": 2643 + }, + { + "epoch": 0.145522593428367, + "grad_norm": 0.8365896940231323, + "learning_rate": 9.878888767486764e-06, + "loss": 0.7995, + "step": 2644 + }, + { + "epoch": 0.14557763223072265, + "grad_norm": 0.8086197972297668, + "learning_rate": 9.878793922174397e-06, + "loss": 0.8069, + "step": 2645 + }, + { + "epoch": 0.1456326710330783, + "grad_norm": 0.8075234889984131, + "learning_rate": 9.878699040194322e-06, + "loss": 0.8415, + "step": 2646 + }, + { + "epoch": 0.14568770983543397, + "grad_norm": 0.9413748979568481, + "learning_rate": 9.87860412154725e-06, + "loss": 0.7811, + "step": 2647 + }, + { + "epoch": 0.14574274863778963, + "grad_norm": 0.7744552493095398, + "learning_rate": 9.878509166233895e-06, + "loss": 0.7983, + "step": 2648 + }, + { + "epoch": 0.1457977874401453, + "grad_norm": 0.8184664845466614, + "learning_rate": 9.878414174254974e-06, + "loss": 0.8052, + "step": 2649 + }, + { + "epoch": 0.14585282624250095, + "grad_norm": 0.928814172744751, + "learning_rate": 9.878319145611195e-06, + "loss": 0.7695, + "step": 2650 + }, + { + "epoch": 0.14590786504485662, + "grad_norm": 0.9623318314552307, + "learning_rate": 9.878224080303276e-06, + "loss": 0.9025, + "step": 2651 + }, + { + "epoch": 0.14596290384721228, + "grad_norm": 0.866538405418396, + "learning_rate": 9.87812897833193e-06, + "loss": 0.7895, + "step": 2652 + }, + { + "epoch": 0.14601794264956794, + "grad_norm": 0.9248599410057068, + "learning_rate": 9.878033839697874e-06, + "loss": 0.8532, + "step": 2653 + }, + { + "epoch": 0.1460729814519236, + "grad_norm": 0.7866301536560059, + "learning_rate": 9.87793866440182e-06, + "loss": 0.8724, + "step": 2654 + }, + { + "epoch": 0.14612802025427926, + "grad_norm": 0.8471634387969971, + "learning_rate": 9.877843452444485e-06, + "loss": 0.9184, + "step": 2655 + }, + { + "epoch": 0.14618305905663492, + "grad_norm": 0.7367103695869446, + "learning_rate": 9.877748203826585e-06, + "loss": 0.7328, + "step": 2656 + }, + { + "epoch": 0.14623809785899058, + "grad_norm": 0.95980304479599, + "learning_rate": 9.877652918548834e-06, + "loss": 0.9274, + "step": 2657 + }, + { + "epoch": 0.14629313666134625, + "grad_norm": 1.0511064529418945, + "learning_rate": 9.87755759661195e-06, + "loss": 0.8223, + "step": 2658 + }, + { + "epoch": 0.1463481754637019, + "grad_norm": 0.7616510391235352, + "learning_rate": 9.877462238016649e-06, + "loss": 0.7473, + "step": 2659 + }, + { + "epoch": 0.14640321426605757, + "grad_norm": 0.7814056873321533, + "learning_rate": 9.877366842763647e-06, + "loss": 0.8898, + "step": 2660 + }, + { + "epoch": 0.14645825306841323, + "grad_norm": 0.8707298636436462, + "learning_rate": 9.877271410853662e-06, + "loss": 0.8792, + "step": 2661 + }, + { + "epoch": 0.1465132918707689, + "grad_norm": 0.8618701696395874, + "learning_rate": 9.877175942287409e-06, + "loss": 0.8761, + "step": 2662 + }, + { + "epoch": 0.14656833067312455, + "grad_norm": 0.9437732100486755, + "learning_rate": 9.877080437065609e-06, + "loss": 0.7922, + "step": 2663 + }, + { + "epoch": 0.14662336947548021, + "grad_norm": 0.9465780258178711, + "learning_rate": 9.876984895188976e-06, + "loss": 0.8449, + "step": 2664 + }, + { + "epoch": 0.14667840827783588, + "grad_norm": 0.7149911522865295, + "learning_rate": 9.876889316658233e-06, + "loss": 0.6408, + "step": 2665 + }, + { + "epoch": 0.14673344708019154, + "grad_norm": 0.9996811151504517, + "learning_rate": 9.876793701474092e-06, + "loss": 0.9324, + "step": 2666 + }, + { + "epoch": 0.1467884858825472, + "grad_norm": 0.7941329479217529, + "learning_rate": 9.876698049637277e-06, + "loss": 0.8115, + "step": 2667 + }, + { + "epoch": 0.14684352468490286, + "grad_norm": 0.754175066947937, + "learning_rate": 9.876602361148504e-06, + "loss": 0.7709, + "step": 2668 + }, + { + "epoch": 0.14689856348725852, + "grad_norm": 0.7867946624755859, + "learning_rate": 9.876506636008494e-06, + "loss": 0.8578, + "step": 2669 + }, + { + "epoch": 0.14695360228961418, + "grad_norm": 0.7441185116767883, + "learning_rate": 9.876410874217965e-06, + "loss": 0.8491, + "step": 2670 + }, + { + "epoch": 0.14700864109196984, + "grad_norm": 0.8414027690887451, + "learning_rate": 9.876315075777638e-06, + "loss": 0.8404, + "step": 2671 + }, + { + "epoch": 0.1470636798943255, + "grad_norm": 0.7911489009857178, + "learning_rate": 9.876219240688231e-06, + "loss": 0.8606, + "step": 2672 + }, + { + "epoch": 0.14711871869668117, + "grad_norm": 0.8601381778717041, + "learning_rate": 9.876123368950465e-06, + "loss": 0.7753, + "step": 2673 + }, + { + "epoch": 0.14717375749903683, + "grad_norm": 0.8672378659248352, + "learning_rate": 9.876027460565062e-06, + "loss": 0.7763, + "step": 2674 + }, + { + "epoch": 0.1472287963013925, + "grad_norm": 0.7192933559417725, + "learning_rate": 9.875931515532742e-06, + "loss": 0.7681, + "step": 2675 + }, + { + "epoch": 0.14728383510374815, + "grad_norm": 0.7483426332473755, + "learning_rate": 9.875835533854226e-06, + "loss": 0.8129, + "step": 2676 + }, + { + "epoch": 0.1473388739061038, + "grad_norm": 0.8883694410324097, + "learning_rate": 9.875739515530235e-06, + "loss": 0.8912, + "step": 2677 + }, + { + "epoch": 0.14739391270845947, + "grad_norm": 0.8440148234367371, + "learning_rate": 9.87564346056149e-06, + "loss": 0.8411, + "step": 2678 + }, + { + "epoch": 0.14744895151081513, + "grad_norm": 0.8916668891906738, + "learning_rate": 9.875547368948715e-06, + "loss": 0.8484, + "step": 2679 + }, + { + "epoch": 0.1475039903131708, + "grad_norm": 0.805258572101593, + "learning_rate": 9.875451240692631e-06, + "loss": 0.8172, + "step": 2680 + }, + { + "epoch": 0.14755902911552646, + "grad_norm": 0.8322305679321289, + "learning_rate": 9.87535507579396e-06, + "loss": 0.809, + "step": 2681 + }, + { + "epoch": 0.14761406791788212, + "grad_norm": 0.7320597767829895, + "learning_rate": 9.875258874253424e-06, + "loss": 0.7346, + "step": 2682 + }, + { + "epoch": 0.14766910672023778, + "grad_norm": 1.018036127090454, + "learning_rate": 9.875162636071749e-06, + "loss": 0.931, + "step": 2683 + }, + { + "epoch": 0.14772414552259344, + "grad_norm": 0.8601503968238831, + "learning_rate": 9.875066361249657e-06, + "loss": 0.7689, + "step": 2684 + }, + { + "epoch": 0.14777918432494908, + "grad_norm": 0.8478472232818604, + "learning_rate": 9.87497004978787e-06, + "loss": 0.9545, + "step": 2685 + }, + { + "epoch": 0.14783422312730474, + "grad_norm": 0.7510890364646912, + "learning_rate": 9.874873701687115e-06, + "loss": 0.7794, + "step": 2686 + }, + { + "epoch": 0.1478892619296604, + "grad_norm": 0.8226999044418335, + "learning_rate": 9.874777316948112e-06, + "loss": 0.8477, + "step": 2687 + }, + { + "epoch": 0.14794430073201606, + "grad_norm": 0.8284991979598999, + "learning_rate": 9.874680895571588e-06, + "loss": 0.8498, + "step": 2688 + }, + { + "epoch": 0.14799933953437172, + "grad_norm": 0.9007356762886047, + "learning_rate": 9.874584437558267e-06, + "loss": 0.8526, + "step": 2689 + }, + { + "epoch": 0.14805437833672738, + "grad_norm": 0.8770126104354858, + "learning_rate": 9.874487942908877e-06, + "loss": 0.844, + "step": 2690 + }, + { + "epoch": 0.14810941713908304, + "grad_norm": 1.1561466455459595, + "learning_rate": 9.874391411624138e-06, + "loss": 0.976, + "step": 2691 + }, + { + "epoch": 0.1481644559414387, + "grad_norm": 0.8162640929222107, + "learning_rate": 9.874294843704777e-06, + "loss": 0.8581, + "step": 2692 + }, + { + "epoch": 0.14821949474379437, + "grad_norm": 0.8308132290840149, + "learning_rate": 9.874198239151522e-06, + "loss": 0.8303, + "step": 2693 + }, + { + "epoch": 0.14827453354615003, + "grad_norm": 0.771024227142334, + "learning_rate": 9.874101597965098e-06, + "loss": 0.8351, + "step": 2694 + }, + { + "epoch": 0.1483295723485057, + "grad_norm": 0.7588162422180176, + "learning_rate": 9.874004920146232e-06, + "loss": 0.7858, + "step": 2695 + }, + { + "epoch": 0.14838461115086135, + "grad_norm": 0.8282446265220642, + "learning_rate": 9.873908205695648e-06, + "loss": 0.8465, + "step": 2696 + }, + { + "epoch": 0.148439649953217, + "grad_norm": 0.8342786431312561, + "learning_rate": 9.873811454614076e-06, + "loss": 0.8688, + "step": 2697 + }, + { + "epoch": 0.14849468875557267, + "grad_norm": 0.7957108020782471, + "learning_rate": 9.87371466690224e-06, + "loss": 0.8381, + "step": 2698 + }, + { + "epoch": 0.14854972755792833, + "grad_norm": 0.8763726353645325, + "learning_rate": 9.87361784256087e-06, + "loss": 0.8922, + "step": 2699 + }, + { + "epoch": 0.148604766360284, + "grad_norm": 0.7760055661201477, + "learning_rate": 9.873520981590693e-06, + "loss": 0.8384, + "step": 2700 + }, + { + "epoch": 0.14865980516263966, + "grad_norm": 0.9691097736358643, + "learning_rate": 9.873424083992436e-06, + "loss": 0.8581, + "step": 2701 + }, + { + "epoch": 0.14871484396499532, + "grad_norm": 0.9072558879852295, + "learning_rate": 9.87332714976683e-06, + "loss": 0.8942, + "step": 2702 + }, + { + "epoch": 0.14876988276735098, + "grad_norm": 0.8961714506149292, + "learning_rate": 9.8732301789146e-06, + "loss": 0.8062, + "step": 2703 + }, + { + "epoch": 0.14882492156970664, + "grad_norm": 1.4835050106048584, + "learning_rate": 9.873133171436477e-06, + "loss": 0.886, + "step": 2704 + }, + { + "epoch": 0.1488799603720623, + "grad_norm": 0.8153702616691589, + "learning_rate": 9.87303612733319e-06, + "loss": 0.8369, + "step": 2705 + }, + { + "epoch": 0.14893499917441796, + "grad_norm": 0.8755800724029541, + "learning_rate": 9.872939046605467e-06, + "loss": 0.7591, + "step": 2706 + }, + { + "epoch": 0.14899003797677363, + "grad_norm": 0.8173243403434753, + "learning_rate": 9.872841929254038e-06, + "loss": 0.8626, + "step": 2707 + }, + { + "epoch": 0.1490450767791293, + "grad_norm": 0.7891639471054077, + "learning_rate": 9.872744775279634e-06, + "loss": 0.737, + "step": 2708 + }, + { + "epoch": 0.14910011558148495, + "grad_norm": 1.0270631313323975, + "learning_rate": 9.872647584682985e-06, + "loss": 0.9202, + "step": 2709 + }, + { + "epoch": 0.1491551543838406, + "grad_norm": 0.7736123204231262, + "learning_rate": 9.872550357464822e-06, + "loss": 0.7835, + "step": 2710 + }, + { + "epoch": 0.14921019318619627, + "grad_norm": 0.7791550159454346, + "learning_rate": 9.872453093625873e-06, + "loss": 0.8375, + "step": 2711 + }, + { + "epoch": 0.14926523198855193, + "grad_norm": 0.8410583734512329, + "learning_rate": 9.872355793166872e-06, + "loss": 0.877, + "step": 2712 + }, + { + "epoch": 0.1493202707909076, + "grad_norm": 0.8277738094329834, + "learning_rate": 9.87225845608855e-06, + "loss": 0.7255, + "step": 2713 + }, + { + "epoch": 0.14937530959326326, + "grad_norm": 0.8617290258407593, + "learning_rate": 9.872161082391635e-06, + "loss": 0.7885, + "step": 2714 + }, + { + "epoch": 0.14943034839561892, + "grad_norm": 0.8866406679153442, + "learning_rate": 9.872063672076864e-06, + "loss": 0.8621, + "step": 2715 + }, + { + "epoch": 0.14948538719797458, + "grad_norm": 0.7581049799919128, + "learning_rate": 9.871966225144964e-06, + "loss": 0.8177, + "step": 2716 + }, + { + "epoch": 0.14954042600033024, + "grad_norm": 0.833696722984314, + "learning_rate": 9.871868741596673e-06, + "loss": 0.8382, + "step": 2717 + }, + { + "epoch": 0.1495954648026859, + "grad_norm": 1.0857365131378174, + "learning_rate": 9.871771221432718e-06, + "loss": 0.9254, + "step": 2718 + }, + { + "epoch": 0.14965050360504156, + "grad_norm": 0.7622446417808533, + "learning_rate": 9.871673664653837e-06, + "loss": 0.832, + "step": 2719 + }, + { + "epoch": 0.14970554240739722, + "grad_norm": 0.7436832785606384, + "learning_rate": 9.871576071260758e-06, + "loss": 0.7642, + "step": 2720 + }, + { + "epoch": 0.14976058120975289, + "grad_norm": 0.8547641634941101, + "learning_rate": 9.87147844125422e-06, + "loss": 0.7584, + "step": 2721 + }, + { + "epoch": 0.14981562001210855, + "grad_norm": 0.7634096145629883, + "learning_rate": 9.871380774634953e-06, + "loss": 0.8332, + "step": 2722 + }, + { + "epoch": 0.1498706588144642, + "grad_norm": 0.7949081063270569, + "learning_rate": 9.871283071403692e-06, + "loss": 0.7812, + "step": 2723 + }, + { + "epoch": 0.14992569761681987, + "grad_norm": 0.8089914321899414, + "learning_rate": 9.871185331561171e-06, + "loss": 0.8503, + "step": 2724 + }, + { + "epoch": 0.14998073641917553, + "grad_norm": 0.8451627492904663, + "learning_rate": 9.871087555108125e-06, + "loss": 0.9101, + "step": 2725 + }, + { + "epoch": 0.1500357752215312, + "grad_norm": 0.8399865627288818, + "learning_rate": 9.87098974204529e-06, + "loss": 0.8222, + "step": 2726 + }, + { + "epoch": 0.15009081402388685, + "grad_norm": 0.7786773443222046, + "learning_rate": 9.870891892373397e-06, + "loss": 0.8069, + "step": 2727 } ], "logging_steps": 1, @@ -12752,7 +19115,7 @@ "attributes": {} } }, - "total_flos": 5.365029428243988e+18, + "total_flos": 8.047544142365983e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null