diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,30752 @@ +{ + "best_global_step": 3816, + "best_metric": 0.46624913811683655, + "best_model_checkpoint": "saves/lora/llama-3-8b-instruct/train_codealpacapy_1754507520/checkpoint-3816", + "epoch": 10.0, + "eval_steps": 954, + "global_step": 19080, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002620545073375262, + "grad_norm": 1.1329799890518188, + "learning_rate": 1.048218029350105e-07, + "loss": 1.2569, + "num_input_tokens_seen": 2944, + "step": 5 + }, + { + "epoch": 0.005241090146750524, + "grad_norm": 0.44206225872039795, + "learning_rate": 2.3584905660377358e-07, + "loss": 1.1164, + "num_input_tokens_seen": 6816, + "step": 10 + }, + { + "epoch": 0.007861635220125786, + "grad_norm": 0.7355162501335144, + "learning_rate": 3.6687631027253674e-07, + "loss": 1.2228, + "num_input_tokens_seen": 9760, + "step": 15 + }, + { + "epoch": 0.010482180293501049, + "grad_norm": 0.42181098461151123, + "learning_rate": 4.979035639412998e-07, + "loss": 1.1638, + "num_input_tokens_seen": 13472, + "step": 20 + }, + { + "epoch": 0.01310272536687631, + "grad_norm": 1.0762665271759033, + "learning_rate": 6.28930817610063e-07, + "loss": 1.2356, + "num_input_tokens_seen": 16864, + "step": 25 + }, + { + "epoch": 0.015723270440251572, + "grad_norm": 0.798069179058075, + "learning_rate": 7.59958071278826e-07, + "loss": 1.3566, + "num_input_tokens_seen": 20352, + "step": 30 + }, + { + "epoch": 0.018343815513626835, + "grad_norm": 0.9464090466499329, + "learning_rate": 8.90985324947589e-07, + "loss": 1.3357, + "num_input_tokens_seen": 23008, + "step": 35 + }, + { + "epoch": 0.020964360587002098, + "grad_norm": 0.7717921733856201, + "learning_rate": 1.0220125786163522e-06, + "loss": 1.075, + "num_input_tokens_seen": 25920, + "step": 40 + }, + { + "epoch": 0.02358490566037736, + "grad_norm": 1.1683149337768555, + "learning_rate": 1.1530398322851154e-06, + "loss": 1.2322, + "num_input_tokens_seen": 28672, + "step": 45 + }, + { + "epoch": 0.02620545073375262, + "grad_norm": 0.5954431891441345, + "learning_rate": 1.2840670859538784e-06, + "loss": 1.0773, + "num_input_tokens_seen": 31488, + "step": 50 + }, + { + "epoch": 0.028825995807127882, + "grad_norm": 0.886174738407135, + "learning_rate": 1.4150943396226415e-06, + "loss": 1.2132, + "num_input_tokens_seen": 34496, + "step": 55 + }, + { + "epoch": 0.031446540880503145, + "grad_norm": 0.5222442746162415, + "learning_rate": 1.5461215932914047e-06, + "loss": 1.291, + "num_input_tokens_seen": 38112, + "step": 60 + }, + { + "epoch": 0.034067085953878404, + "grad_norm": 0.8157989382743835, + "learning_rate": 1.677148846960168e-06, + "loss": 1.3672, + "num_input_tokens_seen": 40864, + "step": 65 + }, + { + "epoch": 0.03668763102725367, + "grad_norm": 0.4730589985847473, + "learning_rate": 1.8081761006289309e-06, + "loss": 1.2761, + "num_input_tokens_seen": 44192, + "step": 70 + }, + { + "epoch": 0.03930817610062893, + "grad_norm": 1.5447367429733276, + "learning_rate": 1.939203354297694e-06, + "loss": 1.3983, + "num_input_tokens_seen": 46784, + "step": 75 + }, + { + "epoch": 0.041928721174004195, + "grad_norm": 2.6971452236175537, + "learning_rate": 2.0702306079664572e-06, + "loss": 1.3794, + "num_input_tokens_seen": 50176, + "step": 80 + }, + { + "epoch": 0.044549266247379454, + "grad_norm": 0.5718836188316345, + "learning_rate": 2.20125786163522e-06, + "loss": 1.0783, + "num_input_tokens_seen": 54400, + "step": 85 + }, + { + "epoch": 0.04716981132075472, + "grad_norm": 1.485183835029602, + "learning_rate": 2.3322851153039836e-06, + "loss": 1.4152, + "num_input_tokens_seen": 56992, + "step": 90 + }, + { + "epoch": 0.04979035639412998, + "grad_norm": 1.5645537376403809, + "learning_rate": 2.4633123689727464e-06, + "loss": 0.9882, + "num_input_tokens_seen": 60896, + "step": 95 + }, + { + "epoch": 0.05241090146750524, + "grad_norm": 0.5941735506057739, + "learning_rate": 2.5943396226415095e-06, + "loss": 1.1127, + "num_input_tokens_seen": 64576, + "step": 100 + }, + { + "epoch": 0.055031446540880505, + "grad_norm": 2.248969316482544, + "learning_rate": 2.7253668763102727e-06, + "loss": 1.3204, + "num_input_tokens_seen": 67616, + "step": 105 + }, + { + "epoch": 0.057651991614255764, + "grad_norm": 0.772114098072052, + "learning_rate": 2.8563941299790355e-06, + "loss": 0.966, + "num_input_tokens_seen": 72544, + "step": 110 + }, + { + "epoch": 0.06027253668763103, + "grad_norm": 0.9727751612663269, + "learning_rate": 2.987421383647799e-06, + "loss": 1.1791, + "num_input_tokens_seen": 75648, + "step": 115 + }, + { + "epoch": 0.06289308176100629, + "grad_norm": 1.2088732719421387, + "learning_rate": 3.118448637316562e-06, + "loss": 1.2416, + "num_input_tokens_seen": 78368, + "step": 120 + }, + { + "epoch": 0.06551362683438156, + "grad_norm": 1.828019142150879, + "learning_rate": 3.249475890985325e-06, + "loss": 1.0691, + "num_input_tokens_seen": 81952, + "step": 125 + }, + { + "epoch": 0.06813417190775681, + "grad_norm": 1.4937382936477661, + "learning_rate": 3.380503144654088e-06, + "loss": 0.9815, + "num_input_tokens_seen": 84960, + "step": 130 + }, + { + "epoch": 0.07075471698113207, + "grad_norm": 1.791987657546997, + "learning_rate": 3.5115303983228514e-06, + "loss": 1.1367, + "num_input_tokens_seen": 88576, + "step": 135 + }, + { + "epoch": 0.07337526205450734, + "grad_norm": 1.5433217287063599, + "learning_rate": 3.642557651991614e-06, + "loss": 0.7881, + "num_input_tokens_seen": 92160, + "step": 140 + }, + { + "epoch": 0.0759958071278826, + "grad_norm": 0.8648506999015808, + "learning_rate": 3.7735849056603773e-06, + "loss": 0.955, + "num_input_tokens_seen": 95840, + "step": 145 + }, + { + "epoch": 0.07861635220125786, + "grad_norm": 1.545575737953186, + "learning_rate": 3.9046121593291405e-06, + "loss": 0.7917, + "num_input_tokens_seen": 98816, + "step": 150 + }, + { + "epoch": 0.08123689727463312, + "grad_norm": 0.8775690197944641, + "learning_rate": 4.035639412997904e-06, + "loss": 0.6595, + "num_input_tokens_seen": 102880, + "step": 155 + }, + { + "epoch": 0.08385744234800839, + "grad_norm": 1.759342908859253, + "learning_rate": 4.166666666666667e-06, + "loss": 0.8159, + "num_input_tokens_seen": 105920, + "step": 160 + }, + { + "epoch": 0.08647798742138364, + "grad_norm": 0.9140380024909973, + "learning_rate": 4.29769392033543e-06, + "loss": 0.8719, + "num_input_tokens_seen": 108800, + "step": 165 + }, + { + "epoch": 0.08909853249475891, + "grad_norm": 0.860956072807312, + "learning_rate": 4.428721174004193e-06, + "loss": 0.7009, + "num_input_tokens_seen": 111968, + "step": 170 + }, + { + "epoch": 0.09171907756813417, + "grad_norm": 1.1469119787216187, + "learning_rate": 4.559748427672956e-06, + "loss": 0.6641, + "num_input_tokens_seen": 114976, + "step": 175 + }, + { + "epoch": 0.09433962264150944, + "grad_norm": 6.3839802742004395, + "learning_rate": 4.69077568134172e-06, + "loss": 0.7039, + "num_input_tokens_seen": 117568, + "step": 180 + }, + { + "epoch": 0.09696016771488469, + "grad_norm": 0.8989048004150391, + "learning_rate": 4.821802935010482e-06, + "loss": 0.5202, + "num_input_tokens_seen": 120384, + "step": 185 + }, + { + "epoch": 0.09958071278825996, + "grad_norm": 0.6958763003349304, + "learning_rate": 4.952830188679246e-06, + "loss": 0.5848, + "num_input_tokens_seen": 123680, + "step": 190 + }, + { + "epoch": 0.10220125786163523, + "grad_norm": 0.7917189002037048, + "learning_rate": 5.083857442348009e-06, + "loss": 0.631, + "num_input_tokens_seen": 127168, + "step": 195 + }, + { + "epoch": 0.10482180293501048, + "grad_norm": 0.8649274110794067, + "learning_rate": 5.2148846960167715e-06, + "loss": 0.4923, + "num_input_tokens_seen": 130176, + "step": 200 + }, + { + "epoch": 0.10744234800838574, + "grad_norm": 0.9330058097839355, + "learning_rate": 5.345911949685535e-06, + "loss": 0.5768, + "num_input_tokens_seen": 133056, + "step": 205 + }, + { + "epoch": 0.11006289308176101, + "grad_norm": 0.8483762741088867, + "learning_rate": 5.476939203354298e-06, + "loss": 0.4682, + "num_input_tokens_seen": 136864, + "step": 210 + }, + { + "epoch": 0.11268343815513626, + "grad_norm": 1.0640465021133423, + "learning_rate": 5.607966457023061e-06, + "loss": 0.6685, + "num_input_tokens_seen": 140064, + "step": 215 + }, + { + "epoch": 0.11530398322851153, + "grad_norm": 1.0772100687026978, + "learning_rate": 5.738993710691824e-06, + "loss": 0.613, + "num_input_tokens_seen": 143712, + "step": 220 + }, + { + "epoch": 0.1179245283018868, + "grad_norm": 0.7169722318649292, + "learning_rate": 5.870020964360588e-06, + "loss": 0.5398, + "num_input_tokens_seen": 146880, + "step": 225 + }, + { + "epoch": 0.12054507337526206, + "grad_norm": 1.0740278959274292, + "learning_rate": 6.0010482180293506e-06, + "loss": 0.5495, + "num_input_tokens_seen": 150560, + "step": 230 + }, + { + "epoch": 0.12316561844863731, + "grad_norm": 0.8666868805885315, + "learning_rate": 6.132075471698113e-06, + "loss": 0.7203, + "num_input_tokens_seen": 153664, + "step": 235 + }, + { + "epoch": 0.12578616352201258, + "grad_norm": 1.1675362586975098, + "learning_rate": 6.263102725366876e-06, + "loss": 0.501, + "num_input_tokens_seen": 157568, + "step": 240 + }, + { + "epoch": 0.12840670859538783, + "grad_norm": 0.8522683382034302, + "learning_rate": 6.3941299790356405e-06, + "loss": 0.5383, + "num_input_tokens_seen": 160512, + "step": 245 + }, + { + "epoch": 0.1310272536687631, + "grad_norm": 0.36474698781967163, + "learning_rate": 6.5251572327044024e-06, + "loss": 0.5235, + "num_input_tokens_seen": 164512, + "step": 250 + }, + { + "epoch": 0.13364779874213836, + "grad_norm": 2.245542049407959, + "learning_rate": 6.656184486373165e-06, + "loss": 0.4482, + "num_input_tokens_seen": 167200, + "step": 255 + }, + { + "epoch": 0.13626834381551362, + "grad_norm": 0.9947577118873596, + "learning_rate": 6.78721174004193e-06, + "loss": 0.5524, + "num_input_tokens_seen": 169984, + "step": 260 + }, + { + "epoch": 0.1388888888888889, + "grad_norm": 0.7001338601112366, + "learning_rate": 6.918238993710692e-06, + "loss": 0.4988, + "num_input_tokens_seen": 173856, + "step": 265 + }, + { + "epoch": 0.14150943396226415, + "grad_norm": 1.1030653715133667, + "learning_rate": 7.049266247379454e-06, + "loss": 0.5879, + "num_input_tokens_seen": 176192, + "step": 270 + }, + { + "epoch": 0.1441299790356394, + "grad_norm": 0.4992482662200928, + "learning_rate": 7.180293501048219e-06, + "loss": 0.4094, + "num_input_tokens_seen": 179840, + "step": 275 + }, + { + "epoch": 0.14675052410901468, + "grad_norm": 0.7680016160011292, + "learning_rate": 7.3113207547169815e-06, + "loss": 0.4894, + "num_input_tokens_seen": 182976, + "step": 280 + }, + { + "epoch": 0.14937106918238993, + "grad_norm": 0.7882890105247498, + "learning_rate": 7.442348008385745e-06, + "loss": 0.5322, + "num_input_tokens_seen": 186400, + "step": 285 + }, + { + "epoch": 0.1519916142557652, + "grad_norm": 0.9793110489845276, + "learning_rate": 7.573375262054508e-06, + "loss": 0.5422, + "num_input_tokens_seen": 190144, + "step": 290 + }, + { + "epoch": 0.15461215932914046, + "grad_norm": 1.391627311706543, + "learning_rate": 7.70440251572327e-06, + "loss": 0.5559, + "num_input_tokens_seen": 193344, + "step": 295 + }, + { + "epoch": 0.15723270440251572, + "grad_norm": 1.2318620681762695, + "learning_rate": 7.835429769392034e-06, + "loss": 0.3757, + "num_input_tokens_seen": 195904, + "step": 300 + }, + { + "epoch": 0.159853249475891, + "grad_norm": 1.2254750728607178, + "learning_rate": 7.966457023060797e-06, + "loss": 0.5078, + "num_input_tokens_seen": 198944, + "step": 305 + }, + { + "epoch": 0.16247379454926625, + "grad_norm": 1.7892051935195923, + "learning_rate": 8.09748427672956e-06, + "loss": 0.5101, + "num_input_tokens_seen": 201792, + "step": 310 + }, + { + "epoch": 0.1650943396226415, + "grad_norm": 0.9085080027580261, + "learning_rate": 8.228511530398324e-06, + "loss": 0.4689, + "num_input_tokens_seen": 206080, + "step": 315 + }, + { + "epoch": 0.16771488469601678, + "grad_norm": 0.7474177479743958, + "learning_rate": 8.359538784067087e-06, + "loss": 0.5962, + "num_input_tokens_seen": 209376, + "step": 320 + }, + { + "epoch": 0.17033542976939203, + "grad_norm": 1.0011258125305176, + "learning_rate": 8.49056603773585e-06, + "loss": 0.5748, + "num_input_tokens_seen": 212160, + "step": 325 + }, + { + "epoch": 0.17295597484276728, + "grad_norm": 0.6904563903808594, + "learning_rate": 8.621593291404612e-06, + "loss": 0.5387, + "num_input_tokens_seen": 216128, + "step": 330 + }, + { + "epoch": 0.17557651991614256, + "grad_norm": 0.7396156191825867, + "learning_rate": 8.752620545073375e-06, + "loss": 0.6544, + "num_input_tokens_seen": 219232, + "step": 335 + }, + { + "epoch": 0.17819706498951782, + "grad_norm": 0.8651964068412781, + "learning_rate": 8.883647798742138e-06, + "loss": 0.4495, + "num_input_tokens_seen": 222464, + "step": 340 + }, + { + "epoch": 0.18081761006289307, + "grad_norm": 0.7717373967170715, + "learning_rate": 9.014675052410902e-06, + "loss": 0.7009, + "num_input_tokens_seen": 225184, + "step": 345 + }, + { + "epoch": 0.18343815513626835, + "grad_norm": 0.7471546530723572, + "learning_rate": 9.145702306079665e-06, + "loss": 0.4536, + "num_input_tokens_seen": 228000, + "step": 350 + }, + { + "epoch": 0.1860587002096436, + "grad_norm": 0.7326545119285583, + "learning_rate": 9.276729559748428e-06, + "loss": 0.4279, + "num_input_tokens_seen": 230944, + "step": 355 + }, + { + "epoch": 0.18867924528301888, + "grad_norm": 1.2235350608825684, + "learning_rate": 9.40775681341719e-06, + "loss": 0.4566, + "num_input_tokens_seen": 233728, + "step": 360 + }, + { + "epoch": 0.19129979035639413, + "grad_norm": 0.9556767344474792, + "learning_rate": 9.538784067085953e-06, + "loss": 0.4894, + "num_input_tokens_seen": 236480, + "step": 365 + }, + { + "epoch": 0.19392033542976939, + "grad_norm": 1.3948463201522827, + "learning_rate": 9.669811320754718e-06, + "loss": 0.467, + "num_input_tokens_seen": 239488, + "step": 370 + }, + { + "epoch": 0.19654088050314467, + "grad_norm": 1.1374164819717407, + "learning_rate": 9.80083857442348e-06, + "loss": 0.5921, + "num_input_tokens_seen": 242144, + "step": 375 + }, + { + "epoch": 0.19916142557651992, + "grad_norm": 0.7017232179641724, + "learning_rate": 9.931865828092243e-06, + "loss": 0.5052, + "num_input_tokens_seen": 245216, + "step": 380 + }, + { + "epoch": 0.20178197064989517, + "grad_norm": 0.961683452129364, + "learning_rate": 1.0062893081761008e-05, + "loss": 0.6477, + "num_input_tokens_seen": 249632, + "step": 385 + }, + { + "epoch": 0.20440251572327045, + "grad_norm": 0.5592197179794312, + "learning_rate": 1.019392033542977e-05, + "loss": 0.542, + "num_input_tokens_seen": 253312, + "step": 390 + }, + { + "epoch": 0.2070230607966457, + "grad_norm": 0.7247169017791748, + "learning_rate": 1.0324947589098532e-05, + "loss": 0.4957, + "num_input_tokens_seen": 256384, + "step": 395 + }, + { + "epoch": 0.20964360587002095, + "grad_norm": 6.50959587097168, + "learning_rate": 1.0455974842767296e-05, + "loss": 0.4748, + "num_input_tokens_seen": 259744, + "step": 400 + }, + { + "epoch": 0.21226415094339623, + "grad_norm": 1.0247383117675781, + "learning_rate": 1.0587002096436059e-05, + "loss": 0.7088, + "num_input_tokens_seen": 263456, + "step": 405 + }, + { + "epoch": 0.2148846960167715, + "grad_norm": 1.0258949995040894, + "learning_rate": 1.0718029350104822e-05, + "loss": 0.5006, + "num_input_tokens_seen": 266496, + "step": 410 + }, + { + "epoch": 0.21750524109014674, + "grad_norm": 1.0033241510391235, + "learning_rate": 1.0849056603773586e-05, + "loss": 0.595, + "num_input_tokens_seen": 269600, + "step": 415 + }, + { + "epoch": 0.22012578616352202, + "grad_norm": 0.8237833976745605, + "learning_rate": 1.0980083857442349e-05, + "loss": 0.4878, + "num_input_tokens_seen": 273152, + "step": 420 + }, + { + "epoch": 0.22274633123689727, + "grad_norm": 0.8439940214157104, + "learning_rate": 1.1111111111111112e-05, + "loss": 0.6407, + "num_input_tokens_seen": 276992, + "step": 425 + }, + { + "epoch": 0.22536687631027252, + "grad_norm": 1.3609600067138672, + "learning_rate": 1.1242138364779874e-05, + "loss": 0.6209, + "num_input_tokens_seen": 279296, + "step": 430 + }, + { + "epoch": 0.2279874213836478, + "grad_norm": 0.5186997652053833, + "learning_rate": 1.1373165618448637e-05, + "loss": 0.4143, + "num_input_tokens_seen": 283296, + "step": 435 + }, + { + "epoch": 0.23060796645702306, + "grad_norm": 1.6151751279830933, + "learning_rate": 1.1504192872117402e-05, + "loss": 0.6658, + "num_input_tokens_seen": 286016, + "step": 440 + }, + { + "epoch": 0.23322851153039834, + "grad_norm": 1.689827561378479, + "learning_rate": 1.1635220125786164e-05, + "loss": 0.5545, + "num_input_tokens_seen": 289248, + "step": 445 + }, + { + "epoch": 0.2358490566037736, + "grad_norm": 0.684535026550293, + "learning_rate": 1.1766247379454927e-05, + "loss": 0.5986, + "num_input_tokens_seen": 292160, + "step": 450 + }, + { + "epoch": 0.23846960167714884, + "grad_norm": 1.055497407913208, + "learning_rate": 1.1897274633123692e-05, + "loss": 0.5218, + "num_input_tokens_seen": 295296, + "step": 455 + }, + { + "epoch": 0.24109014675052412, + "grad_norm": 0.9728020429611206, + "learning_rate": 1.2028301886792454e-05, + "loss": 0.5788, + "num_input_tokens_seen": 299360, + "step": 460 + }, + { + "epoch": 0.24371069182389937, + "grad_norm": 0.6924186944961548, + "learning_rate": 1.2159329140461215e-05, + "loss": 0.4819, + "num_input_tokens_seen": 303072, + "step": 465 + }, + { + "epoch": 0.24633123689727462, + "grad_norm": 0.7920917272567749, + "learning_rate": 1.229035639412998e-05, + "loss": 0.4638, + "num_input_tokens_seen": 307616, + "step": 470 + }, + { + "epoch": 0.2489517819706499, + "grad_norm": 0.7843032479286194, + "learning_rate": 1.2421383647798743e-05, + "loss": 0.4584, + "num_input_tokens_seen": 310112, + "step": 475 + }, + { + "epoch": 0.25157232704402516, + "grad_norm": 0.9971636533737183, + "learning_rate": 1.2552410901467507e-05, + "loss": 0.4612, + "num_input_tokens_seen": 313472, + "step": 480 + }, + { + "epoch": 0.25419287211740044, + "grad_norm": 0.9198432564735413, + "learning_rate": 1.2683438155136268e-05, + "loss": 0.4928, + "num_input_tokens_seen": 316128, + "step": 485 + }, + { + "epoch": 0.25681341719077566, + "grad_norm": 0.7736453413963318, + "learning_rate": 1.2814465408805033e-05, + "loss": 0.4552, + "num_input_tokens_seen": 319584, + "step": 490 + }, + { + "epoch": 0.25943396226415094, + "grad_norm": 0.6796345114707947, + "learning_rate": 1.2945492662473795e-05, + "loss": 0.5348, + "num_input_tokens_seen": 322496, + "step": 495 + }, + { + "epoch": 0.2620545073375262, + "grad_norm": 0.8698579668998718, + "learning_rate": 1.3076519916142556e-05, + "loss": 0.7985, + "num_input_tokens_seen": 325120, + "step": 500 + }, + { + "epoch": 0.26467505241090145, + "grad_norm": 1.1811022758483887, + "learning_rate": 1.320754716981132e-05, + "loss": 0.5888, + "num_input_tokens_seen": 327840, + "step": 505 + }, + { + "epoch": 0.2672955974842767, + "grad_norm": 0.8138794302940369, + "learning_rate": 1.3338574423480085e-05, + "loss": 0.4123, + "num_input_tokens_seen": 331776, + "step": 510 + }, + { + "epoch": 0.269916142557652, + "grad_norm": 1.991489291191101, + "learning_rate": 1.3469601677148846e-05, + "loss": 0.5027, + "num_input_tokens_seen": 336544, + "step": 515 + }, + { + "epoch": 0.27253668763102723, + "grad_norm": 0.7809500098228455, + "learning_rate": 1.360062893081761e-05, + "loss": 0.6585, + "num_input_tokens_seen": 340768, + "step": 520 + }, + { + "epoch": 0.2751572327044025, + "grad_norm": 1.2404972314834595, + "learning_rate": 1.3731656184486375e-05, + "loss": 0.6478, + "num_input_tokens_seen": 343296, + "step": 525 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 0.8562243580818176, + "learning_rate": 1.3862683438155136e-05, + "loss": 0.3793, + "num_input_tokens_seen": 346848, + "step": 530 + }, + { + "epoch": 0.280398322851153, + "grad_norm": 0.8456347584724426, + "learning_rate": 1.3993710691823899e-05, + "loss": 0.537, + "num_input_tokens_seen": 349632, + "step": 535 + }, + { + "epoch": 0.2830188679245283, + "grad_norm": 1.036941409111023, + "learning_rate": 1.4124737945492664e-05, + "loss": 0.5075, + "num_input_tokens_seen": 353024, + "step": 540 + }, + { + "epoch": 0.2856394129979036, + "grad_norm": 0.8966272473335266, + "learning_rate": 1.4255765199161425e-05, + "loss": 0.496, + "num_input_tokens_seen": 355552, + "step": 545 + }, + { + "epoch": 0.2882599580712788, + "grad_norm": 1.1913000345230103, + "learning_rate": 1.4386792452830189e-05, + "loss": 0.4617, + "num_input_tokens_seen": 360128, + "step": 550 + }, + { + "epoch": 0.2908805031446541, + "grad_norm": 0.6880296468734741, + "learning_rate": 1.4517819706498954e-05, + "loss": 0.5104, + "num_input_tokens_seen": 363360, + "step": 555 + }, + { + "epoch": 0.29350104821802936, + "grad_norm": 1.043920874595642, + "learning_rate": 1.4648846960167716e-05, + "loss": 0.507, + "num_input_tokens_seen": 366144, + "step": 560 + }, + { + "epoch": 0.29612159329140464, + "grad_norm": 0.916695237159729, + "learning_rate": 1.4779874213836479e-05, + "loss": 0.5337, + "num_input_tokens_seen": 369216, + "step": 565 + }, + { + "epoch": 0.29874213836477986, + "grad_norm": 1.5181792974472046, + "learning_rate": 1.4910901467505242e-05, + "loss": 0.5856, + "num_input_tokens_seen": 372512, + "step": 570 + }, + { + "epoch": 0.30136268343815514, + "grad_norm": 1.5025418996810913, + "learning_rate": 1.5041928721174006e-05, + "loss": 0.5356, + "num_input_tokens_seen": 374976, + "step": 575 + }, + { + "epoch": 0.3039832285115304, + "grad_norm": 1.8223342895507812, + "learning_rate": 1.5172955974842767e-05, + "loss": 0.4309, + "num_input_tokens_seen": 377632, + "step": 580 + }, + { + "epoch": 0.30660377358490565, + "grad_norm": 0.8003756403923035, + "learning_rate": 1.530398322851153e-05, + "loss": 0.6588, + "num_input_tokens_seen": 380768, + "step": 585 + }, + { + "epoch": 0.30922431865828093, + "grad_norm": 0.9612182974815369, + "learning_rate": 1.5435010482180296e-05, + "loss": 0.4924, + "num_input_tokens_seen": 383520, + "step": 590 + }, + { + "epoch": 0.3118448637316562, + "grad_norm": 2.175323486328125, + "learning_rate": 1.5566037735849056e-05, + "loss": 0.7615, + "num_input_tokens_seen": 386496, + "step": 595 + }, + { + "epoch": 0.31446540880503143, + "grad_norm": 1.0355792045593262, + "learning_rate": 1.5697064989517822e-05, + "loss": 0.5797, + "num_input_tokens_seen": 390464, + "step": 600 + }, + { + "epoch": 0.3170859538784067, + "grad_norm": 1.0655884742736816, + "learning_rate": 1.5828092243186584e-05, + "loss": 0.4388, + "num_input_tokens_seen": 393440, + "step": 605 + }, + { + "epoch": 0.319706498951782, + "grad_norm": 0.9436463713645935, + "learning_rate": 1.5959119496855347e-05, + "loss": 0.4367, + "num_input_tokens_seen": 398336, + "step": 610 + }, + { + "epoch": 0.3223270440251572, + "grad_norm": 1.6125998497009277, + "learning_rate": 1.609014675052411e-05, + "loss": 0.4054, + "num_input_tokens_seen": 401504, + "step": 615 + }, + { + "epoch": 0.3249475890985325, + "grad_norm": 1.5156126022338867, + "learning_rate": 1.6221174004192873e-05, + "loss": 0.4886, + "num_input_tokens_seen": 404864, + "step": 620 + }, + { + "epoch": 0.3275681341719078, + "grad_norm": 0.6368691325187683, + "learning_rate": 1.6352201257861635e-05, + "loss": 0.5544, + "num_input_tokens_seen": 409056, + "step": 625 + }, + { + "epoch": 0.330188679245283, + "grad_norm": 1.0655624866485596, + "learning_rate": 1.6483228511530398e-05, + "loss": 0.552, + "num_input_tokens_seen": 412544, + "step": 630 + }, + { + "epoch": 0.3328092243186583, + "grad_norm": 1.7277559041976929, + "learning_rate": 1.6614255765199164e-05, + "loss": 0.5259, + "num_input_tokens_seen": 415520, + "step": 635 + }, + { + "epoch": 0.33542976939203356, + "grad_norm": 1.4159108400344849, + "learning_rate": 1.6745283018867924e-05, + "loss": 0.9164, + "num_input_tokens_seen": 417760, + "step": 640 + }, + { + "epoch": 0.3380503144654088, + "grad_norm": 0.6982753276824951, + "learning_rate": 1.687631027253669e-05, + "loss": 0.5559, + "num_input_tokens_seen": 421728, + "step": 645 + }, + { + "epoch": 0.34067085953878407, + "grad_norm": 5.861119747161865, + "learning_rate": 1.7007337526205453e-05, + "loss": 0.4972, + "num_input_tokens_seen": 425664, + "step": 650 + }, + { + "epoch": 0.34329140461215935, + "grad_norm": 1.1906477212905884, + "learning_rate": 1.7138364779874212e-05, + "loss": 0.4526, + "num_input_tokens_seen": 428896, + "step": 655 + }, + { + "epoch": 0.34591194968553457, + "grad_norm": 1.570335865020752, + "learning_rate": 1.7269392033542978e-05, + "loss": 0.5117, + "num_input_tokens_seen": 431744, + "step": 660 + }, + { + "epoch": 0.34853249475890985, + "grad_norm": 0.9298561811447144, + "learning_rate": 1.740041928721174e-05, + "loss": 0.5271, + "num_input_tokens_seen": 435520, + "step": 665 + }, + { + "epoch": 0.35115303983228513, + "grad_norm": 0.7308741807937622, + "learning_rate": 1.7531446540880504e-05, + "loss": 0.4673, + "num_input_tokens_seen": 438592, + "step": 670 + }, + { + "epoch": 0.35377358490566035, + "grad_norm": 0.8939027786254883, + "learning_rate": 1.7662473794549266e-05, + "loss": 0.5195, + "num_input_tokens_seen": 441664, + "step": 675 + }, + { + "epoch": 0.35639412997903563, + "grad_norm": 1.619928240776062, + "learning_rate": 1.779350104821803e-05, + "loss": 0.5729, + "num_input_tokens_seen": 444832, + "step": 680 + }, + { + "epoch": 0.3590146750524109, + "grad_norm": 1.1400911808013916, + "learning_rate": 1.7924528301886792e-05, + "loss": 0.5938, + "num_input_tokens_seen": 447392, + "step": 685 + }, + { + "epoch": 0.36163522012578614, + "grad_norm": 1.3737945556640625, + "learning_rate": 1.8055555555555555e-05, + "loss": 0.547, + "num_input_tokens_seen": 450528, + "step": 690 + }, + { + "epoch": 0.3642557651991614, + "grad_norm": 1.1419432163238525, + "learning_rate": 1.818658280922432e-05, + "loss": 0.4085, + "num_input_tokens_seen": 453504, + "step": 695 + }, + { + "epoch": 0.3668763102725367, + "grad_norm": 1.363502860069275, + "learning_rate": 1.831761006289308e-05, + "loss": 0.6111, + "num_input_tokens_seen": 456288, + "step": 700 + }, + { + "epoch": 0.3694968553459119, + "grad_norm": 1.540914535522461, + "learning_rate": 1.8448637316561846e-05, + "loss": 0.3956, + "num_input_tokens_seen": 459840, + "step": 705 + }, + { + "epoch": 0.3721174004192872, + "grad_norm": 0.7595596313476562, + "learning_rate": 1.857966457023061e-05, + "loss": 0.4125, + "num_input_tokens_seen": 463168, + "step": 710 + }, + { + "epoch": 0.3747379454926625, + "grad_norm": 1.0865188837051392, + "learning_rate": 1.8710691823899372e-05, + "loss": 0.478, + "num_input_tokens_seen": 466112, + "step": 715 + }, + { + "epoch": 0.37735849056603776, + "grad_norm": 1.4115959405899048, + "learning_rate": 1.8841719077568135e-05, + "loss": 0.629, + "num_input_tokens_seen": 469568, + "step": 720 + }, + { + "epoch": 0.379979035639413, + "grad_norm": 0.9635273814201355, + "learning_rate": 1.8972746331236897e-05, + "loss": 0.4086, + "num_input_tokens_seen": 472896, + "step": 725 + }, + { + "epoch": 0.38259958071278827, + "grad_norm": 1.1972815990447998, + "learning_rate": 1.9103773584905664e-05, + "loss": 0.325, + "num_input_tokens_seen": 475744, + "step": 730 + }, + { + "epoch": 0.38522012578616355, + "grad_norm": 0.5146929621696472, + "learning_rate": 1.9234800838574423e-05, + "loss": 0.5138, + "num_input_tokens_seen": 481216, + "step": 735 + }, + { + "epoch": 0.38784067085953877, + "grad_norm": 1.2256056070327759, + "learning_rate": 1.936582809224319e-05, + "loss": 0.5292, + "num_input_tokens_seen": 483488, + "step": 740 + }, + { + "epoch": 0.39046121593291405, + "grad_norm": 0.9205886721611023, + "learning_rate": 1.9496855345911952e-05, + "loss": 0.5691, + "num_input_tokens_seen": 487072, + "step": 745 + }, + { + "epoch": 0.39308176100628933, + "grad_norm": 0.8949298858642578, + "learning_rate": 1.9627882599580715e-05, + "loss": 0.5145, + "num_input_tokens_seen": 490496, + "step": 750 + }, + { + "epoch": 0.39570230607966456, + "grad_norm": 0.8545956611633301, + "learning_rate": 1.9758909853249477e-05, + "loss": 0.4523, + "num_input_tokens_seen": 493824, + "step": 755 + }, + { + "epoch": 0.39832285115303984, + "grad_norm": 1.2030757665634155, + "learning_rate": 1.988993710691824e-05, + "loss": 0.6662, + "num_input_tokens_seen": 496192, + "step": 760 + }, + { + "epoch": 0.4009433962264151, + "grad_norm": 0.8649848103523254, + "learning_rate": 2.0020964360587003e-05, + "loss": 0.5718, + "num_input_tokens_seen": 498880, + "step": 765 + }, + { + "epoch": 0.40356394129979034, + "grad_norm": 0.8810229301452637, + "learning_rate": 2.0151991614255766e-05, + "loss": 0.5095, + "num_input_tokens_seen": 502016, + "step": 770 + }, + { + "epoch": 0.4061844863731656, + "grad_norm": 0.6040758490562439, + "learning_rate": 2.0283018867924532e-05, + "loss": 0.5094, + "num_input_tokens_seen": 505120, + "step": 775 + }, + { + "epoch": 0.4088050314465409, + "grad_norm": 1.0357438325881958, + "learning_rate": 2.041404612159329e-05, + "loss": 0.4895, + "num_input_tokens_seen": 508256, + "step": 780 + }, + { + "epoch": 0.4114255765199161, + "grad_norm": 1.027489185333252, + "learning_rate": 2.0545073375262054e-05, + "loss": 0.4881, + "num_input_tokens_seen": 511776, + "step": 785 + }, + { + "epoch": 0.4140461215932914, + "grad_norm": 2.0480377674102783, + "learning_rate": 2.067610062893082e-05, + "loss": 0.49, + "num_input_tokens_seen": 514464, + "step": 790 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 1.03103768825531, + "learning_rate": 2.080712788259958e-05, + "loss": 0.4869, + "num_input_tokens_seen": 518144, + "step": 795 + }, + { + "epoch": 0.4192872117400419, + "grad_norm": 1.3457624912261963, + "learning_rate": 2.0938155136268346e-05, + "loss": 0.5128, + "num_input_tokens_seen": 520768, + "step": 800 + }, + { + "epoch": 0.4219077568134172, + "grad_norm": 1.4920198917388916, + "learning_rate": 2.106918238993711e-05, + "loss": 0.6324, + "num_input_tokens_seen": 523808, + "step": 805 + }, + { + "epoch": 0.42452830188679247, + "grad_norm": 1.1666946411132812, + "learning_rate": 2.120020964360587e-05, + "loss": 0.3684, + "num_input_tokens_seen": 527136, + "step": 810 + }, + { + "epoch": 0.4271488469601677, + "grad_norm": 2.379878282546997, + "learning_rate": 2.1331236897274634e-05, + "loss": 0.613, + "num_input_tokens_seen": 530240, + "step": 815 + }, + { + "epoch": 0.429769392033543, + "grad_norm": 1.343873143196106, + "learning_rate": 2.1462264150943397e-05, + "loss": 0.502, + "num_input_tokens_seen": 534496, + "step": 820 + }, + { + "epoch": 0.43238993710691825, + "grad_norm": 1.6689718961715698, + "learning_rate": 2.159329140461216e-05, + "loss": 0.5265, + "num_input_tokens_seen": 538624, + "step": 825 + }, + { + "epoch": 0.4350104821802935, + "grad_norm": 1.6153682470321655, + "learning_rate": 2.1724318658280922e-05, + "loss": 0.5027, + "num_input_tokens_seen": 542112, + "step": 830 + }, + { + "epoch": 0.43763102725366876, + "grad_norm": 1.0619564056396484, + "learning_rate": 2.1855345911949688e-05, + "loss": 0.6202, + "num_input_tokens_seen": 545408, + "step": 835 + }, + { + "epoch": 0.44025157232704404, + "grad_norm": 0.7109149694442749, + "learning_rate": 2.1986373165618448e-05, + "loss": 0.494, + "num_input_tokens_seen": 548416, + "step": 840 + }, + { + "epoch": 0.44287211740041926, + "grad_norm": 1.0601003170013428, + "learning_rate": 2.2117400419287214e-05, + "loss": 0.5325, + "num_input_tokens_seen": 551264, + "step": 845 + }, + { + "epoch": 0.44549266247379454, + "grad_norm": 0.754993736743927, + "learning_rate": 2.2248427672955977e-05, + "loss": 0.4137, + "num_input_tokens_seen": 554592, + "step": 850 + }, + { + "epoch": 0.4481132075471698, + "grad_norm": 1.3108211755752563, + "learning_rate": 2.237945492662474e-05, + "loss": 0.3737, + "num_input_tokens_seen": 558304, + "step": 855 + }, + { + "epoch": 0.45073375262054505, + "grad_norm": 1.086026906967163, + "learning_rate": 2.2510482180293502e-05, + "loss": 0.5376, + "num_input_tokens_seen": 561248, + "step": 860 + }, + { + "epoch": 0.4533542976939203, + "grad_norm": 1.2115050554275513, + "learning_rate": 2.2641509433962265e-05, + "loss": 0.3515, + "num_input_tokens_seen": 564352, + "step": 865 + }, + { + "epoch": 0.4559748427672956, + "grad_norm": 2.1600208282470703, + "learning_rate": 2.2772536687631028e-05, + "loss": 0.4744, + "num_input_tokens_seen": 567712, + "step": 870 + }, + { + "epoch": 0.4585953878406709, + "grad_norm": 1.6078932285308838, + "learning_rate": 2.290356394129979e-05, + "loss": 0.5253, + "num_input_tokens_seen": 570240, + "step": 875 + }, + { + "epoch": 0.4612159329140461, + "grad_norm": 1.805027723312378, + "learning_rate": 2.3034591194968556e-05, + "loss": 0.5407, + "num_input_tokens_seen": 572768, + "step": 880 + }, + { + "epoch": 0.4638364779874214, + "grad_norm": 1.3988354206085205, + "learning_rate": 2.316561844863732e-05, + "loss": 0.548, + "num_input_tokens_seen": 576512, + "step": 885 + }, + { + "epoch": 0.46645702306079667, + "grad_norm": 1.2440086603164673, + "learning_rate": 2.329664570230608e-05, + "loss": 0.5338, + "num_input_tokens_seen": 579264, + "step": 890 + }, + { + "epoch": 0.4690775681341719, + "grad_norm": 0.8754032254219055, + "learning_rate": 2.3427672955974845e-05, + "loss": 0.5145, + "num_input_tokens_seen": 581440, + "step": 895 + }, + { + "epoch": 0.4716981132075472, + "grad_norm": 0.8028460144996643, + "learning_rate": 2.3558700209643607e-05, + "loss": 0.3516, + "num_input_tokens_seen": 584544, + "step": 900 + }, + { + "epoch": 0.47431865828092246, + "grad_norm": 1.3208998441696167, + "learning_rate": 2.368972746331237e-05, + "loss": 0.5138, + "num_input_tokens_seen": 588160, + "step": 905 + }, + { + "epoch": 0.4769392033542977, + "grad_norm": 2.019037961959839, + "learning_rate": 2.3820754716981133e-05, + "loss": 0.4542, + "num_input_tokens_seen": 590400, + "step": 910 + }, + { + "epoch": 0.47955974842767296, + "grad_norm": 1.0043576955795288, + "learning_rate": 2.39517819706499e-05, + "loss": 0.4432, + "num_input_tokens_seen": 593440, + "step": 915 + }, + { + "epoch": 0.48218029350104824, + "grad_norm": 1.3934003114700317, + "learning_rate": 2.408280922431866e-05, + "loss": 0.4644, + "num_input_tokens_seen": 596480, + "step": 920 + }, + { + "epoch": 0.48480083857442346, + "grad_norm": 0.925674557685852, + "learning_rate": 2.421383647798742e-05, + "loss": 0.4517, + "num_input_tokens_seen": 599392, + "step": 925 + }, + { + "epoch": 0.48742138364779874, + "grad_norm": 1.2982006072998047, + "learning_rate": 2.4344863731656187e-05, + "loss": 0.3984, + "num_input_tokens_seen": 602432, + "step": 930 + }, + { + "epoch": 0.490041928721174, + "grad_norm": 0.9743600487709045, + "learning_rate": 2.4475890985324947e-05, + "loss": 0.4459, + "num_input_tokens_seen": 604512, + "step": 935 + }, + { + "epoch": 0.49266247379454925, + "grad_norm": 0.6091085076332092, + "learning_rate": 2.4606918238993713e-05, + "loss": 0.4298, + "num_input_tokens_seen": 608864, + "step": 940 + }, + { + "epoch": 0.49528301886792453, + "grad_norm": 1.3479996919631958, + "learning_rate": 2.4737945492662476e-05, + "loss": 0.4996, + "num_input_tokens_seen": 612448, + "step": 945 + }, + { + "epoch": 0.4979035639412998, + "grad_norm": 3.3380227088928223, + "learning_rate": 2.486897274633124e-05, + "loss": 0.4542, + "num_input_tokens_seen": 614944, + "step": 950 + }, + { + "epoch": 0.5, + "eval_loss": 0.49471116065979004, + "eval_runtime": 15.9455, + "eval_samples_per_second": 53.181, + "eval_steps_per_second": 13.295, + "num_input_tokens_seen": 616992, + "step": 954 + }, + { + "epoch": 0.500524109014675, + "grad_norm": 1.3500741720199585, + "learning_rate": 2.5e-05, + "loss": 0.4662, + "num_input_tokens_seen": 617472, + "step": 955 + }, + { + "epoch": 0.5031446540880503, + "grad_norm": 1.7900996208190918, + "learning_rate": 2.5131027253668764e-05, + "loss": 0.4149, + "num_input_tokens_seen": 620192, + "step": 960 + }, + { + "epoch": 0.5057651991614256, + "grad_norm": 0.8886694312095642, + "learning_rate": 2.526205450733753e-05, + "loss": 0.5272, + "num_input_tokens_seen": 623232, + "step": 965 + }, + { + "epoch": 0.5083857442348009, + "grad_norm": 1.4506951570510864, + "learning_rate": 2.5393081761006293e-05, + "loss": 0.6347, + "num_input_tokens_seen": 626016, + "step": 970 + }, + { + "epoch": 0.5110062893081762, + "grad_norm": 1.1908502578735352, + "learning_rate": 2.5524109014675052e-05, + "loss": 0.4504, + "num_input_tokens_seen": 629472, + "step": 975 + }, + { + "epoch": 0.5136268343815513, + "grad_norm": 1.2886977195739746, + "learning_rate": 2.5655136268343815e-05, + "loss": 0.5337, + "num_input_tokens_seen": 632608, + "step": 980 + }, + { + "epoch": 0.5162473794549266, + "grad_norm": 1.4067749977111816, + "learning_rate": 2.578616352201258e-05, + "loss": 0.3889, + "num_input_tokens_seen": 636192, + "step": 985 + }, + { + "epoch": 0.5188679245283019, + "grad_norm": 1.0364179611206055, + "learning_rate": 2.5917190775681344e-05, + "loss": 0.4645, + "num_input_tokens_seen": 640544, + "step": 990 + }, + { + "epoch": 0.5214884696016772, + "grad_norm": 1.254272699356079, + "learning_rate": 2.6048218029350107e-05, + "loss": 0.4517, + "num_input_tokens_seen": 644448, + "step": 995 + }, + { + "epoch": 0.5241090146750524, + "grad_norm": 0.7881894707679749, + "learning_rate": 2.6179245283018873e-05, + "loss": 0.4407, + "num_input_tokens_seen": 648288, + "step": 1000 + }, + { + "epoch": 0.5267295597484277, + "grad_norm": 2.1381165981292725, + "learning_rate": 2.631027253668763e-05, + "loss": 0.628, + "num_input_tokens_seen": 651296, + "step": 1005 + }, + { + "epoch": 0.5293501048218029, + "grad_norm": 0.8951306343078613, + "learning_rate": 2.6441299790356395e-05, + "loss": 0.3911, + "num_input_tokens_seen": 654176, + "step": 1010 + }, + { + "epoch": 0.5319706498951782, + "grad_norm": 1.0670815706253052, + "learning_rate": 2.6572327044025158e-05, + "loss": 0.5192, + "num_input_tokens_seen": 657664, + "step": 1015 + }, + { + "epoch": 0.5345911949685535, + "grad_norm": 1.1627840995788574, + "learning_rate": 2.6703354297693924e-05, + "loss": 0.5797, + "num_input_tokens_seen": 660064, + "step": 1020 + }, + { + "epoch": 0.5372117400419287, + "grad_norm": 1.4620821475982666, + "learning_rate": 2.6834381551362687e-05, + "loss": 0.5739, + "num_input_tokens_seen": 662880, + "step": 1025 + }, + { + "epoch": 0.539832285115304, + "grad_norm": 1.1687567234039307, + "learning_rate": 2.696540880503145e-05, + "loss": 0.4157, + "num_input_tokens_seen": 665920, + "step": 1030 + }, + { + "epoch": 0.5424528301886793, + "grad_norm": 1.5211421251296997, + "learning_rate": 2.709643605870021e-05, + "loss": 0.4259, + "num_input_tokens_seen": 668864, + "step": 1035 + }, + { + "epoch": 0.5450733752620545, + "grad_norm": 1.349327564239502, + "learning_rate": 2.722746331236897e-05, + "loss": 0.4429, + "num_input_tokens_seen": 671776, + "step": 1040 + }, + { + "epoch": 0.5476939203354297, + "grad_norm": 1.204770565032959, + "learning_rate": 2.7358490566037738e-05, + "loss": 0.5242, + "num_input_tokens_seen": 674912, + "step": 1045 + }, + { + "epoch": 0.550314465408805, + "grad_norm": 0.8893855214118958, + "learning_rate": 2.74895178197065e-05, + "loss": 0.3789, + "num_input_tokens_seen": 678368, + "step": 1050 + }, + { + "epoch": 0.5529350104821803, + "grad_norm": 0.7021135091781616, + "learning_rate": 2.7620545073375263e-05, + "loss": 0.5393, + "num_input_tokens_seen": 682336, + "step": 1055 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 2.1586036682128906, + "learning_rate": 2.775157232704403e-05, + "loss": 0.4741, + "num_input_tokens_seen": 685760, + "step": 1060 + }, + { + "epoch": 0.5581761006289309, + "grad_norm": 1.6409112215042114, + "learning_rate": 2.788259958071279e-05, + "loss": 0.4036, + "num_input_tokens_seen": 688480, + "step": 1065 + }, + { + "epoch": 0.560796645702306, + "grad_norm": 1.0242619514465332, + "learning_rate": 2.801362683438155e-05, + "loss": 0.5662, + "num_input_tokens_seen": 692064, + "step": 1070 + }, + { + "epoch": 0.5634171907756813, + "grad_norm": 1.8925127983093262, + "learning_rate": 2.8144654088050314e-05, + "loss": 0.5548, + "num_input_tokens_seen": 694464, + "step": 1075 + }, + { + "epoch": 0.5660377358490566, + "grad_norm": 1.0462273359298706, + "learning_rate": 2.827568134171908e-05, + "loss": 0.4853, + "num_input_tokens_seen": 697344, + "step": 1080 + }, + { + "epoch": 0.5686582809224319, + "grad_norm": 1.519573450088501, + "learning_rate": 2.8406708595387843e-05, + "loss": 0.4577, + "num_input_tokens_seen": 700256, + "step": 1085 + }, + { + "epoch": 0.5712788259958071, + "grad_norm": 0.9503867030143738, + "learning_rate": 2.8537735849056606e-05, + "loss": 0.4372, + "num_input_tokens_seen": 703968, + "step": 1090 + }, + { + "epoch": 0.5738993710691824, + "grad_norm": 0.8229944705963135, + "learning_rate": 2.8668763102725365e-05, + "loss": 0.4578, + "num_input_tokens_seen": 708160, + "step": 1095 + }, + { + "epoch": 0.5765199161425576, + "grad_norm": 1.165449857711792, + "learning_rate": 2.8799790356394128e-05, + "loss": 0.6707, + "num_input_tokens_seen": 710976, + "step": 1100 + }, + { + "epoch": 0.5791404612159329, + "grad_norm": 2.6347739696502686, + "learning_rate": 2.8930817610062894e-05, + "loss": 0.5778, + "num_input_tokens_seen": 713696, + "step": 1105 + }, + { + "epoch": 0.5817610062893082, + "grad_norm": 1.617785096168518, + "learning_rate": 2.9061844863731657e-05, + "loss": 0.4256, + "num_input_tokens_seen": 716608, + "step": 1110 + }, + { + "epoch": 0.5843815513626834, + "grad_norm": 1.4327939748764038, + "learning_rate": 2.9192872117400423e-05, + "loss": 0.4735, + "num_input_tokens_seen": 719488, + "step": 1115 + }, + { + "epoch": 0.5870020964360587, + "grad_norm": 0.880470335483551, + "learning_rate": 2.9323899371069186e-05, + "loss": 0.375, + "num_input_tokens_seen": 722304, + "step": 1120 + }, + { + "epoch": 0.589622641509434, + "grad_norm": 1.223748803138733, + "learning_rate": 2.945492662473795e-05, + "loss": 0.4008, + "num_input_tokens_seen": 725472, + "step": 1125 + }, + { + "epoch": 0.5922431865828093, + "grad_norm": 1.068710446357727, + "learning_rate": 2.9585953878406708e-05, + "loss": 0.3536, + "num_input_tokens_seen": 728896, + "step": 1130 + }, + { + "epoch": 0.5948637316561844, + "grad_norm": 0.8500136733055115, + "learning_rate": 2.971698113207547e-05, + "loss": 0.3943, + "num_input_tokens_seen": 732000, + "step": 1135 + }, + { + "epoch": 0.5974842767295597, + "grad_norm": 1.1471060514450073, + "learning_rate": 2.9848008385744237e-05, + "loss": 0.5403, + "num_input_tokens_seen": 735328, + "step": 1140 + }, + { + "epoch": 0.600104821802935, + "grad_norm": 0.8781283497810364, + "learning_rate": 2.9979035639413e-05, + "loss": 0.3363, + "num_input_tokens_seen": 738528, + "step": 1145 + }, + { + "epoch": 0.6027253668763103, + "grad_norm": 1.1101824045181274, + "learning_rate": 3.0110062893081766e-05, + "loss": 0.5633, + "num_input_tokens_seen": 741376, + "step": 1150 + }, + { + "epoch": 0.6053459119496856, + "grad_norm": 1.0582693815231323, + "learning_rate": 3.024109014675053e-05, + "loss": 0.4941, + "num_input_tokens_seen": 743872, + "step": 1155 + }, + { + "epoch": 0.6079664570230608, + "grad_norm": 1.0258983373641968, + "learning_rate": 3.0372117400419288e-05, + "loss": 0.504, + "num_input_tokens_seen": 746720, + "step": 1160 + }, + { + "epoch": 0.610587002096436, + "grad_norm": 1.0611976385116577, + "learning_rate": 3.050314465408805e-05, + "loss": 0.5397, + "num_input_tokens_seen": 750240, + "step": 1165 + }, + { + "epoch": 0.6132075471698113, + "grad_norm": 1.260906457901001, + "learning_rate": 3.063417190775681e-05, + "loss": 0.472, + "num_input_tokens_seen": 753344, + "step": 1170 + }, + { + "epoch": 0.6158280922431866, + "grad_norm": 0.9108681082725525, + "learning_rate": 3.076519916142558e-05, + "loss": 0.4478, + "num_input_tokens_seen": 755968, + "step": 1175 + }, + { + "epoch": 0.6184486373165619, + "grad_norm": 0.825501561164856, + "learning_rate": 3.0896226415094346e-05, + "loss": 0.5576, + "num_input_tokens_seen": 759520, + "step": 1180 + }, + { + "epoch": 0.6210691823899371, + "grad_norm": 1.200038194656372, + "learning_rate": 3.1027253668763105e-05, + "loss": 0.4801, + "num_input_tokens_seen": 762048, + "step": 1185 + }, + { + "epoch": 0.6236897274633124, + "grad_norm": 1.3916493654251099, + "learning_rate": 3.1158280922431864e-05, + "loss": 0.5691, + "num_input_tokens_seen": 765504, + "step": 1190 + }, + { + "epoch": 0.6263102725366876, + "grad_norm": 0.9444966316223145, + "learning_rate": 3.128930817610063e-05, + "loss": 0.443, + "num_input_tokens_seen": 768192, + "step": 1195 + }, + { + "epoch": 0.6289308176100629, + "grad_norm": 1.0770262479782104, + "learning_rate": 3.142033542976939e-05, + "loss": 0.487, + "num_input_tokens_seen": 771168, + "step": 1200 + }, + { + "epoch": 0.6315513626834381, + "grad_norm": 0.8401181697845459, + "learning_rate": 3.1551362683438156e-05, + "loss": 0.2963, + "num_input_tokens_seen": 777504, + "step": 1205 + }, + { + "epoch": 0.6341719077568134, + "grad_norm": 1.367497444152832, + "learning_rate": 3.168238993710692e-05, + "loss": 0.4401, + "num_input_tokens_seen": 780512, + "step": 1210 + }, + { + "epoch": 0.6367924528301887, + "grad_norm": 1.3445719480514526, + "learning_rate": 3.181341719077569e-05, + "loss": 0.4998, + "num_input_tokens_seen": 784192, + "step": 1215 + }, + { + "epoch": 0.639412997903564, + "grad_norm": 1.1309243440628052, + "learning_rate": 3.194444444444444e-05, + "loss": 0.5548, + "num_input_tokens_seen": 787552, + "step": 1220 + }, + { + "epoch": 0.6420335429769392, + "grad_norm": 0.8281728625297546, + "learning_rate": 3.207547169811321e-05, + "loss": 0.6041, + "num_input_tokens_seen": 790720, + "step": 1225 + }, + { + "epoch": 0.6446540880503144, + "grad_norm": 1.6736555099487305, + "learning_rate": 3.220649895178197e-05, + "loss": 0.5532, + "num_input_tokens_seen": 793696, + "step": 1230 + }, + { + "epoch": 0.6472746331236897, + "grad_norm": 1.513587236404419, + "learning_rate": 3.233752620545073e-05, + "loss": 0.5273, + "num_input_tokens_seen": 796640, + "step": 1235 + }, + { + "epoch": 0.649895178197065, + "grad_norm": 1.8802945613861084, + "learning_rate": 3.24685534591195e-05, + "loss": 0.5041, + "num_input_tokens_seen": 799648, + "step": 1240 + }, + { + "epoch": 0.6525157232704403, + "grad_norm": 1.6726081371307373, + "learning_rate": 3.2599580712788265e-05, + "loss": 0.3328, + "num_input_tokens_seen": 802496, + "step": 1245 + }, + { + "epoch": 0.6551362683438156, + "grad_norm": 1.1021676063537598, + "learning_rate": 3.2730607966457024e-05, + "loss": 0.5131, + "num_input_tokens_seen": 805760, + "step": 1250 + }, + { + "epoch": 0.6577568134171907, + "grad_norm": 2.38026762008667, + "learning_rate": 3.2861635220125784e-05, + "loss": 0.5153, + "num_input_tokens_seen": 808832, + "step": 1255 + }, + { + "epoch": 0.660377358490566, + "grad_norm": 1.2496347427368164, + "learning_rate": 3.299266247379455e-05, + "loss": 0.4344, + "num_input_tokens_seen": 812256, + "step": 1260 + }, + { + "epoch": 0.6629979035639413, + "grad_norm": 1.4650437831878662, + "learning_rate": 3.3123689727463316e-05, + "loss": 0.4477, + "num_input_tokens_seen": 815680, + "step": 1265 + }, + { + "epoch": 0.6656184486373166, + "grad_norm": 0.717955470085144, + "learning_rate": 3.3254716981132075e-05, + "loss": 0.3542, + "num_input_tokens_seen": 818400, + "step": 1270 + }, + { + "epoch": 0.6682389937106918, + "grad_norm": 1.1214158535003662, + "learning_rate": 3.338574423480084e-05, + "loss": 0.541, + "num_input_tokens_seen": 821536, + "step": 1275 + }, + { + "epoch": 0.6708595387840671, + "grad_norm": 1.4707571268081665, + "learning_rate": 3.351677148846961e-05, + "loss": 0.4598, + "num_input_tokens_seen": 825024, + "step": 1280 + }, + { + "epoch": 0.6734800838574424, + "grad_norm": 0.7733718156814575, + "learning_rate": 3.364779874213837e-05, + "loss": 0.4414, + "num_input_tokens_seen": 829248, + "step": 1285 + }, + { + "epoch": 0.6761006289308176, + "grad_norm": 1.0387868881225586, + "learning_rate": 3.3778825995807126e-05, + "loss": 0.4079, + "num_input_tokens_seen": 831872, + "step": 1290 + }, + { + "epoch": 0.6787211740041929, + "grad_norm": 1.2547913789749146, + "learning_rate": 3.390985324947589e-05, + "loss": 0.5671, + "num_input_tokens_seen": 834880, + "step": 1295 + }, + { + "epoch": 0.6813417190775681, + "grad_norm": 0.8173801302909851, + "learning_rate": 3.404088050314466e-05, + "loss": 0.5557, + "num_input_tokens_seen": 837824, + "step": 1300 + }, + { + "epoch": 0.6839622641509434, + "grad_norm": 0.9595037698745728, + "learning_rate": 3.417190775681342e-05, + "loss": 0.4656, + "num_input_tokens_seen": 840896, + "step": 1305 + }, + { + "epoch": 0.6865828092243187, + "grad_norm": 1.0143568515777588, + "learning_rate": 3.4302935010482184e-05, + "loss": 0.6657, + "num_input_tokens_seen": 844000, + "step": 1310 + }, + { + "epoch": 0.689203354297694, + "grad_norm": 0.6064816117286682, + "learning_rate": 3.4433962264150943e-05, + "loss": 0.4463, + "num_input_tokens_seen": 847840, + "step": 1315 + }, + { + "epoch": 0.6918238993710691, + "grad_norm": 0.908545970916748, + "learning_rate": 3.456498951781971e-05, + "loss": 0.5581, + "num_input_tokens_seen": 850976, + "step": 1320 + }, + { + "epoch": 0.6944444444444444, + "grad_norm": 0.7620278000831604, + "learning_rate": 3.469601677148847e-05, + "loss": 0.4242, + "num_input_tokens_seen": 855616, + "step": 1325 + }, + { + "epoch": 0.6970649895178197, + "grad_norm": 0.9090112447738647, + "learning_rate": 3.4827044025157235e-05, + "loss": 0.3062, + "num_input_tokens_seen": 858528, + "step": 1330 + }, + { + "epoch": 0.699685534591195, + "grad_norm": 1.4122806787490845, + "learning_rate": 3.4958071278826e-05, + "loss": 0.485, + "num_input_tokens_seen": 862144, + "step": 1335 + }, + { + "epoch": 0.7023060796645703, + "grad_norm": 0.9676330089569092, + "learning_rate": 3.508909853249476e-05, + "loss": 0.5698, + "num_input_tokens_seen": 865440, + "step": 1340 + }, + { + "epoch": 0.7049266247379455, + "grad_norm": 1.0063731670379639, + "learning_rate": 3.522012578616352e-05, + "loss": 0.5007, + "num_input_tokens_seen": 868992, + "step": 1345 + }, + { + "epoch": 0.7075471698113207, + "grad_norm": 1.5224899053573608, + "learning_rate": 3.5351153039832286e-05, + "loss": 0.4733, + "num_input_tokens_seen": 871968, + "step": 1350 + }, + { + "epoch": 0.710167714884696, + "grad_norm": 1.152663230895996, + "learning_rate": 3.548218029350105e-05, + "loss": 0.4664, + "num_input_tokens_seen": 874912, + "step": 1355 + }, + { + "epoch": 0.7127882599580713, + "grad_norm": 1.1674985885620117, + "learning_rate": 3.561320754716981e-05, + "loss": 0.508, + "num_input_tokens_seen": 878112, + "step": 1360 + }, + { + "epoch": 0.7154088050314465, + "grad_norm": 1.4763439893722534, + "learning_rate": 3.574423480083858e-05, + "loss": 0.4877, + "num_input_tokens_seen": 880928, + "step": 1365 + }, + { + "epoch": 0.7180293501048218, + "grad_norm": 0.7789502143859863, + "learning_rate": 3.5875262054507344e-05, + "loss": 0.5095, + "num_input_tokens_seen": 884832, + "step": 1370 + }, + { + "epoch": 0.7206498951781971, + "grad_norm": 0.6680644154548645, + "learning_rate": 3.6006289308176097e-05, + "loss": 0.4631, + "num_input_tokens_seen": 888800, + "step": 1375 + }, + { + "epoch": 0.7232704402515723, + "grad_norm": 0.8397420048713684, + "learning_rate": 3.613731656184486e-05, + "loss": 0.4751, + "num_input_tokens_seen": 892416, + "step": 1380 + }, + { + "epoch": 0.7258909853249476, + "grad_norm": 1.06929612159729, + "learning_rate": 3.626834381551363e-05, + "loss": 0.5614, + "num_input_tokens_seen": 895840, + "step": 1385 + }, + { + "epoch": 0.7285115303983228, + "grad_norm": 1.1642292737960815, + "learning_rate": 3.6399371069182395e-05, + "loss": 0.5073, + "num_input_tokens_seen": 899392, + "step": 1390 + }, + { + "epoch": 0.7311320754716981, + "grad_norm": 1.0081895589828491, + "learning_rate": 3.6530398322851154e-05, + "loss": 0.4382, + "num_input_tokens_seen": 903264, + "step": 1395 + }, + { + "epoch": 0.7337526205450734, + "grad_norm": 1.034812331199646, + "learning_rate": 3.666142557651992e-05, + "loss": 0.448, + "num_input_tokens_seen": 906016, + "step": 1400 + }, + { + "epoch": 0.7363731656184487, + "grad_norm": 1.4724457263946533, + "learning_rate": 3.679245283018868e-05, + "loss": 0.5822, + "num_input_tokens_seen": 909152, + "step": 1405 + }, + { + "epoch": 0.7389937106918238, + "grad_norm": 1.2866677045822144, + "learning_rate": 3.692348008385744e-05, + "loss": 0.5294, + "num_input_tokens_seen": 912416, + "step": 1410 + }, + { + "epoch": 0.7416142557651991, + "grad_norm": 1.2879081964492798, + "learning_rate": 3.7054507337526205e-05, + "loss": 0.6122, + "num_input_tokens_seen": 915616, + "step": 1415 + }, + { + "epoch": 0.7442348008385744, + "grad_norm": 1.8310232162475586, + "learning_rate": 3.718553459119497e-05, + "loss": 0.5061, + "num_input_tokens_seen": 921568, + "step": 1420 + }, + { + "epoch": 0.7468553459119497, + "grad_norm": 0.8084191083908081, + "learning_rate": 3.731656184486374e-05, + "loss": 0.4443, + "num_input_tokens_seen": 924864, + "step": 1425 + }, + { + "epoch": 0.749475890985325, + "grad_norm": 1.1850011348724365, + "learning_rate": 3.74475890985325e-05, + "loss": 0.5136, + "num_input_tokens_seen": 927968, + "step": 1430 + }, + { + "epoch": 0.7520964360587002, + "grad_norm": 1.0675768852233887, + "learning_rate": 3.757861635220126e-05, + "loss": 0.4996, + "num_input_tokens_seen": 931584, + "step": 1435 + }, + { + "epoch": 0.7547169811320755, + "grad_norm": 1.6253714561462402, + "learning_rate": 3.770964360587002e-05, + "loss": 0.4602, + "num_input_tokens_seen": 934816, + "step": 1440 + }, + { + "epoch": 0.7573375262054507, + "grad_norm": 1.2322351932525635, + "learning_rate": 3.784067085953878e-05, + "loss": 0.5247, + "num_input_tokens_seen": 937856, + "step": 1445 + }, + { + "epoch": 0.759958071278826, + "grad_norm": 1.2224183082580566, + "learning_rate": 3.797169811320755e-05, + "loss": 0.4997, + "num_input_tokens_seen": 945632, + "step": 1450 + }, + { + "epoch": 0.7625786163522013, + "grad_norm": 1.024849534034729, + "learning_rate": 3.8102725366876314e-05, + "loss": 0.5333, + "num_input_tokens_seen": 948608, + "step": 1455 + }, + { + "epoch": 0.7651991614255765, + "grad_norm": 1.0879597663879395, + "learning_rate": 3.8233752620545074e-05, + "loss": 0.3943, + "num_input_tokens_seen": 951488, + "step": 1460 + }, + { + "epoch": 0.7678197064989518, + "grad_norm": 1.083091139793396, + "learning_rate": 3.836477987421384e-05, + "loss": 0.4868, + "num_input_tokens_seen": 955072, + "step": 1465 + }, + { + "epoch": 0.7704402515723271, + "grad_norm": 1.3005597591400146, + "learning_rate": 3.84958071278826e-05, + "loss": 0.5597, + "num_input_tokens_seen": 958272, + "step": 1470 + }, + { + "epoch": 0.7730607966457023, + "grad_norm": 0.6882370114326477, + "learning_rate": 3.8626834381551365e-05, + "loss": 0.6423, + "num_input_tokens_seen": 961888, + "step": 1475 + }, + { + "epoch": 0.7756813417190775, + "grad_norm": 2.279172658920288, + "learning_rate": 3.8757861635220125e-05, + "loss": 0.6664, + "num_input_tokens_seen": 964256, + "step": 1480 + }, + { + "epoch": 0.7783018867924528, + "grad_norm": 1.3799962997436523, + "learning_rate": 3.888888888888889e-05, + "loss": 0.5052, + "num_input_tokens_seen": 968224, + "step": 1485 + }, + { + "epoch": 0.7809224318658281, + "grad_norm": 0.6687362790107727, + "learning_rate": 3.901991614255766e-05, + "loss": 0.4743, + "num_input_tokens_seen": 971136, + "step": 1490 + }, + { + "epoch": 0.7835429769392034, + "grad_norm": 1.2772607803344727, + "learning_rate": 3.9150943396226416e-05, + "loss": 0.485, + "num_input_tokens_seen": 975936, + "step": 1495 + }, + { + "epoch": 0.7861635220125787, + "grad_norm": 1.3253453969955444, + "learning_rate": 3.9281970649895176e-05, + "loss": 0.5117, + "num_input_tokens_seen": 979360, + "step": 1500 + }, + { + "epoch": 0.7887840670859538, + "grad_norm": 0.5450798869132996, + "learning_rate": 3.941299790356394e-05, + "loss": 0.4376, + "num_input_tokens_seen": 983616, + "step": 1505 + }, + { + "epoch": 0.7914046121593291, + "grad_norm": 1.1536977291107178, + "learning_rate": 3.954402515723271e-05, + "loss": 0.4546, + "num_input_tokens_seen": 986944, + "step": 1510 + }, + { + "epoch": 0.7940251572327044, + "grad_norm": 0.9984588623046875, + "learning_rate": 3.967505241090147e-05, + "loss": 0.4823, + "num_input_tokens_seen": 989888, + "step": 1515 + }, + { + "epoch": 0.7966457023060797, + "grad_norm": 1.162560224533081, + "learning_rate": 3.9806079664570233e-05, + "loss": 0.4868, + "num_input_tokens_seen": 992768, + "step": 1520 + }, + { + "epoch": 0.799266247379455, + "grad_norm": 1.0175426006317139, + "learning_rate": 3.9937106918239e-05, + "loss": 0.3688, + "num_input_tokens_seen": 996896, + "step": 1525 + }, + { + "epoch": 0.8018867924528302, + "grad_norm": 0.8534324765205383, + "learning_rate": 4.006813417190776e-05, + "loss": 0.5169, + "num_input_tokens_seen": 999840, + "step": 1530 + }, + { + "epoch": 0.8045073375262054, + "grad_norm": 0.7690382599830627, + "learning_rate": 4.019916142557652e-05, + "loss": 0.4106, + "num_input_tokens_seen": 1003680, + "step": 1535 + }, + { + "epoch": 0.8071278825995807, + "grad_norm": 1.120474934577942, + "learning_rate": 4.0330188679245284e-05, + "loss": 0.4437, + "num_input_tokens_seen": 1006656, + "step": 1540 + }, + { + "epoch": 0.809748427672956, + "grad_norm": 1.9628252983093262, + "learning_rate": 4.046121593291405e-05, + "loss": 0.387, + "num_input_tokens_seen": 1008768, + "step": 1545 + }, + { + "epoch": 0.8123689727463312, + "grad_norm": 0.7493323087692261, + "learning_rate": 4.059224318658281e-05, + "loss": 0.4151, + "num_input_tokens_seen": 1011968, + "step": 1550 + }, + { + "epoch": 0.8149895178197065, + "grad_norm": 0.9795581698417664, + "learning_rate": 4.0723270440251576e-05, + "loss": 0.4665, + "num_input_tokens_seen": 1018784, + "step": 1555 + }, + { + "epoch": 0.8176100628930818, + "grad_norm": 0.8289888501167297, + "learning_rate": 4.0854297693920336e-05, + "loss": 0.348, + "num_input_tokens_seen": 1023520, + "step": 1560 + }, + { + "epoch": 0.820230607966457, + "grad_norm": 0.9891926646232605, + "learning_rate": 4.09853249475891e-05, + "loss": 0.4525, + "num_input_tokens_seen": 1027200, + "step": 1565 + }, + { + "epoch": 0.8228511530398323, + "grad_norm": 0.5674022436141968, + "learning_rate": 4.111635220125786e-05, + "loss": 0.7697, + "num_input_tokens_seen": 1030848, + "step": 1570 + }, + { + "epoch": 0.8254716981132075, + "grad_norm": 0.9372828602790833, + "learning_rate": 4.124737945492663e-05, + "loss": 0.5325, + "num_input_tokens_seen": 1035232, + "step": 1575 + }, + { + "epoch": 0.8280922431865828, + "grad_norm": 1.2675105333328247, + "learning_rate": 4.137840670859539e-05, + "loss": 0.4919, + "num_input_tokens_seen": 1038176, + "step": 1580 + }, + { + "epoch": 0.8307127882599581, + "grad_norm": 2.1916871070861816, + "learning_rate": 4.150943396226415e-05, + "loss": 0.3897, + "num_input_tokens_seen": 1041216, + "step": 1585 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.9461628794670105, + "learning_rate": 4.164046121593291e-05, + "loss": 0.5064, + "num_input_tokens_seen": 1043840, + "step": 1590 + }, + { + "epoch": 0.8359538784067087, + "grad_norm": 0.8400557041168213, + "learning_rate": 4.177148846960168e-05, + "loss": 0.3923, + "num_input_tokens_seen": 1046656, + "step": 1595 + }, + { + "epoch": 0.8385744234800838, + "grad_norm": 0.9414968490600586, + "learning_rate": 4.1902515723270444e-05, + "loss": 0.4207, + "num_input_tokens_seen": 1051616, + "step": 1600 + }, + { + "epoch": 0.8411949685534591, + "grad_norm": 1.2903246879577637, + "learning_rate": 4.2033542976939204e-05, + "loss": 0.5876, + "num_input_tokens_seen": 1053824, + "step": 1605 + }, + { + "epoch": 0.8438155136268344, + "grad_norm": 1.0650029182434082, + "learning_rate": 4.216457023060797e-05, + "loss": 0.4431, + "num_input_tokens_seen": 1056512, + "step": 1610 + }, + { + "epoch": 0.8464360587002097, + "grad_norm": 0.7673878073692322, + "learning_rate": 4.2295597484276736e-05, + "loss": 0.4726, + "num_input_tokens_seen": 1060160, + "step": 1615 + }, + { + "epoch": 0.8490566037735849, + "grad_norm": 1.4193532466888428, + "learning_rate": 4.2426624737945495e-05, + "loss": 0.6032, + "num_input_tokens_seen": 1062880, + "step": 1620 + }, + { + "epoch": 0.8516771488469602, + "grad_norm": 1.3776535987854004, + "learning_rate": 4.2557651991614255e-05, + "loss": 0.5627, + "num_input_tokens_seen": 1066528, + "step": 1625 + }, + { + "epoch": 0.8542976939203354, + "grad_norm": 0.9663233757019043, + "learning_rate": 4.268867924528302e-05, + "loss": 0.3933, + "num_input_tokens_seen": 1069472, + "step": 1630 + }, + { + "epoch": 0.8569182389937107, + "grad_norm": 0.997472882270813, + "learning_rate": 4.281970649895179e-05, + "loss": 0.4844, + "num_input_tokens_seen": 1074304, + "step": 1635 + }, + { + "epoch": 0.859538784067086, + "grad_norm": 1.409064769744873, + "learning_rate": 4.2950733752620546e-05, + "loss": 0.4221, + "num_input_tokens_seen": 1077376, + "step": 1640 + }, + { + "epoch": 0.8621593291404612, + "grad_norm": 0.7887558937072754, + "learning_rate": 4.308176100628931e-05, + "loss": 0.4913, + "num_input_tokens_seen": 1080672, + "step": 1645 + }, + { + "epoch": 0.8647798742138365, + "grad_norm": 0.9368734955787659, + "learning_rate": 4.321278825995808e-05, + "loss": 0.5246, + "num_input_tokens_seen": 1084224, + "step": 1650 + }, + { + "epoch": 0.8674004192872118, + "grad_norm": 1.0434441566467285, + "learning_rate": 4.334381551362683e-05, + "loss": 0.6436, + "num_input_tokens_seen": 1086912, + "step": 1655 + }, + { + "epoch": 0.870020964360587, + "grad_norm": 1.3519736528396606, + "learning_rate": 4.34748427672956e-05, + "loss": 0.5746, + "num_input_tokens_seen": 1089696, + "step": 1660 + }, + { + "epoch": 0.8726415094339622, + "grad_norm": 0.8625402450561523, + "learning_rate": 4.3605870020964364e-05, + "loss": 0.4204, + "num_input_tokens_seen": 1092608, + "step": 1665 + }, + { + "epoch": 0.8752620545073375, + "grad_norm": 0.9488956928253174, + "learning_rate": 4.373689727463312e-05, + "loss": 0.6534, + "num_input_tokens_seen": 1095808, + "step": 1670 + }, + { + "epoch": 0.8778825995807128, + "grad_norm": 0.7375586628913879, + "learning_rate": 4.386792452830189e-05, + "loss": 0.5481, + "num_input_tokens_seen": 1098816, + "step": 1675 + }, + { + "epoch": 0.8805031446540881, + "grad_norm": 0.6724284291267395, + "learning_rate": 4.3998951781970655e-05, + "loss": 0.5537, + "num_input_tokens_seen": 1103136, + "step": 1680 + }, + { + "epoch": 0.8831236897274634, + "grad_norm": 0.7764517068862915, + "learning_rate": 4.4129979035639415e-05, + "loss": 0.437, + "num_input_tokens_seen": 1107872, + "step": 1685 + }, + { + "epoch": 0.8857442348008385, + "grad_norm": 0.9861476421356201, + "learning_rate": 4.4261006289308174e-05, + "loss": 0.618, + "num_input_tokens_seen": 1111072, + "step": 1690 + }, + { + "epoch": 0.8883647798742138, + "grad_norm": 1.3961912393569946, + "learning_rate": 4.439203354297694e-05, + "loss": 0.5344, + "num_input_tokens_seen": 1113792, + "step": 1695 + }, + { + "epoch": 0.8909853249475891, + "grad_norm": 1.3887982368469238, + "learning_rate": 4.4523060796645706e-05, + "loss": 0.4316, + "num_input_tokens_seen": 1116384, + "step": 1700 + }, + { + "epoch": 0.8936058700209644, + "grad_norm": 0.6735685467720032, + "learning_rate": 4.4654088050314466e-05, + "loss": 0.4291, + "num_input_tokens_seen": 1119296, + "step": 1705 + }, + { + "epoch": 0.8962264150943396, + "grad_norm": 0.7763240933418274, + "learning_rate": 4.478511530398323e-05, + "loss": 0.5904, + "num_input_tokens_seen": 1122720, + "step": 1710 + }, + { + "epoch": 0.8988469601677149, + "grad_norm": 2.8756637573242188, + "learning_rate": 4.491614255765199e-05, + "loss": 0.5418, + "num_input_tokens_seen": 1125088, + "step": 1715 + }, + { + "epoch": 0.9014675052410901, + "grad_norm": 1.0745000839233398, + "learning_rate": 4.504716981132076e-05, + "loss": 0.6086, + "num_input_tokens_seen": 1128288, + "step": 1720 + }, + { + "epoch": 0.9040880503144654, + "grad_norm": 1.6246923208236694, + "learning_rate": 4.517819706498952e-05, + "loss": 0.5343, + "num_input_tokens_seen": 1131040, + "step": 1725 + }, + { + "epoch": 0.9067085953878407, + "grad_norm": 0.699201762676239, + "learning_rate": 4.530922431865828e-05, + "loss": 0.5153, + "num_input_tokens_seen": 1135136, + "step": 1730 + }, + { + "epoch": 0.9093291404612159, + "grad_norm": 0.8787443041801453, + "learning_rate": 4.544025157232705e-05, + "loss": 0.6253, + "num_input_tokens_seen": 1139008, + "step": 1735 + }, + { + "epoch": 0.9119496855345912, + "grad_norm": 0.8806815147399902, + "learning_rate": 4.557127882599581e-05, + "loss": 0.4548, + "num_input_tokens_seen": 1141920, + "step": 1740 + }, + { + "epoch": 0.9145702306079665, + "grad_norm": 1.0828437805175781, + "learning_rate": 4.570230607966457e-05, + "loss": 0.4333, + "num_input_tokens_seen": 1144960, + "step": 1745 + }, + { + "epoch": 0.9171907756813418, + "grad_norm": 1.0258169174194336, + "learning_rate": 4.5833333333333334e-05, + "loss": 0.5326, + "num_input_tokens_seen": 1147712, + "step": 1750 + }, + { + "epoch": 0.9198113207547169, + "grad_norm": 0.9765433669090271, + "learning_rate": 4.59643605870021e-05, + "loss": 0.5189, + "num_input_tokens_seen": 1150528, + "step": 1755 + }, + { + "epoch": 0.9224318658280922, + "grad_norm": 1.3686145544052124, + "learning_rate": 4.609538784067086e-05, + "loss": 0.5525, + "num_input_tokens_seen": 1154272, + "step": 1760 + }, + { + "epoch": 0.9250524109014675, + "grad_norm": 1.5498641729354858, + "learning_rate": 4.6226415094339625e-05, + "loss": 0.4369, + "num_input_tokens_seen": 1157088, + "step": 1765 + }, + { + "epoch": 0.9276729559748428, + "grad_norm": 0.8960267901420593, + "learning_rate": 4.635744234800839e-05, + "loss": 0.6084, + "num_input_tokens_seen": 1160192, + "step": 1770 + }, + { + "epoch": 0.9302935010482181, + "grad_norm": 0.974982500076294, + "learning_rate": 4.648846960167715e-05, + "loss": 0.5283, + "num_input_tokens_seen": 1163424, + "step": 1775 + }, + { + "epoch": 0.9329140461215933, + "grad_norm": 0.762740969657898, + "learning_rate": 4.661949685534591e-05, + "loss": 0.5772, + "num_input_tokens_seen": 1167104, + "step": 1780 + }, + { + "epoch": 0.9355345911949685, + "grad_norm": 1.2664331197738647, + "learning_rate": 4.6750524109014677e-05, + "loss": 0.485, + "num_input_tokens_seen": 1170368, + "step": 1785 + }, + { + "epoch": 0.9381551362683438, + "grad_norm": 0.7627193331718445, + "learning_rate": 4.688155136268344e-05, + "loss": 0.5926, + "num_input_tokens_seen": 1173888, + "step": 1790 + }, + { + "epoch": 0.9407756813417191, + "grad_norm": 0.6363877654075623, + "learning_rate": 4.70125786163522e-05, + "loss": 0.4907, + "num_input_tokens_seen": 1177408, + "step": 1795 + }, + { + "epoch": 0.9433962264150944, + "grad_norm": 1.1750450134277344, + "learning_rate": 4.714360587002097e-05, + "loss": 0.4745, + "num_input_tokens_seen": 1181056, + "step": 1800 + }, + { + "epoch": 0.9460167714884696, + "grad_norm": 1.114333987236023, + "learning_rate": 4.7274633123689734e-05, + "loss": 0.6566, + "num_input_tokens_seen": 1184256, + "step": 1805 + }, + { + "epoch": 0.9486373165618449, + "grad_norm": 0.8970564007759094, + "learning_rate": 4.7405660377358494e-05, + "loss": 0.4175, + "num_input_tokens_seen": 1187168, + "step": 1810 + }, + { + "epoch": 0.9512578616352201, + "grad_norm": 0.6136268377304077, + "learning_rate": 4.753668763102725e-05, + "loss": 0.4456, + "num_input_tokens_seen": 1190784, + "step": 1815 + }, + { + "epoch": 0.9538784067085954, + "grad_norm": 0.9896544814109802, + "learning_rate": 4.766771488469602e-05, + "loss": 0.5445, + "num_input_tokens_seen": 1193312, + "step": 1820 + }, + { + "epoch": 0.9564989517819706, + "grad_norm": 0.7866264581680298, + "learning_rate": 4.7798742138364785e-05, + "loss": 0.4739, + "num_input_tokens_seen": 1196608, + "step": 1825 + }, + { + "epoch": 0.9591194968553459, + "grad_norm": 1.5880359411239624, + "learning_rate": 4.7929769392033545e-05, + "loss": 0.5522, + "num_input_tokens_seen": 1200160, + "step": 1830 + }, + { + "epoch": 0.9617400419287212, + "grad_norm": 2.0305678844451904, + "learning_rate": 4.806079664570231e-05, + "loss": 0.3973, + "num_input_tokens_seen": 1203040, + "step": 1835 + }, + { + "epoch": 0.9643605870020965, + "grad_norm": 0.7545301914215088, + "learning_rate": 4.819182389937107e-05, + "loss": 0.3714, + "num_input_tokens_seen": 1205408, + "step": 1840 + }, + { + "epoch": 0.9669811320754716, + "grad_norm": 3.2850940227508545, + "learning_rate": 4.8322851153039836e-05, + "loss": 0.349, + "num_input_tokens_seen": 1208896, + "step": 1845 + }, + { + "epoch": 0.9696016771488469, + "grad_norm": 0.8142299056053162, + "learning_rate": 4.8453878406708596e-05, + "loss": 0.4665, + "num_input_tokens_seen": 1212576, + "step": 1850 + }, + { + "epoch": 0.9722222222222222, + "grad_norm": 0.6876987218856812, + "learning_rate": 4.858490566037736e-05, + "loss": 0.4703, + "num_input_tokens_seen": 1215872, + "step": 1855 + }, + { + "epoch": 0.9748427672955975, + "grad_norm": 2.8036534786224365, + "learning_rate": 4.871593291404613e-05, + "loss": 0.4468, + "num_input_tokens_seen": 1219968, + "step": 1860 + }, + { + "epoch": 0.9774633123689728, + "grad_norm": 0.7710445523262024, + "learning_rate": 4.884696016771489e-05, + "loss": 0.4866, + "num_input_tokens_seen": 1222048, + "step": 1865 + }, + { + "epoch": 0.980083857442348, + "grad_norm": 1.1830052137374878, + "learning_rate": 4.897798742138365e-05, + "loss": 0.514, + "num_input_tokens_seen": 1225120, + "step": 1870 + }, + { + "epoch": 0.9827044025157232, + "grad_norm": 0.8191472887992859, + "learning_rate": 4.910901467505241e-05, + "loss": 0.5125, + "num_input_tokens_seen": 1229056, + "step": 1875 + }, + { + "epoch": 0.9853249475890985, + "grad_norm": 0.8089073896408081, + "learning_rate": 4.924004192872117e-05, + "loss": 0.3696, + "num_input_tokens_seen": 1231776, + "step": 1880 + }, + { + "epoch": 0.9879454926624738, + "grad_norm": 0.6687107682228088, + "learning_rate": 4.937106918238994e-05, + "loss": 0.4585, + "num_input_tokens_seen": 1235040, + "step": 1885 + }, + { + "epoch": 0.9905660377358491, + "grad_norm": 1.1852471828460693, + "learning_rate": 4.9502096436058705e-05, + "loss": 0.3882, + "num_input_tokens_seen": 1238048, + "step": 1890 + }, + { + "epoch": 0.9931865828092243, + "grad_norm": 0.8214172720909119, + "learning_rate": 4.963312368972747e-05, + "loss": 0.5264, + "num_input_tokens_seen": 1241344, + "step": 1895 + }, + { + "epoch": 0.9958071278825996, + "grad_norm": 1.0274542570114136, + "learning_rate": 4.976415094339622e-05, + "loss": 0.3466, + "num_input_tokens_seen": 1243840, + "step": 1900 + }, + { + "epoch": 0.9984276729559748, + "grad_norm": 0.8158193230628967, + "learning_rate": 4.989517819706499e-05, + "loss": 0.581, + "num_input_tokens_seen": 1246816, + "step": 1905 + }, + { + "epoch": 1.0, + "eval_loss": 0.4810025095939636, + "eval_runtime": 16.0149, + "eval_samples_per_second": 52.951, + "eval_steps_per_second": 13.238, + "num_input_tokens_seen": 1248304, + "step": 1908 + }, + { + "epoch": 1.00104821802935, + "grad_norm": 1.0742979049682617, + "learning_rate": 4.9999999581622816e-05, + "loss": 0.4705, + "num_input_tokens_seen": 1249200, + "step": 1910 + }, + { + "epoch": 1.0036687631027255, + "grad_norm": 1.1787995100021362, + "learning_rate": 4.999998493842267e-05, + "loss": 0.445, + "num_input_tokens_seen": 1251664, + "step": 1915 + }, + { + "epoch": 1.0062893081761006, + "grad_norm": 0.4732556641101837, + "learning_rate": 4.999994937637709e-05, + "loss": 0.4375, + "num_input_tokens_seen": 1254928, + "step": 1920 + }, + { + "epoch": 1.0089098532494758, + "grad_norm": 1.4841618537902832, + "learning_rate": 4.999989289551581e-05, + "loss": 0.3812, + "num_input_tokens_seen": 1257744, + "step": 1925 + }, + { + "epoch": 1.0115303983228512, + "grad_norm": 1.050187587738037, + "learning_rate": 4.999981549588612e-05, + "loss": 0.4126, + "num_input_tokens_seen": 1261904, + "step": 1930 + }, + { + "epoch": 1.0141509433962264, + "grad_norm": 0.628690779209137, + "learning_rate": 4.9999717177552764e-05, + "loss": 0.3835, + "num_input_tokens_seen": 1264432, + "step": 1935 + }, + { + "epoch": 1.0167714884696017, + "grad_norm": 0.9771276116371155, + "learning_rate": 4.999959794059801e-05, + "loss": 0.5186, + "num_input_tokens_seen": 1268400, + "step": 1940 + }, + { + "epoch": 1.019392033542977, + "grad_norm": 0.836229681968689, + "learning_rate": 4.999945778512164e-05, + "loss": 0.4003, + "num_input_tokens_seen": 1272752, + "step": 1945 + }, + { + "epoch": 1.0220125786163523, + "grad_norm": 0.8851866126060486, + "learning_rate": 4.999929671124093e-05, + "loss": 0.5479, + "num_input_tokens_seen": 1275568, + "step": 1950 + }, + { + "epoch": 1.0246331236897275, + "grad_norm": 0.6954601407051086, + "learning_rate": 4.9999114719090645e-05, + "loss": 0.5315, + "num_input_tokens_seen": 1278160, + "step": 1955 + }, + { + "epoch": 1.0272536687631026, + "grad_norm": 1.1170744895935059, + "learning_rate": 4.999891180882308e-05, + "loss": 0.438, + "num_input_tokens_seen": 1281424, + "step": 1960 + }, + { + "epoch": 1.029874213836478, + "grad_norm": 0.8316051959991455, + "learning_rate": 4.9998687980608014e-05, + "loss": 0.4385, + "num_input_tokens_seen": 1283920, + "step": 1965 + }, + { + "epoch": 1.0324947589098532, + "grad_norm": 1.3205347061157227, + "learning_rate": 4.9998443234632744e-05, + "loss": 0.4072, + "num_input_tokens_seen": 1287920, + "step": 1970 + }, + { + "epoch": 1.0351153039832286, + "grad_norm": 1.6038148403167725, + "learning_rate": 4.999817757110206e-05, + "loss": 0.4972, + "num_input_tokens_seen": 1290960, + "step": 1975 + }, + { + "epoch": 1.0377358490566038, + "grad_norm": 0.7955760955810547, + "learning_rate": 4.999789099023826e-05, + "loss": 0.3383, + "num_input_tokens_seen": 1294544, + "step": 1980 + }, + { + "epoch": 1.040356394129979, + "grad_norm": 0.6737096905708313, + "learning_rate": 4.9997583492281126e-05, + "loss": 0.6128, + "num_input_tokens_seen": 1298160, + "step": 1985 + }, + { + "epoch": 1.0429769392033543, + "grad_norm": 0.8089790940284729, + "learning_rate": 4.999725507748798e-05, + "loss": 0.4411, + "num_input_tokens_seen": 1300368, + "step": 1990 + }, + { + "epoch": 1.0455974842767295, + "grad_norm": 0.5768561363220215, + "learning_rate": 4.9996905746133606e-05, + "loss": 0.3987, + "num_input_tokens_seen": 1307184, + "step": 1995 + }, + { + "epoch": 1.0482180293501049, + "grad_norm": 1.187424898147583, + "learning_rate": 4.999653549851032e-05, + "loss": 0.7058, + "num_input_tokens_seen": 1309840, + "step": 2000 + }, + { + "epoch": 1.05083857442348, + "grad_norm": 1.0152928829193115, + "learning_rate": 4.999614433492792e-05, + "loss": 0.4344, + "num_input_tokens_seen": 1313648, + "step": 2005 + }, + { + "epoch": 1.0534591194968554, + "grad_norm": 0.9261406064033508, + "learning_rate": 4.9995732255713725e-05, + "loss": 0.4588, + "num_input_tokens_seen": 1316496, + "step": 2010 + }, + { + "epoch": 1.0560796645702306, + "grad_norm": 1.382073998451233, + "learning_rate": 4.9995299261212536e-05, + "loss": 0.4705, + "num_input_tokens_seen": 1319952, + "step": 2015 + }, + { + "epoch": 1.0587002096436058, + "grad_norm": 0.8699270486831665, + "learning_rate": 4.999484535178667e-05, + "loss": 0.4166, + "num_input_tokens_seen": 1323344, + "step": 2020 + }, + { + "epoch": 1.0613207547169812, + "grad_norm": 1.0845222473144531, + "learning_rate": 4.9994370527815925e-05, + "loss": 0.5126, + "num_input_tokens_seen": 1326704, + "step": 2025 + }, + { + "epoch": 1.0639412997903563, + "grad_norm": 0.6870549321174622, + "learning_rate": 4.999387478969762e-05, + "loss": 0.4452, + "num_input_tokens_seen": 1330128, + "step": 2030 + }, + { + "epoch": 1.0665618448637317, + "grad_norm": 0.9056400060653687, + "learning_rate": 4.999335813784657e-05, + "loss": 0.4056, + "num_input_tokens_seen": 1333040, + "step": 2035 + }, + { + "epoch": 1.069182389937107, + "grad_norm": 0.9430304765701294, + "learning_rate": 4.999282057269508e-05, + "loss": 0.4711, + "num_input_tokens_seen": 1335504, + "step": 2040 + }, + { + "epoch": 1.0718029350104823, + "grad_norm": 1.2468879222869873, + "learning_rate": 4.999226209469295e-05, + "loss": 0.5015, + "num_input_tokens_seen": 1339280, + "step": 2045 + }, + { + "epoch": 1.0744234800838575, + "grad_norm": 1.3531601428985596, + "learning_rate": 4.999168270430752e-05, + "loss": 0.52, + "num_input_tokens_seen": 1342576, + "step": 2050 + }, + { + "epoch": 1.0770440251572326, + "grad_norm": 0.8856658935546875, + "learning_rate": 4.999108240202356e-05, + "loss": 0.459, + "num_input_tokens_seen": 1346256, + "step": 2055 + }, + { + "epoch": 1.079664570230608, + "grad_norm": 1.056297779083252, + "learning_rate": 4.999046118834341e-05, + "loss": 0.5484, + "num_input_tokens_seen": 1349008, + "step": 2060 + }, + { + "epoch": 1.0822851153039832, + "grad_norm": 0.9018176794052124, + "learning_rate": 4.998981906378684e-05, + "loss": 0.52, + "num_input_tokens_seen": 1353552, + "step": 2065 + }, + { + "epoch": 1.0849056603773586, + "grad_norm": 1.057755708694458, + "learning_rate": 4.998915602889117e-05, + "loss": 0.4244, + "num_input_tokens_seen": 1356400, + "step": 2070 + }, + { + "epoch": 1.0875262054507338, + "grad_norm": 1.263851284980774, + "learning_rate": 4.9988472084211203e-05, + "loss": 0.4647, + "num_input_tokens_seen": 1361008, + "step": 2075 + }, + { + "epoch": 1.090146750524109, + "grad_norm": 1.9587774276733398, + "learning_rate": 4.9987767230319215e-05, + "loss": 0.5123, + "num_input_tokens_seen": 1363184, + "step": 2080 + }, + { + "epoch": 1.0927672955974843, + "grad_norm": 1.4504437446594238, + "learning_rate": 4.998704146780501e-05, + "loss": 0.4464, + "num_input_tokens_seen": 1365872, + "step": 2085 + }, + { + "epoch": 1.0953878406708595, + "grad_norm": 0.7393147945404053, + "learning_rate": 4.9986294797275857e-05, + "loss": 0.5093, + "num_input_tokens_seen": 1369168, + "step": 2090 + }, + { + "epoch": 1.0980083857442349, + "grad_norm": 0.7789657115936279, + "learning_rate": 4.9985527219356554e-05, + "loss": 0.4523, + "num_input_tokens_seen": 1372208, + "step": 2095 + }, + { + "epoch": 1.10062893081761, + "grad_norm": 0.8802303075790405, + "learning_rate": 4.998473873468937e-05, + "loss": 0.3695, + "num_input_tokens_seen": 1377360, + "step": 2100 + }, + { + "epoch": 1.1032494758909852, + "grad_norm": 1.60611891746521, + "learning_rate": 4.998392934393407e-05, + "loss": 0.398, + "num_input_tokens_seen": 1380400, + "step": 2105 + }, + { + "epoch": 1.1058700209643606, + "grad_norm": 1.350777268409729, + "learning_rate": 4.9983099047767905e-05, + "loss": 0.4369, + "num_input_tokens_seen": 1383056, + "step": 2110 + }, + { + "epoch": 1.1084905660377358, + "grad_norm": 1.2078945636749268, + "learning_rate": 4.9982247846885644e-05, + "loss": 0.5236, + "num_input_tokens_seen": 1386160, + "step": 2115 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 1.0692567825317383, + "learning_rate": 4.9981375741999534e-05, + "loss": 0.4301, + "num_input_tokens_seen": 1389232, + "step": 2120 + }, + { + "epoch": 1.1137316561844863, + "grad_norm": 0.8742273449897766, + "learning_rate": 4.99804827338393e-05, + "loss": 0.5455, + "num_input_tokens_seen": 1393328, + "step": 2125 + }, + { + "epoch": 1.1163522012578617, + "grad_norm": 0.8781280517578125, + "learning_rate": 4.997956882315218e-05, + "loss": 0.408, + "num_input_tokens_seen": 1396816, + "step": 2130 + }, + { + "epoch": 1.118972746331237, + "grad_norm": 0.9955752491950989, + "learning_rate": 4.997863401070289e-05, + "loss": 0.3961, + "num_input_tokens_seen": 1400176, + "step": 2135 + }, + { + "epoch": 1.121593291404612, + "grad_norm": 0.9488391280174255, + "learning_rate": 4.9977678297273634e-05, + "loss": 0.5429, + "num_input_tokens_seen": 1403408, + "step": 2140 + }, + { + "epoch": 1.1242138364779874, + "grad_norm": 0.8460773825645447, + "learning_rate": 4.997670168366412e-05, + "loss": 0.3935, + "num_input_tokens_seen": 1406448, + "step": 2145 + }, + { + "epoch": 1.1268343815513626, + "grad_norm": 2.673248767852783, + "learning_rate": 4.997570417069152e-05, + "loss": 0.4819, + "num_input_tokens_seen": 1409584, + "step": 2150 + }, + { + "epoch": 1.129454926624738, + "grad_norm": 1.185036301612854, + "learning_rate": 4.997468575919052e-05, + "loss": 0.4712, + "num_input_tokens_seen": 1413872, + "step": 2155 + }, + { + "epoch": 1.1320754716981132, + "grad_norm": 1.0000262260437012, + "learning_rate": 4.9973646450013275e-05, + "loss": 0.6094, + "num_input_tokens_seen": 1416528, + "step": 2160 + }, + { + "epoch": 1.1346960167714886, + "grad_norm": 1.172716498374939, + "learning_rate": 4.997258624402943e-05, + "loss": 0.4793, + "num_input_tokens_seen": 1418896, + "step": 2165 + }, + { + "epoch": 1.1373165618448637, + "grad_norm": 1.255769968032837, + "learning_rate": 4.997150514212611e-05, + "loss": 0.5115, + "num_input_tokens_seen": 1421712, + "step": 2170 + }, + { + "epoch": 1.139937106918239, + "grad_norm": 0.8062251210212708, + "learning_rate": 4.997040314520795e-05, + "loss": 0.4456, + "num_input_tokens_seen": 1425072, + "step": 2175 + }, + { + "epoch": 1.1425576519916143, + "grad_norm": 0.807964563369751, + "learning_rate": 4.9969280254197035e-05, + "loss": 0.6374, + "num_input_tokens_seen": 1428848, + "step": 2180 + }, + { + "epoch": 1.1451781970649895, + "grad_norm": 1.036013126373291, + "learning_rate": 4.996813647003296e-05, + "loss": 0.4905, + "num_input_tokens_seen": 1432432, + "step": 2185 + }, + { + "epoch": 1.1477987421383649, + "grad_norm": 0.8877140879631042, + "learning_rate": 4.9966971793672784e-05, + "loss": 0.6338, + "num_input_tokens_seen": 1436048, + "step": 2190 + }, + { + "epoch": 1.15041928721174, + "grad_norm": 1.3733699321746826, + "learning_rate": 4.9965786226091054e-05, + "loss": 0.6085, + "num_input_tokens_seen": 1440144, + "step": 2195 + }, + { + "epoch": 1.1530398322851152, + "grad_norm": 1.034093976020813, + "learning_rate": 4.9964579768279803e-05, + "loss": 0.5383, + "num_input_tokens_seen": 1443760, + "step": 2200 + }, + { + "epoch": 1.1556603773584906, + "grad_norm": 0.970409095287323, + "learning_rate": 4.996335242124854e-05, + "loss": 0.7393, + "num_input_tokens_seen": 1446864, + "step": 2205 + }, + { + "epoch": 1.1582809224318658, + "grad_norm": 0.9072503447532654, + "learning_rate": 4.996210418602425e-05, + "loss": 0.3069, + "num_input_tokens_seen": 1449456, + "step": 2210 + }, + { + "epoch": 1.1609014675052411, + "grad_norm": 0.8870097398757935, + "learning_rate": 4.99608350636514e-05, + "loss": 0.5277, + "num_input_tokens_seen": 1452944, + "step": 2215 + }, + { + "epoch": 1.1635220125786163, + "grad_norm": 0.700185239315033, + "learning_rate": 4.995954505519193e-05, + "loss": 0.4376, + "num_input_tokens_seen": 1455952, + "step": 2220 + }, + { + "epoch": 1.1661425576519917, + "grad_norm": 0.9235810041427612, + "learning_rate": 4.995823416172527e-05, + "loss": 0.5658, + "num_input_tokens_seen": 1459376, + "step": 2225 + }, + { + "epoch": 1.1687631027253669, + "grad_norm": 0.8589577674865723, + "learning_rate": 4.995690238434831e-05, + "loss": 0.4468, + "num_input_tokens_seen": 1462352, + "step": 2230 + }, + { + "epoch": 1.171383647798742, + "grad_norm": 1.1888647079467773, + "learning_rate": 4.995554972417541e-05, + "loss": 0.5421, + "num_input_tokens_seen": 1465104, + "step": 2235 + }, + { + "epoch": 1.1740041928721174, + "grad_norm": 0.7779944539070129, + "learning_rate": 4.995417618233844e-05, + "loss": 0.5131, + "num_input_tokens_seen": 1468624, + "step": 2240 + }, + { + "epoch": 1.1766247379454926, + "grad_norm": 1.0345760583877563, + "learning_rate": 4.9952781759986694e-05, + "loss": 0.3968, + "num_input_tokens_seen": 1475184, + "step": 2245 + }, + { + "epoch": 1.179245283018868, + "grad_norm": 0.46633586287498474, + "learning_rate": 4.995136645828697e-05, + "loss": 0.3658, + "num_input_tokens_seen": 1478640, + "step": 2250 + }, + { + "epoch": 1.1818658280922432, + "grad_norm": 1.153000831604004, + "learning_rate": 4.994993027842353e-05, + "loss": 0.5201, + "num_input_tokens_seen": 1481552, + "step": 2255 + }, + { + "epoch": 1.1844863731656186, + "grad_norm": 0.9975373148918152, + "learning_rate": 4.9948473221598094e-05, + "loss": 0.4209, + "num_input_tokens_seen": 1485104, + "step": 2260 + }, + { + "epoch": 1.1871069182389937, + "grad_norm": 0.9484853148460388, + "learning_rate": 4.994699528902987e-05, + "loss": 0.492, + "num_input_tokens_seen": 1487728, + "step": 2265 + }, + { + "epoch": 1.189727463312369, + "grad_norm": 0.7492456436157227, + "learning_rate": 4.994549648195552e-05, + "loss": 0.4324, + "num_input_tokens_seen": 1491312, + "step": 2270 + }, + { + "epoch": 1.1923480083857443, + "grad_norm": 1.4831198453903198, + "learning_rate": 4.994397680162918e-05, + "loss": 0.4853, + "num_input_tokens_seen": 1494192, + "step": 2275 + }, + { + "epoch": 1.1949685534591195, + "grad_norm": 1.3034517765045166, + "learning_rate": 4.9942436249322444e-05, + "loss": 0.606, + "num_input_tokens_seen": 1497168, + "step": 2280 + }, + { + "epoch": 1.1975890985324948, + "grad_norm": 0.9795873761177063, + "learning_rate": 4.994087482632438e-05, + "loss": 0.4757, + "num_input_tokens_seen": 1501936, + "step": 2285 + }, + { + "epoch": 1.20020964360587, + "grad_norm": 0.7105223536491394, + "learning_rate": 4.993929253394152e-05, + "loss": 0.394, + "num_input_tokens_seen": 1505040, + "step": 2290 + }, + { + "epoch": 1.2028301886792452, + "grad_norm": 1.2792354822158813, + "learning_rate": 4.993768937349784e-05, + "loss": 0.604, + "num_input_tokens_seen": 1507856, + "step": 2295 + }, + { + "epoch": 1.2054507337526206, + "grad_norm": 1.1788899898529053, + "learning_rate": 4.993606534633481e-05, + "loss": 0.446, + "num_input_tokens_seen": 1510768, + "step": 2300 + }, + { + "epoch": 1.2080712788259957, + "grad_norm": 1.0587245225906372, + "learning_rate": 4.9934420453811334e-05, + "loss": 0.5837, + "num_input_tokens_seen": 1513808, + "step": 2305 + }, + { + "epoch": 1.2106918238993711, + "grad_norm": 0.6296498775482178, + "learning_rate": 4.993275469730377e-05, + "loss": 0.4276, + "num_input_tokens_seen": 1517392, + "step": 2310 + }, + { + "epoch": 1.2133123689727463, + "grad_norm": 0.6112010478973389, + "learning_rate": 4.993106807820597e-05, + "loss": 0.4738, + "num_input_tokens_seen": 1521648, + "step": 2315 + }, + { + "epoch": 1.2159329140461215, + "grad_norm": 0.9701637625694275, + "learning_rate": 4.99293605979292e-05, + "loss": 0.4183, + "num_input_tokens_seen": 1524496, + "step": 2320 + }, + { + "epoch": 1.2185534591194969, + "grad_norm": 0.9613224267959595, + "learning_rate": 4.992763225790221e-05, + "loss": 0.4863, + "num_input_tokens_seen": 1528144, + "step": 2325 + }, + { + "epoch": 1.221174004192872, + "grad_norm": 1.068890929222107, + "learning_rate": 4.992588305957119e-05, + "loss": 0.4457, + "num_input_tokens_seen": 1531664, + "step": 2330 + }, + { + "epoch": 1.2237945492662474, + "grad_norm": 1.1266165971755981, + "learning_rate": 4.99241130043998e-05, + "loss": 0.5727, + "num_input_tokens_seen": 1535408, + "step": 2335 + }, + { + "epoch": 1.2264150943396226, + "grad_norm": 0.763270914554596, + "learning_rate": 4.992232209386914e-05, + "loss": 0.4676, + "num_input_tokens_seen": 1538448, + "step": 2340 + }, + { + "epoch": 1.229035639412998, + "grad_norm": 0.9213160276412964, + "learning_rate": 4.9920510329477756e-05, + "loss": 0.5028, + "num_input_tokens_seen": 1541968, + "step": 2345 + }, + { + "epoch": 1.2316561844863732, + "grad_norm": 0.8430405855178833, + "learning_rate": 4.9918677712741644e-05, + "loss": 0.5094, + "num_input_tokens_seen": 1544560, + "step": 2350 + }, + { + "epoch": 1.2342767295597485, + "grad_norm": 1.1504417657852173, + "learning_rate": 4.991682424519427e-05, + "loss": 0.4719, + "num_input_tokens_seen": 1548080, + "step": 2355 + }, + { + "epoch": 1.2368972746331237, + "grad_norm": 1.2291706800460815, + "learning_rate": 4.9914949928386524e-05, + "loss": 0.4132, + "num_input_tokens_seen": 1551472, + "step": 2360 + }, + { + "epoch": 1.2395178197064989, + "grad_norm": 0.8182924389839172, + "learning_rate": 4.991305476388673e-05, + "loss": 0.5327, + "num_input_tokens_seen": 1553904, + "step": 2365 + }, + { + "epoch": 1.2421383647798743, + "grad_norm": 0.793321967124939, + "learning_rate": 4.991113875328072e-05, + "loss": 0.4151, + "num_input_tokens_seen": 1558352, + "step": 2370 + }, + { + "epoch": 1.2447589098532494, + "grad_norm": 1.2904354333877563, + "learning_rate": 4.9909201898171676e-05, + "loss": 0.3633, + "num_input_tokens_seen": 1561616, + "step": 2375 + }, + { + "epoch": 1.2473794549266248, + "grad_norm": 0.7734314799308777, + "learning_rate": 4.9907244200180295e-05, + "loss": 0.3482, + "num_input_tokens_seen": 1566000, + "step": 2380 + }, + { + "epoch": 1.25, + "grad_norm": 0.7268831729888916, + "learning_rate": 4.990526566094469e-05, + "loss": 0.4813, + "num_input_tokens_seen": 1569776, + "step": 2385 + }, + { + "epoch": 1.2526205450733752, + "grad_norm": 1.2944151163101196, + "learning_rate": 4.99032662821204e-05, + "loss": 0.5265, + "num_input_tokens_seen": 1572112, + "step": 2390 + }, + { + "epoch": 1.2552410901467506, + "grad_norm": 0.6390265226364136, + "learning_rate": 4.990124606538042e-05, + "loss": 0.4888, + "num_input_tokens_seen": 1575312, + "step": 2395 + }, + { + "epoch": 1.2578616352201257, + "grad_norm": 0.7560932040214539, + "learning_rate": 4.9899205012415184e-05, + "loss": 0.5422, + "num_input_tokens_seen": 1578256, + "step": 2400 + }, + { + "epoch": 1.2604821802935011, + "grad_norm": 0.9199072122573853, + "learning_rate": 4.9897143124932547e-05, + "loss": 0.6436, + "num_input_tokens_seen": 1581328, + "step": 2405 + }, + { + "epoch": 1.2631027253668763, + "grad_norm": 0.8926464915275574, + "learning_rate": 4.9895060404657786e-05, + "loss": 0.495, + "num_input_tokens_seen": 1583952, + "step": 2410 + }, + { + "epoch": 1.2657232704402515, + "grad_norm": 1.5563676357269287, + "learning_rate": 4.9892956853333644e-05, + "loss": 0.4787, + "num_input_tokens_seen": 1587472, + "step": 2415 + }, + { + "epoch": 1.2683438155136268, + "grad_norm": 1.1136178970336914, + "learning_rate": 4.989083247272027e-05, + "loss": 0.4206, + "num_input_tokens_seen": 1590384, + "step": 2420 + }, + { + "epoch": 1.270964360587002, + "grad_norm": 0.9627929329872131, + "learning_rate": 4.988868726459526e-05, + "loss": 0.4284, + "num_input_tokens_seen": 1593264, + "step": 2425 + }, + { + "epoch": 1.2735849056603774, + "grad_norm": 0.6252536177635193, + "learning_rate": 4.988652123075361e-05, + "loss": 0.4134, + "num_input_tokens_seen": 1597680, + "step": 2430 + }, + { + "epoch": 1.2762054507337526, + "grad_norm": 1.4072520732879639, + "learning_rate": 4.988433437300776e-05, + "loss": 0.5027, + "num_input_tokens_seen": 1600912, + "step": 2435 + }, + { + "epoch": 1.2788259958071277, + "grad_norm": 1.094447374343872, + "learning_rate": 4.988212669318758e-05, + "loss": 0.4068, + "num_input_tokens_seen": 1603472, + "step": 2440 + }, + { + "epoch": 1.2814465408805031, + "grad_norm": 2.021796226501465, + "learning_rate": 4.987989819314036e-05, + "loss": 0.4313, + "num_input_tokens_seen": 1606384, + "step": 2445 + }, + { + "epoch": 1.2840670859538785, + "grad_norm": 0.7851201295852661, + "learning_rate": 4.98776488747308e-05, + "loss": 0.4581, + "num_input_tokens_seen": 1609360, + "step": 2450 + }, + { + "epoch": 1.2866876310272537, + "grad_norm": 1.277802586555481, + "learning_rate": 4.9875378739841016e-05, + "loss": 0.4812, + "num_input_tokens_seen": 1612336, + "step": 2455 + }, + { + "epoch": 1.2893081761006289, + "grad_norm": 0.8310542702674866, + "learning_rate": 4.9873087790370576e-05, + "loss": 0.3967, + "num_input_tokens_seen": 1616048, + "step": 2460 + }, + { + "epoch": 1.2919287211740043, + "grad_norm": 1.554909586906433, + "learning_rate": 4.9870776028236424e-05, + "loss": 0.3664, + "num_input_tokens_seen": 1618960, + "step": 2465 + }, + { + "epoch": 1.2945492662473794, + "grad_norm": 0.6267876625061035, + "learning_rate": 4.9868443455372945e-05, + "loss": 0.3319, + "num_input_tokens_seen": 1622544, + "step": 2470 + }, + { + "epoch": 1.2971698113207548, + "grad_norm": 0.777036726474762, + "learning_rate": 4.986609007373193e-05, + "loss": 0.4428, + "num_input_tokens_seen": 1625872, + "step": 2475 + }, + { + "epoch": 1.29979035639413, + "grad_norm": 0.6305606961250305, + "learning_rate": 4.986371588528257e-05, + "loss": 0.456, + "num_input_tokens_seen": 1629456, + "step": 2480 + }, + { + "epoch": 1.3024109014675052, + "grad_norm": 0.7708353400230408, + "learning_rate": 4.98613208920115e-05, + "loss": 0.5563, + "num_input_tokens_seen": 1632432, + "step": 2485 + }, + { + "epoch": 1.3050314465408805, + "grad_norm": 0.6484007239341736, + "learning_rate": 4.985890509592271e-05, + "loss": 0.4384, + "num_input_tokens_seen": 1636080, + "step": 2490 + }, + { + "epoch": 1.3076519916142557, + "grad_norm": 0.7659841775894165, + "learning_rate": 4.985646849903766e-05, + "loss": 0.4534, + "num_input_tokens_seen": 1638960, + "step": 2495 + }, + { + "epoch": 1.310272536687631, + "grad_norm": 0.41712164878845215, + "learning_rate": 4.985401110339517e-05, + "loss": 0.446, + "num_input_tokens_seen": 1643120, + "step": 2500 + }, + { + "epoch": 1.3128930817610063, + "grad_norm": 1.0955899953842163, + "learning_rate": 4.985153291105146e-05, + "loss": 0.3762, + "num_input_tokens_seen": 1646768, + "step": 2505 + }, + { + "epoch": 1.3155136268343814, + "grad_norm": 0.7508005499839783, + "learning_rate": 4.984903392408019e-05, + "loss": 0.4803, + "num_input_tokens_seen": 1650256, + "step": 2510 + }, + { + "epoch": 1.3181341719077568, + "grad_norm": 0.8121232986450195, + "learning_rate": 4.984651414457239e-05, + "loss": 0.4872, + "num_input_tokens_seen": 1652912, + "step": 2515 + }, + { + "epoch": 1.320754716981132, + "grad_norm": 0.4928153455257416, + "learning_rate": 4.98439735746365e-05, + "loss": 0.4589, + "num_input_tokens_seen": 1656432, + "step": 2520 + }, + { + "epoch": 1.3233752620545074, + "grad_norm": 0.8091893792152405, + "learning_rate": 4.984141221639835e-05, + "loss": 0.3878, + "num_input_tokens_seen": 1659472, + "step": 2525 + }, + { + "epoch": 1.3259958071278826, + "grad_norm": 1.4634898900985718, + "learning_rate": 4.9838830072001165e-05, + "loss": 0.488, + "num_input_tokens_seen": 1662480, + "step": 2530 + }, + { + "epoch": 1.3286163522012577, + "grad_norm": 0.6570993661880493, + "learning_rate": 4.983622714360557e-05, + "loss": 0.5136, + "num_input_tokens_seen": 1665424, + "step": 2535 + }, + { + "epoch": 1.3312368972746331, + "grad_norm": 1.0097737312316895, + "learning_rate": 4.9833603433389576e-05, + "loss": 0.5172, + "num_input_tokens_seen": 1667760, + "step": 2540 + }, + { + "epoch": 1.3338574423480085, + "grad_norm": 0.8939637541770935, + "learning_rate": 4.983095894354858e-05, + "loss": 0.375, + "num_input_tokens_seen": 1670800, + "step": 2545 + }, + { + "epoch": 1.3364779874213837, + "grad_norm": 0.8916773796081543, + "learning_rate": 4.982829367629537e-05, + "loss": 0.4994, + "num_input_tokens_seen": 1675760, + "step": 2550 + }, + { + "epoch": 1.3390985324947589, + "grad_norm": 1.909305453300476, + "learning_rate": 4.982560763386013e-05, + "loss": 0.5354, + "num_input_tokens_seen": 1677936, + "step": 2555 + }, + { + "epoch": 1.3417190775681342, + "grad_norm": 0.6962745189666748, + "learning_rate": 4.9822900818490404e-05, + "loss": 0.5569, + "num_input_tokens_seen": 1685008, + "step": 2560 + }, + { + "epoch": 1.3443396226415094, + "grad_norm": 1.023728847503662, + "learning_rate": 4.982017323245114e-05, + "loss": 0.5837, + "num_input_tokens_seen": 1688240, + "step": 2565 + }, + { + "epoch": 1.3469601677148848, + "grad_norm": 1.132413387298584, + "learning_rate": 4.981742487802466e-05, + "loss": 0.619, + "num_input_tokens_seen": 1690832, + "step": 2570 + }, + { + "epoch": 1.34958071278826, + "grad_norm": 0.8140695691108704, + "learning_rate": 4.9814655757510644e-05, + "loss": 0.5911, + "num_input_tokens_seen": 1695248, + "step": 2575 + }, + { + "epoch": 1.3522012578616351, + "grad_norm": 0.9371203184127808, + "learning_rate": 4.981186587322619e-05, + "loss": 0.4635, + "num_input_tokens_seen": 1698896, + "step": 2580 + }, + { + "epoch": 1.3548218029350105, + "grad_norm": 1.5073059797286987, + "learning_rate": 4.980905522750573e-05, + "loss": 0.5779, + "num_input_tokens_seen": 1701296, + "step": 2585 + }, + { + "epoch": 1.3574423480083857, + "grad_norm": 1.1648235321044922, + "learning_rate": 4.980622382270108e-05, + "loss": 0.3459, + "num_input_tokens_seen": 1705136, + "step": 2590 + }, + { + "epoch": 1.360062893081761, + "grad_norm": 0.9291535019874573, + "learning_rate": 4.9803371661181456e-05, + "loss": 0.4362, + "num_input_tokens_seen": 1708016, + "step": 2595 + }, + { + "epoch": 1.3626834381551363, + "grad_norm": 2.9464635848999023, + "learning_rate": 4.980049874533338e-05, + "loss": 0.5966, + "num_input_tokens_seen": 1710864, + "step": 2600 + }, + { + "epoch": 1.3653039832285114, + "grad_norm": 0.8245675563812256, + "learning_rate": 4.979760507756081e-05, + "loss": 0.3864, + "num_input_tokens_seen": 1713552, + "step": 2605 + }, + { + "epoch": 1.3679245283018868, + "grad_norm": 0.86576908826828, + "learning_rate": 4.979469066028502e-05, + "loss": 0.4049, + "num_input_tokens_seen": 1716528, + "step": 2610 + }, + { + "epoch": 1.370545073375262, + "grad_norm": 2.656132936477661, + "learning_rate": 4.9791755495944645e-05, + "loss": 0.4235, + "num_input_tokens_seen": 1719216, + "step": 2615 + }, + { + "epoch": 1.3731656184486374, + "grad_norm": 0.7970884442329407, + "learning_rate": 4.978879958699573e-05, + "loss": 0.4969, + "num_input_tokens_seen": 1722384, + "step": 2620 + }, + { + "epoch": 1.3757861635220126, + "grad_norm": 0.6960129737854004, + "learning_rate": 4.978582293591162e-05, + "loss": 0.422, + "num_input_tokens_seen": 1725104, + "step": 2625 + }, + { + "epoch": 1.3784067085953877, + "grad_norm": 2.9517323970794678, + "learning_rate": 4.978282554518305e-05, + "loss": 0.4416, + "num_input_tokens_seen": 1728112, + "step": 2630 + }, + { + "epoch": 1.381027253668763, + "grad_norm": 1.0889865159988403, + "learning_rate": 4.9779807417318096e-05, + "loss": 0.4752, + "num_input_tokens_seen": 1731536, + "step": 2635 + }, + { + "epoch": 1.3836477987421385, + "grad_norm": 0.7341992259025574, + "learning_rate": 4.977676855484219e-05, + "loss": 0.5179, + "num_input_tokens_seen": 1734896, + "step": 2640 + }, + { + "epoch": 1.3862683438155137, + "grad_norm": 1.3548693656921387, + "learning_rate": 4.977370896029812e-05, + "loss": 0.4811, + "num_input_tokens_seen": 1737584, + "step": 2645 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 0.9121825695037842, + "learning_rate": 4.977062863624601e-05, + "loss": 0.5389, + "num_input_tokens_seen": 1740720, + "step": 2650 + }, + { + "epoch": 1.3915094339622642, + "grad_norm": 0.6982294917106628, + "learning_rate": 4.976752758526333e-05, + "loss": 0.3369, + "num_input_tokens_seen": 1743216, + "step": 2655 + }, + { + "epoch": 1.3941299790356394, + "grad_norm": 1.0777761936187744, + "learning_rate": 4.9764405809944906e-05, + "loss": 0.5122, + "num_input_tokens_seen": 1746000, + "step": 2660 + }, + { + "epoch": 1.3967505241090148, + "grad_norm": 1.5318150520324707, + "learning_rate": 4.9761263312902895e-05, + "loss": 0.3821, + "num_input_tokens_seen": 1748560, + "step": 2665 + }, + { + "epoch": 1.39937106918239, + "grad_norm": 0.9911133050918579, + "learning_rate": 4.9758100096766786e-05, + "loss": 0.5333, + "num_input_tokens_seen": 1751504, + "step": 2670 + }, + { + "epoch": 1.4019916142557651, + "grad_norm": 1.1300935745239258, + "learning_rate": 4.975491616418342e-05, + "loss": 0.4499, + "num_input_tokens_seen": 1754864, + "step": 2675 + }, + { + "epoch": 1.4046121593291405, + "grad_norm": 1.7669901847839355, + "learning_rate": 4.975171151781698e-05, + "loss": 0.4515, + "num_input_tokens_seen": 1757648, + "step": 2680 + }, + { + "epoch": 1.4072327044025157, + "grad_norm": 0.8042817711830139, + "learning_rate": 4.974848616034894e-05, + "loss": 0.4605, + "num_input_tokens_seen": 1760432, + "step": 2685 + }, + { + "epoch": 1.409853249475891, + "grad_norm": 0.5583804845809937, + "learning_rate": 4.974524009447815e-05, + "loss": 0.4697, + "num_input_tokens_seen": 1763664, + "step": 2690 + }, + { + "epoch": 1.4124737945492662, + "grad_norm": 0.8552871346473694, + "learning_rate": 4.974197332292078e-05, + "loss": 0.3924, + "num_input_tokens_seen": 1766640, + "step": 2695 + }, + { + "epoch": 1.4150943396226414, + "grad_norm": 0.96357661485672, + "learning_rate": 4.973868584841028e-05, + "loss": 0.4629, + "num_input_tokens_seen": 1769776, + "step": 2700 + }, + { + "epoch": 1.4177148846960168, + "grad_norm": 1.270294189453125, + "learning_rate": 4.973537767369749e-05, + "loss": 0.5007, + "num_input_tokens_seen": 1772112, + "step": 2705 + }, + { + "epoch": 1.420335429769392, + "grad_norm": 0.8364608883857727, + "learning_rate": 4.973204880155053e-05, + "loss": 0.4675, + "num_input_tokens_seen": 1775312, + "step": 2710 + }, + { + "epoch": 1.4229559748427674, + "grad_norm": 0.6230647563934326, + "learning_rate": 4.972869923475485e-05, + "loss": 0.588, + "num_input_tokens_seen": 1779504, + "step": 2715 + }, + { + "epoch": 1.4255765199161425, + "grad_norm": 0.7107497453689575, + "learning_rate": 4.972532897611321e-05, + "loss": 0.4239, + "num_input_tokens_seen": 1783248, + "step": 2720 + }, + { + "epoch": 1.4281970649895177, + "grad_norm": 1.4759639501571655, + "learning_rate": 4.972193802844569e-05, + "loss": 0.3418, + "num_input_tokens_seen": 1789168, + "step": 2725 + }, + { + "epoch": 1.430817610062893, + "grad_norm": 0.9443889856338501, + "learning_rate": 4.971852639458968e-05, + "loss": 0.4381, + "num_input_tokens_seen": 1792592, + "step": 2730 + }, + { + "epoch": 1.4334381551362683, + "grad_norm": 1.1995207071304321, + "learning_rate": 4.971509407739988e-05, + "loss": 0.4785, + "num_input_tokens_seen": 1795536, + "step": 2735 + }, + { + "epoch": 1.4360587002096437, + "grad_norm": 0.7326859831809998, + "learning_rate": 4.971164107974831e-05, + "loss": 0.4196, + "num_input_tokens_seen": 1798064, + "step": 2740 + }, + { + "epoch": 1.4386792452830188, + "grad_norm": 0.8591048121452332, + "learning_rate": 4.970816740452425e-05, + "loss": 0.447, + "num_input_tokens_seen": 1801392, + "step": 2745 + }, + { + "epoch": 1.441299790356394, + "grad_norm": 1.1418044567108154, + "learning_rate": 4.9704673054634335e-05, + "loss": 0.5208, + "num_input_tokens_seen": 1804368, + "step": 2750 + }, + { + "epoch": 1.4439203354297694, + "grad_norm": 1.283186674118042, + "learning_rate": 4.970115803300247e-05, + "loss": 0.5171, + "num_input_tokens_seen": 1807504, + "step": 2755 + }, + { + "epoch": 1.4465408805031448, + "grad_norm": 1.1464016437530518, + "learning_rate": 4.969762234256987e-05, + "loss": 0.3553, + "num_input_tokens_seen": 1810160, + "step": 2760 + }, + { + "epoch": 1.44916142557652, + "grad_norm": 1.031670093536377, + "learning_rate": 4.969406598629503e-05, + "loss": 0.4599, + "num_input_tokens_seen": 1813616, + "step": 2765 + }, + { + "epoch": 1.4517819706498951, + "grad_norm": 0.81280517578125, + "learning_rate": 4.969048896715376e-05, + "loss": 0.4359, + "num_input_tokens_seen": 1816752, + "step": 2770 + }, + { + "epoch": 1.4544025157232705, + "grad_norm": 0.9195156097412109, + "learning_rate": 4.968689128813914e-05, + "loss": 0.4382, + "num_input_tokens_seen": 1820784, + "step": 2775 + }, + { + "epoch": 1.4570230607966457, + "grad_norm": 1.1050888299942017, + "learning_rate": 4.968327295226153e-05, + "loss": 0.3873, + "num_input_tokens_seen": 1823024, + "step": 2780 + }, + { + "epoch": 1.459643605870021, + "grad_norm": 0.8507685661315918, + "learning_rate": 4.967963396254861e-05, + "loss": 0.5287, + "num_input_tokens_seen": 1828016, + "step": 2785 + }, + { + "epoch": 1.4622641509433962, + "grad_norm": 2.293369770050049, + "learning_rate": 4.967597432204531e-05, + "loss": 0.5032, + "num_input_tokens_seen": 1831280, + "step": 2790 + }, + { + "epoch": 1.4648846960167714, + "grad_norm": 0.5671038031578064, + "learning_rate": 4.9672294033813846e-05, + "loss": 0.4057, + "num_input_tokens_seen": 1834896, + "step": 2795 + }, + { + "epoch": 1.4675052410901468, + "grad_norm": 0.7233836650848389, + "learning_rate": 4.966859310093372e-05, + "loss": 0.4287, + "num_input_tokens_seen": 1837488, + "step": 2800 + }, + { + "epoch": 1.470125786163522, + "grad_norm": 0.7738066911697388, + "learning_rate": 4.966487152650171e-05, + "loss": 0.5067, + "num_input_tokens_seen": 1841584, + "step": 2805 + }, + { + "epoch": 1.4727463312368974, + "grad_norm": 0.9488295912742615, + "learning_rate": 4.966112931363185e-05, + "loss": 0.5192, + "num_input_tokens_seen": 1844688, + "step": 2810 + }, + { + "epoch": 1.4753668763102725, + "grad_norm": 0.9181019067764282, + "learning_rate": 4.965736646545546e-05, + "loss": 0.4604, + "num_input_tokens_seen": 1847472, + "step": 2815 + }, + { + "epoch": 1.4779874213836477, + "grad_norm": 1.2376569509506226, + "learning_rate": 4.96535829851211e-05, + "loss": 0.6717, + "num_input_tokens_seen": 1851088, + "step": 2820 + }, + { + "epoch": 1.480607966457023, + "grad_norm": 0.7197278738021851, + "learning_rate": 4.964977887579464e-05, + "loss": 0.473, + "num_input_tokens_seen": 1855216, + "step": 2825 + }, + { + "epoch": 1.4832285115303983, + "grad_norm": 0.6267632246017456, + "learning_rate": 4.964595414065918e-05, + "loss": 0.3936, + "num_input_tokens_seen": 1857744, + "step": 2830 + }, + { + "epoch": 1.4858490566037736, + "grad_norm": 0.982931911945343, + "learning_rate": 4.9642108782915066e-05, + "loss": 0.3999, + "num_input_tokens_seen": 1860400, + "step": 2835 + }, + { + "epoch": 1.4884696016771488, + "grad_norm": 1.0582060813903809, + "learning_rate": 4.963824280577993e-05, + "loss": 0.5116, + "num_input_tokens_seen": 1863248, + "step": 2840 + }, + { + "epoch": 1.491090146750524, + "grad_norm": 1.0129716396331787, + "learning_rate": 4.963435621248865e-05, + "loss": 0.4548, + "num_input_tokens_seen": 1866576, + "step": 2845 + }, + { + "epoch": 1.4937106918238994, + "grad_norm": 0.9157496690750122, + "learning_rate": 4.9630449006293345e-05, + "loss": 0.4229, + "num_input_tokens_seen": 1869840, + "step": 2850 + }, + { + "epoch": 1.4963312368972748, + "grad_norm": 0.8755952715873718, + "learning_rate": 4.9626521190463375e-05, + "loss": 0.5583, + "num_input_tokens_seen": 1872624, + "step": 2855 + }, + { + "epoch": 1.49895178197065, + "grad_norm": 1.025514841079712, + "learning_rate": 4.9622572768285377e-05, + "loss": 0.4509, + "num_input_tokens_seen": 1876016, + "step": 2860 + }, + { + "epoch": 1.5, + "eval_loss": 0.4744594395160675, + "eval_runtime": 15.9875, + "eval_samples_per_second": 53.041, + "eval_steps_per_second": 13.26, + "num_input_tokens_seen": 1877040, + "step": 2862 + }, + { + "epoch": 1.501572327044025, + "grad_norm": 0.9040143489837646, + "learning_rate": 4.96186037430632e-05, + "loss": 0.3916, + "num_input_tokens_seen": 1880976, + "step": 2865 + }, + { + "epoch": 1.5041928721174003, + "grad_norm": 1.7504807710647583, + "learning_rate": 4.9614614118117934e-05, + "loss": 0.5426, + "num_input_tokens_seen": 1883568, + "step": 2870 + }, + { + "epoch": 1.5068134171907757, + "grad_norm": 1.1855220794677734, + "learning_rate": 4.961060389678793e-05, + "loss": 0.4502, + "num_input_tokens_seen": 1886800, + "step": 2875 + }, + { + "epoch": 1.509433962264151, + "grad_norm": 1.1036823987960815, + "learning_rate": 4.9606573082428754e-05, + "loss": 0.4262, + "num_input_tokens_seen": 1890672, + "step": 2880 + }, + { + "epoch": 1.5120545073375262, + "grad_norm": 0.9053158760070801, + "learning_rate": 4.9602521678413206e-05, + "loss": 0.3468, + "num_input_tokens_seen": 1893456, + "step": 2885 + }, + { + "epoch": 1.5146750524109014, + "grad_norm": 1.2621124982833862, + "learning_rate": 4.959844968813132e-05, + "loss": 0.4645, + "num_input_tokens_seen": 1896752, + "step": 2890 + }, + { + "epoch": 1.5172955974842768, + "grad_norm": 0.5261931419372559, + "learning_rate": 4.959435711499034e-05, + "loss": 0.3961, + "num_input_tokens_seen": 1899664, + "step": 2895 + }, + { + "epoch": 1.519916142557652, + "grad_norm": 0.841161847114563, + "learning_rate": 4.959024396241475e-05, + "loss": 0.3555, + "num_input_tokens_seen": 1902320, + "step": 2900 + }, + { + "epoch": 1.5225366876310273, + "grad_norm": 0.864820659160614, + "learning_rate": 4.958611023384626e-05, + "loss": 0.5174, + "num_input_tokens_seen": 1905488, + "step": 2905 + }, + { + "epoch": 1.5251572327044025, + "grad_norm": 0.899372398853302, + "learning_rate": 4.958195593274376e-05, + "loss": 0.4331, + "num_input_tokens_seen": 1908144, + "step": 2910 + }, + { + "epoch": 1.5277777777777777, + "grad_norm": 0.6398664712905884, + "learning_rate": 4.957778106258341e-05, + "loss": 0.4302, + "num_input_tokens_seen": 1911536, + "step": 2915 + }, + { + "epoch": 1.530398322851153, + "grad_norm": 0.5402488708496094, + "learning_rate": 4.957358562685852e-05, + "loss": 0.4351, + "num_input_tokens_seen": 1916176, + "step": 2920 + }, + { + "epoch": 1.5330188679245285, + "grad_norm": 1.050798773765564, + "learning_rate": 4.956936962907966e-05, + "loss": 0.4489, + "num_input_tokens_seen": 1920592, + "step": 2925 + }, + { + "epoch": 1.5356394129979036, + "grad_norm": 0.934907853603363, + "learning_rate": 4.9565133072774585e-05, + "loss": 0.4311, + "num_input_tokens_seen": 1923152, + "step": 2930 + }, + { + "epoch": 1.5382599580712788, + "grad_norm": 1.077411413192749, + "learning_rate": 4.956087596148824e-05, + "loss": 0.4866, + "num_input_tokens_seen": 1926000, + "step": 2935 + }, + { + "epoch": 1.540880503144654, + "grad_norm": 0.6997095346450806, + "learning_rate": 4.955659829878279e-05, + "loss": 0.5131, + "num_input_tokens_seen": 1929072, + "step": 2940 + }, + { + "epoch": 1.5435010482180294, + "grad_norm": 0.4783366918563843, + "learning_rate": 4.955230008823758e-05, + "loss": 0.3232, + "num_input_tokens_seen": 1932208, + "step": 2945 + }, + { + "epoch": 1.5461215932914047, + "grad_norm": 1.023843765258789, + "learning_rate": 4.954798133344916e-05, + "loss": 0.6153, + "num_input_tokens_seen": 1935152, + "step": 2950 + }, + { + "epoch": 1.54874213836478, + "grad_norm": 0.8002196550369263, + "learning_rate": 4.954364203803127e-05, + "loss": 0.5148, + "num_input_tokens_seen": 1938704, + "step": 2955 + }, + { + "epoch": 1.551362683438155, + "grad_norm": 0.7720306515693665, + "learning_rate": 4.953928220561482e-05, + "loss": 0.5402, + "num_input_tokens_seen": 1942192, + "step": 2960 + }, + { + "epoch": 1.5539832285115303, + "grad_norm": 1.3564172983169556, + "learning_rate": 4.953490183984795e-05, + "loss": 0.4243, + "num_input_tokens_seen": 1945744, + "step": 2965 + }, + { + "epoch": 1.5566037735849056, + "grad_norm": 0.7503104209899902, + "learning_rate": 4.953050094439591e-05, + "loss": 0.4462, + "num_input_tokens_seen": 1948944, + "step": 2970 + }, + { + "epoch": 1.559224318658281, + "grad_norm": 0.9157913327217102, + "learning_rate": 4.95260795229412e-05, + "loss": 0.4209, + "num_input_tokens_seen": 1951792, + "step": 2975 + }, + { + "epoch": 1.5618448637316562, + "grad_norm": 0.7814742922782898, + "learning_rate": 4.952163757918344e-05, + "loss": 0.5005, + "num_input_tokens_seen": 1955376, + "step": 2980 + }, + { + "epoch": 1.5644654088050314, + "grad_norm": 0.754301130771637, + "learning_rate": 4.951717511683947e-05, + "loss": 0.4813, + "num_input_tokens_seen": 1959120, + "step": 2985 + }, + { + "epoch": 1.5670859538784065, + "grad_norm": 0.5610980987548828, + "learning_rate": 4.9512692139643264e-05, + "loss": 0.3679, + "num_input_tokens_seen": 1964304, + "step": 2990 + }, + { + "epoch": 1.569706498951782, + "grad_norm": 1.1088690757751465, + "learning_rate": 4.950818865134596e-05, + "loss": 0.5633, + "num_input_tokens_seen": 1966832, + "step": 2995 + }, + { + "epoch": 1.5723270440251573, + "grad_norm": 0.645404040813446, + "learning_rate": 4.9503664655715885e-05, + "loss": 0.5358, + "num_input_tokens_seen": 1970224, + "step": 3000 + }, + { + "epoch": 1.5749475890985325, + "grad_norm": 0.5579919219017029, + "learning_rate": 4.9499120156538516e-05, + "loss": 0.4638, + "num_input_tokens_seen": 1974288, + "step": 3005 + }, + { + "epoch": 1.5775681341719077, + "grad_norm": 0.7647812366485596, + "learning_rate": 4.949455515761647e-05, + "loss": 0.4025, + "num_input_tokens_seen": 1978064, + "step": 3010 + }, + { + "epoch": 1.580188679245283, + "grad_norm": 1.2589459419250488, + "learning_rate": 4.948996966276953e-05, + "loss": 0.4979, + "num_input_tokens_seen": 1981744, + "step": 3015 + }, + { + "epoch": 1.5828092243186582, + "grad_norm": 1.4147696495056152, + "learning_rate": 4.948536367583464e-05, + "loss": 0.4845, + "num_input_tokens_seen": 1984016, + "step": 3020 + }, + { + "epoch": 1.5854297693920336, + "grad_norm": 0.931585431098938, + "learning_rate": 4.948073720066587e-05, + "loss": 0.3341, + "num_input_tokens_seen": 1987056, + "step": 3025 + }, + { + "epoch": 1.5880503144654088, + "grad_norm": 0.7190506458282471, + "learning_rate": 4.947609024113444e-05, + "loss": 0.503, + "num_input_tokens_seen": 1992048, + "step": 3030 + }, + { + "epoch": 1.590670859538784, + "grad_norm": 0.6777158379554749, + "learning_rate": 4.947142280112873e-05, + "loss": 0.6066, + "num_input_tokens_seen": 1995472, + "step": 3035 + }, + { + "epoch": 1.5932914046121593, + "grad_norm": 0.6768317818641663, + "learning_rate": 4.946673488455422e-05, + "loss": 0.3505, + "num_input_tokens_seen": 1999152, + "step": 3040 + }, + { + "epoch": 1.5959119496855347, + "grad_norm": 0.9045868515968323, + "learning_rate": 4.946202649533356e-05, + "loss": 0.4927, + "num_input_tokens_seen": 2001744, + "step": 3045 + }, + { + "epoch": 1.59853249475891, + "grad_norm": 1.053294062614441, + "learning_rate": 4.9457297637406506e-05, + "loss": 0.5604, + "num_input_tokens_seen": 2004848, + "step": 3050 + }, + { + "epoch": 1.601153039832285, + "grad_norm": 1.1805440187454224, + "learning_rate": 4.9452548314729965e-05, + "loss": 0.5076, + "num_input_tokens_seen": 2007920, + "step": 3055 + }, + { + "epoch": 1.6037735849056602, + "grad_norm": 0.936616063117981, + "learning_rate": 4.944777853127793e-05, + "loss": 0.4373, + "num_input_tokens_seen": 2011152, + "step": 3060 + }, + { + "epoch": 1.6063941299790356, + "grad_norm": 1.099682331085205, + "learning_rate": 4.9442988291041545e-05, + "loss": 0.5447, + "num_input_tokens_seen": 2013968, + "step": 3065 + }, + { + "epoch": 1.609014675052411, + "grad_norm": 1.5645614862442017, + "learning_rate": 4.943817759802908e-05, + "loss": 0.4841, + "num_input_tokens_seen": 2017936, + "step": 3070 + }, + { + "epoch": 1.6116352201257862, + "grad_norm": 0.4708518385887146, + "learning_rate": 4.94333464562659e-05, + "loss": 0.4092, + "num_input_tokens_seen": 2020528, + "step": 3075 + }, + { + "epoch": 1.6142557651991614, + "grad_norm": 1.0597648620605469, + "learning_rate": 4.942849486979446e-05, + "loss": 0.6004, + "num_input_tokens_seen": 2023792, + "step": 3080 + }, + { + "epoch": 1.6168763102725365, + "grad_norm": 0.6889981031417847, + "learning_rate": 4.9423622842674366e-05, + "loss": 0.4975, + "num_input_tokens_seen": 2027152, + "step": 3085 + }, + { + "epoch": 1.619496855345912, + "grad_norm": 0.6894543766975403, + "learning_rate": 4.9418730378982304e-05, + "loss": 0.3318, + "num_input_tokens_seen": 2029968, + "step": 3090 + }, + { + "epoch": 1.6221174004192873, + "grad_norm": 1.1045794486999512, + "learning_rate": 4.9413817482812064e-05, + "loss": 0.4133, + "num_input_tokens_seen": 2033488, + "step": 3095 + }, + { + "epoch": 1.6247379454926625, + "grad_norm": 1.2524356842041016, + "learning_rate": 4.9408884158274534e-05, + "loss": 0.5218, + "num_input_tokens_seen": 2037136, + "step": 3100 + }, + { + "epoch": 1.6273584905660377, + "grad_norm": 0.7781729698181152, + "learning_rate": 4.940393040949769e-05, + "loss": 0.4519, + "num_input_tokens_seen": 2039888, + "step": 3105 + }, + { + "epoch": 1.629979035639413, + "grad_norm": 0.8695936799049377, + "learning_rate": 4.939895624062661e-05, + "loss": 0.634, + "num_input_tokens_seen": 2043024, + "step": 3110 + }, + { + "epoch": 1.6325995807127882, + "grad_norm": 0.9215144515037537, + "learning_rate": 4.9393961655823454e-05, + "loss": 0.3455, + "num_input_tokens_seen": 2047056, + "step": 3115 + }, + { + "epoch": 1.6352201257861636, + "grad_norm": 1.3170740604400635, + "learning_rate": 4.9388946659267444e-05, + "loss": 0.5202, + "num_input_tokens_seen": 2050000, + "step": 3120 + }, + { + "epoch": 1.6378406708595388, + "grad_norm": 0.7563946843147278, + "learning_rate": 4.9383911255154916e-05, + "loss": 0.4611, + "num_input_tokens_seen": 2054032, + "step": 3125 + }, + { + "epoch": 1.640461215932914, + "grad_norm": 1.1154406070709229, + "learning_rate": 4.9378855447699264e-05, + "loss": 0.4804, + "num_input_tokens_seen": 2057072, + "step": 3130 + }, + { + "epoch": 1.6430817610062893, + "grad_norm": 0.6643469929695129, + "learning_rate": 4.9373779241130955e-05, + "loss": 0.4387, + "num_input_tokens_seen": 2060656, + "step": 3135 + }, + { + "epoch": 1.6457023060796647, + "grad_norm": 1.7015659809112549, + "learning_rate": 4.936868263969752e-05, + "loss": 0.4998, + "num_input_tokens_seen": 2063568, + "step": 3140 + }, + { + "epoch": 1.64832285115304, + "grad_norm": 1.0338246822357178, + "learning_rate": 4.936356564766358e-05, + "loss": 0.3794, + "num_input_tokens_seen": 2066960, + "step": 3145 + }, + { + "epoch": 1.650943396226415, + "grad_norm": 0.6922914981842041, + "learning_rate": 4.935842826931078e-05, + "loss": 0.4611, + "num_input_tokens_seen": 2070096, + "step": 3150 + }, + { + "epoch": 1.6535639412997902, + "grad_norm": 0.7206467390060425, + "learning_rate": 4.9353270508937854e-05, + "loss": 0.4606, + "num_input_tokens_seen": 2073072, + "step": 3155 + }, + { + "epoch": 1.6561844863731656, + "grad_norm": 0.7317720055580139, + "learning_rate": 4.934809237086059e-05, + "loss": 0.4374, + "num_input_tokens_seen": 2075632, + "step": 3160 + }, + { + "epoch": 1.658805031446541, + "grad_norm": 1.1413885354995728, + "learning_rate": 4.934289385941179e-05, + "loss": 0.3939, + "num_input_tokens_seen": 2079024, + "step": 3165 + }, + { + "epoch": 1.6614255765199162, + "grad_norm": 0.8386529684066772, + "learning_rate": 4.9337674978941364e-05, + "loss": 0.5958, + "num_input_tokens_seen": 2081744, + "step": 3170 + }, + { + "epoch": 1.6640461215932913, + "grad_norm": 0.573779284954071, + "learning_rate": 4.9332435733816204e-05, + "loss": 0.4985, + "num_input_tokens_seen": 2084784, + "step": 3175 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.7840592265129089, + "learning_rate": 4.932717612842028e-05, + "loss": 0.3948, + "num_input_tokens_seen": 2087984, + "step": 3180 + }, + { + "epoch": 1.669287211740042, + "grad_norm": 0.6742015480995178, + "learning_rate": 4.93218961671546e-05, + "loss": 0.4228, + "num_input_tokens_seen": 2091344, + "step": 3185 + }, + { + "epoch": 1.6719077568134173, + "grad_norm": 0.5258821249008179, + "learning_rate": 4.931659585443719e-05, + "loss": 0.4024, + "num_input_tokens_seen": 2095312, + "step": 3190 + }, + { + "epoch": 1.6745283018867925, + "grad_norm": 0.6267913579940796, + "learning_rate": 4.931127519470311e-05, + "loss": 0.3862, + "num_input_tokens_seen": 2098832, + "step": 3195 + }, + { + "epoch": 1.6771488469601676, + "grad_norm": 1.2268528938293457, + "learning_rate": 4.9305934192404426e-05, + "loss": 0.553, + "num_input_tokens_seen": 2101168, + "step": 3200 + }, + { + "epoch": 1.679769392033543, + "grad_norm": 1.2314720153808594, + "learning_rate": 4.930057285201027e-05, + "loss": 0.6548, + "num_input_tokens_seen": 2104816, + "step": 3205 + }, + { + "epoch": 1.6823899371069182, + "grad_norm": 1.2311400175094604, + "learning_rate": 4.929519117800676e-05, + "loss": 0.4519, + "num_input_tokens_seen": 2107728, + "step": 3210 + }, + { + "epoch": 1.6850104821802936, + "grad_norm": 1.9260826110839844, + "learning_rate": 4.928978917489703e-05, + "loss": 0.6496, + "num_input_tokens_seen": 2110992, + "step": 3215 + }, + { + "epoch": 1.6876310272536688, + "grad_norm": 0.826842725276947, + "learning_rate": 4.928436684720122e-05, + "loss": 0.4222, + "num_input_tokens_seen": 2113840, + "step": 3220 + }, + { + "epoch": 1.690251572327044, + "grad_norm": 0.953839898109436, + "learning_rate": 4.927892419945651e-05, + "loss": 0.4736, + "num_input_tokens_seen": 2116784, + "step": 3225 + }, + { + "epoch": 1.6928721174004193, + "grad_norm": 0.7061343789100647, + "learning_rate": 4.927346123621705e-05, + "loss": 0.6148, + "num_input_tokens_seen": 2119696, + "step": 3230 + }, + { + "epoch": 1.6954926624737947, + "grad_norm": 1.079646348953247, + "learning_rate": 4.926797796205399e-05, + "loss": 0.5255, + "num_input_tokens_seen": 2122608, + "step": 3235 + }, + { + "epoch": 1.6981132075471699, + "grad_norm": 1.104776382446289, + "learning_rate": 4.926247438155549e-05, + "loss": 0.4369, + "num_input_tokens_seen": 2125808, + "step": 3240 + }, + { + "epoch": 1.700733752620545, + "grad_norm": 0.8296249508857727, + "learning_rate": 4.9256950499326684e-05, + "loss": 0.3986, + "num_input_tokens_seen": 2129584, + "step": 3245 + }, + { + "epoch": 1.7033542976939202, + "grad_norm": 0.9556183815002441, + "learning_rate": 4.9251406319989725e-05, + "loss": 0.4276, + "num_input_tokens_seen": 2132208, + "step": 3250 + }, + { + "epoch": 1.7059748427672956, + "grad_norm": 0.6989715695381165, + "learning_rate": 4.9245841848183714e-05, + "loss": 0.3993, + "num_input_tokens_seen": 2135632, + "step": 3255 + }, + { + "epoch": 1.708595387840671, + "grad_norm": 1.0478543043136597, + "learning_rate": 4.924025708856475e-05, + "loss": 0.66, + "num_input_tokens_seen": 2139760, + "step": 3260 + }, + { + "epoch": 1.7112159329140462, + "grad_norm": 0.9971013069152832, + "learning_rate": 4.9234652045805895e-05, + "loss": 0.4369, + "num_input_tokens_seen": 2143280, + "step": 3265 + }, + { + "epoch": 1.7138364779874213, + "grad_norm": 0.6585241556167603, + "learning_rate": 4.922902672459722e-05, + "loss": 0.4172, + "num_input_tokens_seen": 2145648, + "step": 3270 + }, + { + "epoch": 1.7164570230607965, + "grad_norm": 1.021687626838684, + "learning_rate": 4.9223381129645706e-05, + "loss": 0.4474, + "num_input_tokens_seen": 2149040, + "step": 3275 + }, + { + "epoch": 1.719077568134172, + "grad_norm": 0.6146479249000549, + "learning_rate": 4.921771526567535e-05, + "loss": 0.494, + "num_input_tokens_seen": 2151792, + "step": 3280 + }, + { + "epoch": 1.7216981132075473, + "grad_norm": 0.7666890621185303, + "learning_rate": 4.921202913742707e-05, + "loss": 0.4165, + "num_input_tokens_seen": 2155600, + "step": 3285 + }, + { + "epoch": 1.7243186582809225, + "grad_norm": 0.6087334752082825, + "learning_rate": 4.920632274965878e-05, + "loss": 0.3894, + "num_input_tokens_seen": 2159088, + "step": 3290 + }, + { + "epoch": 1.7269392033542976, + "grad_norm": 1.0195256471633911, + "learning_rate": 4.920059610714531e-05, + "loss": 0.3453, + "num_input_tokens_seen": 2162800, + "step": 3295 + }, + { + "epoch": 1.7295597484276728, + "grad_norm": 1.1876991987228394, + "learning_rate": 4.919484921467846e-05, + "loss": 0.5195, + "num_input_tokens_seen": 2165872, + "step": 3300 + }, + { + "epoch": 1.7321802935010482, + "grad_norm": 0.9071875214576721, + "learning_rate": 4.9189082077066965e-05, + "loss": 0.529, + "num_input_tokens_seen": 2169456, + "step": 3305 + }, + { + "epoch": 1.7348008385744236, + "grad_norm": 0.7913904190063477, + "learning_rate": 4.918329469913649e-05, + "loss": 0.4538, + "num_input_tokens_seen": 2172368, + "step": 3310 + }, + { + "epoch": 1.7374213836477987, + "grad_norm": 0.6485946774482727, + "learning_rate": 4.917748708572967e-05, + "loss": 0.5567, + "num_input_tokens_seen": 2175504, + "step": 3315 + }, + { + "epoch": 1.740041928721174, + "grad_norm": 0.9313512444496155, + "learning_rate": 4.917165924170604e-05, + "loss": 0.4674, + "num_input_tokens_seen": 2179440, + "step": 3320 + }, + { + "epoch": 1.7426624737945493, + "grad_norm": 1.1463568210601807, + "learning_rate": 4.9165811171942064e-05, + "loss": 0.5506, + "num_input_tokens_seen": 2181904, + "step": 3325 + }, + { + "epoch": 1.7452830188679245, + "grad_norm": 0.9177520871162415, + "learning_rate": 4.915994288133115e-05, + "loss": 0.3775, + "num_input_tokens_seen": 2184816, + "step": 3330 + }, + { + "epoch": 1.7479035639412999, + "grad_norm": 1.0196067094802856, + "learning_rate": 4.9154054374783624e-05, + "loss": 0.543, + "num_input_tokens_seen": 2187920, + "step": 3335 + }, + { + "epoch": 1.750524109014675, + "grad_norm": 0.5377241969108582, + "learning_rate": 4.914814565722671e-05, + "loss": 0.4704, + "num_input_tokens_seen": 2191216, + "step": 3340 + }, + { + "epoch": 1.7531446540880502, + "grad_norm": 1.0635336637496948, + "learning_rate": 4.914221673360455e-05, + "loss": 0.4123, + "num_input_tokens_seen": 2193744, + "step": 3345 + }, + { + "epoch": 1.7557651991614256, + "grad_norm": 0.7078237533569336, + "learning_rate": 4.91362676088782e-05, + "loss": 0.4131, + "num_input_tokens_seen": 2196368, + "step": 3350 + }, + { + "epoch": 1.758385744234801, + "grad_norm": 0.6519614458084106, + "learning_rate": 4.913029828802561e-05, + "loss": 0.3528, + "num_input_tokens_seen": 2199632, + "step": 3355 + }, + { + "epoch": 1.7610062893081762, + "grad_norm": 0.6242289543151855, + "learning_rate": 4.912430877604165e-05, + "loss": 0.4766, + "num_input_tokens_seen": 2202704, + "step": 3360 + }, + { + "epoch": 1.7636268343815513, + "grad_norm": 0.8293642401695251, + "learning_rate": 4.9118299077938054e-05, + "loss": 0.6145, + "num_input_tokens_seen": 2205616, + "step": 3365 + }, + { + "epoch": 1.7662473794549265, + "grad_norm": 0.5299723744392395, + "learning_rate": 4.911226919874347e-05, + "loss": 0.4083, + "num_input_tokens_seen": 2209008, + "step": 3370 + }, + { + "epoch": 1.7688679245283019, + "grad_norm": 0.8112401962280273, + "learning_rate": 4.910621914350343e-05, + "loss": 0.3982, + "num_input_tokens_seen": 2211824, + "step": 3375 + }, + { + "epoch": 1.7714884696016773, + "grad_norm": 0.5307987332344055, + "learning_rate": 4.910014891728033e-05, + "loss": 0.4443, + "num_input_tokens_seen": 2215632, + "step": 3380 + }, + { + "epoch": 1.7741090146750524, + "grad_norm": 1.012789249420166, + "learning_rate": 4.9094058525153475e-05, + "loss": 0.4101, + "num_input_tokens_seen": 2218512, + "step": 3385 + }, + { + "epoch": 1.7767295597484276, + "grad_norm": 0.9255281090736389, + "learning_rate": 4.908794797221902e-05, + "loss": 0.3633, + "num_input_tokens_seen": 2221776, + "step": 3390 + }, + { + "epoch": 1.7793501048218028, + "grad_norm": 0.49620386958122253, + "learning_rate": 4.908181726358999e-05, + "loss": 0.3689, + "num_input_tokens_seen": 2224784, + "step": 3395 + }, + { + "epoch": 1.7819706498951782, + "grad_norm": 1.2273541688919067, + "learning_rate": 4.907566640439628e-05, + "loss": 0.4392, + "num_input_tokens_seen": 2227408, + "step": 3400 + }, + { + "epoch": 1.7845911949685536, + "grad_norm": 0.9090877175331116, + "learning_rate": 4.906949539978467e-05, + "loss": 0.4175, + "num_input_tokens_seen": 2230064, + "step": 3405 + }, + { + "epoch": 1.7872117400419287, + "grad_norm": 0.8446885943412781, + "learning_rate": 4.906330425491875e-05, + "loss": 0.3367, + "num_input_tokens_seen": 2234288, + "step": 3410 + }, + { + "epoch": 1.789832285115304, + "grad_norm": 0.4506591558456421, + "learning_rate": 4.9057092974979e-05, + "loss": 0.3395, + "num_input_tokens_seen": 2237072, + "step": 3415 + }, + { + "epoch": 1.7924528301886793, + "grad_norm": 0.5717880129814148, + "learning_rate": 4.905086156516273e-05, + "loss": 0.337, + "num_input_tokens_seen": 2240016, + "step": 3420 + }, + { + "epoch": 1.7950733752620545, + "grad_norm": 0.6348119974136353, + "learning_rate": 4.904461003068411e-05, + "loss": 0.4228, + "num_input_tokens_seen": 2242800, + "step": 3425 + }, + { + "epoch": 1.7976939203354299, + "grad_norm": 0.9716742038726807, + "learning_rate": 4.9038338376774124e-05, + "loss": 0.4596, + "num_input_tokens_seen": 2245680, + "step": 3430 + }, + { + "epoch": 1.800314465408805, + "grad_norm": 0.9509571194648743, + "learning_rate": 4.9032046608680613e-05, + "loss": 0.4938, + "num_input_tokens_seen": 2248272, + "step": 3435 + }, + { + "epoch": 1.8029350104821802, + "grad_norm": 1.168410301208496, + "learning_rate": 4.902573473166824e-05, + "loss": 0.299, + "num_input_tokens_seen": 2251376, + "step": 3440 + }, + { + "epoch": 1.8055555555555556, + "grad_norm": 1.5922887325286865, + "learning_rate": 4.9019402751018496e-05, + "loss": 0.497, + "num_input_tokens_seen": 2254256, + "step": 3445 + }, + { + "epoch": 1.808176100628931, + "grad_norm": 1.0310759544372559, + "learning_rate": 4.901305067202969e-05, + "loss": 0.5269, + "num_input_tokens_seen": 2257104, + "step": 3450 + }, + { + "epoch": 1.8107966457023061, + "grad_norm": 1.0089747905731201, + "learning_rate": 4.900667850001696e-05, + "loss": 0.4809, + "num_input_tokens_seen": 2259920, + "step": 3455 + }, + { + "epoch": 1.8134171907756813, + "grad_norm": 0.6547533273696899, + "learning_rate": 4.900028624031223e-05, + "loss": 0.3573, + "num_input_tokens_seen": 2262736, + "step": 3460 + }, + { + "epoch": 1.8160377358490565, + "grad_norm": 0.3966623544692993, + "learning_rate": 4.899387389826427e-05, + "loss": 0.4047, + "num_input_tokens_seen": 2266864, + "step": 3465 + }, + { + "epoch": 1.8186582809224319, + "grad_norm": 1.626362919807434, + "learning_rate": 4.898744147923863e-05, + "loss": 0.6399, + "num_input_tokens_seen": 2269584, + "step": 3470 + }, + { + "epoch": 1.8212788259958073, + "grad_norm": 0.5518888235092163, + "learning_rate": 4.898098898861766e-05, + "loss": 0.4097, + "num_input_tokens_seen": 2272720, + "step": 3475 + }, + { + "epoch": 1.8238993710691824, + "grad_norm": 0.6557338237762451, + "learning_rate": 4.897451643180051e-05, + "loss": 0.5464, + "num_input_tokens_seen": 2276880, + "step": 3480 + }, + { + "epoch": 1.8265199161425576, + "grad_norm": 2.4046545028686523, + "learning_rate": 4.896802381420313e-05, + "loss": 0.411, + "num_input_tokens_seen": 2280016, + "step": 3485 + }, + { + "epoch": 1.8291404612159328, + "grad_norm": 0.752528190612793, + "learning_rate": 4.896151114125823e-05, + "loss": 0.4382, + "num_input_tokens_seen": 2282640, + "step": 3490 + }, + { + "epoch": 1.8317610062893082, + "grad_norm": 1.3505322933197021, + "learning_rate": 4.895497841841533e-05, + "loss": 0.4068, + "num_input_tokens_seen": 2286032, + "step": 3495 + }, + { + "epoch": 1.8343815513626835, + "grad_norm": 0.6531990766525269, + "learning_rate": 4.8948425651140704e-05, + "loss": 0.5103, + "num_input_tokens_seen": 2289136, + "step": 3500 + }, + { + "epoch": 1.8370020964360587, + "grad_norm": 0.770585298538208, + "learning_rate": 4.894185284491742e-05, + "loss": 0.3534, + "num_input_tokens_seen": 2292944, + "step": 3505 + }, + { + "epoch": 1.8396226415094339, + "grad_norm": 0.7485274076461792, + "learning_rate": 4.893526000524529e-05, + "loss": 0.4226, + "num_input_tokens_seen": 2296240, + "step": 3510 + }, + { + "epoch": 1.8422431865828093, + "grad_norm": 1.143608808517456, + "learning_rate": 4.892864713764091e-05, + "loss": 0.4255, + "num_input_tokens_seen": 2299088, + "step": 3515 + }, + { + "epoch": 1.8448637316561844, + "grad_norm": 1.143466591835022, + "learning_rate": 4.892201424763762e-05, + "loss": 0.4733, + "num_input_tokens_seen": 2302736, + "step": 3520 + }, + { + "epoch": 1.8474842767295598, + "grad_norm": 0.9357412457466125, + "learning_rate": 4.891536134078553e-05, + "loss": 0.5139, + "num_input_tokens_seen": 2306640, + "step": 3525 + }, + { + "epoch": 1.850104821802935, + "grad_norm": 0.8397156596183777, + "learning_rate": 4.8908688422651465e-05, + "loss": 0.5268, + "num_input_tokens_seen": 2309840, + "step": 3530 + }, + { + "epoch": 1.8527253668763102, + "grad_norm": 1.4215189218521118, + "learning_rate": 4.8901995498819044e-05, + "loss": 0.4845, + "num_input_tokens_seen": 2312208, + "step": 3535 + }, + { + "epoch": 1.8553459119496856, + "grad_norm": 2.0612120628356934, + "learning_rate": 4.8895282574888576e-05, + "loss": 0.5608, + "num_input_tokens_seen": 2315088, + "step": 3540 + }, + { + "epoch": 1.857966457023061, + "grad_norm": 1.0592396259307861, + "learning_rate": 4.888854965647716e-05, + "loss": 0.474, + "num_input_tokens_seen": 2318224, + "step": 3545 + }, + { + "epoch": 1.8605870020964361, + "grad_norm": 0.48451220989227295, + "learning_rate": 4.8881796749218564e-05, + "loss": 0.3889, + "num_input_tokens_seen": 2322800, + "step": 3550 + }, + { + "epoch": 1.8632075471698113, + "grad_norm": 1.2871973514556885, + "learning_rate": 4.8875023858763335e-05, + "loss": 0.3117, + "num_input_tokens_seen": 2329008, + "step": 3555 + }, + { + "epoch": 1.8658280922431865, + "grad_norm": 1.4554080963134766, + "learning_rate": 4.88682309907787e-05, + "loss": 0.6046, + "num_input_tokens_seen": 2331344, + "step": 3560 + }, + { + "epoch": 1.8684486373165619, + "grad_norm": 0.7912168502807617, + "learning_rate": 4.886141815094863e-05, + "loss": 0.4482, + "num_input_tokens_seen": 2334352, + "step": 3565 + }, + { + "epoch": 1.8710691823899372, + "grad_norm": 0.6143674850463867, + "learning_rate": 4.88545853449738e-05, + "loss": 0.5071, + "num_input_tokens_seen": 2338512, + "step": 3570 + }, + { + "epoch": 1.8736897274633124, + "grad_norm": 0.592241644859314, + "learning_rate": 4.8847732578571585e-05, + "loss": 0.5396, + "num_input_tokens_seen": 2342224, + "step": 3575 + }, + { + "epoch": 1.8763102725366876, + "grad_norm": 1.7707849740982056, + "learning_rate": 4.8840859857476074e-05, + "loss": 0.4265, + "num_input_tokens_seen": 2345808, + "step": 3580 + }, + { + "epoch": 1.8789308176100628, + "grad_norm": 0.5506871342658997, + "learning_rate": 4.8833967187438034e-05, + "loss": 0.496, + "num_input_tokens_seen": 2348560, + "step": 3585 + }, + { + "epoch": 1.8815513626834381, + "grad_norm": 0.7687011957168579, + "learning_rate": 4.882705457422495e-05, + "loss": 0.3546, + "num_input_tokens_seen": 2351824, + "step": 3590 + }, + { + "epoch": 1.8841719077568135, + "grad_norm": 0.9790545105934143, + "learning_rate": 4.8820122023620975e-05, + "loss": 0.3626, + "num_input_tokens_seen": 2355696, + "step": 3595 + }, + { + "epoch": 1.8867924528301887, + "grad_norm": 0.6160573363304138, + "learning_rate": 4.881316954142694e-05, + "loss": 0.3869, + "num_input_tokens_seen": 2358352, + "step": 3600 + }, + { + "epoch": 1.8894129979035639, + "grad_norm": 0.5691027045249939, + "learning_rate": 4.880619713346039e-05, + "loss": 0.3879, + "num_input_tokens_seen": 2361776, + "step": 3605 + }, + { + "epoch": 1.892033542976939, + "grad_norm": 0.893364429473877, + "learning_rate": 4.879920480555549e-05, + "loss": 0.5119, + "num_input_tokens_seen": 2365264, + "step": 3610 + }, + { + "epoch": 1.8946540880503144, + "grad_norm": 1.1702690124511719, + "learning_rate": 4.8792192563563114e-05, + "loss": 0.6625, + "num_input_tokens_seen": 2368080, + "step": 3615 + }, + { + "epoch": 1.8972746331236898, + "grad_norm": 0.6682045459747314, + "learning_rate": 4.8785160413350797e-05, + "loss": 0.4572, + "num_input_tokens_seen": 2371696, + "step": 3620 + }, + { + "epoch": 1.899895178197065, + "grad_norm": 0.9398805499076843, + "learning_rate": 4.877810836080269e-05, + "loss": 0.4412, + "num_input_tokens_seen": 2374160, + "step": 3625 + }, + { + "epoch": 1.9025157232704402, + "grad_norm": 0.9666318297386169, + "learning_rate": 4.8771036411819656e-05, + "loss": 0.3761, + "num_input_tokens_seen": 2376656, + "step": 3630 + }, + { + "epoch": 1.9051362683438156, + "grad_norm": 0.7522600889205933, + "learning_rate": 4.876394457231917e-05, + "loss": 0.4412, + "num_input_tokens_seen": 2379856, + "step": 3635 + }, + { + "epoch": 1.9077568134171907, + "grad_norm": 0.7685899138450623, + "learning_rate": 4.875683284823537e-05, + "loss": 0.5216, + "num_input_tokens_seen": 2382736, + "step": 3640 + }, + { + "epoch": 1.9103773584905661, + "grad_norm": 0.9114197492599487, + "learning_rate": 4.8749701245519e-05, + "loss": 0.414, + "num_input_tokens_seen": 2386672, + "step": 3645 + }, + { + "epoch": 1.9129979035639413, + "grad_norm": 0.7425282597541809, + "learning_rate": 4.874254977013747e-05, + "loss": 0.3679, + "num_input_tokens_seen": 2389296, + "step": 3650 + }, + { + "epoch": 1.9156184486373165, + "grad_norm": 0.8446131944656372, + "learning_rate": 4.8735378428074806e-05, + "loss": 0.5341, + "num_input_tokens_seen": 2392080, + "step": 3655 + }, + { + "epoch": 1.9182389937106918, + "grad_norm": 0.4886540472507477, + "learning_rate": 4.8728187225331665e-05, + "loss": 0.3792, + "num_input_tokens_seen": 2395952, + "step": 3660 + }, + { + "epoch": 1.9208595387840672, + "grad_norm": 0.5189837217330933, + "learning_rate": 4.872097616792532e-05, + "loss": 0.3701, + "num_input_tokens_seen": 2398512, + "step": 3665 + }, + { + "epoch": 1.9234800838574424, + "grad_norm": 0.7328994870185852, + "learning_rate": 4.871374526188964e-05, + "loss": 0.5135, + "num_input_tokens_seen": 2401904, + "step": 3670 + }, + { + "epoch": 1.9261006289308176, + "grad_norm": 0.5765765309333801, + "learning_rate": 4.8706494513275134e-05, + "loss": 0.4812, + "num_input_tokens_seen": 2405584, + "step": 3675 + }, + { + "epoch": 1.9287211740041927, + "grad_norm": 0.4812271296977997, + "learning_rate": 4.869922392814889e-05, + "loss": 0.5234, + "num_input_tokens_seen": 2409712, + "step": 3680 + }, + { + "epoch": 1.9313417190775681, + "grad_norm": 0.8490468859672546, + "learning_rate": 4.869193351259459e-05, + "loss": 0.5924, + "num_input_tokens_seen": 2412240, + "step": 3685 + }, + { + "epoch": 1.9339622641509435, + "grad_norm": 0.673173189163208, + "learning_rate": 4.868462327271254e-05, + "loss": 0.3817, + "num_input_tokens_seen": 2415056, + "step": 3690 + }, + { + "epoch": 1.9365828092243187, + "grad_norm": 1.0735427141189575, + "learning_rate": 4.86772932146196e-05, + "loss": 0.438, + "num_input_tokens_seen": 2419184, + "step": 3695 + }, + { + "epoch": 1.9392033542976939, + "grad_norm": 1.2832117080688477, + "learning_rate": 4.866994334444923e-05, + "loss": 0.4883, + "num_input_tokens_seen": 2422512, + "step": 3700 + }, + { + "epoch": 1.941823899371069, + "grad_norm": 1.4377793073654175, + "learning_rate": 4.866257366835147e-05, + "loss": 0.4295, + "num_input_tokens_seen": 2425936, + "step": 3705 + }, + { + "epoch": 1.9444444444444444, + "grad_norm": 1.0550988912582397, + "learning_rate": 4.865518419249294e-05, + "loss": 0.5679, + "num_input_tokens_seen": 2428944, + "step": 3710 + }, + { + "epoch": 1.9470649895178198, + "grad_norm": 0.7928629517555237, + "learning_rate": 4.864777492305679e-05, + "loss": 0.3608, + "num_input_tokens_seen": 2432080, + "step": 3715 + }, + { + "epoch": 1.949685534591195, + "grad_norm": 0.8851485848426819, + "learning_rate": 4.864034586624277e-05, + "loss": 0.4028, + "num_input_tokens_seen": 2434864, + "step": 3720 + }, + { + "epoch": 1.9523060796645701, + "grad_norm": 0.836521327495575, + "learning_rate": 4.863289702826719e-05, + "loss": 0.4083, + "num_input_tokens_seen": 2437104, + "step": 3725 + }, + { + "epoch": 1.9549266247379455, + "grad_norm": 1.0958480834960938, + "learning_rate": 4.862542841536288e-05, + "loss": 0.6185, + "num_input_tokens_seen": 2439792, + "step": 3730 + }, + { + "epoch": 1.9575471698113207, + "grad_norm": 0.5410102009773254, + "learning_rate": 4.861794003377923e-05, + "loss": 0.5999, + "num_input_tokens_seen": 2442320, + "step": 3735 + }, + { + "epoch": 1.960167714884696, + "grad_norm": 0.6202638149261475, + "learning_rate": 4.8610431889782195e-05, + "loss": 0.3798, + "num_input_tokens_seen": 2445136, + "step": 3740 + }, + { + "epoch": 1.9627882599580713, + "grad_norm": 0.6556448340415955, + "learning_rate": 4.8602903989654224e-05, + "loss": 0.4277, + "num_input_tokens_seen": 2448080, + "step": 3745 + }, + { + "epoch": 1.9654088050314464, + "grad_norm": 0.742984414100647, + "learning_rate": 4.859535633969434e-05, + "loss": 0.4767, + "num_input_tokens_seen": 2451376, + "step": 3750 + }, + { + "epoch": 1.9680293501048218, + "grad_norm": 0.8273113369941711, + "learning_rate": 4.858778894621807e-05, + "loss": 0.5352, + "num_input_tokens_seen": 2454672, + "step": 3755 + }, + { + "epoch": 1.9706498951781972, + "grad_norm": 0.8492875099182129, + "learning_rate": 4.858020181555745e-05, + "loss": 0.4147, + "num_input_tokens_seen": 2457712, + "step": 3760 + }, + { + "epoch": 1.9732704402515724, + "grad_norm": 0.8644865155220032, + "learning_rate": 4.857259495406105e-05, + "loss": 0.4345, + "num_input_tokens_seen": 2461936, + "step": 3765 + }, + { + "epoch": 1.9758909853249476, + "grad_norm": 0.7969812750816345, + "learning_rate": 4.856496836809394e-05, + "loss": 0.3636, + "num_input_tokens_seen": 2465296, + "step": 3770 + }, + { + "epoch": 1.9785115303983227, + "grad_norm": 0.8386428356170654, + "learning_rate": 4.8557322064037714e-05, + "loss": 0.456, + "num_input_tokens_seen": 2469872, + "step": 3775 + }, + { + "epoch": 1.9811320754716981, + "grad_norm": 0.797146737575531, + "learning_rate": 4.854965604829044e-05, + "loss": 0.3381, + "num_input_tokens_seen": 2473840, + "step": 3780 + }, + { + "epoch": 1.9837526205450735, + "grad_norm": 0.8806960582733154, + "learning_rate": 4.8541970327266685e-05, + "loss": 0.4548, + "num_input_tokens_seen": 2476944, + "step": 3785 + }, + { + "epoch": 1.9863731656184487, + "grad_norm": 0.8920587301254272, + "learning_rate": 4.853426490739751e-05, + "loss": 0.426, + "num_input_tokens_seen": 2479792, + "step": 3790 + }, + { + "epoch": 1.9889937106918238, + "grad_norm": 0.9800183773040771, + "learning_rate": 4.852653979513047e-05, + "loss": 0.5526, + "num_input_tokens_seen": 2484464, + "step": 3795 + }, + { + "epoch": 1.991614255765199, + "grad_norm": 0.9677714109420776, + "learning_rate": 4.851879499692958e-05, + "loss": 0.4092, + "num_input_tokens_seen": 2487312, + "step": 3800 + }, + { + "epoch": 1.9942348008385744, + "grad_norm": 0.9310934543609619, + "learning_rate": 4.851103051927532e-05, + "loss": 0.4433, + "num_input_tokens_seen": 2490288, + "step": 3805 + }, + { + "epoch": 1.9968553459119498, + "grad_norm": 0.8258974552154541, + "learning_rate": 4.850324636866468e-05, + "loss": 0.4819, + "num_input_tokens_seen": 2494128, + "step": 3810 + }, + { + "epoch": 1.999475890985325, + "grad_norm": 0.7043586373329163, + "learning_rate": 4.849544255161106e-05, + "loss": 0.4172, + "num_input_tokens_seen": 2496848, + "step": 3815 + }, + { + "epoch": 2.0, + "eval_loss": 0.46624913811683655, + "eval_runtime": 16.0043, + "eval_samples_per_second": 52.986, + "eval_steps_per_second": 13.246, + "num_input_tokens_seen": 2497016, + "step": 3816 + }, + { + "epoch": 2.0020964360587, + "grad_norm": 0.777105450630188, + "learning_rate": 4.848761907464433e-05, + "loss": 0.3247, + "num_input_tokens_seen": 2499256, + "step": 3820 + }, + { + "epoch": 2.0047169811320753, + "grad_norm": 0.7645193934440613, + "learning_rate": 4.847977594431084e-05, + "loss": 0.3979, + "num_input_tokens_seen": 2502200, + "step": 3825 + }, + { + "epoch": 2.007337526205451, + "grad_norm": 1.0611459016799927, + "learning_rate": 4.847191316717335e-05, + "loss": 0.5533, + "num_input_tokens_seen": 2505080, + "step": 3830 + }, + { + "epoch": 2.009958071278826, + "grad_norm": 0.8008416891098022, + "learning_rate": 4.846403074981107e-05, + "loss": 0.4317, + "num_input_tokens_seen": 2508216, + "step": 3835 + }, + { + "epoch": 2.0125786163522013, + "grad_norm": 0.6990626454353333, + "learning_rate": 4.845612869881967e-05, + "loss": 0.3293, + "num_input_tokens_seen": 2511128, + "step": 3840 + }, + { + "epoch": 2.0151991614255764, + "grad_norm": 0.6920326948165894, + "learning_rate": 4.8448207020811194e-05, + "loss": 0.4435, + "num_input_tokens_seen": 2514776, + "step": 3845 + }, + { + "epoch": 2.0178197064989516, + "grad_norm": 1.3585435152053833, + "learning_rate": 4.8440265722414155e-05, + "loss": 0.5415, + "num_input_tokens_seen": 2517528, + "step": 3850 + }, + { + "epoch": 2.020440251572327, + "grad_norm": 0.6074157357215881, + "learning_rate": 4.843230481027347e-05, + "loss": 0.4785, + "num_input_tokens_seen": 2523032, + "step": 3855 + }, + { + "epoch": 2.0230607966457024, + "grad_norm": 0.6190252900123596, + "learning_rate": 4.8424324291050464e-05, + "loss": 0.3929, + "num_input_tokens_seen": 2526200, + "step": 3860 + }, + { + "epoch": 2.0256813417190775, + "grad_norm": 0.9831500053405762, + "learning_rate": 4.841632417142287e-05, + "loss": 0.5303, + "num_input_tokens_seen": 2532760, + "step": 3865 + }, + { + "epoch": 2.0283018867924527, + "grad_norm": 0.7516472935676575, + "learning_rate": 4.840830445808483e-05, + "loss": 0.3504, + "num_input_tokens_seen": 2535640, + "step": 3870 + }, + { + "epoch": 2.030922431865828, + "grad_norm": 0.6724318861961365, + "learning_rate": 4.840026515774686e-05, + "loss": 0.3851, + "num_input_tokens_seen": 2539288, + "step": 3875 + }, + { + "epoch": 2.0335429769392035, + "grad_norm": 1.2951538562774658, + "learning_rate": 4.8392206277135896e-05, + "loss": 0.4752, + "num_input_tokens_seen": 2541880, + "step": 3880 + }, + { + "epoch": 2.0361635220125787, + "grad_norm": 0.7937584519386292, + "learning_rate": 4.8384127822995227e-05, + "loss": 0.4063, + "num_input_tokens_seen": 2545080, + "step": 3885 + }, + { + "epoch": 2.038784067085954, + "grad_norm": 0.8190783858299255, + "learning_rate": 4.8376029802084546e-05, + "loss": 0.3152, + "num_input_tokens_seen": 2548376, + "step": 3890 + }, + { + "epoch": 2.041404612159329, + "grad_norm": 0.7781983017921448, + "learning_rate": 4.836791222117989e-05, + "loss": 0.3578, + "num_input_tokens_seen": 2550616, + "step": 3895 + }, + { + "epoch": 2.0440251572327046, + "grad_norm": 0.8768317699432373, + "learning_rate": 4.83597750870737e-05, + "loss": 0.5031, + "num_input_tokens_seen": 2555000, + "step": 3900 + }, + { + "epoch": 2.04664570230608, + "grad_norm": 1.1290158033370972, + "learning_rate": 4.8351618406574746e-05, + "loss": 0.4069, + "num_input_tokens_seen": 2558328, + "step": 3905 + }, + { + "epoch": 2.049266247379455, + "grad_norm": 0.9850488305091858, + "learning_rate": 4.834344218650817e-05, + "loss": 0.5016, + "num_input_tokens_seen": 2561592, + "step": 3910 + }, + { + "epoch": 2.05188679245283, + "grad_norm": 0.6264485716819763, + "learning_rate": 4.833524643371545e-05, + "loss": 0.2937, + "num_input_tokens_seen": 2564472, + "step": 3915 + }, + { + "epoch": 2.0545073375262053, + "grad_norm": 1.578213095664978, + "learning_rate": 4.8327031155054434e-05, + "loss": 0.4006, + "num_input_tokens_seen": 2567352, + "step": 3920 + }, + { + "epoch": 2.057127882599581, + "grad_norm": 0.8124864101409912, + "learning_rate": 4.831879635739929e-05, + "loss": 0.3101, + "num_input_tokens_seen": 2571448, + "step": 3925 + }, + { + "epoch": 2.059748427672956, + "grad_norm": 1.1771689653396606, + "learning_rate": 4.83105420476405e-05, + "loss": 0.3401, + "num_input_tokens_seen": 2574264, + "step": 3930 + }, + { + "epoch": 2.0623689727463312, + "grad_norm": 1.159608006477356, + "learning_rate": 4.830226823268491e-05, + "loss": 0.5131, + "num_input_tokens_seen": 2577592, + "step": 3935 + }, + { + "epoch": 2.0649895178197064, + "grad_norm": 1.1599745750427246, + "learning_rate": 4.829397491945568e-05, + "loss": 0.4403, + "num_input_tokens_seen": 2581624, + "step": 3940 + }, + { + "epoch": 2.0676100628930816, + "grad_norm": 0.7546862959861755, + "learning_rate": 4.828566211489225e-05, + "loss": 0.4224, + "num_input_tokens_seen": 2584888, + "step": 3945 + }, + { + "epoch": 2.070230607966457, + "grad_norm": 1.6702712774276733, + "learning_rate": 4.827732982595041e-05, + "loss": 0.4359, + "num_input_tokens_seen": 2587672, + "step": 3950 + }, + { + "epoch": 2.0728511530398324, + "grad_norm": 0.9914207458496094, + "learning_rate": 4.826897805960224e-05, + "loss": 0.4206, + "num_input_tokens_seen": 2590136, + "step": 3955 + }, + { + "epoch": 2.0754716981132075, + "grad_norm": 0.7342826724052429, + "learning_rate": 4.8260606822836116e-05, + "loss": 0.4111, + "num_input_tokens_seen": 2593976, + "step": 3960 + }, + { + "epoch": 2.0780922431865827, + "grad_norm": 0.959398090839386, + "learning_rate": 4.8252216122656716e-05, + "loss": 0.9059, + "num_input_tokens_seen": 2597208, + "step": 3965 + }, + { + "epoch": 2.080712788259958, + "grad_norm": 0.9145901203155518, + "learning_rate": 4.824380596608497e-05, + "loss": 0.4258, + "num_input_tokens_seen": 2600536, + "step": 3970 + }, + { + "epoch": 2.0833333333333335, + "grad_norm": 0.9551880359649658, + "learning_rate": 4.823537636015812e-05, + "loss": 0.3505, + "num_input_tokens_seen": 2603608, + "step": 3975 + }, + { + "epoch": 2.0859538784067087, + "grad_norm": 0.9049939513206482, + "learning_rate": 4.822692731192969e-05, + "loss": 0.4114, + "num_input_tokens_seen": 2607288, + "step": 3980 + }, + { + "epoch": 2.088574423480084, + "grad_norm": 0.8881303668022156, + "learning_rate": 4.8218458828469445e-05, + "loss": 0.5263, + "num_input_tokens_seen": 2610392, + "step": 3985 + }, + { + "epoch": 2.091194968553459, + "grad_norm": 0.665157675743103, + "learning_rate": 4.820997091686343e-05, + "loss": 0.5104, + "num_input_tokens_seen": 2613208, + "step": 3990 + }, + { + "epoch": 2.0938155136268346, + "grad_norm": 0.7549234628677368, + "learning_rate": 4.8201463584213946e-05, + "loss": 0.5382, + "num_input_tokens_seen": 2616472, + "step": 3995 + }, + { + "epoch": 2.0964360587002098, + "grad_norm": 0.7100829482078552, + "learning_rate": 4.819293683763954e-05, + "loss": 0.4122, + "num_input_tokens_seen": 2620504, + "step": 4000 + }, + { + "epoch": 2.099056603773585, + "grad_norm": 0.9131651520729065, + "learning_rate": 4.818439068427498e-05, + "loss": 0.4901, + "num_input_tokens_seen": 2624632, + "step": 4005 + }, + { + "epoch": 2.10167714884696, + "grad_norm": 1.1091861724853516, + "learning_rate": 4.817582513127133e-05, + "loss": 0.4203, + "num_input_tokens_seen": 2627608, + "step": 4010 + }, + { + "epoch": 2.1042976939203353, + "grad_norm": 0.5477728247642517, + "learning_rate": 4.8167240185795835e-05, + "loss": 0.33, + "num_input_tokens_seen": 2630232, + "step": 4015 + }, + { + "epoch": 2.106918238993711, + "grad_norm": 0.6681999564170837, + "learning_rate": 4.8158635855032e-05, + "loss": 0.3506, + "num_input_tokens_seen": 2633624, + "step": 4020 + }, + { + "epoch": 2.109538784067086, + "grad_norm": 1.2846183776855469, + "learning_rate": 4.8150012146179514e-05, + "loss": 0.4437, + "num_input_tokens_seen": 2636280, + "step": 4025 + }, + { + "epoch": 2.1121593291404612, + "grad_norm": 0.8901273012161255, + "learning_rate": 4.814136906645431e-05, + "loss": 0.4306, + "num_input_tokens_seen": 2640152, + "step": 4030 + }, + { + "epoch": 2.1147798742138364, + "grad_norm": 0.6136233806610107, + "learning_rate": 4.813270662308854e-05, + "loss": 0.4253, + "num_input_tokens_seen": 2643800, + "step": 4035 + }, + { + "epoch": 2.1174004192872116, + "grad_norm": 0.854512095451355, + "learning_rate": 4.812402482333052e-05, + "loss": 0.393, + "num_input_tokens_seen": 2647608, + "step": 4040 + }, + { + "epoch": 2.120020964360587, + "grad_norm": 0.9328063130378723, + "learning_rate": 4.811532367444479e-05, + "loss": 0.4828, + "num_input_tokens_seen": 2650200, + "step": 4045 + }, + { + "epoch": 2.1226415094339623, + "grad_norm": 0.8199171423912048, + "learning_rate": 4.810660318371208e-05, + "loss": 0.5392, + "num_input_tokens_seen": 2653784, + "step": 4050 + }, + { + "epoch": 2.1252620545073375, + "grad_norm": 0.5396403074264526, + "learning_rate": 4.809786335842929e-05, + "loss": 0.4333, + "num_input_tokens_seen": 2658104, + "step": 4055 + }, + { + "epoch": 2.1278825995807127, + "grad_norm": 1.22113037109375, + "learning_rate": 4.8089104205909506e-05, + "loss": 0.3703, + "num_input_tokens_seen": 2661080, + "step": 4060 + }, + { + "epoch": 2.130503144654088, + "grad_norm": 0.766905665397644, + "learning_rate": 4.8080325733482004e-05, + "loss": 0.3631, + "num_input_tokens_seen": 2664312, + "step": 4065 + }, + { + "epoch": 2.1331236897274635, + "grad_norm": 1.436716914176941, + "learning_rate": 4.8071527948492176e-05, + "loss": 0.4663, + "num_input_tokens_seen": 2667704, + "step": 4070 + }, + { + "epoch": 2.1357442348008386, + "grad_norm": 0.9589127898216248, + "learning_rate": 4.806271085830164e-05, + "loss": 0.3454, + "num_input_tokens_seen": 2671160, + "step": 4075 + }, + { + "epoch": 2.138364779874214, + "grad_norm": 0.7042800188064575, + "learning_rate": 4.805387447028812e-05, + "loss": 0.4594, + "num_input_tokens_seen": 2675544, + "step": 4080 + }, + { + "epoch": 2.140985324947589, + "grad_norm": 1.0763548612594604, + "learning_rate": 4.80450187918455e-05, + "loss": 0.4348, + "num_input_tokens_seen": 2678264, + "step": 4085 + }, + { + "epoch": 2.1436058700209646, + "grad_norm": 0.9889065623283386, + "learning_rate": 4.8036143830383807e-05, + "loss": 0.3747, + "num_input_tokens_seen": 2684120, + "step": 4090 + }, + { + "epoch": 2.1462264150943398, + "grad_norm": 1.3986936807632446, + "learning_rate": 4.8027249593329206e-05, + "loss": 0.5781, + "num_input_tokens_seen": 2686872, + "step": 4095 + }, + { + "epoch": 2.148846960167715, + "grad_norm": 0.8649975657463074, + "learning_rate": 4.8018336088123986e-05, + "loss": 0.3855, + "num_input_tokens_seen": 2690456, + "step": 4100 + }, + { + "epoch": 2.15146750524109, + "grad_norm": 0.8798226714134216, + "learning_rate": 4.800940332222656e-05, + "loss": 0.4121, + "num_input_tokens_seen": 2692824, + "step": 4105 + }, + { + "epoch": 2.1540880503144653, + "grad_norm": 1.0019736289978027, + "learning_rate": 4.8000451303111474e-05, + "loss": 0.507, + "num_input_tokens_seen": 2695704, + "step": 4110 + }, + { + "epoch": 2.156708595387841, + "grad_norm": 0.6111348867416382, + "learning_rate": 4.799148003826936e-05, + "loss": 0.4191, + "num_input_tokens_seen": 2699480, + "step": 4115 + }, + { + "epoch": 2.159329140461216, + "grad_norm": 0.7186006307601929, + "learning_rate": 4.798248953520694e-05, + "loss": 0.482, + "num_input_tokens_seen": 2703960, + "step": 4120 + }, + { + "epoch": 2.161949685534591, + "grad_norm": 0.7773911356925964, + "learning_rate": 4.7973479801447084e-05, + "loss": 0.3697, + "num_input_tokens_seen": 2706552, + "step": 4125 + }, + { + "epoch": 2.1645702306079664, + "grad_norm": 0.8523171544075012, + "learning_rate": 4.796445084452871e-05, + "loss": 0.4566, + "num_input_tokens_seen": 2709368, + "step": 4130 + }, + { + "epoch": 2.1671907756813416, + "grad_norm": 1.1112538576126099, + "learning_rate": 4.7955402672006854e-05, + "loss": 0.4708, + "num_input_tokens_seen": 2712248, + "step": 4135 + }, + { + "epoch": 2.169811320754717, + "grad_norm": 0.8870540857315063, + "learning_rate": 4.794633529145259e-05, + "loss": 0.3738, + "num_input_tokens_seen": 2715448, + "step": 4140 + }, + { + "epoch": 2.1724318658280923, + "grad_norm": 0.5395047664642334, + "learning_rate": 4.793724871045312e-05, + "loss": 0.3458, + "num_input_tokens_seen": 2719064, + "step": 4145 + }, + { + "epoch": 2.1750524109014675, + "grad_norm": 0.7204754948616028, + "learning_rate": 4.792814293661164e-05, + "loss": 0.3965, + "num_input_tokens_seen": 2722488, + "step": 4150 + }, + { + "epoch": 2.1776729559748427, + "grad_norm": 1.08787202835083, + "learning_rate": 4.791901797754748e-05, + "loss": 0.3371, + "num_input_tokens_seen": 2725528, + "step": 4155 + }, + { + "epoch": 2.180293501048218, + "grad_norm": 0.5393990278244019, + "learning_rate": 4.790987384089597e-05, + "loss": 0.3671, + "num_input_tokens_seen": 2728824, + "step": 4160 + }, + { + "epoch": 2.1829140461215935, + "grad_norm": 0.6779523491859436, + "learning_rate": 4.790071053430851e-05, + "loss": 0.5133, + "num_input_tokens_seen": 2732472, + "step": 4165 + }, + { + "epoch": 2.1855345911949686, + "grad_norm": 0.6041650772094727, + "learning_rate": 4.7891528065452544e-05, + "loss": 0.4329, + "num_input_tokens_seen": 2735544, + "step": 4170 + }, + { + "epoch": 2.188155136268344, + "grad_norm": 0.8024056553840637, + "learning_rate": 4.788232644201153e-05, + "loss": 0.4354, + "num_input_tokens_seen": 2739032, + "step": 4175 + }, + { + "epoch": 2.190775681341719, + "grad_norm": 1.0426281690597534, + "learning_rate": 4.787310567168498e-05, + "loss": 0.5219, + "num_input_tokens_seen": 2742456, + "step": 4180 + }, + { + "epoch": 2.1933962264150946, + "grad_norm": 1.2483441829681396, + "learning_rate": 4.78638657621884e-05, + "loss": 0.3638, + "num_input_tokens_seen": 2744888, + "step": 4185 + }, + { + "epoch": 2.1960167714884697, + "grad_norm": 0.8211708068847656, + "learning_rate": 4.785460672125332e-05, + "loss": 0.4921, + "num_input_tokens_seen": 2747512, + "step": 4190 + }, + { + "epoch": 2.198637316561845, + "grad_norm": 0.8558998107910156, + "learning_rate": 4.7845328556627306e-05, + "loss": 0.4527, + "num_input_tokens_seen": 2750648, + "step": 4195 + }, + { + "epoch": 2.20125786163522, + "grad_norm": 1.1886229515075684, + "learning_rate": 4.783603127607388e-05, + "loss": 0.3346, + "num_input_tokens_seen": 2753368, + "step": 4200 + }, + { + "epoch": 2.2038784067085953, + "grad_norm": 0.952170729637146, + "learning_rate": 4.78267148873726e-05, + "loss": 0.384, + "num_input_tokens_seen": 2756696, + "step": 4205 + }, + { + "epoch": 2.2064989517819704, + "grad_norm": 1.4667011499404907, + "learning_rate": 4.781737939831898e-05, + "loss": 0.4388, + "num_input_tokens_seen": 2759256, + "step": 4210 + }, + { + "epoch": 2.209119496855346, + "grad_norm": 0.89683997631073, + "learning_rate": 4.7808024816724536e-05, + "loss": 0.4769, + "num_input_tokens_seen": 2762296, + "step": 4215 + }, + { + "epoch": 2.211740041928721, + "grad_norm": 1.4685626029968262, + "learning_rate": 4.7798651150416754e-05, + "loss": 0.4138, + "num_input_tokens_seen": 2765400, + "step": 4220 + }, + { + "epoch": 2.2143605870020964, + "grad_norm": 1.2142530679702759, + "learning_rate": 4.778925840723909e-05, + "loss": 0.3993, + "num_input_tokens_seen": 2768120, + "step": 4225 + }, + { + "epoch": 2.2169811320754715, + "grad_norm": 0.8623555302619934, + "learning_rate": 4.777984659505096e-05, + "loss": 0.4728, + "num_input_tokens_seen": 2771096, + "step": 4230 + }, + { + "epoch": 2.219601677148847, + "grad_norm": 1.2266653776168823, + "learning_rate": 4.777041572172774e-05, + "loss": 0.4798, + "num_input_tokens_seen": 2774328, + "step": 4235 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.6406659483909607, + "learning_rate": 4.776096579516076e-05, + "loss": 0.486, + "num_input_tokens_seen": 2778008, + "step": 4240 + }, + { + "epoch": 2.2248427672955975, + "grad_norm": 1.9605166912078857, + "learning_rate": 4.775149682325728e-05, + "loss": 0.4225, + "num_input_tokens_seen": 2780216, + "step": 4245 + }, + { + "epoch": 2.2274633123689727, + "grad_norm": 1.1101921796798706, + "learning_rate": 4.77420088139405e-05, + "loss": 0.4002, + "num_input_tokens_seen": 2784088, + "step": 4250 + }, + { + "epoch": 2.230083857442348, + "grad_norm": 0.8717358112335205, + "learning_rate": 4.7732501775149564e-05, + "loss": 0.338, + "num_input_tokens_seen": 2786488, + "step": 4255 + }, + { + "epoch": 2.2327044025157234, + "grad_norm": 1.0911259651184082, + "learning_rate": 4.7722975714839526e-05, + "loss": 0.3393, + "num_input_tokens_seen": 2788920, + "step": 4260 + }, + { + "epoch": 2.2353249475890986, + "grad_norm": 0.9638444185256958, + "learning_rate": 4.7713430640981346e-05, + "loss": 0.5692, + "num_input_tokens_seen": 2791640, + "step": 4265 + }, + { + "epoch": 2.237945492662474, + "grad_norm": 1.064584732055664, + "learning_rate": 4.7703866561561915e-05, + "loss": 0.3151, + "num_input_tokens_seen": 2794904, + "step": 4270 + }, + { + "epoch": 2.240566037735849, + "grad_norm": 0.9129518270492554, + "learning_rate": 4.769428348458402e-05, + "loss": 0.3703, + "num_input_tokens_seen": 2798840, + "step": 4275 + }, + { + "epoch": 2.243186582809224, + "grad_norm": 0.8561168909072876, + "learning_rate": 4.7684681418066334e-05, + "loss": 0.4649, + "num_input_tokens_seen": 2802616, + "step": 4280 + }, + { + "epoch": 2.2458071278825997, + "grad_norm": 1.4074697494506836, + "learning_rate": 4.767506037004344e-05, + "loss": 0.4027, + "num_input_tokens_seen": 2805432, + "step": 4285 + }, + { + "epoch": 2.248427672955975, + "grad_norm": 1.254148006439209, + "learning_rate": 4.766542034856577e-05, + "loss": 0.4558, + "num_input_tokens_seen": 2808696, + "step": 4290 + }, + { + "epoch": 2.25104821802935, + "grad_norm": 0.844550371170044, + "learning_rate": 4.7655761361699676e-05, + "loss": 0.3737, + "num_input_tokens_seen": 2811640, + "step": 4295 + }, + { + "epoch": 2.2536687631027252, + "grad_norm": 0.9207198023796082, + "learning_rate": 4.7646083417527345e-05, + "loss": 0.3726, + "num_input_tokens_seen": 2815224, + "step": 4300 + }, + { + "epoch": 2.2562893081761004, + "grad_norm": 0.8567259311676025, + "learning_rate": 4.7636386524146846e-05, + "loss": 0.4425, + "num_input_tokens_seen": 2817688, + "step": 4305 + }, + { + "epoch": 2.258909853249476, + "grad_norm": 0.7549933195114136, + "learning_rate": 4.7626670689672095e-05, + "loss": 0.3427, + "num_input_tokens_seen": 2821016, + "step": 4310 + }, + { + "epoch": 2.261530398322851, + "grad_norm": 1.0269373655319214, + "learning_rate": 4.761693592223285e-05, + "loss": 0.5144, + "num_input_tokens_seen": 2823608, + "step": 4315 + }, + { + "epoch": 2.2641509433962264, + "grad_norm": 1.029028296470642, + "learning_rate": 4.760718222997472e-05, + "loss": 0.368, + "num_input_tokens_seen": 2828248, + "step": 4320 + }, + { + "epoch": 2.2667714884696015, + "grad_norm": 0.873231053352356, + "learning_rate": 4.7597409621059164e-05, + "loss": 0.4691, + "num_input_tokens_seen": 2832344, + "step": 4325 + }, + { + "epoch": 2.269392033542977, + "grad_norm": 1.0330032110214233, + "learning_rate": 4.7587618103663444e-05, + "loss": 0.3713, + "num_input_tokens_seen": 2836408, + "step": 4330 + }, + { + "epoch": 2.2720125786163523, + "grad_norm": 0.735686182975769, + "learning_rate": 4.757780768598066e-05, + "loss": 0.4427, + "num_input_tokens_seen": 2839320, + "step": 4335 + }, + { + "epoch": 2.2746331236897275, + "grad_norm": 1.2915380001068115, + "learning_rate": 4.756797837621971e-05, + "loss": 0.4694, + "num_input_tokens_seen": 2842456, + "step": 4340 + }, + { + "epoch": 2.2772536687631026, + "grad_norm": 1.3666719198226929, + "learning_rate": 4.755813018260532e-05, + "loss": 0.5327, + "num_input_tokens_seen": 2845432, + "step": 4345 + }, + { + "epoch": 2.279874213836478, + "grad_norm": 0.7247224450111389, + "learning_rate": 4.754826311337801e-05, + "loss": 0.4768, + "num_input_tokens_seen": 2848536, + "step": 4350 + }, + { + "epoch": 2.2824947589098534, + "grad_norm": 0.7602744698524475, + "learning_rate": 4.753837717679409e-05, + "loss": 0.5164, + "num_input_tokens_seen": 2851544, + "step": 4355 + }, + { + "epoch": 2.2851153039832286, + "grad_norm": 0.8811573386192322, + "learning_rate": 4.7528472381125653e-05, + "loss": 0.4121, + "num_input_tokens_seen": 2854392, + "step": 4360 + }, + { + "epoch": 2.2877358490566038, + "grad_norm": 0.7466426491737366, + "learning_rate": 4.75185487346606e-05, + "loss": 0.3483, + "num_input_tokens_seen": 2858424, + "step": 4365 + }, + { + "epoch": 2.290356394129979, + "grad_norm": 0.7757810354232788, + "learning_rate": 4.750860624570256e-05, + "loss": 0.4498, + "num_input_tokens_seen": 2861368, + "step": 4370 + }, + { + "epoch": 2.2929769392033545, + "grad_norm": 0.9642816185951233, + "learning_rate": 4.7498644922570966e-05, + "loss": 0.4274, + "num_input_tokens_seen": 2863960, + "step": 4375 + }, + { + "epoch": 2.2955974842767297, + "grad_norm": 1.4252939224243164, + "learning_rate": 4.7488664773601004e-05, + "loss": 0.5009, + "num_input_tokens_seen": 2867352, + "step": 4380 + }, + { + "epoch": 2.298218029350105, + "grad_norm": 1.0357571840286255, + "learning_rate": 4.7478665807143605e-05, + "loss": 0.4778, + "num_input_tokens_seen": 2871992, + "step": 4385 + }, + { + "epoch": 2.30083857442348, + "grad_norm": 1.110084056854248, + "learning_rate": 4.7468648031565434e-05, + "loss": 0.3879, + "num_input_tokens_seen": 2874680, + "step": 4390 + }, + { + "epoch": 2.3034591194968552, + "grad_norm": 1.6297154426574707, + "learning_rate": 4.745861145524892e-05, + "loss": 0.3161, + "num_input_tokens_seen": 2877688, + "step": 4395 + }, + { + "epoch": 2.3060796645702304, + "grad_norm": 0.7920019626617432, + "learning_rate": 4.74485560865922e-05, + "loss": 0.4931, + "num_input_tokens_seen": 2885528, + "step": 4400 + }, + { + "epoch": 2.308700209643606, + "grad_norm": 0.8782615661621094, + "learning_rate": 4.743848193400917e-05, + "loss": 0.5479, + "num_input_tokens_seen": 2889464, + "step": 4405 + }, + { + "epoch": 2.311320754716981, + "grad_norm": 1.040565848350525, + "learning_rate": 4.7428389005929405e-05, + "loss": 0.3286, + "num_input_tokens_seen": 2893464, + "step": 4410 + }, + { + "epoch": 2.3139412997903563, + "grad_norm": 1.3487461805343628, + "learning_rate": 4.74182773107982e-05, + "loss": 0.4597, + "num_input_tokens_seen": 2896792, + "step": 4415 + }, + { + "epoch": 2.3165618448637315, + "grad_norm": 0.9644630551338196, + "learning_rate": 4.7408146857076566e-05, + "loss": 0.4075, + "num_input_tokens_seen": 2900664, + "step": 4420 + }, + { + "epoch": 2.319182389937107, + "grad_norm": 0.878553032875061, + "learning_rate": 4.739799765324121e-05, + "loss": 0.4598, + "num_input_tokens_seen": 2904152, + "step": 4425 + }, + { + "epoch": 2.3218029350104823, + "grad_norm": 0.8255696296691895, + "learning_rate": 4.738782970778452e-05, + "loss": 0.4242, + "num_input_tokens_seen": 2906776, + "step": 4430 + }, + { + "epoch": 2.3244234800838575, + "grad_norm": 0.727022647857666, + "learning_rate": 4.737764302921456e-05, + "loss": 0.4306, + "num_input_tokens_seen": 2909720, + "step": 4435 + }, + { + "epoch": 2.3270440251572326, + "grad_norm": 1.345492959022522, + "learning_rate": 4.7367437626055087e-05, + "loss": 0.303, + "num_input_tokens_seen": 2916536, + "step": 4440 + }, + { + "epoch": 2.329664570230608, + "grad_norm": 1.5187095403671265, + "learning_rate": 4.735721350684551e-05, + "loss": 0.4894, + "num_input_tokens_seen": 2919608, + "step": 4445 + }, + { + "epoch": 2.3322851153039834, + "grad_norm": 0.8001366853713989, + "learning_rate": 4.734697068014091e-05, + "loss": 0.5374, + "num_input_tokens_seen": 2923352, + "step": 4450 + }, + { + "epoch": 2.3349056603773586, + "grad_norm": 1.3480087518692017, + "learning_rate": 4.733670915451202e-05, + "loss": 0.3991, + "num_input_tokens_seen": 2926232, + "step": 4455 + }, + { + "epoch": 2.3375262054507338, + "grad_norm": 0.8997186422348022, + "learning_rate": 4.732642893854519e-05, + "loss": 0.4004, + "num_input_tokens_seen": 2929656, + "step": 4460 + }, + { + "epoch": 2.340146750524109, + "grad_norm": 0.8312307000160217, + "learning_rate": 4.7316130040842466e-05, + "loss": 0.494, + "num_input_tokens_seen": 2932344, + "step": 4465 + }, + { + "epoch": 2.342767295597484, + "grad_norm": 0.9825847744941711, + "learning_rate": 4.730581247002148e-05, + "loss": 0.3508, + "num_input_tokens_seen": 2936152, + "step": 4470 + }, + { + "epoch": 2.3453878406708597, + "grad_norm": 0.8222494125366211, + "learning_rate": 4.7295476234715516e-05, + "loss": 0.3788, + "num_input_tokens_seen": 2939224, + "step": 4475 + }, + { + "epoch": 2.348008385744235, + "grad_norm": 0.8447177410125732, + "learning_rate": 4.728512134357345e-05, + "loss": 0.3473, + "num_input_tokens_seen": 2941912, + "step": 4480 + }, + { + "epoch": 2.35062893081761, + "grad_norm": 0.8459510803222656, + "learning_rate": 4.727474780525979e-05, + "loss": 0.3614, + "num_input_tokens_seen": 2944568, + "step": 4485 + }, + { + "epoch": 2.353249475890985, + "grad_norm": 0.7941240668296814, + "learning_rate": 4.7264355628454636e-05, + "loss": 0.3309, + "num_input_tokens_seen": 2947480, + "step": 4490 + }, + { + "epoch": 2.3558700209643604, + "grad_norm": 1.2616897821426392, + "learning_rate": 4.7253944821853685e-05, + "loss": 0.4467, + "num_input_tokens_seen": 2951192, + "step": 4495 + }, + { + "epoch": 2.358490566037736, + "grad_norm": 1.0420795679092407, + "learning_rate": 4.724351539416822e-05, + "loss": 0.419, + "num_input_tokens_seen": 2954648, + "step": 4500 + }, + { + "epoch": 2.361111111111111, + "grad_norm": 0.5916221141815186, + "learning_rate": 4.7233067354125125e-05, + "loss": 0.3444, + "num_input_tokens_seen": 2958552, + "step": 4505 + }, + { + "epoch": 2.3637316561844863, + "grad_norm": 0.7333043217658997, + "learning_rate": 4.722260071046683e-05, + "loss": 0.3493, + "num_input_tokens_seen": 2962040, + "step": 4510 + }, + { + "epoch": 2.3663522012578615, + "grad_norm": 0.7491455674171448, + "learning_rate": 4.721211547195136e-05, + "loss": 0.4203, + "num_input_tokens_seen": 2964824, + "step": 4515 + }, + { + "epoch": 2.368972746331237, + "grad_norm": 0.6295095086097717, + "learning_rate": 4.7201611647352264e-05, + "loss": 0.391, + "num_input_tokens_seen": 2967448, + "step": 4520 + }, + { + "epoch": 2.3715932914046123, + "grad_norm": 0.9054363369941711, + "learning_rate": 4.719108924545866e-05, + "loss": 0.4928, + "num_input_tokens_seen": 2971000, + "step": 4525 + }, + { + "epoch": 2.3742138364779874, + "grad_norm": 2.005793333053589, + "learning_rate": 4.718054827507524e-05, + "loss": 0.4253, + "num_input_tokens_seen": 2973592, + "step": 4530 + }, + { + "epoch": 2.3768343815513626, + "grad_norm": 0.8807366490364075, + "learning_rate": 4.716998874502218e-05, + "loss": 0.5002, + "num_input_tokens_seen": 2977560, + "step": 4535 + }, + { + "epoch": 2.379454926624738, + "grad_norm": 0.8950674533843994, + "learning_rate": 4.7159410664135225e-05, + "loss": 0.4149, + "num_input_tokens_seen": 2980280, + "step": 4540 + }, + { + "epoch": 2.3820754716981134, + "grad_norm": 0.9782240390777588, + "learning_rate": 4.714881404126563e-05, + "loss": 0.3876, + "num_input_tokens_seen": 2983352, + "step": 4545 + }, + { + "epoch": 2.3846960167714886, + "grad_norm": 0.8232694268226624, + "learning_rate": 4.713819888528016e-05, + "loss": 0.4724, + "num_input_tokens_seen": 2986648, + "step": 4550 + }, + { + "epoch": 2.3873165618448637, + "grad_norm": 1.2366597652435303, + "learning_rate": 4.7127565205061096e-05, + "loss": 0.3994, + "num_input_tokens_seen": 2989880, + "step": 4555 + }, + { + "epoch": 2.389937106918239, + "grad_norm": 1.158815622329712, + "learning_rate": 4.711691300950622e-05, + "loss": 0.3987, + "num_input_tokens_seen": 2992536, + "step": 4560 + }, + { + "epoch": 2.392557651991614, + "grad_norm": 1.658149242401123, + "learning_rate": 4.710624230752879e-05, + "loss": 0.4405, + "num_input_tokens_seen": 2996152, + "step": 4565 + }, + { + "epoch": 2.3951781970649897, + "grad_norm": 2.1598942279815674, + "learning_rate": 4.709555310805758e-05, + "loss": 0.4062, + "num_input_tokens_seen": 2999288, + "step": 4570 + }, + { + "epoch": 2.397798742138365, + "grad_norm": 1.0635714530944824, + "learning_rate": 4.7084845420036805e-05, + "loss": 0.3203, + "num_input_tokens_seen": 3002520, + "step": 4575 + }, + { + "epoch": 2.40041928721174, + "grad_norm": 0.9896450042724609, + "learning_rate": 4.7074119252426175e-05, + "loss": 0.4991, + "num_input_tokens_seen": 3005752, + "step": 4580 + }, + { + "epoch": 2.403039832285115, + "grad_norm": 0.9332050085067749, + "learning_rate": 4.7063374614200866e-05, + "loss": 0.432, + "num_input_tokens_seen": 3009432, + "step": 4585 + }, + { + "epoch": 2.4056603773584904, + "grad_norm": 1.3100768327713013, + "learning_rate": 4.7052611514351495e-05, + "loss": 0.4598, + "num_input_tokens_seen": 3014136, + "step": 4590 + }, + { + "epoch": 2.408280922431866, + "grad_norm": 0.7949943542480469, + "learning_rate": 4.704182996188413e-05, + "loss": 0.4527, + "num_input_tokens_seen": 3017144, + "step": 4595 + }, + { + "epoch": 2.410901467505241, + "grad_norm": 1.7680968046188354, + "learning_rate": 4.703102996582028e-05, + "loss": 0.4654, + "num_input_tokens_seen": 3019448, + "step": 4600 + }, + { + "epoch": 2.4135220125786163, + "grad_norm": 0.8493803143501282, + "learning_rate": 4.70202115351969e-05, + "loss": 0.3272, + "num_input_tokens_seen": 3022104, + "step": 4605 + }, + { + "epoch": 2.4161425576519915, + "grad_norm": 1.3049858808517456, + "learning_rate": 4.700937467906634e-05, + "loss": 0.458, + "num_input_tokens_seen": 3025688, + "step": 4610 + }, + { + "epoch": 2.418763102725367, + "grad_norm": 0.8336288928985596, + "learning_rate": 4.69985194064964e-05, + "loss": 0.305, + "num_input_tokens_seen": 3029816, + "step": 4615 + }, + { + "epoch": 2.4213836477987423, + "grad_norm": 0.5758680701255798, + "learning_rate": 4.698764572657029e-05, + "loss": 0.3019, + "num_input_tokens_seen": 3032824, + "step": 4620 + }, + { + "epoch": 2.4240041928721174, + "grad_norm": 0.9816603064537048, + "learning_rate": 4.697675364838657e-05, + "loss": 0.5333, + "num_input_tokens_seen": 3036632, + "step": 4625 + }, + { + "epoch": 2.4266247379454926, + "grad_norm": 1.1536612510681152, + "learning_rate": 4.6965843181059264e-05, + "loss": 0.4518, + "num_input_tokens_seen": 3039384, + "step": 4630 + }, + { + "epoch": 2.4292452830188678, + "grad_norm": 0.7009607553482056, + "learning_rate": 4.695491433371774e-05, + "loss": 0.4233, + "num_input_tokens_seen": 3042296, + "step": 4635 + }, + { + "epoch": 2.431865828092243, + "grad_norm": 0.9431701898574829, + "learning_rate": 4.694396711550676e-05, + "loss": 0.3858, + "num_input_tokens_seen": 3045304, + "step": 4640 + }, + { + "epoch": 2.4344863731656186, + "grad_norm": 1.1311391592025757, + "learning_rate": 4.693300153558646e-05, + "loss": 0.3407, + "num_input_tokens_seen": 3048824, + "step": 4645 + }, + { + "epoch": 2.4371069182389937, + "grad_norm": 0.7785087823867798, + "learning_rate": 4.692201760313233e-05, + "loss": 0.4553, + "num_input_tokens_seen": 3052120, + "step": 4650 + }, + { + "epoch": 2.439727463312369, + "grad_norm": 0.7809030413627625, + "learning_rate": 4.691101532733524e-05, + "loss": 0.4155, + "num_input_tokens_seen": 3055384, + "step": 4655 + }, + { + "epoch": 2.442348008385744, + "grad_norm": 0.6369892358779907, + "learning_rate": 4.689999471740137e-05, + "loss": 0.3298, + "num_input_tokens_seen": 3058328, + "step": 4660 + }, + { + "epoch": 2.4449685534591197, + "grad_norm": 0.6371482014656067, + "learning_rate": 4.6888955782552274e-05, + "loss": 0.4453, + "num_input_tokens_seen": 3063512, + "step": 4665 + }, + { + "epoch": 2.447589098532495, + "grad_norm": 0.7082908153533936, + "learning_rate": 4.6877898532024825e-05, + "loss": 0.4549, + "num_input_tokens_seen": 3068056, + "step": 4670 + }, + { + "epoch": 2.45020964360587, + "grad_norm": 0.8254973292350769, + "learning_rate": 4.686682297507123e-05, + "loss": 0.3659, + "num_input_tokens_seen": 3071608, + "step": 4675 + }, + { + "epoch": 2.452830188679245, + "grad_norm": 1.4239747524261475, + "learning_rate": 4.6855729120959e-05, + "loss": 0.4579, + "num_input_tokens_seen": 3074872, + "step": 4680 + }, + { + "epoch": 2.4554507337526204, + "grad_norm": 1.2840948104858398, + "learning_rate": 4.684461697897098e-05, + "loss": 0.3546, + "num_input_tokens_seen": 3077720, + "step": 4685 + }, + { + "epoch": 2.458071278825996, + "grad_norm": 1.1082954406738281, + "learning_rate": 4.683348655840529e-05, + "loss": 0.5079, + "num_input_tokens_seen": 3080664, + "step": 4690 + }, + { + "epoch": 2.460691823899371, + "grad_norm": 1.150654911994934, + "learning_rate": 4.682233786857536e-05, + "loss": 0.417, + "num_input_tokens_seen": 3084408, + "step": 4695 + }, + { + "epoch": 2.4633123689727463, + "grad_norm": 0.9058507084846497, + "learning_rate": 4.681117091880991e-05, + "loss": 0.4581, + "num_input_tokens_seen": 3087640, + "step": 4700 + }, + { + "epoch": 2.4659329140461215, + "grad_norm": 0.9406165480613708, + "learning_rate": 4.679998571845293e-05, + "loss": 0.4747, + "num_input_tokens_seen": 3090520, + "step": 4705 + }, + { + "epoch": 2.468553459119497, + "grad_norm": 0.7005019783973694, + "learning_rate": 4.678878227686368e-05, + "loss": 0.3793, + "num_input_tokens_seen": 3093624, + "step": 4710 + }, + { + "epoch": 2.4711740041928723, + "grad_norm": 0.708203911781311, + "learning_rate": 4.677756060341669e-05, + "loss": 0.3913, + "num_input_tokens_seen": 3096184, + "step": 4715 + }, + { + "epoch": 2.4737945492662474, + "grad_norm": 1.4799072742462158, + "learning_rate": 4.676632070750175e-05, + "loss": 0.5135, + "num_input_tokens_seen": 3099928, + "step": 4720 + }, + { + "epoch": 2.4764150943396226, + "grad_norm": 1.0895200967788696, + "learning_rate": 4.6755062598523894e-05, + "loss": 0.4003, + "num_input_tokens_seen": 3102456, + "step": 4725 + }, + { + "epoch": 2.4790356394129978, + "grad_norm": 2.277808427810669, + "learning_rate": 4.674378628590338e-05, + "loss": 0.4624, + "num_input_tokens_seen": 3104728, + "step": 4730 + }, + { + "epoch": 2.481656184486373, + "grad_norm": 1.6635130643844604, + "learning_rate": 4.673249177907571e-05, + "loss": 0.3703, + "num_input_tokens_seen": 3107704, + "step": 4735 + }, + { + "epoch": 2.4842767295597485, + "grad_norm": 0.7697907090187073, + "learning_rate": 4.672117908749164e-05, + "loss": 0.3649, + "num_input_tokens_seen": 3110328, + "step": 4740 + }, + { + "epoch": 2.4868972746331237, + "grad_norm": 0.8594107031822205, + "learning_rate": 4.670984822061708e-05, + "loss": 0.4487, + "num_input_tokens_seen": 3113464, + "step": 4745 + }, + { + "epoch": 2.489517819706499, + "grad_norm": 0.744135320186615, + "learning_rate": 4.6698499187933196e-05, + "loss": 0.3745, + "num_input_tokens_seen": 3117112, + "step": 4750 + }, + { + "epoch": 2.492138364779874, + "grad_norm": 1.1509106159210205, + "learning_rate": 4.668713199893635e-05, + "loss": 0.3732, + "num_input_tokens_seen": 3120280, + "step": 4755 + }, + { + "epoch": 2.4947589098532497, + "grad_norm": 0.7417581081390381, + "learning_rate": 4.6675746663138066e-05, + "loss": 0.4083, + "num_input_tokens_seen": 3123928, + "step": 4760 + }, + { + "epoch": 2.497379454926625, + "grad_norm": 0.8908089399337769, + "learning_rate": 4.666434319006508e-05, + "loss": 0.3568, + "num_input_tokens_seen": 3126584, + "step": 4765 + }, + { + "epoch": 2.5, + "grad_norm": 1.2597172260284424, + "learning_rate": 4.66529215892593e-05, + "loss": 0.5189, + "num_input_tokens_seen": 3129368, + "step": 4770 + }, + { + "epoch": 2.5, + "eval_loss": 0.4697624444961548, + "eval_runtime": 15.98, + "eval_samples_per_second": 53.066, + "eval_steps_per_second": 13.267, + "num_input_tokens_seen": 3129368, + "step": 4770 + }, + { + "epoch": 2.502620545073375, + "grad_norm": 1.7914645671844482, + "learning_rate": 4.664148187027781e-05, + "loss": 0.5097, + "num_input_tokens_seen": 3132248, + "step": 4775 + }, + { + "epoch": 2.5052410901467503, + "grad_norm": 0.6445972919464111, + "learning_rate": 4.663002404269283e-05, + "loss": 0.3104, + "num_input_tokens_seen": 3135320, + "step": 4780 + }, + { + "epoch": 2.507861635220126, + "grad_norm": 1.261978030204773, + "learning_rate": 4.661854811609174e-05, + "loss": 0.6311, + "num_input_tokens_seen": 3139032, + "step": 4785 + }, + { + "epoch": 2.510482180293501, + "grad_norm": 1.0026904344558716, + "learning_rate": 4.6607054100077096e-05, + "loss": 0.5538, + "num_input_tokens_seen": 3141944, + "step": 4790 + }, + { + "epoch": 2.5131027253668763, + "grad_norm": 1.547315001487732, + "learning_rate": 4.6595542004266545e-05, + "loss": 0.4122, + "num_input_tokens_seen": 3145400, + "step": 4795 + }, + { + "epoch": 2.5157232704402515, + "grad_norm": 0.6631594896316528, + "learning_rate": 4.65840118382929e-05, + "loss": 0.5022, + "num_input_tokens_seen": 3149496, + "step": 4800 + }, + { + "epoch": 2.518343815513627, + "grad_norm": 0.8879810571670532, + "learning_rate": 4.657246361180405e-05, + "loss": 0.5156, + "num_input_tokens_seen": 3152344, + "step": 4805 + }, + { + "epoch": 2.5209643605870022, + "grad_norm": 0.7576431632041931, + "learning_rate": 4.656089733446305e-05, + "loss": 0.3616, + "num_input_tokens_seen": 3155192, + "step": 4810 + }, + { + "epoch": 2.5235849056603774, + "grad_norm": 1.1050255298614502, + "learning_rate": 4.6549313015948025e-05, + "loss": 0.3354, + "num_input_tokens_seen": 3157688, + "step": 4815 + }, + { + "epoch": 2.5262054507337526, + "grad_norm": 0.5222830176353455, + "learning_rate": 4.653771066595219e-05, + "loss": 0.4087, + "num_input_tokens_seen": 3161592, + "step": 4820 + }, + { + "epoch": 2.5288259958071277, + "grad_norm": 1.5137090682983398, + "learning_rate": 4.652609029418389e-05, + "loss": 0.4053, + "num_input_tokens_seen": 3164024, + "step": 4825 + }, + { + "epoch": 2.531446540880503, + "grad_norm": 1.134620189666748, + "learning_rate": 4.65144519103665e-05, + "loss": 0.332, + "num_input_tokens_seen": 3166424, + "step": 4830 + }, + { + "epoch": 2.5340670859538785, + "grad_norm": 1.0010987520217896, + "learning_rate": 4.650279552423849e-05, + "loss": 0.5753, + "num_input_tokens_seen": 3169240, + "step": 4835 + }, + { + "epoch": 2.5366876310272537, + "grad_norm": 1.0707064867019653, + "learning_rate": 4.6491121145553386e-05, + "loss": 0.4239, + "num_input_tokens_seen": 3172088, + "step": 4840 + }, + { + "epoch": 2.539308176100629, + "grad_norm": 1.0071207284927368, + "learning_rate": 4.6479428784079796e-05, + "loss": 0.4243, + "num_input_tokens_seen": 3175480, + "step": 4845 + }, + { + "epoch": 2.541928721174004, + "grad_norm": 0.7460083961486816, + "learning_rate": 4.6467718449601326e-05, + "loss": 0.4638, + "num_input_tokens_seen": 3179480, + "step": 4850 + }, + { + "epoch": 2.5445492662473796, + "grad_norm": 0.6872434020042419, + "learning_rate": 4.645599015191667e-05, + "loss": 0.3951, + "num_input_tokens_seen": 3182520, + "step": 4855 + }, + { + "epoch": 2.547169811320755, + "grad_norm": 0.8928537368774414, + "learning_rate": 4.6444243900839525e-05, + "loss": 0.4404, + "num_input_tokens_seen": 3185560, + "step": 4860 + }, + { + "epoch": 2.54979035639413, + "grad_norm": 0.9340757727622986, + "learning_rate": 4.643247970619862e-05, + "loss": 0.4365, + "num_input_tokens_seen": 3189592, + "step": 4865 + }, + { + "epoch": 2.552410901467505, + "grad_norm": 1.9084841012954712, + "learning_rate": 4.642069757783769e-05, + "loss": 0.3769, + "num_input_tokens_seen": 3192760, + "step": 4870 + }, + { + "epoch": 2.5550314465408803, + "grad_norm": 1.3107311725616455, + "learning_rate": 4.640889752561549e-05, + "loss": 0.3533, + "num_input_tokens_seen": 3195992, + "step": 4875 + }, + { + "epoch": 2.5576519916142555, + "grad_norm": 1.1993252038955688, + "learning_rate": 4.639707955940575e-05, + "loss": 0.4332, + "num_input_tokens_seen": 3198200, + "step": 4880 + }, + { + "epoch": 2.560272536687631, + "grad_norm": 0.6327290534973145, + "learning_rate": 4.6385243689097226e-05, + "loss": 0.4102, + "num_input_tokens_seen": 3201848, + "step": 4885 + }, + { + "epoch": 2.5628930817610063, + "grad_norm": 2.253788709640503, + "learning_rate": 4.6373389924593615e-05, + "loss": 0.5817, + "num_input_tokens_seen": 3204504, + "step": 4890 + }, + { + "epoch": 2.5655136268343814, + "grad_norm": 0.9842377305030823, + "learning_rate": 4.6361518275813615e-05, + "loss": 0.5333, + "num_input_tokens_seen": 3207320, + "step": 4895 + }, + { + "epoch": 2.568134171907757, + "grad_norm": 0.8692671656608582, + "learning_rate": 4.6349628752690876e-05, + "loss": 0.3872, + "num_input_tokens_seen": 3210584, + "step": 4900 + }, + { + "epoch": 2.5707547169811322, + "grad_norm": 0.9715347290039062, + "learning_rate": 4.633772136517401e-05, + "loss": 0.3187, + "num_input_tokens_seen": 3213496, + "step": 4905 + }, + { + "epoch": 2.5733752620545074, + "grad_norm": 0.608004629611969, + "learning_rate": 4.6325796123226575e-05, + "loss": 0.4145, + "num_input_tokens_seen": 3217016, + "step": 4910 + }, + { + "epoch": 2.5759958071278826, + "grad_norm": 0.8643375635147095, + "learning_rate": 4.6313853036827057e-05, + "loss": 0.5153, + "num_input_tokens_seen": 3219800, + "step": 4915 + }, + { + "epoch": 2.5786163522012577, + "grad_norm": 1.0464848279953003, + "learning_rate": 4.630189211596891e-05, + "loss": 0.4095, + "num_input_tokens_seen": 3222424, + "step": 4920 + }, + { + "epoch": 2.581236897274633, + "grad_norm": 1.3037142753601074, + "learning_rate": 4.628991337066047e-05, + "loss": 0.4472, + "num_input_tokens_seen": 3225272, + "step": 4925 + }, + { + "epoch": 2.5838574423480085, + "grad_norm": 0.7554773092269897, + "learning_rate": 4.627791681092499e-05, + "loss": 0.3786, + "num_input_tokens_seen": 3228088, + "step": 4930 + }, + { + "epoch": 2.5864779874213837, + "grad_norm": 0.7330243587493896, + "learning_rate": 4.626590244680068e-05, + "loss": 0.4338, + "num_input_tokens_seen": 3231096, + "step": 4935 + }, + { + "epoch": 2.589098532494759, + "grad_norm": 1.355343222618103, + "learning_rate": 4.625387028834057e-05, + "loss": 0.4987, + "num_input_tokens_seen": 3234552, + "step": 4940 + }, + { + "epoch": 2.591719077568134, + "grad_norm": 1.459860920906067, + "learning_rate": 4.6241820345612654e-05, + "loss": 0.3418, + "num_input_tokens_seen": 3238072, + "step": 4945 + }, + { + "epoch": 2.5943396226415096, + "grad_norm": 1.0843544006347656, + "learning_rate": 4.622975262869976e-05, + "loss": 0.4799, + "num_input_tokens_seen": 3241624, + "step": 4950 + }, + { + "epoch": 2.596960167714885, + "grad_norm": 1.0586282014846802, + "learning_rate": 4.62176671476996e-05, + "loss": 0.6838, + "num_input_tokens_seen": 3245080, + "step": 4955 + }, + { + "epoch": 2.59958071278826, + "grad_norm": 0.8712664842605591, + "learning_rate": 4.620556391272476e-05, + "loss": 0.4619, + "num_input_tokens_seen": 3247800, + "step": 4960 + }, + { + "epoch": 2.602201257861635, + "grad_norm": 0.6452222466468811, + "learning_rate": 4.619344293390266e-05, + "loss": 0.3286, + "num_input_tokens_seen": 3251000, + "step": 4965 + }, + { + "epoch": 2.6048218029350103, + "grad_norm": 0.815735399723053, + "learning_rate": 4.61813042213756e-05, + "loss": 0.3796, + "num_input_tokens_seen": 3254872, + "step": 4970 + }, + { + "epoch": 2.6074423480083855, + "grad_norm": 0.9331057667732239, + "learning_rate": 4.6169147785300685e-05, + "loss": 0.4507, + "num_input_tokens_seen": 3258520, + "step": 4975 + }, + { + "epoch": 2.610062893081761, + "grad_norm": 0.988396167755127, + "learning_rate": 4.6156973635849864e-05, + "loss": 0.4581, + "num_input_tokens_seen": 3261944, + "step": 4980 + }, + { + "epoch": 2.6126834381551363, + "grad_norm": 0.8576576709747314, + "learning_rate": 4.614478178320993e-05, + "loss": 0.4808, + "num_input_tokens_seen": 3265432, + "step": 4985 + }, + { + "epoch": 2.6153039832285114, + "grad_norm": 1.6156798601150513, + "learning_rate": 4.613257223758245e-05, + "loss": 0.4383, + "num_input_tokens_seen": 3268632, + "step": 4990 + }, + { + "epoch": 2.617924528301887, + "grad_norm": 1.5049232244491577, + "learning_rate": 4.612034500918381e-05, + "loss": 0.3914, + "num_input_tokens_seen": 3271416, + "step": 4995 + }, + { + "epoch": 2.620545073375262, + "grad_norm": 1.0529022216796875, + "learning_rate": 4.610810010824522e-05, + "loss": 0.3119, + "num_input_tokens_seen": 3274424, + "step": 5000 + }, + { + "epoch": 2.6231656184486374, + "grad_norm": 1.1350876092910767, + "learning_rate": 4.609583754501263e-05, + "loss": 0.436, + "num_input_tokens_seen": 3277688, + "step": 5005 + }, + { + "epoch": 2.6257861635220126, + "grad_norm": 0.5732675194740295, + "learning_rate": 4.6083557329746805e-05, + "loss": 0.337, + "num_input_tokens_seen": 3280440, + "step": 5010 + }, + { + "epoch": 2.6284067085953877, + "grad_norm": 1.451996088027954, + "learning_rate": 4.607125947272326e-05, + "loss": 0.5979, + "num_input_tokens_seen": 3283704, + "step": 5015 + }, + { + "epoch": 2.631027253668763, + "grad_norm": 0.9236838221549988, + "learning_rate": 4.6058943984232286e-05, + "loss": 0.3932, + "num_input_tokens_seen": 3286808, + "step": 5020 + }, + { + "epoch": 2.6336477987421385, + "grad_norm": 0.7672876715660095, + "learning_rate": 4.604661087457893e-05, + "loss": 0.5198, + "num_input_tokens_seen": 3289592, + "step": 5025 + }, + { + "epoch": 2.6362683438155137, + "grad_norm": 0.8135759830474854, + "learning_rate": 4.6034260154082955e-05, + "loss": 0.4335, + "num_input_tokens_seen": 3292696, + "step": 5030 + }, + { + "epoch": 2.638888888888889, + "grad_norm": 0.8956961035728455, + "learning_rate": 4.602189183307889e-05, + "loss": 0.2957, + "num_input_tokens_seen": 3296632, + "step": 5035 + }, + { + "epoch": 2.641509433962264, + "grad_norm": 1.2388856410980225, + "learning_rate": 4.600950592191599e-05, + "loss": 0.3471, + "num_input_tokens_seen": 3300536, + "step": 5040 + }, + { + "epoch": 2.6441299790356396, + "grad_norm": 0.9492870569229126, + "learning_rate": 4.599710243095819e-05, + "loss": 0.4164, + "num_input_tokens_seen": 3304728, + "step": 5045 + }, + { + "epoch": 2.646750524109015, + "grad_norm": 1.1145286560058594, + "learning_rate": 4.59846813705842e-05, + "loss": 0.4707, + "num_input_tokens_seen": 3307928, + "step": 5050 + }, + { + "epoch": 2.64937106918239, + "grad_norm": 1.0140471458435059, + "learning_rate": 4.597224275118738e-05, + "loss": 0.5214, + "num_input_tokens_seen": 3310552, + "step": 5055 + }, + { + "epoch": 2.651991614255765, + "grad_norm": 1.0148290395736694, + "learning_rate": 4.59597865831758e-05, + "loss": 0.5404, + "num_input_tokens_seen": 3314904, + "step": 5060 + }, + { + "epoch": 2.6546121593291403, + "grad_norm": 0.8564881682395935, + "learning_rate": 4.5947312876972214e-05, + "loss": 0.4096, + "num_input_tokens_seen": 3320184, + "step": 5065 + }, + { + "epoch": 2.6572327044025155, + "grad_norm": 1.4552708864212036, + "learning_rate": 4.5934821643014034e-05, + "loss": 0.3804, + "num_input_tokens_seen": 3322904, + "step": 5070 + }, + { + "epoch": 2.659853249475891, + "grad_norm": 0.8273054361343384, + "learning_rate": 4.5922312891753385e-05, + "loss": 0.3721, + "num_input_tokens_seen": 3325784, + "step": 5075 + }, + { + "epoch": 2.6624737945492662, + "grad_norm": 0.8393348455429077, + "learning_rate": 4.590978663365699e-05, + "loss": 0.475, + "num_input_tokens_seen": 3328760, + "step": 5080 + }, + { + "epoch": 2.6650943396226414, + "grad_norm": 1.255098581314087, + "learning_rate": 4.589724287920627e-05, + "loss": 0.4376, + "num_input_tokens_seen": 3331576, + "step": 5085 + }, + { + "epoch": 2.667714884696017, + "grad_norm": 1.4163281917572021, + "learning_rate": 4.5884681638897246e-05, + "loss": 0.4812, + "num_input_tokens_seen": 3333880, + "step": 5090 + }, + { + "epoch": 2.670335429769392, + "grad_norm": 1.2287194728851318, + "learning_rate": 4.587210292324061e-05, + "loss": 0.449, + "num_input_tokens_seen": 3337880, + "step": 5095 + }, + { + "epoch": 2.6729559748427674, + "grad_norm": 0.7543774247169495, + "learning_rate": 4.585950674276164e-05, + "loss": 0.4283, + "num_input_tokens_seen": 3340920, + "step": 5100 + }, + { + "epoch": 2.6755765199161425, + "grad_norm": 0.9646177887916565, + "learning_rate": 4.5846893108000256e-05, + "loss": 0.3798, + "num_input_tokens_seen": 3344312, + "step": 5105 + }, + { + "epoch": 2.6781970649895177, + "grad_norm": 0.7653923630714417, + "learning_rate": 4.5834262029510965e-05, + "loss": 0.3679, + "num_input_tokens_seen": 3347672, + "step": 5110 + }, + { + "epoch": 2.680817610062893, + "grad_norm": 0.7920419573783875, + "learning_rate": 4.5821613517862883e-05, + "loss": 0.4494, + "num_input_tokens_seen": 3351032, + "step": 5115 + }, + { + "epoch": 2.6834381551362685, + "grad_norm": 1.1617299318313599, + "learning_rate": 4.5808947583639693e-05, + "loss": 0.5746, + "num_input_tokens_seen": 3353752, + "step": 5120 + }, + { + "epoch": 2.6860587002096437, + "grad_norm": 1.0381428003311157, + "learning_rate": 4.579626423743969e-05, + "loss": 0.3813, + "num_input_tokens_seen": 3356600, + "step": 5125 + }, + { + "epoch": 2.688679245283019, + "grad_norm": 1.2138789892196655, + "learning_rate": 4.57835634898757e-05, + "loss": 0.4831, + "num_input_tokens_seen": 3360280, + "step": 5130 + }, + { + "epoch": 2.691299790356394, + "grad_norm": 0.9265775680541992, + "learning_rate": 4.577084535157514e-05, + "loss": 0.5483, + "num_input_tokens_seen": 3363768, + "step": 5135 + }, + { + "epoch": 2.6939203354297696, + "grad_norm": 0.7985010147094727, + "learning_rate": 4.5758109833179963e-05, + "loss": 0.3803, + "num_input_tokens_seen": 3367480, + "step": 5140 + }, + { + "epoch": 2.6965408805031448, + "grad_norm": 2.042046070098877, + "learning_rate": 4.5745356945346676e-05, + "loss": 0.4301, + "num_input_tokens_seen": 3370520, + "step": 5145 + }, + { + "epoch": 2.69916142557652, + "grad_norm": 1.2066019773483276, + "learning_rate": 4.57325866987463e-05, + "loss": 0.3678, + "num_input_tokens_seen": 3372984, + "step": 5150 + }, + { + "epoch": 2.701781970649895, + "grad_norm": 1.7148114442825317, + "learning_rate": 4.571979910406441e-05, + "loss": 0.423, + "num_input_tokens_seen": 3375288, + "step": 5155 + }, + { + "epoch": 2.7044025157232703, + "grad_norm": 1.0197051763534546, + "learning_rate": 4.570699417200106e-05, + "loss": 0.3281, + "num_input_tokens_seen": 3378488, + "step": 5160 + }, + { + "epoch": 2.7070230607966455, + "grad_norm": 0.8845809102058411, + "learning_rate": 4.569417191327086e-05, + "loss": 0.3689, + "num_input_tokens_seen": 3381368, + "step": 5165 + }, + { + "epoch": 2.709643605870021, + "grad_norm": 1.7204573154449463, + "learning_rate": 4.5681332338602864e-05, + "loss": 0.431, + "num_input_tokens_seen": 3385016, + "step": 5170 + }, + { + "epoch": 2.7122641509433962, + "grad_norm": 0.6478859186172485, + "learning_rate": 4.5668475458740654e-05, + "loss": 0.4384, + "num_input_tokens_seen": 3387992, + "step": 5175 + }, + { + "epoch": 2.7148846960167714, + "grad_norm": 0.7893748879432678, + "learning_rate": 4.5655601284442276e-05, + "loss": 0.4235, + "num_input_tokens_seen": 3390872, + "step": 5180 + }, + { + "epoch": 2.717505241090147, + "grad_norm": 1.2078436613082886, + "learning_rate": 4.5642709826480256e-05, + "loss": 0.4469, + "num_input_tokens_seen": 3393496, + "step": 5185 + }, + { + "epoch": 2.720125786163522, + "grad_norm": 1.1142756938934326, + "learning_rate": 4.562980109564158e-05, + "loss": 0.6299, + "num_input_tokens_seen": 3395992, + "step": 5190 + }, + { + "epoch": 2.7227463312368974, + "grad_norm": 1.149391531944275, + "learning_rate": 4.561687510272767e-05, + "loss": 0.5094, + "num_input_tokens_seen": 3399160, + "step": 5195 + }, + { + "epoch": 2.7253668763102725, + "grad_norm": 1.033388614654541, + "learning_rate": 4.5603931858554415e-05, + "loss": 0.3945, + "num_input_tokens_seen": 3402072, + "step": 5200 + }, + { + "epoch": 2.7279874213836477, + "grad_norm": 1.2362827062606812, + "learning_rate": 4.559097137395214e-05, + "loss": 0.411, + "num_input_tokens_seen": 3404344, + "step": 5205 + }, + { + "epoch": 2.730607966457023, + "grad_norm": 0.8789539933204651, + "learning_rate": 4.5577993659765574e-05, + "loss": 0.4104, + "num_input_tokens_seen": 3407640, + "step": 5210 + }, + { + "epoch": 2.7332285115303985, + "grad_norm": 1.0468686819076538, + "learning_rate": 4.556499872685387e-05, + "loss": 0.3226, + "num_input_tokens_seen": 3410200, + "step": 5215 + }, + { + "epoch": 2.7358490566037736, + "grad_norm": 0.9751201868057251, + "learning_rate": 4.555198658609061e-05, + "loss": 0.3529, + "num_input_tokens_seen": 3412856, + "step": 5220 + }, + { + "epoch": 2.738469601677149, + "grad_norm": 0.5155064463615417, + "learning_rate": 4.5538957248363756e-05, + "loss": 0.4149, + "num_input_tokens_seen": 3415864, + "step": 5225 + }, + { + "epoch": 2.741090146750524, + "grad_norm": 0.7083256840705872, + "learning_rate": 4.552591072457565e-05, + "loss": 0.4507, + "num_input_tokens_seen": 3419224, + "step": 5230 + }, + { + "epoch": 2.7437106918238996, + "grad_norm": 1.3869246244430542, + "learning_rate": 4.551284702564304e-05, + "loss": 0.3825, + "num_input_tokens_seen": 3422520, + "step": 5235 + }, + { + "epoch": 2.7463312368972748, + "grad_norm": 1.126554250717163, + "learning_rate": 4.5499766162497025e-05, + "loss": 0.4166, + "num_input_tokens_seen": 3425976, + "step": 5240 + }, + { + "epoch": 2.74895178197065, + "grad_norm": 0.8344557881355286, + "learning_rate": 4.548666814608308e-05, + "loss": 0.4293, + "num_input_tokens_seen": 3429368, + "step": 5245 + }, + { + "epoch": 2.751572327044025, + "grad_norm": 1.1329748630523682, + "learning_rate": 4.5473552987361024e-05, + "loss": 0.475, + "num_input_tokens_seen": 3434680, + "step": 5250 + }, + { + "epoch": 2.7541928721174003, + "grad_norm": 1.6217372417449951, + "learning_rate": 4.5460420697305024e-05, + "loss": 0.4998, + "num_input_tokens_seen": 3437752, + "step": 5255 + }, + { + "epoch": 2.7568134171907754, + "grad_norm": 0.6817925572395325, + "learning_rate": 4.544727128690358e-05, + "loss": 0.509, + "num_input_tokens_seen": 3440760, + "step": 5260 + }, + { + "epoch": 2.759433962264151, + "grad_norm": 1.3625164031982422, + "learning_rate": 4.543410476715951e-05, + "loss": 0.4553, + "num_input_tokens_seen": 3443736, + "step": 5265 + }, + { + "epoch": 2.762054507337526, + "grad_norm": 1.13504159450531, + "learning_rate": 4.542092114908997e-05, + "loss": 0.5376, + "num_input_tokens_seen": 3446904, + "step": 5270 + }, + { + "epoch": 2.7646750524109014, + "grad_norm": 1.1802043914794922, + "learning_rate": 4.54077204437264e-05, + "loss": 0.3632, + "num_input_tokens_seen": 3450104, + "step": 5275 + }, + { + "epoch": 2.767295597484277, + "grad_norm": 0.6979948282241821, + "learning_rate": 4.5394502662114555e-05, + "loss": 0.559, + "num_input_tokens_seen": 3453592, + "step": 5280 + }, + { + "epoch": 2.769916142557652, + "grad_norm": 1.425565481185913, + "learning_rate": 4.538126781531446e-05, + "loss": 0.4227, + "num_input_tokens_seen": 3456824, + "step": 5285 + }, + { + "epoch": 2.7725366876310273, + "grad_norm": 1.351806640625, + "learning_rate": 4.536801591440044e-05, + "loss": 0.4558, + "num_input_tokens_seen": 3460792, + "step": 5290 + }, + { + "epoch": 2.7751572327044025, + "grad_norm": 0.8546964526176453, + "learning_rate": 4.535474697046107e-05, + "loss": 0.4666, + "num_input_tokens_seen": 3464280, + "step": 5295 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 2.514061689376831, + "learning_rate": 4.534146099459921e-05, + "loss": 0.5384, + "num_input_tokens_seen": 3467320, + "step": 5300 + }, + { + "epoch": 2.780398322851153, + "grad_norm": 1.2146644592285156, + "learning_rate": 4.5328157997931955e-05, + "loss": 0.4027, + "num_input_tokens_seen": 3470520, + "step": 5305 + }, + { + "epoch": 2.7830188679245285, + "grad_norm": 1.0300226211547852, + "learning_rate": 4.531483799159062e-05, + "loss": 0.4383, + "num_input_tokens_seen": 3474008, + "step": 5310 + }, + { + "epoch": 2.7856394129979036, + "grad_norm": 1.1793044805526733, + "learning_rate": 4.5301500986720816e-05, + "loss": 0.376, + "num_input_tokens_seen": 3477208, + "step": 5315 + }, + { + "epoch": 2.788259958071279, + "grad_norm": 1.1531060934066772, + "learning_rate": 4.528814699448232e-05, + "loss": 0.4393, + "num_input_tokens_seen": 3480088, + "step": 5320 + }, + { + "epoch": 2.790880503144654, + "grad_norm": 0.627663254737854, + "learning_rate": 4.527477602604914e-05, + "loss": 0.4131, + "num_input_tokens_seen": 3484504, + "step": 5325 + }, + { + "epoch": 2.7935010482180296, + "grad_norm": 1.1334360837936401, + "learning_rate": 4.52613880926095e-05, + "loss": 0.3852, + "num_input_tokens_seen": 3487000, + "step": 5330 + }, + { + "epoch": 2.7961215932914047, + "grad_norm": 0.9032422304153442, + "learning_rate": 4.5247983205365806e-05, + "loss": 0.3696, + "num_input_tokens_seen": 3490712, + "step": 5335 + }, + { + "epoch": 2.79874213836478, + "grad_norm": 0.8761667609214783, + "learning_rate": 4.5234561375534655e-05, + "loss": 0.4388, + "num_input_tokens_seen": 3494296, + "step": 5340 + }, + { + "epoch": 2.801362683438155, + "grad_norm": 0.7325816750526428, + "learning_rate": 4.5221122614346823e-05, + "loss": 0.375, + "num_input_tokens_seen": 3498232, + "step": 5345 + }, + { + "epoch": 2.8039832285115303, + "grad_norm": 1.7755851745605469, + "learning_rate": 4.520766693304725e-05, + "loss": 0.4091, + "num_input_tokens_seen": 3501080, + "step": 5350 + }, + { + "epoch": 2.8066037735849054, + "grad_norm": 1.083936333656311, + "learning_rate": 4.519419434289504e-05, + "loss": 0.3385, + "num_input_tokens_seen": 3504024, + "step": 5355 + }, + { + "epoch": 2.809224318658281, + "grad_norm": 0.8813467621803284, + "learning_rate": 4.518070485516344e-05, + "loss": 0.3397, + "num_input_tokens_seen": 3507256, + "step": 5360 + }, + { + "epoch": 2.811844863731656, + "grad_norm": 0.7223554253578186, + "learning_rate": 4.5167198481139825e-05, + "loss": 0.4727, + "num_input_tokens_seen": 3510392, + "step": 5365 + }, + { + "epoch": 2.8144654088050314, + "grad_norm": 1.0303467512130737, + "learning_rate": 4.515367523212573e-05, + "loss": 0.4596, + "num_input_tokens_seen": 3513272, + "step": 5370 + }, + { + "epoch": 2.8170859538784065, + "grad_norm": 0.7046856880187988, + "learning_rate": 4.5140135119436776e-05, + "loss": 0.435, + "num_input_tokens_seen": 3516312, + "step": 5375 + }, + { + "epoch": 2.819706498951782, + "grad_norm": 1.2329988479614258, + "learning_rate": 4.512657815440273e-05, + "loss": 0.4572, + "num_input_tokens_seen": 3519032, + "step": 5380 + }, + { + "epoch": 2.8223270440251573, + "grad_norm": 1.1929303407669067, + "learning_rate": 4.511300434836743e-05, + "loss": 0.3224, + "num_input_tokens_seen": 3521720, + "step": 5385 + }, + { + "epoch": 2.8249475890985325, + "grad_norm": 1.1969496011734009, + "learning_rate": 4.5099413712688805e-05, + "loss": 0.3814, + "num_input_tokens_seen": 3524888, + "step": 5390 + }, + { + "epoch": 2.8275681341719077, + "grad_norm": 0.5228673219680786, + "learning_rate": 4.5085806258738896e-05, + "loss": 0.3597, + "num_input_tokens_seen": 3529688, + "step": 5395 + }, + { + "epoch": 2.830188679245283, + "grad_norm": 0.8462367653846741, + "learning_rate": 4.507218199790379e-05, + "loss": 0.4518, + "num_input_tokens_seen": 3532440, + "step": 5400 + }, + { + "epoch": 2.832809224318658, + "grad_norm": 0.9127882122993469, + "learning_rate": 4.505854094158365e-05, + "loss": 0.4152, + "num_input_tokens_seen": 3535288, + "step": 5405 + }, + { + "epoch": 2.8354297693920336, + "grad_norm": 1.1756995916366577, + "learning_rate": 4.5044883101192695e-05, + "loss": 0.4479, + "num_input_tokens_seen": 3538840, + "step": 5410 + }, + { + "epoch": 2.838050314465409, + "grad_norm": 0.674506664276123, + "learning_rate": 4.503120848815916e-05, + "loss": 0.3161, + "num_input_tokens_seen": 3541880, + "step": 5415 + }, + { + "epoch": 2.840670859538784, + "grad_norm": 0.8888957500457764, + "learning_rate": 4.501751711392536e-05, + "loss": 0.4619, + "num_input_tokens_seen": 3545208, + "step": 5420 + }, + { + "epoch": 2.8432914046121596, + "grad_norm": 1.3335840702056885, + "learning_rate": 4.5003808989947605e-05, + "loss": 0.3666, + "num_input_tokens_seen": 3548120, + "step": 5425 + }, + { + "epoch": 2.8459119496855347, + "grad_norm": 0.7455176115036011, + "learning_rate": 4.499008412769622e-05, + "loss": 0.4219, + "num_input_tokens_seen": 3551512, + "step": 5430 + }, + { + "epoch": 2.84853249475891, + "grad_norm": 1.7777892351150513, + "learning_rate": 4.4976342538655546e-05, + "loss": 0.3556, + "num_input_tokens_seen": 3554520, + "step": 5435 + }, + { + "epoch": 2.851153039832285, + "grad_norm": 0.9051198959350586, + "learning_rate": 4.4962584234323925e-05, + "loss": 0.4774, + "num_input_tokens_seen": 3557208, + "step": 5440 + }, + { + "epoch": 2.8537735849056602, + "grad_norm": 1.6951491832733154, + "learning_rate": 4.4948809226213664e-05, + "loss": 0.5687, + "num_input_tokens_seen": 3560600, + "step": 5445 + }, + { + "epoch": 2.8563941299790354, + "grad_norm": 0.5941191911697388, + "learning_rate": 4.4935017525851067e-05, + "loss": 0.3941, + "num_input_tokens_seen": 3565752, + "step": 5450 + }, + { + "epoch": 2.859014675052411, + "grad_norm": 0.7911731600761414, + "learning_rate": 4.4921209144776414e-05, + "loss": 0.3528, + "num_input_tokens_seen": 3569400, + "step": 5455 + }, + { + "epoch": 2.861635220125786, + "grad_norm": 1.376853346824646, + "learning_rate": 4.490738409454389e-05, + "loss": 0.3868, + "num_input_tokens_seen": 3572088, + "step": 5460 + }, + { + "epoch": 2.8642557651991614, + "grad_norm": 0.6740906834602356, + "learning_rate": 4.48935423867217e-05, + "loss": 0.5119, + "num_input_tokens_seen": 3575032, + "step": 5465 + }, + { + "epoch": 2.8668763102725365, + "grad_norm": 1.1250858306884766, + "learning_rate": 4.487968403289195e-05, + "loss": 0.4023, + "num_input_tokens_seen": 3578232, + "step": 5470 + }, + { + "epoch": 2.869496855345912, + "grad_norm": 0.8367806077003479, + "learning_rate": 4.4865809044650655e-05, + "loss": 0.4428, + "num_input_tokens_seen": 3581304, + "step": 5475 + }, + { + "epoch": 2.8721174004192873, + "grad_norm": 0.8115038871765137, + "learning_rate": 4.48519174336078e-05, + "loss": 0.3382, + "num_input_tokens_seen": 3584664, + "step": 5480 + }, + { + "epoch": 2.8747379454926625, + "grad_norm": 0.6509210467338562, + "learning_rate": 4.483800921138722e-05, + "loss": 0.4973, + "num_input_tokens_seen": 3587544, + "step": 5485 + }, + { + "epoch": 2.8773584905660377, + "grad_norm": 1.1909575462341309, + "learning_rate": 4.4824084389626705e-05, + "loss": 0.4474, + "num_input_tokens_seen": 3590360, + "step": 5490 + }, + { + "epoch": 2.879979035639413, + "grad_norm": 0.8834188580513, + "learning_rate": 4.48101429799779e-05, + "loss": 0.3637, + "num_input_tokens_seen": 3597208, + "step": 5495 + }, + { + "epoch": 2.882599580712788, + "grad_norm": 1.4132260084152222, + "learning_rate": 4.479618499410634e-05, + "loss": 0.3465, + "num_input_tokens_seen": 3600568, + "step": 5500 + }, + { + "epoch": 2.8852201257861636, + "grad_norm": 2.2884161472320557, + "learning_rate": 4.478221044369143e-05, + "loss": 0.4004, + "num_input_tokens_seen": 3603480, + "step": 5505 + }, + { + "epoch": 2.8878406708595388, + "grad_norm": 0.8631753325462341, + "learning_rate": 4.476821934042644e-05, + "loss": 0.4738, + "num_input_tokens_seen": 3606360, + "step": 5510 + }, + { + "epoch": 2.890461215932914, + "grad_norm": 1.5074541568756104, + "learning_rate": 4.4754211696018475e-05, + "loss": 0.3806, + "num_input_tokens_seen": 3608856, + "step": 5515 + }, + { + "epoch": 2.8930817610062896, + "grad_norm": 0.7334145307540894, + "learning_rate": 4.47401875221885e-05, + "loss": 0.3446, + "num_input_tokens_seen": 3611384, + "step": 5520 + }, + { + "epoch": 2.8957023060796647, + "grad_norm": 0.9450438022613525, + "learning_rate": 4.4726146830671304e-05, + "loss": 0.6477, + "num_input_tokens_seen": 3614488, + "step": 5525 + }, + { + "epoch": 2.89832285115304, + "grad_norm": 1.1054750680923462, + "learning_rate": 4.47120896332155e-05, + "loss": 0.3974, + "num_input_tokens_seen": 3617080, + "step": 5530 + }, + { + "epoch": 2.900943396226415, + "grad_norm": 1.348459243774414, + "learning_rate": 4.4698015941583494e-05, + "loss": 0.5244, + "num_input_tokens_seen": 3620728, + "step": 5535 + }, + { + "epoch": 2.9035639412997902, + "grad_norm": 0.9116919040679932, + "learning_rate": 4.4683925767551525e-05, + "loss": 0.4694, + "num_input_tokens_seen": 3624184, + "step": 5540 + }, + { + "epoch": 2.9061844863731654, + "grad_norm": 0.562655508518219, + "learning_rate": 4.466981912290959e-05, + "loss": 0.3712, + "num_input_tokens_seen": 3628856, + "step": 5545 + }, + { + "epoch": 2.908805031446541, + "grad_norm": 0.667197585105896, + "learning_rate": 4.46556960194615e-05, + "loss": 0.4829, + "num_input_tokens_seen": 3631800, + "step": 5550 + }, + { + "epoch": 2.911425576519916, + "grad_norm": 1.7877581119537354, + "learning_rate": 4.464155646902482e-05, + "loss": 0.4115, + "num_input_tokens_seen": 3634680, + "step": 5555 + }, + { + "epoch": 2.9140461215932913, + "grad_norm": 0.8164344429969788, + "learning_rate": 4.462740048343087e-05, + "loss": 0.3559, + "num_input_tokens_seen": 3638264, + "step": 5560 + }, + { + "epoch": 2.9166666666666665, + "grad_norm": 1.0347782373428345, + "learning_rate": 4.461322807452475e-05, + "loss": 0.3702, + "num_input_tokens_seen": 3641240, + "step": 5565 + }, + { + "epoch": 2.919287211740042, + "grad_norm": 0.6840627789497375, + "learning_rate": 4.4599039254165264e-05, + "loss": 0.4652, + "num_input_tokens_seen": 3643640, + "step": 5570 + }, + { + "epoch": 2.9219077568134173, + "grad_norm": 0.6843350529670715, + "learning_rate": 4.458483403422498e-05, + "loss": 0.4222, + "num_input_tokens_seen": 3648088, + "step": 5575 + }, + { + "epoch": 2.9245283018867925, + "grad_norm": 0.6178821325302124, + "learning_rate": 4.457061242659018e-05, + "loss": 0.3876, + "num_input_tokens_seen": 3651352, + "step": 5580 + }, + { + "epoch": 2.9271488469601676, + "grad_norm": 0.7887333631515503, + "learning_rate": 4.455637444316085e-05, + "loss": 0.4513, + "num_input_tokens_seen": 3655288, + "step": 5585 + }, + { + "epoch": 2.929769392033543, + "grad_norm": 1.0248557329177856, + "learning_rate": 4.454212009585068e-05, + "loss": 0.3978, + "num_input_tokens_seen": 3657624, + "step": 5590 + }, + { + "epoch": 2.932389937106918, + "grad_norm": 1.2831767797470093, + "learning_rate": 4.4527849396587065e-05, + "loss": 0.3434, + "num_input_tokens_seen": 3660472, + "step": 5595 + }, + { + "epoch": 2.9350104821802936, + "grad_norm": 0.8872580528259277, + "learning_rate": 4.4513562357311074e-05, + "loss": 0.4865, + "num_input_tokens_seen": 3663992, + "step": 5600 + }, + { + "epoch": 2.9376310272536688, + "grad_norm": 1.1587947607040405, + "learning_rate": 4.449925898997744e-05, + "loss": 0.4447, + "num_input_tokens_seen": 3666424, + "step": 5605 + }, + { + "epoch": 2.940251572327044, + "grad_norm": 0.9038293957710266, + "learning_rate": 4.4484939306554585e-05, + "loss": 0.4845, + "num_input_tokens_seen": 3669400, + "step": 5610 + }, + { + "epoch": 2.9428721174004195, + "grad_norm": 0.9431254267692566, + "learning_rate": 4.4470603319024554e-05, + "loss": 0.5637, + "num_input_tokens_seen": 3672440, + "step": 5615 + }, + { + "epoch": 2.9454926624737947, + "grad_norm": 1.5376920700073242, + "learning_rate": 4.445625103938304e-05, + "loss": 0.3218, + "num_input_tokens_seen": 3675704, + "step": 5620 + }, + { + "epoch": 2.94811320754717, + "grad_norm": 1.2780061960220337, + "learning_rate": 4.4441882479639375e-05, + "loss": 0.5517, + "num_input_tokens_seen": 3679160, + "step": 5625 + }, + { + "epoch": 2.950733752620545, + "grad_norm": 0.8423174023628235, + "learning_rate": 4.442749765181653e-05, + "loss": 0.4055, + "num_input_tokens_seen": 3682328, + "step": 5630 + }, + { + "epoch": 2.95335429769392, + "grad_norm": 0.9989615678787231, + "learning_rate": 4.441309656795106e-05, + "loss": 0.3968, + "num_input_tokens_seen": 3685240, + "step": 5635 + }, + { + "epoch": 2.9559748427672954, + "grad_norm": 1.0722618103027344, + "learning_rate": 4.4398679240093144e-05, + "loss": 0.556, + "num_input_tokens_seen": 3687960, + "step": 5640 + }, + { + "epoch": 2.958595387840671, + "grad_norm": 0.7590206265449524, + "learning_rate": 4.438424568030652e-05, + "loss": 0.3472, + "num_input_tokens_seen": 3691704, + "step": 5645 + }, + { + "epoch": 2.961215932914046, + "grad_norm": 1.3052700757980347, + "learning_rate": 4.436979590066857e-05, + "loss": 0.465, + "num_input_tokens_seen": 3695416, + "step": 5650 + }, + { + "epoch": 2.9638364779874213, + "grad_norm": 1.1633963584899902, + "learning_rate": 4.435532991327017e-05, + "loss": 0.4066, + "num_input_tokens_seen": 3698168, + "step": 5655 + }, + { + "epoch": 2.9664570230607965, + "grad_norm": 1.5827629566192627, + "learning_rate": 4.434084773021582e-05, + "loss": 0.549, + "num_input_tokens_seen": 3700984, + "step": 5660 + }, + { + "epoch": 2.969077568134172, + "grad_norm": 1.0102607011795044, + "learning_rate": 4.432634936362354e-05, + "loss": 0.5935, + "num_input_tokens_seen": 3704472, + "step": 5665 + }, + { + "epoch": 2.9716981132075473, + "grad_norm": 0.9235026240348816, + "learning_rate": 4.431183482562491e-05, + "loss": 0.3457, + "num_input_tokens_seen": 3707960, + "step": 5670 + }, + { + "epoch": 2.9743186582809225, + "grad_norm": 1.2358946800231934, + "learning_rate": 4.429730412836503e-05, + "loss": 0.4454, + "num_input_tokens_seen": 3710520, + "step": 5675 + }, + { + "epoch": 2.9769392033542976, + "grad_norm": 0.9773744344711304, + "learning_rate": 4.4282757284002515e-05, + "loss": 0.4402, + "num_input_tokens_seen": 3713688, + "step": 5680 + }, + { + "epoch": 2.979559748427673, + "grad_norm": 0.8835509419441223, + "learning_rate": 4.426819430470951e-05, + "loss": 0.4046, + "num_input_tokens_seen": 3717304, + "step": 5685 + }, + { + "epoch": 2.982180293501048, + "grad_norm": 0.8533018231391907, + "learning_rate": 4.425361520267165e-05, + "loss": 0.4856, + "num_input_tokens_seen": 3720888, + "step": 5690 + }, + { + "epoch": 2.9848008385744236, + "grad_norm": 0.9646098613739014, + "learning_rate": 4.423901999008805e-05, + "loss": 0.4586, + "num_input_tokens_seen": 3724888, + "step": 5695 + }, + { + "epoch": 2.9874213836477987, + "grad_norm": 1.201683521270752, + "learning_rate": 4.4224408679171324e-05, + "loss": 0.3517, + "num_input_tokens_seen": 3727288, + "step": 5700 + }, + { + "epoch": 2.990041928721174, + "grad_norm": 1.326709270477295, + "learning_rate": 4.4209781282147555e-05, + "loss": 0.4735, + "num_input_tokens_seen": 3730328, + "step": 5705 + }, + { + "epoch": 2.9926624737945495, + "grad_norm": 1.1086126565933228, + "learning_rate": 4.419513781125628e-05, + "loss": 0.3733, + "num_input_tokens_seen": 3733912, + "step": 5710 + }, + { + "epoch": 2.9952830188679247, + "grad_norm": 1.023452639579773, + "learning_rate": 4.418047827875048e-05, + "loss": 0.405, + "num_input_tokens_seen": 3737464, + "step": 5715 + }, + { + "epoch": 2.9979035639413, + "grad_norm": 0.7127342820167542, + "learning_rate": 4.416580269689658e-05, + "loss": 0.3859, + "num_input_tokens_seen": 3740152, + "step": 5720 + }, + { + "epoch": 3.0, + "eval_loss": 0.46909987926483154, + "eval_runtime": 16.0027, + "eval_samples_per_second": 52.991, + "eval_steps_per_second": 13.248, + "num_input_tokens_seen": 3742552, + "step": 5724 + }, + { + "epoch": 3.000524109014675, + "grad_norm": 1.469266653060913, + "learning_rate": 4.415111107797445e-05, + "loss": 0.3431, + "num_input_tokens_seen": 3742968, + "step": 5725 + }, + { + "epoch": 3.00314465408805, + "grad_norm": 0.7359603047370911, + "learning_rate": 4.4136403434277364e-05, + "loss": 0.4105, + "num_input_tokens_seen": 3745880, + "step": 5730 + }, + { + "epoch": 3.0057651991614254, + "grad_norm": 0.717217743396759, + "learning_rate": 4.412167977811199e-05, + "loss": 0.419, + "num_input_tokens_seen": 3749464, + "step": 5735 + }, + { + "epoch": 3.008385744234801, + "grad_norm": 1.3508707284927368, + "learning_rate": 4.4106940121798424e-05, + "loss": 0.3292, + "num_input_tokens_seen": 3752248, + "step": 5740 + }, + { + "epoch": 3.011006289308176, + "grad_norm": 1.0361217260360718, + "learning_rate": 4.409218447767013e-05, + "loss": 0.3964, + "num_input_tokens_seen": 3755480, + "step": 5745 + }, + { + "epoch": 3.0136268343815513, + "grad_norm": 0.7684920430183411, + "learning_rate": 4.4077412858073966e-05, + "loss": 0.3623, + "num_input_tokens_seen": 3758040, + "step": 5750 + }, + { + "epoch": 3.0162473794549265, + "grad_norm": 1.570131778717041, + "learning_rate": 4.406262527537014e-05, + "loss": 0.3719, + "num_input_tokens_seen": 3761368, + "step": 5755 + }, + { + "epoch": 3.018867924528302, + "grad_norm": 1.2229608297348022, + "learning_rate": 4.404782174193223e-05, + "loss": 0.3771, + "num_input_tokens_seen": 3764824, + "step": 5760 + }, + { + "epoch": 3.0214884696016773, + "grad_norm": 0.6157572865486145, + "learning_rate": 4.403300227014716e-05, + "loss": 0.3039, + "num_input_tokens_seen": 3768056, + "step": 5765 + }, + { + "epoch": 3.0241090146750524, + "grad_norm": 1.2287287712097168, + "learning_rate": 4.4018166872415176e-05, + "loss": 0.404, + "num_input_tokens_seen": 3770168, + "step": 5770 + }, + { + "epoch": 3.0267295597484276, + "grad_norm": 0.7126141786575317, + "learning_rate": 4.4003315561149875e-05, + "loss": 0.4015, + "num_input_tokens_seen": 3773112, + "step": 5775 + }, + { + "epoch": 3.029350104821803, + "grad_norm": 0.725141167640686, + "learning_rate": 4.398844834877815e-05, + "loss": 0.4311, + "num_input_tokens_seen": 3776568, + "step": 5780 + }, + { + "epoch": 3.0319706498951784, + "grad_norm": 0.6598429679870605, + "learning_rate": 4.39735652477402e-05, + "loss": 0.5376, + "num_input_tokens_seen": 3780920, + "step": 5785 + }, + { + "epoch": 3.0345911949685536, + "grad_norm": 0.94338059425354, + "learning_rate": 4.395866627048953e-05, + "loss": 0.4373, + "num_input_tokens_seen": 3783960, + "step": 5790 + }, + { + "epoch": 3.0372117400419287, + "grad_norm": 1.0908699035644531, + "learning_rate": 4.3943751429492925e-05, + "loss": 0.346, + "num_input_tokens_seen": 3786968, + "step": 5795 + }, + { + "epoch": 3.039832285115304, + "grad_norm": 0.6315764784812927, + "learning_rate": 4.392882073723043e-05, + "loss": 0.4467, + "num_input_tokens_seen": 3790808, + "step": 5800 + }, + { + "epoch": 3.042452830188679, + "grad_norm": 0.7107840776443481, + "learning_rate": 4.391387420619539e-05, + "loss": 0.3936, + "num_input_tokens_seen": 3794264, + "step": 5805 + }, + { + "epoch": 3.0450733752620547, + "grad_norm": 0.7973026037216187, + "learning_rate": 4.389891184889435e-05, + "loss": 0.4196, + "num_input_tokens_seen": 3797496, + "step": 5810 + }, + { + "epoch": 3.04769392033543, + "grad_norm": 1.3113906383514404, + "learning_rate": 4.3883933677847154e-05, + "loss": 0.5289, + "num_input_tokens_seen": 3800888, + "step": 5815 + }, + { + "epoch": 3.050314465408805, + "grad_norm": 0.7941030263900757, + "learning_rate": 4.3868939705586844e-05, + "loss": 0.3405, + "num_input_tokens_seen": 3803512, + "step": 5820 + }, + { + "epoch": 3.05293501048218, + "grad_norm": 1.3308534622192383, + "learning_rate": 4.385392994465968e-05, + "loss": 0.4571, + "num_input_tokens_seen": 3806552, + "step": 5825 + }, + { + "epoch": 3.0555555555555554, + "grad_norm": 0.5111305713653564, + "learning_rate": 4.383890440762515e-05, + "loss": 0.4329, + "num_input_tokens_seen": 3810648, + "step": 5830 + }, + { + "epoch": 3.058176100628931, + "grad_norm": 0.8437191247940063, + "learning_rate": 4.3823863107055936e-05, + "loss": 0.3243, + "num_input_tokens_seen": 3813976, + "step": 5835 + }, + { + "epoch": 3.060796645702306, + "grad_norm": 1.6951078176498413, + "learning_rate": 4.380880605553792e-05, + "loss": 0.4187, + "num_input_tokens_seen": 3816824, + "step": 5840 + }, + { + "epoch": 3.0634171907756813, + "grad_norm": 0.7214998602867126, + "learning_rate": 4.3793733265670147e-05, + "loss": 0.4637, + "num_input_tokens_seen": 3820376, + "step": 5845 + }, + { + "epoch": 3.0660377358490565, + "grad_norm": 1.087815761566162, + "learning_rate": 4.3778644750064834e-05, + "loss": 0.3431, + "num_input_tokens_seen": 3823736, + "step": 5850 + }, + { + "epoch": 3.068658280922432, + "grad_norm": 1.0277661085128784, + "learning_rate": 4.376354052134738e-05, + "loss": 0.5428, + "num_input_tokens_seen": 3826104, + "step": 5855 + }, + { + "epoch": 3.0712788259958073, + "grad_norm": 1.0313982963562012, + "learning_rate": 4.374842059215629e-05, + "loss": 0.4245, + "num_input_tokens_seen": 3828632, + "step": 5860 + }, + { + "epoch": 3.0738993710691824, + "grad_norm": 0.8534420132637024, + "learning_rate": 4.373328497514325e-05, + "loss": 0.2527, + "num_input_tokens_seen": 3831576, + "step": 5865 + }, + { + "epoch": 3.0765199161425576, + "grad_norm": 0.8121978640556335, + "learning_rate": 4.371813368297304e-05, + "loss": 0.2925, + "num_input_tokens_seen": 3834392, + "step": 5870 + }, + { + "epoch": 3.0791404612159328, + "grad_norm": 1.3369191884994507, + "learning_rate": 4.370296672832358e-05, + "loss": 0.4472, + "num_input_tokens_seen": 3837144, + "step": 5875 + }, + { + "epoch": 3.0817610062893084, + "grad_norm": 0.8817345499992371, + "learning_rate": 4.3687784123885875e-05, + "loss": 0.4254, + "num_input_tokens_seen": 3840504, + "step": 5880 + }, + { + "epoch": 3.0843815513626835, + "grad_norm": 0.7065593004226685, + "learning_rate": 4.3672585882364045e-05, + "loss": 0.4294, + "num_input_tokens_seen": 3844408, + "step": 5885 + }, + { + "epoch": 3.0870020964360587, + "grad_norm": 1.2367149591445923, + "learning_rate": 4.3657372016475275e-05, + "loss": 0.4189, + "num_input_tokens_seen": 3847736, + "step": 5890 + }, + { + "epoch": 3.089622641509434, + "grad_norm": 0.6823593974113464, + "learning_rate": 4.364214253894983e-05, + "loss": 0.3906, + "num_input_tokens_seen": 3850360, + "step": 5895 + }, + { + "epoch": 3.092243186582809, + "grad_norm": 0.6672104597091675, + "learning_rate": 4.3626897462531054e-05, + "loss": 0.3751, + "num_input_tokens_seen": 3853624, + "step": 5900 + }, + { + "epoch": 3.0948637316561847, + "grad_norm": 1.0136375427246094, + "learning_rate": 4.361163679997532e-05, + "loss": 0.2881, + "num_input_tokens_seen": 3856664, + "step": 5905 + }, + { + "epoch": 3.09748427672956, + "grad_norm": 1.1904988288879395, + "learning_rate": 4.359636056405206e-05, + "loss": 0.4347, + "num_input_tokens_seen": 3859640, + "step": 5910 + }, + { + "epoch": 3.100104821802935, + "grad_norm": 1.1986950635910034, + "learning_rate": 4.3581068767543724e-05, + "loss": 0.3095, + "num_input_tokens_seen": 3862712, + "step": 5915 + }, + { + "epoch": 3.10272536687631, + "grad_norm": 0.9301473498344421, + "learning_rate": 4.35657614232458e-05, + "loss": 0.37, + "num_input_tokens_seen": 3866328, + "step": 5920 + }, + { + "epoch": 3.1053459119496853, + "grad_norm": 0.8737086653709412, + "learning_rate": 4.355043854396677e-05, + "loss": 0.3571, + "num_input_tokens_seen": 3869592, + "step": 5925 + }, + { + "epoch": 3.107966457023061, + "grad_norm": 1.242950439453125, + "learning_rate": 4.35351001425281e-05, + "loss": 0.3099, + "num_input_tokens_seen": 3875032, + "step": 5930 + }, + { + "epoch": 3.110587002096436, + "grad_norm": 0.6521815657615662, + "learning_rate": 4.351974623176429e-05, + "loss": 0.4534, + "num_input_tokens_seen": 3879288, + "step": 5935 + }, + { + "epoch": 3.1132075471698113, + "grad_norm": 0.729006290435791, + "learning_rate": 4.3504376824522787e-05, + "loss": 0.285, + "num_input_tokens_seen": 3882936, + "step": 5940 + }, + { + "epoch": 3.1158280922431865, + "grad_norm": 0.6507551074028015, + "learning_rate": 4.348899193366399e-05, + "loss": 0.4196, + "num_input_tokens_seen": 3886296, + "step": 5945 + }, + { + "epoch": 3.1184486373165616, + "grad_norm": 1.5355678796768188, + "learning_rate": 4.34735915720613e-05, + "loss": 0.3682, + "num_input_tokens_seen": 3889816, + "step": 5950 + }, + { + "epoch": 3.1210691823899372, + "grad_norm": 0.8615676760673523, + "learning_rate": 4.345817575260101e-05, + "loss": 0.3134, + "num_input_tokens_seen": 3892888, + "step": 5955 + }, + { + "epoch": 3.1236897274633124, + "grad_norm": 2.740187644958496, + "learning_rate": 4.3442744488182395e-05, + "loss": 0.4513, + "num_input_tokens_seen": 3896152, + "step": 5960 + }, + { + "epoch": 3.1263102725366876, + "grad_norm": 0.4437565505504608, + "learning_rate": 4.342729779171761e-05, + "loss": 0.2741, + "num_input_tokens_seen": 3898776, + "step": 5965 + }, + { + "epoch": 3.1289308176100628, + "grad_norm": 1.2162270545959473, + "learning_rate": 4.341183567613177e-05, + "loss": 0.3746, + "num_input_tokens_seen": 3901912, + "step": 5970 + }, + { + "epoch": 3.131551362683438, + "grad_norm": 0.7551894187927246, + "learning_rate": 4.339635815436286e-05, + "loss": 0.4375, + "num_input_tokens_seen": 3905688, + "step": 5975 + }, + { + "epoch": 3.1341719077568135, + "grad_norm": 1.0952547788619995, + "learning_rate": 4.3380865239361754e-05, + "loss": 0.4066, + "num_input_tokens_seen": 3908888, + "step": 5980 + }, + { + "epoch": 3.1367924528301887, + "grad_norm": 1.372725009918213, + "learning_rate": 4.336535694409222e-05, + "loss": 0.3314, + "num_input_tokens_seen": 3912280, + "step": 5985 + }, + { + "epoch": 3.139412997903564, + "grad_norm": 1.7719688415527344, + "learning_rate": 4.334983328153088e-05, + "loss": 0.3306, + "num_input_tokens_seen": 3915256, + "step": 5990 + }, + { + "epoch": 3.142033542976939, + "grad_norm": 0.9063447117805481, + "learning_rate": 4.3334294264667255e-05, + "loss": 0.3467, + "num_input_tokens_seen": 3917912, + "step": 5995 + }, + { + "epoch": 3.1446540880503147, + "grad_norm": 1.0147356986999512, + "learning_rate": 4.3318739906503655e-05, + "loss": 0.4159, + "num_input_tokens_seen": 3920824, + "step": 6000 + }, + { + "epoch": 3.14727463312369, + "grad_norm": 1.3220438957214355, + "learning_rate": 4.3303170220055264e-05, + "loss": 0.3298, + "num_input_tokens_seen": 3923736, + "step": 6005 + }, + { + "epoch": 3.149895178197065, + "grad_norm": 0.7525033354759216, + "learning_rate": 4.32875852183501e-05, + "loss": 0.4507, + "num_input_tokens_seen": 3926584, + "step": 6010 + }, + { + "epoch": 3.15251572327044, + "grad_norm": 1.0509706735610962, + "learning_rate": 4.3271984914428965e-05, + "loss": 0.5333, + "num_input_tokens_seen": 3929432, + "step": 6015 + }, + { + "epoch": 3.1551362683438153, + "grad_norm": 0.903187096118927, + "learning_rate": 4.325636932134548e-05, + "loss": 0.4092, + "num_input_tokens_seen": 3932088, + "step": 6020 + }, + { + "epoch": 3.157756813417191, + "grad_norm": 1.1435550451278687, + "learning_rate": 4.324073845216606e-05, + "loss": 0.3442, + "num_input_tokens_seen": 3935384, + "step": 6025 + }, + { + "epoch": 3.160377358490566, + "grad_norm": 1.10163152217865, + "learning_rate": 4.322509231996992e-05, + "loss": 0.3748, + "num_input_tokens_seen": 3938424, + "step": 6030 + }, + { + "epoch": 3.1629979035639413, + "grad_norm": 1.160224437713623, + "learning_rate": 4.320943093784901e-05, + "loss": 0.4211, + "num_input_tokens_seen": 3940984, + "step": 6035 + }, + { + "epoch": 3.1656184486373165, + "grad_norm": 1.0087311267852783, + "learning_rate": 4.319375431890806e-05, + "loss": 0.3743, + "num_input_tokens_seen": 3945144, + "step": 6040 + }, + { + "epoch": 3.1682389937106916, + "grad_norm": 0.6966981291770935, + "learning_rate": 4.317806247626456e-05, + "loss": 0.3737, + "num_input_tokens_seen": 3948344, + "step": 6045 + }, + { + "epoch": 3.1708595387840672, + "grad_norm": 1.057544231414795, + "learning_rate": 4.316235542304872e-05, + "loss": 0.4589, + "num_input_tokens_seen": 3951000, + "step": 6050 + }, + { + "epoch": 3.1734800838574424, + "grad_norm": 0.9173097014427185, + "learning_rate": 4.314663317240348e-05, + "loss": 0.3677, + "num_input_tokens_seen": 3954584, + "step": 6055 + }, + { + "epoch": 3.1761006289308176, + "grad_norm": 0.7158564329147339, + "learning_rate": 4.313089573748451e-05, + "loss": 0.2529, + "num_input_tokens_seen": 3957944, + "step": 6060 + }, + { + "epoch": 3.1787211740041927, + "grad_norm": 0.7619892954826355, + "learning_rate": 4.311514313146018e-05, + "loss": 0.3332, + "num_input_tokens_seen": 3961816, + "step": 6065 + }, + { + "epoch": 3.181341719077568, + "grad_norm": 0.7171514630317688, + "learning_rate": 4.309937536751153e-05, + "loss": 0.4097, + "num_input_tokens_seen": 3964824, + "step": 6070 + }, + { + "epoch": 3.1839622641509435, + "grad_norm": 1.118693470954895, + "learning_rate": 4.3083592458832327e-05, + "loss": 0.2997, + "num_input_tokens_seen": 3968056, + "step": 6075 + }, + { + "epoch": 3.1865828092243187, + "grad_norm": 0.9991127848625183, + "learning_rate": 4.3067794418628976e-05, + "loss": 0.3722, + "num_input_tokens_seen": 3970424, + "step": 6080 + }, + { + "epoch": 3.189203354297694, + "grad_norm": 1.4533143043518066, + "learning_rate": 4.305198126012057e-05, + "loss": 0.3617, + "num_input_tokens_seen": 3973208, + "step": 6085 + }, + { + "epoch": 3.191823899371069, + "grad_norm": 1.0571832656860352, + "learning_rate": 4.303615299653881e-05, + "loss": 0.4037, + "num_input_tokens_seen": 3976216, + "step": 6090 + }, + { + "epoch": 3.1944444444444446, + "grad_norm": 1.2544262409210205, + "learning_rate": 4.30203096411281e-05, + "loss": 0.4537, + "num_input_tokens_seen": 3980824, + "step": 6095 + }, + { + "epoch": 3.19706498951782, + "grad_norm": 1.2205969095230103, + "learning_rate": 4.30044512071454e-05, + "loss": 0.4325, + "num_input_tokens_seen": 3987480, + "step": 6100 + }, + { + "epoch": 3.199685534591195, + "grad_norm": 1.0982763767242432, + "learning_rate": 4.2988577707860346e-05, + "loss": 0.3681, + "num_input_tokens_seen": 3991288, + "step": 6105 + }, + { + "epoch": 3.20230607966457, + "grad_norm": 0.871878445148468, + "learning_rate": 4.2972689156555154e-05, + "loss": 0.4374, + "num_input_tokens_seen": 3993848, + "step": 6110 + }, + { + "epoch": 3.2049266247379453, + "grad_norm": 0.9454466700553894, + "learning_rate": 4.295678556652464e-05, + "loss": 0.3948, + "num_input_tokens_seen": 3997848, + "step": 6115 + }, + { + "epoch": 3.207547169811321, + "grad_norm": 1.6090997457504272, + "learning_rate": 4.294086695107619e-05, + "loss": 0.3248, + "num_input_tokens_seen": 4001784, + "step": 6120 + }, + { + "epoch": 3.210167714884696, + "grad_norm": 0.7739579081535339, + "learning_rate": 4.292493332352978e-05, + "loss": 0.4021, + "num_input_tokens_seen": 4005048, + "step": 6125 + }, + { + "epoch": 3.2127882599580713, + "grad_norm": 1.2850641012191772, + "learning_rate": 4.290898469721795e-05, + "loss": 0.4148, + "num_input_tokens_seen": 4007864, + "step": 6130 + }, + { + "epoch": 3.2154088050314464, + "grad_norm": 0.982771098613739, + "learning_rate": 4.2893021085485765e-05, + "loss": 0.4145, + "num_input_tokens_seen": 4011832, + "step": 6135 + }, + { + "epoch": 3.2180293501048216, + "grad_norm": 1.1558164358139038, + "learning_rate": 4.287704250169086e-05, + "loss": 0.3753, + "num_input_tokens_seen": 4015032, + "step": 6140 + }, + { + "epoch": 3.220649895178197, + "grad_norm": 1.0375374555587769, + "learning_rate": 4.2861048959203386e-05, + "loss": 0.5218, + "num_input_tokens_seen": 4017432, + "step": 6145 + }, + { + "epoch": 3.2232704402515724, + "grad_norm": 1.724047064781189, + "learning_rate": 4.284504047140599e-05, + "loss": 0.3215, + "num_input_tokens_seen": 4020216, + "step": 6150 + }, + { + "epoch": 3.2258909853249476, + "grad_norm": 1.1068226099014282, + "learning_rate": 4.282901705169387e-05, + "loss": 0.4404, + "num_input_tokens_seen": 4023096, + "step": 6155 + }, + { + "epoch": 3.2285115303983227, + "grad_norm": 2.037219524383545, + "learning_rate": 4.281297871347468e-05, + "loss": 0.2645, + "num_input_tokens_seen": 4026712, + "step": 6160 + }, + { + "epoch": 3.231132075471698, + "grad_norm": 1.2841979265213013, + "learning_rate": 4.279692547016856e-05, + "loss": 0.433, + "num_input_tokens_seen": 4030872, + "step": 6165 + }, + { + "epoch": 3.2337526205450735, + "grad_norm": 1.1580758094787598, + "learning_rate": 4.278085733520814e-05, + "loss": 0.3692, + "num_input_tokens_seen": 4034744, + "step": 6170 + }, + { + "epoch": 3.2363731656184487, + "grad_norm": 1.924967646598816, + "learning_rate": 4.2764774322038494e-05, + "loss": 0.3581, + "num_input_tokens_seen": 4037752, + "step": 6175 + }, + { + "epoch": 3.238993710691824, + "grad_norm": 1.6118278503417969, + "learning_rate": 4.2748676444117156e-05, + "loss": 0.3174, + "num_input_tokens_seen": 4041048, + "step": 6180 + }, + { + "epoch": 3.241614255765199, + "grad_norm": 1.4182935953140259, + "learning_rate": 4.27325637149141e-05, + "loss": 0.4761, + "num_input_tokens_seen": 4043832, + "step": 6185 + }, + { + "epoch": 3.2442348008385746, + "grad_norm": 1.02171790599823, + "learning_rate": 4.271643614791172e-05, + "loss": 0.3353, + "num_input_tokens_seen": 4047512, + "step": 6190 + }, + { + "epoch": 3.24685534591195, + "grad_norm": 1.0257244110107422, + "learning_rate": 4.2700293756604824e-05, + "loss": 0.4186, + "num_input_tokens_seen": 4050584, + "step": 6195 + }, + { + "epoch": 3.249475890985325, + "grad_norm": 1.0413761138916016, + "learning_rate": 4.268413655450064e-05, + "loss": 0.3057, + "num_input_tokens_seen": 4053752, + "step": 6200 + }, + { + "epoch": 3.2520964360587, + "grad_norm": 1.461964726448059, + "learning_rate": 4.266796455511875e-05, + "loss": 0.403, + "num_input_tokens_seen": 4056824, + "step": 6205 + }, + { + "epoch": 3.2547169811320753, + "grad_norm": 2.3538782596588135, + "learning_rate": 4.2651777771991176e-05, + "loss": 0.5003, + "num_input_tokens_seen": 4059864, + "step": 6210 + }, + { + "epoch": 3.257337526205451, + "grad_norm": 1.6210120916366577, + "learning_rate": 4.2635576218662257e-05, + "loss": 0.3431, + "num_input_tokens_seen": 4063416, + "step": 6215 + }, + { + "epoch": 3.259958071278826, + "grad_norm": 0.565352737903595, + "learning_rate": 4.261935990868871e-05, + "loss": 0.3386, + "num_input_tokens_seen": 4066264, + "step": 6220 + }, + { + "epoch": 3.2625786163522013, + "grad_norm": 5.6944260597229, + "learning_rate": 4.260312885563962e-05, + "loss": 0.3945, + "num_input_tokens_seen": 4068664, + "step": 6225 + }, + { + "epoch": 3.2651991614255764, + "grad_norm": 0.9872909188270569, + "learning_rate": 4.2586883073096386e-05, + "loss": 0.575, + "num_input_tokens_seen": 4072088, + "step": 6230 + }, + { + "epoch": 3.2678197064989516, + "grad_norm": 2.1836061477661133, + "learning_rate": 4.257062257465272e-05, + "loss": 0.5184, + "num_input_tokens_seen": 4075064, + "step": 6235 + }, + { + "epoch": 3.270440251572327, + "grad_norm": 1.2434827089309692, + "learning_rate": 4.255434737391469e-05, + "loss": 0.3927, + "num_input_tokens_seen": 4077752, + "step": 6240 + }, + { + "epoch": 3.2730607966457024, + "grad_norm": 1.0651439428329468, + "learning_rate": 4.2538057484500624e-05, + "loss": 0.3995, + "num_input_tokens_seen": 4080472, + "step": 6245 + }, + { + "epoch": 3.2756813417190775, + "grad_norm": 0.9901625514030457, + "learning_rate": 4.2521752920041155e-05, + "loss": 0.2942, + "num_input_tokens_seen": 4083384, + "step": 6250 + }, + { + "epoch": 3.2783018867924527, + "grad_norm": 0.9138268232345581, + "learning_rate": 4.2505433694179216e-05, + "loss": 0.4216, + "num_input_tokens_seen": 4086488, + "step": 6255 + }, + { + "epoch": 3.280922431865828, + "grad_norm": 0.799767255783081, + "learning_rate": 4.2489099820569974e-05, + "loss": 0.2752, + "num_input_tokens_seen": 4089464, + "step": 6260 + }, + { + "epoch": 3.2835429769392035, + "grad_norm": 1.4182939529418945, + "learning_rate": 4.247275131288086e-05, + "loss": 0.4371, + "num_input_tokens_seen": 4092088, + "step": 6265 + }, + { + "epoch": 3.2861635220125787, + "grad_norm": 1.318189263343811, + "learning_rate": 4.2456388184791584e-05, + "loss": 0.368, + "num_input_tokens_seen": 4095544, + "step": 6270 + }, + { + "epoch": 3.288784067085954, + "grad_norm": 0.7519534230232239, + "learning_rate": 4.2440010449994054e-05, + "loss": 0.4035, + "num_input_tokens_seen": 4098264, + "step": 6275 + }, + { + "epoch": 3.291404612159329, + "grad_norm": 0.8536790609359741, + "learning_rate": 4.24236181221924e-05, + "loss": 0.375, + "num_input_tokens_seen": 4100792, + "step": 6280 + }, + { + "epoch": 3.2940251572327046, + "grad_norm": 0.9687683582305908, + "learning_rate": 4.240721121510298e-05, + "loss": 0.3413, + "num_input_tokens_seen": 4103896, + "step": 6285 + }, + { + "epoch": 3.29664570230608, + "grad_norm": 0.720966637134552, + "learning_rate": 4.2390789742454354e-05, + "loss": 0.3018, + "num_input_tokens_seen": 4106968, + "step": 6290 + }, + { + "epoch": 3.299266247379455, + "grad_norm": 1.2980849742889404, + "learning_rate": 4.2374353717987244e-05, + "loss": 0.4553, + "num_input_tokens_seen": 4110040, + "step": 6295 + }, + { + "epoch": 3.30188679245283, + "grad_norm": 0.8183912634849548, + "learning_rate": 4.235790315545457e-05, + "loss": 0.3362, + "num_input_tokens_seen": 4112664, + "step": 6300 + }, + { + "epoch": 3.3045073375262053, + "grad_norm": 1.6093671321868896, + "learning_rate": 4.234143806862141e-05, + "loss": 0.3713, + "num_input_tokens_seen": 4115832, + "step": 6305 + }, + { + "epoch": 3.307127882599581, + "grad_norm": 0.9905369877815247, + "learning_rate": 4.2324958471265006e-05, + "loss": 0.3559, + "num_input_tokens_seen": 4119160, + "step": 6310 + }, + { + "epoch": 3.309748427672956, + "grad_norm": 2.0058059692382812, + "learning_rate": 4.230846437717472e-05, + "loss": 0.7002, + "num_input_tokens_seen": 4122520, + "step": 6315 + }, + { + "epoch": 3.3123689727463312, + "grad_norm": 1.8331395387649536, + "learning_rate": 4.2291955800152063e-05, + "loss": 0.3671, + "num_input_tokens_seen": 4124888, + "step": 6320 + }, + { + "epoch": 3.3149895178197064, + "grad_norm": 1.2158026695251465, + "learning_rate": 4.2275432754010663e-05, + "loss": 0.4344, + "num_input_tokens_seen": 4127832, + "step": 6325 + }, + { + "epoch": 3.3176100628930816, + "grad_norm": 1.243780493736267, + "learning_rate": 4.225889525257624e-05, + "loss": 0.4879, + "num_input_tokens_seen": 4130232, + "step": 6330 + }, + { + "epoch": 3.320230607966457, + "grad_norm": 1.1317265033721924, + "learning_rate": 4.224234330968663e-05, + "loss": 0.333, + "num_input_tokens_seen": 4133080, + "step": 6335 + }, + { + "epoch": 3.3228511530398324, + "grad_norm": 0.8010901808738708, + "learning_rate": 4.222577693919173e-05, + "loss": 0.3354, + "num_input_tokens_seen": 4136536, + "step": 6340 + }, + { + "epoch": 3.3254716981132075, + "grad_norm": 1.1275304555892944, + "learning_rate": 4.2209196154953536e-05, + "loss": 0.3463, + "num_input_tokens_seen": 4139608, + "step": 6345 + }, + { + "epoch": 3.3280922431865827, + "grad_norm": 2.1105618476867676, + "learning_rate": 4.219260097084608e-05, + "loss": 0.4796, + "num_input_tokens_seen": 4142200, + "step": 6350 + }, + { + "epoch": 3.330712788259958, + "grad_norm": 1.6778674125671387, + "learning_rate": 4.217599140075546e-05, + "loss": 0.7224, + "num_input_tokens_seen": 4146200, + "step": 6355 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.83587247133255, + "learning_rate": 4.2159367458579793e-05, + "loss": 0.3097, + "num_input_tokens_seen": 4149464, + "step": 6360 + }, + { + "epoch": 3.3359538784067087, + "grad_norm": 0.8280583620071411, + "learning_rate": 4.2142729158229256e-05, + "loss": 0.4274, + "num_input_tokens_seen": 4151992, + "step": 6365 + }, + { + "epoch": 3.338574423480084, + "grad_norm": 0.7711096405982971, + "learning_rate": 4.2126076513626004e-05, + "loss": 0.4051, + "num_input_tokens_seen": 4155576, + "step": 6370 + }, + { + "epoch": 3.341194968553459, + "grad_norm": 1.477084755897522, + "learning_rate": 4.210940953870422e-05, + "loss": 0.4814, + "num_input_tokens_seen": 4159672, + "step": 6375 + }, + { + "epoch": 3.3438155136268346, + "grad_norm": 0.9226441979408264, + "learning_rate": 4.209272824741005e-05, + "loss": 0.2857, + "num_input_tokens_seen": 4162360, + "step": 6380 + }, + { + "epoch": 3.3464360587002098, + "grad_norm": 1.1768215894699097, + "learning_rate": 4.207603265370166e-05, + "loss": 0.3923, + "num_input_tokens_seen": 4165304, + "step": 6385 + }, + { + "epoch": 3.349056603773585, + "grad_norm": 0.7537957429885864, + "learning_rate": 4.205932277154914e-05, + "loss": 0.3475, + "num_input_tokens_seen": 4168728, + "step": 6390 + }, + { + "epoch": 3.35167714884696, + "grad_norm": 1.4041340351104736, + "learning_rate": 4.204259861493457e-05, + "loss": 0.2943, + "num_input_tokens_seen": 4171960, + "step": 6395 + }, + { + "epoch": 3.3542976939203353, + "grad_norm": 0.903533399105072, + "learning_rate": 4.202586019785194e-05, + "loss": 0.3774, + "num_input_tokens_seen": 4175544, + "step": 6400 + }, + { + "epoch": 3.3569182389937104, + "grad_norm": 0.7169029116630554, + "learning_rate": 4.2009107534307214e-05, + "loss": 0.3058, + "num_input_tokens_seen": 4178264, + "step": 6405 + }, + { + "epoch": 3.359538784067086, + "grad_norm": 1.316460371017456, + "learning_rate": 4.199234063831825e-05, + "loss": 0.4372, + "num_input_tokens_seen": 4180984, + "step": 6410 + }, + { + "epoch": 3.3621593291404612, + "grad_norm": 1.2573238611221313, + "learning_rate": 4.197555952391482e-05, + "loss": 0.4591, + "num_input_tokens_seen": 4183768, + "step": 6415 + }, + { + "epoch": 3.3647798742138364, + "grad_norm": 1.350689172744751, + "learning_rate": 4.195876420513859e-05, + "loss": 0.3255, + "num_input_tokens_seen": 4187000, + "step": 6420 + }, + { + "epoch": 3.3674004192872116, + "grad_norm": 1.8248589038848877, + "learning_rate": 4.194195469604312e-05, + "loss": 0.3774, + "num_input_tokens_seen": 4189304, + "step": 6425 + }, + { + "epoch": 3.370020964360587, + "grad_norm": 0.6587446331977844, + "learning_rate": 4.192513101069383e-05, + "loss": 0.4019, + "num_input_tokens_seen": 4193080, + "step": 6430 + }, + { + "epoch": 3.3726415094339623, + "grad_norm": 1.1426669359207153, + "learning_rate": 4.190829316316803e-05, + "loss": 0.4063, + "num_input_tokens_seen": 4195608, + "step": 6435 + }, + { + "epoch": 3.3752620545073375, + "grad_norm": 1.3362507820129395, + "learning_rate": 4.189144116755485e-05, + "loss": 0.3134, + "num_input_tokens_seen": 4198744, + "step": 6440 + }, + { + "epoch": 3.3778825995807127, + "grad_norm": 1.563043236732483, + "learning_rate": 4.187457503795527e-05, + "loss": 0.6052, + "num_input_tokens_seen": 4202680, + "step": 6445 + }, + { + "epoch": 3.380503144654088, + "grad_norm": 1.340427279472351, + "learning_rate": 4.1857694788482094e-05, + "loss": 0.3842, + "num_input_tokens_seen": 4205144, + "step": 6450 + }, + { + "epoch": 3.3831236897274635, + "grad_norm": 1.2715801000595093, + "learning_rate": 4.184080043325995e-05, + "loss": 0.352, + "num_input_tokens_seen": 4209112, + "step": 6455 + }, + { + "epoch": 3.3857442348008386, + "grad_norm": 0.9158321619033813, + "learning_rate": 4.1823891986425256e-05, + "loss": 0.361, + "num_input_tokens_seen": 4213048, + "step": 6460 + }, + { + "epoch": 3.388364779874214, + "grad_norm": 0.7182477712631226, + "learning_rate": 4.180696946212624e-05, + "loss": 0.3703, + "num_input_tokens_seen": 4216472, + "step": 6465 + }, + { + "epoch": 3.390985324947589, + "grad_norm": 1.085331916809082, + "learning_rate": 4.1790032874522885e-05, + "loss": 0.3561, + "num_input_tokens_seen": 4219672, + "step": 6470 + }, + { + "epoch": 3.3936058700209646, + "grad_norm": 1.183308482170105, + "learning_rate": 4.177308223778696e-05, + "loss": 0.4076, + "num_input_tokens_seen": 4222872, + "step": 6475 + }, + { + "epoch": 3.3962264150943398, + "grad_norm": 0.9688555598258972, + "learning_rate": 4.175611756610198e-05, + "loss": 0.4411, + "num_input_tokens_seen": 4225528, + "step": 6480 + }, + { + "epoch": 3.398846960167715, + "grad_norm": 1.7325408458709717, + "learning_rate": 4.173913887366322e-05, + "loss": 0.3295, + "num_input_tokens_seen": 4229144, + "step": 6485 + }, + { + "epoch": 3.40146750524109, + "grad_norm": 0.9416486024856567, + "learning_rate": 4.172214617467765e-05, + "loss": 0.4158, + "num_input_tokens_seen": 4232792, + "step": 6490 + }, + { + "epoch": 3.4040880503144653, + "grad_norm": 0.8031437993049622, + "learning_rate": 4.1705139483364e-05, + "loss": 0.3628, + "num_input_tokens_seen": 4236600, + "step": 6495 + }, + { + "epoch": 3.4067085953878404, + "grad_norm": 1.242141842842102, + "learning_rate": 4.1688118813952706e-05, + "loss": 0.4993, + "num_input_tokens_seen": 4239896, + "step": 6500 + }, + { + "epoch": 3.409329140461216, + "grad_norm": 0.660439133644104, + "learning_rate": 4.167108418068585e-05, + "loss": 0.4259, + "num_input_tokens_seen": 4243640, + "step": 6505 + }, + { + "epoch": 3.411949685534591, + "grad_norm": 1.0231177806854248, + "learning_rate": 4.165403559781727e-05, + "loss": 0.4455, + "num_input_tokens_seen": 4247160, + "step": 6510 + }, + { + "epoch": 3.4145702306079664, + "grad_norm": 0.9910377264022827, + "learning_rate": 4.163697307961242e-05, + "loss": 0.4795, + "num_input_tokens_seen": 4251032, + "step": 6515 + }, + { + "epoch": 3.4171907756813416, + "grad_norm": 1.3191938400268555, + "learning_rate": 4.1619896640348445e-05, + "loss": 0.444, + "num_input_tokens_seen": 4254680, + "step": 6520 + }, + { + "epoch": 3.419811320754717, + "grad_norm": 0.9005263447761536, + "learning_rate": 4.160280629431413e-05, + "loss": 0.3312, + "num_input_tokens_seen": 4257720, + "step": 6525 + }, + { + "epoch": 3.4224318658280923, + "grad_norm": 0.7457687854766846, + "learning_rate": 4.158570205580989e-05, + "loss": 0.3974, + "num_input_tokens_seen": 4260696, + "step": 6530 + }, + { + "epoch": 3.4250524109014675, + "grad_norm": 0.6850275993347168, + "learning_rate": 4.156858393914779e-05, + "loss": 0.3412, + "num_input_tokens_seen": 4263416, + "step": 6535 + }, + { + "epoch": 3.4276729559748427, + "grad_norm": 1.061647891998291, + "learning_rate": 4.1551451958651455e-05, + "loss": 0.4776, + "num_input_tokens_seen": 4266232, + "step": 6540 + }, + { + "epoch": 3.430293501048218, + "grad_norm": 1.736820936203003, + "learning_rate": 4.153430612865616e-05, + "loss": 0.327, + "num_input_tokens_seen": 4269400, + "step": 6545 + }, + { + "epoch": 3.4329140461215935, + "grad_norm": 1.314971923828125, + "learning_rate": 4.151714646350876e-05, + "loss": 0.3925, + "num_input_tokens_seen": 4271896, + "step": 6550 + }, + { + "epoch": 3.4355345911949686, + "grad_norm": 0.9321863651275635, + "learning_rate": 4.149997297756767e-05, + "loss": 0.5471, + "num_input_tokens_seen": 4275416, + "step": 6555 + }, + { + "epoch": 3.438155136268344, + "grad_norm": 0.767734169960022, + "learning_rate": 4.148278568520289e-05, + "loss": 0.3655, + "num_input_tokens_seen": 4278936, + "step": 6560 + }, + { + "epoch": 3.440775681341719, + "grad_norm": 1.1234803199768066, + "learning_rate": 4.146558460079595e-05, + "loss": 0.3784, + "num_input_tokens_seen": 4282808, + "step": 6565 + }, + { + "epoch": 3.4433962264150946, + "grad_norm": 0.9392116665840149, + "learning_rate": 4.1448369738739923e-05, + "loss": 0.4186, + "num_input_tokens_seen": 4285400, + "step": 6570 + }, + { + "epoch": 3.4460167714884697, + "grad_norm": 1.09610915184021, + "learning_rate": 4.143114111343944e-05, + "loss": 0.2981, + "num_input_tokens_seen": 4288280, + "step": 6575 + }, + { + "epoch": 3.448637316561845, + "grad_norm": 0.8044507503509521, + "learning_rate": 4.1413898739310605e-05, + "loss": 0.4058, + "num_input_tokens_seen": 4291576, + "step": 6580 + }, + { + "epoch": 3.45125786163522, + "grad_norm": 0.8976778388023376, + "learning_rate": 4.1396642630781076e-05, + "loss": 0.3706, + "num_input_tokens_seen": 4295288, + "step": 6585 + }, + { + "epoch": 3.4538784067085953, + "grad_norm": 1.0593849420547485, + "learning_rate": 4.137937280228996e-05, + "loss": 0.4149, + "num_input_tokens_seen": 4298808, + "step": 6590 + }, + { + "epoch": 3.4564989517819704, + "grad_norm": 1.1691585779190063, + "learning_rate": 4.136208926828786e-05, + "loss": 0.3135, + "num_input_tokens_seen": 4304280, + "step": 6595 + }, + { + "epoch": 3.459119496855346, + "grad_norm": 1.7844265699386597, + "learning_rate": 4.134479204323685e-05, + "loss": 0.5334, + "num_input_tokens_seen": 4306616, + "step": 6600 + }, + { + "epoch": 3.461740041928721, + "grad_norm": 1.6122462749481201, + "learning_rate": 4.132748114161046e-05, + "loss": 0.3875, + "num_input_tokens_seen": 4309112, + "step": 6605 + }, + { + "epoch": 3.4643605870020964, + "grad_norm": 1.894338846206665, + "learning_rate": 4.131015657789365e-05, + "loss": 0.4633, + "num_input_tokens_seen": 4311832, + "step": 6610 + }, + { + "epoch": 3.4669811320754715, + "grad_norm": 0.264605313539505, + "learning_rate": 4.129281836658285e-05, + "loss": 0.3534, + "num_input_tokens_seen": 4318328, + "step": 6615 + }, + { + "epoch": 3.469601677148847, + "grad_norm": 1.5725568532943726, + "learning_rate": 4.127546652218586e-05, + "loss": 0.3396, + "num_input_tokens_seen": 4321848, + "step": 6620 + }, + { + "epoch": 3.4722222222222223, + "grad_norm": 1.2332640886306763, + "learning_rate": 4.1258101059221914e-05, + "loss": 0.2756, + "num_input_tokens_seen": 4324984, + "step": 6625 + }, + { + "epoch": 3.4748427672955975, + "grad_norm": 1.278399109840393, + "learning_rate": 4.124072199222165e-05, + "loss": 0.3336, + "num_input_tokens_seen": 4327704, + "step": 6630 + }, + { + "epoch": 3.4774633123689727, + "grad_norm": 1.1565721035003662, + "learning_rate": 4.122332933572707e-05, + "loss": 0.3183, + "num_input_tokens_seen": 4330904, + "step": 6635 + }, + { + "epoch": 3.480083857442348, + "grad_norm": 1.0504337549209595, + "learning_rate": 4.120592310429154e-05, + "loss": 0.4328, + "num_input_tokens_seen": 4333592, + "step": 6640 + }, + { + "epoch": 3.4827044025157234, + "grad_norm": 1.4641993045806885, + "learning_rate": 4.118850331247982e-05, + "loss": 0.3535, + "num_input_tokens_seen": 4336696, + "step": 6645 + }, + { + "epoch": 3.4853249475890986, + "grad_norm": 2.3862950801849365, + "learning_rate": 4.1171069974868e-05, + "loss": 0.5725, + "num_input_tokens_seen": 4340088, + "step": 6650 + }, + { + "epoch": 3.487945492662474, + "grad_norm": 1.2625432014465332, + "learning_rate": 4.115362310604347e-05, + "loss": 0.4582, + "num_input_tokens_seen": 4343640, + "step": 6655 + }, + { + "epoch": 3.490566037735849, + "grad_norm": 1.0637654066085815, + "learning_rate": 4.113616272060501e-05, + "loss": 0.3836, + "num_input_tokens_seen": 4348568, + "step": 6660 + }, + { + "epoch": 3.4931865828092246, + "grad_norm": 0.8377699255943298, + "learning_rate": 4.111868883316266e-05, + "loss": 0.35, + "num_input_tokens_seen": 4352376, + "step": 6665 + }, + { + "epoch": 3.4958071278825997, + "grad_norm": 0.9779176115989685, + "learning_rate": 4.110120145833775e-05, + "loss": 0.4059, + "num_input_tokens_seen": 4356120, + "step": 6670 + }, + { + "epoch": 3.498427672955975, + "grad_norm": 1.0892395973205566, + "learning_rate": 4.108370061076294e-05, + "loss": 0.3728, + "num_input_tokens_seen": 4359448, + "step": 6675 + }, + { + "epoch": 3.5, + "eval_loss": 0.47961604595184326, + "eval_runtime": 16.0112, + "eval_samples_per_second": 52.963, + "eval_steps_per_second": 13.241, + "num_input_tokens_seen": 4361944, + "step": 6678 + }, + { + "epoch": 3.50104821802935, + "grad_norm": 0.8010838031768799, + "learning_rate": 4.106618630508213e-05, + "loss": 0.2728, + "num_input_tokens_seen": 4362936, + "step": 6680 + }, + { + "epoch": 3.5036687631027252, + "grad_norm": 1.2312431335449219, + "learning_rate": 4.10486585559505e-05, + "loss": 0.3401, + "num_input_tokens_seen": 4366968, + "step": 6685 + }, + { + "epoch": 3.5062893081761004, + "grad_norm": 0.944074273109436, + "learning_rate": 4.103111737803446e-05, + "loss": 0.3004, + "num_input_tokens_seen": 4370264, + "step": 6690 + }, + { + "epoch": 3.508909853249476, + "grad_norm": 0.951328694820404, + "learning_rate": 4.101356278601167e-05, + "loss": 0.2654, + "num_input_tokens_seen": 4373432, + "step": 6695 + }, + { + "epoch": 3.511530398322851, + "grad_norm": 0.9487142562866211, + "learning_rate": 4.0995994794571015e-05, + "loss": 0.5093, + "num_input_tokens_seen": 4376824, + "step": 6700 + }, + { + "epoch": 3.5141509433962264, + "grad_norm": 0.9050124883651733, + "learning_rate": 4.0978413418412574e-05, + "loss": 0.4309, + "num_input_tokens_seen": 4380504, + "step": 6705 + }, + { + "epoch": 3.5167714884696015, + "grad_norm": 1.0489882230758667, + "learning_rate": 4.0960818672247656e-05, + "loss": 0.3909, + "num_input_tokens_seen": 4383704, + "step": 6710 + }, + { + "epoch": 3.519392033542977, + "grad_norm": 1.656227469444275, + "learning_rate": 4.094321057079874e-05, + "loss": 0.5149, + "num_input_tokens_seen": 4387128, + "step": 6715 + }, + { + "epoch": 3.5220125786163523, + "grad_norm": 1.0251632928848267, + "learning_rate": 4.092558912879947e-05, + "loss": 0.3834, + "num_input_tokens_seen": 4390296, + "step": 6720 + }, + { + "epoch": 3.5246331236897275, + "grad_norm": 1.4486626386642456, + "learning_rate": 4.090795436099466e-05, + "loss": 0.4326, + "num_input_tokens_seen": 4393624, + "step": 6725 + }, + { + "epoch": 3.5272536687631026, + "grad_norm": 1.182024598121643, + "learning_rate": 4.089030628214029e-05, + "loss": 0.3448, + "num_input_tokens_seen": 4397944, + "step": 6730 + }, + { + "epoch": 3.529874213836478, + "grad_norm": 1.0379191637039185, + "learning_rate": 4.0872644907003476e-05, + "loss": 0.366, + "num_input_tokens_seen": 4400696, + "step": 6735 + }, + { + "epoch": 3.532494758909853, + "grad_norm": 1.659132957458496, + "learning_rate": 4.0854970250362426e-05, + "loss": 0.3854, + "num_input_tokens_seen": 4403416, + "step": 6740 + }, + { + "epoch": 3.5351153039832286, + "grad_norm": 0.7429251074790955, + "learning_rate": 4.0837282327006495e-05, + "loss": 0.3482, + "num_input_tokens_seen": 4407320, + "step": 6745 + }, + { + "epoch": 3.5377358490566038, + "grad_norm": 1.256585717201233, + "learning_rate": 4.081958115173614e-05, + "loss": 0.3293, + "num_input_tokens_seen": 4409848, + "step": 6750 + }, + { + "epoch": 3.540356394129979, + "grad_norm": 0.8949511051177979, + "learning_rate": 4.080186673936288e-05, + "loss": 0.378, + "num_input_tokens_seen": 4412920, + "step": 6755 + }, + { + "epoch": 3.5429769392033545, + "grad_norm": 1.0603764057159424, + "learning_rate": 4.078413910470934e-05, + "loss": 0.2914, + "num_input_tokens_seen": 4415384, + "step": 6760 + }, + { + "epoch": 3.5455974842767297, + "grad_norm": 1.0091497898101807, + "learning_rate": 4.076639826260919e-05, + "loss": 0.3004, + "num_input_tokens_seen": 4418040, + "step": 6765 + }, + { + "epoch": 3.548218029350105, + "grad_norm": 0.9451886415481567, + "learning_rate": 4.074864422790714e-05, + "loss": 0.3704, + "num_input_tokens_seen": 4421464, + "step": 6770 + }, + { + "epoch": 3.55083857442348, + "grad_norm": 1.1469308137893677, + "learning_rate": 4.073087701545897e-05, + "loss": 0.4034, + "num_input_tokens_seen": 4423672, + "step": 6775 + }, + { + "epoch": 3.5534591194968552, + "grad_norm": 1.137507677078247, + "learning_rate": 4.071309664013148e-05, + "loss": 0.3367, + "num_input_tokens_seen": 4426648, + "step": 6780 + }, + { + "epoch": 3.5560796645702304, + "grad_norm": 2.524995803833008, + "learning_rate": 4.069530311680247e-05, + "loss": 0.4007, + "num_input_tokens_seen": 4429752, + "step": 6785 + }, + { + "epoch": 3.558700209643606, + "grad_norm": 1.0261272192001343, + "learning_rate": 4.0677496460360734e-05, + "loss": 0.4739, + "num_input_tokens_seen": 4433784, + "step": 6790 + }, + { + "epoch": 3.561320754716981, + "grad_norm": 1.4464373588562012, + "learning_rate": 4.0659676685706084e-05, + "loss": 0.3558, + "num_input_tokens_seen": 4436376, + "step": 6795 + }, + { + "epoch": 3.5639412997903563, + "grad_norm": 1.4025294780731201, + "learning_rate": 4.064184380774929e-05, + "loss": 0.3245, + "num_input_tokens_seen": 4439384, + "step": 6800 + }, + { + "epoch": 3.5665618448637315, + "grad_norm": 1.5611987113952637, + "learning_rate": 4.062399784141209e-05, + "loss": 0.4085, + "num_input_tokens_seen": 4442712, + "step": 6805 + }, + { + "epoch": 3.569182389937107, + "grad_norm": 1.1542755365371704, + "learning_rate": 4.060613880162717e-05, + "loss": 0.4222, + "num_input_tokens_seen": 4446936, + "step": 6810 + }, + { + "epoch": 3.5718029350104823, + "grad_norm": 0.878410279750824, + "learning_rate": 4.0588266703338164e-05, + "loss": 0.4412, + "num_input_tokens_seen": 4450328, + "step": 6815 + }, + { + "epoch": 3.5744234800838575, + "grad_norm": 0.855292022228241, + "learning_rate": 4.057038156149961e-05, + "loss": 0.3151, + "num_input_tokens_seen": 4453816, + "step": 6820 + }, + { + "epoch": 3.5770440251572326, + "grad_norm": 0.8283522129058838, + "learning_rate": 4.055248339107701e-05, + "loss": 0.3187, + "num_input_tokens_seen": 4458072, + "step": 6825 + }, + { + "epoch": 3.579664570230608, + "grad_norm": 0.9021967053413391, + "learning_rate": 4.053457220704671e-05, + "loss": 0.3448, + "num_input_tokens_seen": 4461144, + "step": 6830 + }, + { + "epoch": 3.582285115303983, + "grad_norm": 0.8003422617912292, + "learning_rate": 4.0516648024395974e-05, + "loss": 0.2779, + "num_input_tokens_seen": 4463928, + "step": 6835 + }, + { + "epoch": 3.5849056603773586, + "grad_norm": 1.2890727519989014, + "learning_rate": 4.049871085812295e-05, + "loss": 0.3705, + "num_input_tokens_seen": 4466488, + "step": 6840 + }, + { + "epoch": 3.5875262054507338, + "grad_norm": 0.7880327105522156, + "learning_rate": 4.0480760723236633e-05, + "loss": 0.3243, + "num_input_tokens_seen": 4470104, + "step": 6845 + }, + { + "epoch": 3.590146750524109, + "grad_norm": 1.2312421798706055, + "learning_rate": 4.046279763475687e-05, + "loss": 0.4353, + "num_input_tokens_seen": 4472184, + "step": 6850 + }, + { + "epoch": 3.5927672955974845, + "grad_norm": 1.0968170166015625, + "learning_rate": 4.0444821607714366e-05, + "loss": 0.397, + "num_input_tokens_seen": 4474840, + "step": 6855 + }, + { + "epoch": 3.5953878406708597, + "grad_norm": 1.7295169830322266, + "learning_rate": 4.042683265715063e-05, + "loss": 0.3989, + "num_input_tokens_seen": 4477816, + "step": 6860 + }, + { + "epoch": 3.598008385744235, + "grad_norm": 1.0223571062088013, + "learning_rate": 4.040883079811799e-05, + "loss": 0.4646, + "num_input_tokens_seen": 4481816, + "step": 6865 + }, + { + "epoch": 3.60062893081761, + "grad_norm": 1.193010687828064, + "learning_rate": 4.039081604567959e-05, + "loss": 0.3128, + "num_input_tokens_seen": 4485464, + "step": 6870 + }, + { + "epoch": 3.603249475890985, + "grad_norm": 0.8160101771354675, + "learning_rate": 4.037278841490933e-05, + "loss": 0.3203, + "num_input_tokens_seen": 4490392, + "step": 6875 + }, + { + "epoch": 3.6058700209643604, + "grad_norm": 1.384966492652893, + "learning_rate": 4.0354747920891954e-05, + "loss": 0.4013, + "num_input_tokens_seen": 4494712, + "step": 6880 + }, + { + "epoch": 3.608490566037736, + "grad_norm": 1.0394400358200073, + "learning_rate": 4.033669457872288e-05, + "loss": 0.4291, + "num_input_tokens_seen": 4497528, + "step": 6885 + }, + { + "epoch": 3.611111111111111, + "grad_norm": 0.8926694989204407, + "learning_rate": 4.0318628403508336e-05, + "loss": 0.2866, + "num_input_tokens_seen": 4500536, + "step": 6890 + }, + { + "epoch": 3.6137316561844863, + "grad_norm": 1.3275249004364014, + "learning_rate": 4.0300549410365276e-05, + "loss": 0.4016, + "num_input_tokens_seen": 4504056, + "step": 6895 + }, + { + "epoch": 3.6163522012578615, + "grad_norm": 1.16357421875, + "learning_rate": 4.0282457614421364e-05, + "loss": 0.3761, + "num_input_tokens_seen": 4506936, + "step": 6900 + }, + { + "epoch": 3.618972746331237, + "grad_norm": 0.7372626066207886, + "learning_rate": 4.0264353030814996e-05, + "loss": 0.3045, + "num_input_tokens_seen": 4509560, + "step": 6905 + }, + { + "epoch": 3.6215932914046123, + "grad_norm": 1.1739435195922852, + "learning_rate": 4.0246235674695255e-05, + "loss": 0.3865, + "num_input_tokens_seen": 4513240, + "step": 6910 + }, + { + "epoch": 3.6242138364779874, + "grad_norm": 1.1258035898208618, + "learning_rate": 4.022810556122193e-05, + "loss": 0.3738, + "num_input_tokens_seen": 4515640, + "step": 6915 + }, + { + "epoch": 3.6268343815513626, + "grad_norm": 0.8410833477973938, + "learning_rate": 4.020996270556546e-05, + "loss": 0.3495, + "num_input_tokens_seen": 4520824, + "step": 6920 + }, + { + "epoch": 3.629454926624738, + "grad_norm": 1.3328640460968018, + "learning_rate": 4.0191807122906964e-05, + "loss": 0.4125, + "num_input_tokens_seen": 4523896, + "step": 6925 + }, + { + "epoch": 3.632075471698113, + "grad_norm": 1.8274134397506714, + "learning_rate": 4.01736388284382e-05, + "loss": 0.4887, + "num_input_tokens_seen": 4527000, + "step": 6930 + }, + { + "epoch": 3.6346960167714886, + "grad_norm": 2.1582813262939453, + "learning_rate": 4.015545783736157e-05, + "loss": 0.4784, + "num_input_tokens_seen": 4529912, + "step": 6935 + }, + { + "epoch": 3.6373165618448637, + "grad_norm": 1.477928638458252, + "learning_rate": 4.013726416489009e-05, + "loss": 0.3391, + "num_input_tokens_seen": 4533336, + "step": 6940 + }, + { + "epoch": 3.639937106918239, + "grad_norm": 0.8379085063934326, + "learning_rate": 4.01190578262474e-05, + "loss": 0.3776, + "num_input_tokens_seen": 4536344, + "step": 6945 + }, + { + "epoch": 3.6425576519916145, + "grad_norm": 0.8443257212638855, + "learning_rate": 4.0100838836667735e-05, + "loss": 0.323, + "num_input_tokens_seen": 4540152, + "step": 6950 + }, + { + "epoch": 3.6451781970649897, + "grad_norm": 1.3356608152389526, + "learning_rate": 4.0082607211395904e-05, + "loss": 0.2997, + "num_input_tokens_seen": 4543032, + "step": 6955 + }, + { + "epoch": 3.647798742138365, + "grad_norm": 0.8671276569366455, + "learning_rate": 4.006436296568731e-05, + "loss": 0.3633, + "num_input_tokens_seen": 4547064, + "step": 6960 + }, + { + "epoch": 3.65041928721174, + "grad_norm": 0.9991268515586853, + "learning_rate": 4.00461061148079e-05, + "loss": 0.3994, + "num_input_tokens_seen": 4551000, + "step": 6965 + }, + { + "epoch": 3.653039832285115, + "grad_norm": 0.7814719676971436, + "learning_rate": 4.0027836674034174e-05, + "loss": 0.4403, + "num_input_tokens_seen": 4555000, + "step": 6970 + }, + { + "epoch": 3.6556603773584904, + "grad_norm": 0.6903594136238098, + "learning_rate": 4.000955465865316e-05, + "loss": 0.3368, + "num_input_tokens_seen": 4558680, + "step": 6975 + }, + { + "epoch": 3.658280922431866, + "grad_norm": 0.8884469270706177, + "learning_rate": 3.999126008396242e-05, + "loss": 0.3628, + "num_input_tokens_seen": 4561336, + "step": 6980 + }, + { + "epoch": 3.660901467505241, + "grad_norm": 1.2357146739959717, + "learning_rate": 3.9972952965270006e-05, + "loss": 0.3995, + "num_input_tokens_seen": 4564568, + "step": 6985 + }, + { + "epoch": 3.6635220125786163, + "grad_norm": 1.0811864137649536, + "learning_rate": 3.9954633317894496e-05, + "loss": 0.362, + "num_input_tokens_seen": 4567448, + "step": 6990 + }, + { + "epoch": 3.6661425576519915, + "grad_norm": 0.7572953701019287, + "learning_rate": 3.9936301157164926e-05, + "loss": 0.4416, + "num_input_tokens_seen": 4570136, + "step": 6995 + }, + { + "epoch": 3.668763102725367, + "grad_norm": 0.9880680441856384, + "learning_rate": 3.99179564984208e-05, + "loss": 0.3916, + "num_input_tokens_seen": 4573624, + "step": 7000 + }, + { + "epoch": 3.6713836477987423, + "grad_norm": 2.7492547035217285, + "learning_rate": 3.989959935701211e-05, + "loss": 0.3295, + "num_input_tokens_seen": 4577048, + "step": 7005 + }, + { + "epoch": 3.6740041928721174, + "grad_norm": 1.0916496515274048, + "learning_rate": 3.988122974829926e-05, + "loss": 0.4052, + "num_input_tokens_seen": 4580088, + "step": 7010 + }, + { + "epoch": 3.6766247379454926, + "grad_norm": 1.0050992965698242, + "learning_rate": 3.9862847687653116e-05, + "loss": 0.3308, + "num_input_tokens_seen": 4583480, + "step": 7015 + }, + { + "epoch": 3.6792452830188678, + "grad_norm": 1.2877388000488281, + "learning_rate": 3.9844453190454924e-05, + "loss": 0.5129, + "num_input_tokens_seen": 4586168, + "step": 7020 + }, + { + "epoch": 3.681865828092243, + "grad_norm": 0.9212608933448792, + "learning_rate": 3.982604627209637e-05, + "loss": 0.3295, + "num_input_tokens_seen": 4589592, + "step": 7025 + }, + { + "epoch": 3.6844863731656186, + "grad_norm": 1.2772223949432373, + "learning_rate": 3.980762694797953e-05, + "loss": 0.4317, + "num_input_tokens_seen": 4592440, + "step": 7030 + }, + { + "epoch": 3.6871069182389937, + "grad_norm": 0.9201710820198059, + "learning_rate": 3.978919523351684e-05, + "loss": 0.2832, + "num_input_tokens_seen": 4594904, + "step": 7035 + }, + { + "epoch": 3.689727463312369, + "grad_norm": 1.0540508031845093, + "learning_rate": 3.977075114413112e-05, + "loss": 0.4042, + "num_input_tokens_seen": 4597624, + "step": 7040 + }, + { + "epoch": 3.6923480083857445, + "grad_norm": 1.6584676504135132, + "learning_rate": 3.9752294695255545e-05, + "loss": 0.4596, + "num_input_tokens_seen": 4599928, + "step": 7045 + }, + { + "epoch": 3.6949685534591197, + "grad_norm": 2.1589813232421875, + "learning_rate": 3.973382590233362e-05, + "loss": 0.3873, + "num_input_tokens_seen": 4602328, + "step": 7050 + }, + { + "epoch": 3.697589098532495, + "grad_norm": 0.7364386916160583, + "learning_rate": 3.9715344780819205e-05, + "loss": 0.3961, + "num_input_tokens_seen": 4606328, + "step": 7055 + }, + { + "epoch": 3.70020964360587, + "grad_norm": 0.9038268327713013, + "learning_rate": 3.9696851346176445e-05, + "loss": 0.3148, + "num_input_tokens_seen": 4609176, + "step": 7060 + }, + { + "epoch": 3.702830188679245, + "grad_norm": 1.0088019371032715, + "learning_rate": 3.9678345613879796e-05, + "loss": 0.4841, + "num_input_tokens_seen": 4612056, + "step": 7065 + }, + { + "epoch": 3.7054507337526204, + "grad_norm": 1.304423451423645, + "learning_rate": 3.965982759941403e-05, + "loss": 0.493, + "num_input_tokens_seen": 4615320, + "step": 7070 + }, + { + "epoch": 3.708071278825996, + "grad_norm": 1.1894910335540771, + "learning_rate": 3.964129731827415e-05, + "loss": 0.4347, + "num_input_tokens_seen": 4618200, + "step": 7075 + }, + { + "epoch": 3.710691823899371, + "grad_norm": 1.5164886713027954, + "learning_rate": 3.9622754785965474e-05, + "loss": 0.2801, + "num_input_tokens_seen": 4621368, + "step": 7080 + }, + { + "epoch": 3.7133123689727463, + "grad_norm": 1.608674168586731, + "learning_rate": 3.9604200018003525e-05, + "loss": 0.389, + "num_input_tokens_seen": 4624088, + "step": 7085 + }, + { + "epoch": 3.7159329140461215, + "grad_norm": 1.0652217864990234, + "learning_rate": 3.95856330299141e-05, + "loss": 0.526, + "num_input_tokens_seen": 4628568, + "step": 7090 + }, + { + "epoch": 3.718553459119497, + "grad_norm": 1.0545778274536133, + "learning_rate": 3.956705383723319e-05, + "loss": 0.419, + "num_input_tokens_seen": 4631288, + "step": 7095 + }, + { + "epoch": 3.7211740041928723, + "grad_norm": 0.9075819849967957, + "learning_rate": 3.954846245550704e-05, + "loss": 0.3246, + "num_input_tokens_seen": 4635224, + "step": 7100 + }, + { + "epoch": 3.7237945492662474, + "grad_norm": 0.9797729849815369, + "learning_rate": 3.952985890029205e-05, + "loss": 0.5548, + "num_input_tokens_seen": 4638648, + "step": 7105 + }, + { + "epoch": 3.7264150943396226, + "grad_norm": 1.7539931535720825, + "learning_rate": 3.951124318715482e-05, + "loss": 0.3012, + "num_input_tokens_seen": 4641848, + "step": 7110 + }, + { + "epoch": 3.7290356394129978, + "grad_norm": 1.1188106536865234, + "learning_rate": 3.9492615331672145e-05, + "loss": 0.3837, + "num_input_tokens_seen": 4644408, + "step": 7115 + }, + { + "epoch": 3.731656184486373, + "grad_norm": 0.7577705979347229, + "learning_rate": 3.947397534943096e-05, + "loss": 0.505, + "num_input_tokens_seen": 4647096, + "step": 7120 + }, + { + "epoch": 3.7342767295597485, + "grad_norm": 1.152060627937317, + "learning_rate": 3.9455323256028344e-05, + "loss": 0.3972, + "num_input_tokens_seen": 4649176, + "step": 7125 + }, + { + "epoch": 3.7368972746331237, + "grad_norm": 1.5144633054733276, + "learning_rate": 3.943665906707153e-05, + "loss": 0.4222, + "num_input_tokens_seen": 4653144, + "step": 7130 + }, + { + "epoch": 3.739517819706499, + "grad_norm": 1.4267832040786743, + "learning_rate": 3.9417982798177834e-05, + "loss": 0.2682, + "num_input_tokens_seen": 4655864, + "step": 7135 + }, + { + "epoch": 3.742138364779874, + "grad_norm": 0.8140873312950134, + "learning_rate": 3.939929446497472e-05, + "loss": 0.3964, + "num_input_tokens_seen": 4659000, + "step": 7140 + }, + { + "epoch": 3.7447589098532497, + "grad_norm": 0.8360342979431152, + "learning_rate": 3.938059408309974e-05, + "loss": 0.4186, + "num_input_tokens_seen": 4661720, + "step": 7145 + }, + { + "epoch": 3.747379454926625, + "grad_norm": 1.2552458047866821, + "learning_rate": 3.936188166820051e-05, + "loss": 0.3857, + "num_input_tokens_seen": 4666584, + "step": 7150 + }, + { + "epoch": 3.75, + "grad_norm": 1.1499649286270142, + "learning_rate": 3.9343157235934714e-05, + "loss": 0.3468, + "num_input_tokens_seen": 4669272, + "step": 7155 + }, + { + "epoch": 3.752620545073375, + "grad_norm": 1.3162422180175781, + "learning_rate": 3.932442080197012e-05, + "loss": 0.3945, + "num_input_tokens_seen": 4672280, + "step": 7160 + }, + { + "epoch": 3.7552410901467503, + "grad_norm": 1.370348334312439, + "learning_rate": 3.930567238198451e-05, + "loss": 0.3064, + "num_input_tokens_seen": 4675032, + "step": 7165 + }, + { + "epoch": 3.757861635220126, + "grad_norm": 1.3759313821792603, + "learning_rate": 3.928691199166571e-05, + "loss": 0.3691, + "num_input_tokens_seen": 4678360, + "step": 7170 + }, + { + "epoch": 3.760482180293501, + "grad_norm": 2.4735326766967773, + "learning_rate": 3.926813964671156e-05, + "loss": 0.3803, + "num_input_tokens_seen": 4681208, + "step": 7175 + }, + { + "epoch": 3.7631027253668763, + "grad_norm": 1.1032936573028564, + "learning_rate": 3.9249355362829884e-05, + "loss": 0.3977, + "num_input_tokens_seen": 4683768, + "step": 7180 + }, + { + "epoch": 3.7657232704402515, + "grad_norm": 1.607244610786438, + "learning_rate": 3.923055915573853e-05, + "loss": 0.3904, + "num_input_tokens_seen": 4686872, + "step": 7185 + }, + { + "epoch": 3.768343815513627, + "grad_norm": 1.041572093963623, + "learning_rate": 3.921175104116531e-05, + "loss": 0.3825, + "num_input_tokens_seen": 4690424, + "step": 7190 + }, + { + "epoch": 3.7709643605870022, + "grad_norm": 1.873212456703186, + "learning_rate": 3.9192931034847966e-05, + "loss": 0.4101, + "num_input_tokens_seen": 4692824, + "step": 7195 + }, + { + "epoch": 3.7735849056603774, + "grad_norm": 1.0537465810775757, + "learning_rate": 3.917409915253426e-05, + "loss": 0.4401, + "num_input_tokens_seen": 4696440, + "step": 7200 + }, + { + "epoch": 3.7762054507337526, + "grad_norm": 1.0769675970077515, + "learning_rate": 3.915525540998182e-05, + "loss": 0.3537, + "num_input_tokens_seen": 4700440, + "step": 7205 + }, + { + "epoch": 3.7788259958071277, + "grad_norm": 0.9854370951652527, + "learning_rate": 3.9136399822958235e-05, + "loss": 0.345, + "num_input_tokens_seen": 4703320, + "step": 7210 + }, + { + "epoch": 3.781446540880503, + "grad_norm": 1.2791153192520142, + "learning_rate": 3.911753240724101e-05, + "loss": 0.4799, + "num_input_tokens_seen": 4706552, + "step": 7215 + }, + { + "epoch": 3.7840670859538785, + "grad_norm": 0.6183247566223145, + "learning_rate": 3.909865317861753e-05, + "loss": 0.3176, + "num_input_tokens_seen": 4709592, + "step": 7220 + }, + { + "epoch": 3.7866876310272537, + "grad_norm": 1.1409404277801514, + "learning_rate": 3.907976215288507e-05, + "loss": 0.4558, + "num_input_tokens_seen": 4712344, + "step": 7225 + }, + { + "epoch": 3.789308176100629, + "grad_norm": 1.2683377265930176, + "learning_rate": 3.9060859345850774e-05, + "loss": 0.3903, + "num_input_tokens_seen": 4715000, + "step": 7230 + }, + { + "epoch": 3.791928721174004, + "grad_norm": 0.9262044429779053, + "learning_rate": 3.904194477333166e-05, + "loss": 0.3683, + "num_input_tokens_seen": 4718904, + "step": 7235 + }, + { + "epoch": 3.7945492662473796, + "grad_norm": 1.4408495426177979, + "learning_rate": 3.902301845115456e-05, + "loss": 0.4495, + "num_input_tokens_seen": 4721656, + "step": 7240 + }, + { + "epoch": 3.797169811320755, + "grad_norm": 2.307800531387329, + "learning_rate": 3.900408039515617e-05, + "loss": 0.4684, + "num_input_tokens_seen": 4726264, + "step": 7245 + }, + { + "epoch": 3.79979035639413, + "grad_norm": 1.071450114250183, + "learning_rate": 3.8985130621182985e-05, + "loss": 0.3202, + "num_input_tokens_seen": 4729112, + "step": 7250 + }, + { + "epoch": 3.802410901467505, + "grad_norm": 1.0675016641616821, + "learning_rate": 3.896616914509131e-05, + "loss": 0.445, + "num_input_tokens_seen": 4731832, + "step": 7255 + }, + { + "epoch": 3.8050314465408803, + "grad_norm": 1.3051769733428955, + "learning_rate": 3.894719598274725e-05, + "loss": 0.4666, + "num_input_tokens_seen": 4738136, + "step": 7260 + }, + { + "epoch": 3.8076519916142555, + "grad_norm": 0.8935559391975403, + "learning_rate": 3.892821115002667e-05, + "loss": 0.339, + "num_input_tokens_seen": 4741080, + "step": 7265 + }, + { + "epoch": 3.810272536687631, + "grad_norm": 0.9727830290794373, + "learning_rate": 3.8909214662815216e-05, + "loss": 0.2778, + "num_input_tokens_seen": 4744472, + "step": 7270 + }, + { + "epoch": 3.8128930817610063, + "grad_norm": 1.13785719871521, + "learning_rate": 3.889020653700828e-05, + "loss": 0.3296, + "num_input_tokens_seen": 4748664, + "step": 7275 + }, + { + "epoch": 3.8155136268343814, + "grad_norm": 2.4037158489227295, + "learning_rate": 3.8871186788511e-05, + "loss": 0.3934, + "num_input_tokens_seen": 4751704, + "step": 7280 + }, + { + "epoch": 3.818134171907757, + "grad_norm": 3.0284829139709473, + "learning_rate": 3.8852155433238214e-05, + "loss": 0.3794, + "num_input_tokens_seen": 4754104, + "step": 7285 + }, + { + "epoch": 3.8207547169811322, + "grad_norm": 1.0998848676681519, + "learning_rate": 3.8833112487114505e-05, + "loss": 0.35, + "num_input_tokens_seen": 4757528, + "step": 7290 + }, + { + "epoch": 3.8233752620545074, + "grad_norm": 0.6477729678153992, + "learning_rate": 3.881405796607414e-05, + "loss": 0.4976, + "num_input_tokens_seen": 4761208, + "step": 7295 + }, + { + "epoch": 3.8259958071278826, + "grad_norm": 1.1955393552780151, + "learning_rate": 3.879499188606107e-05, + "loss": 0.3751, + "num_input_tokens_seen": 4764088, + "step": 7300 + }, + { + "epoch": 3.8286163522012577, + "grad_norm": 0.8536476492881775, + "learning_rate": 3.877591426302892e-05, + "loss": 0.4448, + "num_input_tokens_seen": 4767352, + "step": 7305 + }, + { + "epoch": 3.831236897274633, + "grad_norm": 1.1128385066986084, + "learning_rate": 3.8756825112940964e-05, + "loss": 0.3905, + "num_input_tokens_seen": 4770648, + "step": 7310 + }, + { + "epoch": 3.8338574423480085, + "grad_norm": 0.6781826019287109, + "learning_rate": 3.873772445177015e-05, + "loss": 0.3676, + "num_input_tokens_seen": 4777624, + "step": 7315 + }, + { + "epoch": 3.8364779874213837, + "grad_norm": 1.5855016708374023, + "learning_rate": 3.8718612295499036e-05, + "loss": 0.4643, + "num_input_tokens_seen": 4780376, + "step": 7320 + }, + { + "epoch": 3.839098532494759, + "grad_norm": 1.0260591506958008, + "learning_rate": 3.8699488660119784e-05, + "loss": 0.3633, + "num_input_tokens_seen": 4783096, + "step": 7325 + }, + { + "epoch": 3.841719077568134, + "grad_norm": 0.8852398991584778, + "learning_rate": 3.868035356163419e-05, + "loss": 0.2738, + "num_input_tokens_seen": 4787000, + "step": 7330 + }, + { + "epoch": 3.8443396226415096, + "grad_norm": 1.9096487760543823, + "learning_rate": 3.866120701605363e-05, + "loss": 0.4062, + "num_input_tokens_seen": 4790168, + "step": 7335 + }, + { + "epoch": 3.846960167714885, + "grad_norm": 1.0435830354690552, + "learning_rate": 3.8642049039399054e-05, + "loss": 0.4391, + "num_input_tokens_seen": 4793944, + "step": 7340 + }, + { + "epoch": 3.84958071278826, + "grad_norm": 1.0726282596588135, + "learning_rate": 3.862287964770099e-05, + "loss": 0.314, + "num_input_tokens_seen": 4797016, + "step": 7345 + }, + { + "epoch": 3.852201257861635, + "grad_norm": 0.8580519556999207, + "learning_rate": 3.86036988569995e-05, + "loss": 0.4072, + "num_input_tokens_seen": 4800216, + "step": 7350 + }, + { + "epoch": 3.8548218029350103, + "grad_norm": 1.195745825767517, + "learning_rate": 3.8584506683344216e-05, + "loss": 0.3134, + "num_input_tokens_seen": 4803992, + "step": 7355 + }, + { + "epoch": 3.8574423480083855, + "grad_norm": 1.231759786605835, + "learning_rate": 3.8565303142794234e-05, + "loss": 0.3652, + "num_input_tokens_seen": 4806872, + "step": 7360 + }, + { + "epoch": 3.860062893081761, + "grad_norm": 0.8415560126304626, + "learning_rate": 3.8546088251418224e-05, + "loss": 0.2581, + "num_input_tokens_seen": 4809912, + "step": 7365 + }, + { + "epoch": 3.8626834381551363, + "grad_norm": 0.7027899622917175, + "learning_rate": 3.8526862025294336e-05, + "loss": 0.2718, + "num_input_tokens_seen": 4816984, + "step": 7370 + }, + { + "epoch": 3.8653039832285114, + "grad_norm": 1.0682406425476074, + "learning_rate": 3.8507624480510186e-05, + "loss": 0.4466, + "num_input_tokens_seen": 4819608, + "step": 7375 + }, + { + "epoch": 3.867924528301887, + "grad_norm": 0.653344452381134, + "learning_rate": 3.848837563316287e-05, + "loss": 0.3402, + "num_input_tokens_seen": 4823448, + "step": 7380 + }, + { + "epoch": 3.870545073375262, + "grad_norm": 1.0065646171569824, + "learning_rate": 3.8469115499358945e-05, + "loss": 0.3982, + "num_input_tokens_seen": 4826328, + "step": 7385 + }, + { + "epoch": 3.8731656184486374, + "grad_norm": 1.646185278892517, + "learning_rate": 3.844984409521442e-05, + "loss": 0.4009, + "num_input_tokens_seen": 4829048, + "step": 7390 + }, + { + "epoch": 3.8757861635220126, + "grad_norm": 1.008074402809143, + "learning_rate": 3.843056143685472e-05, + "loss": 0.3494, + "num_input_tokens_seen": 4833048, + "step": 7395 + }, + { + "epoch": 3.8784067085953877, + "grad_norm": 0.856590211391449, + "learning_rate": 3.841126754041468e-05, + "loss": 0.331, + "num_input_tokens_seen": 4836184, + "step": 7400 + }, + { + "epoch": 3.881027253668763, + "grad_norm": 1.9266842603683472, + "learning_rate": 3.839196242203859e-05, + "loss": 0.3849, + "num_input_tokens_seen": 4839896, + "step": 7405 + }, + { + "epoch": 3.8836477987421385, + "grad_norm": 1.3479008674621582, + "learning_rate": 3.837264609788005e-05, + "loss": 0.3276, + "num_input_tokens_seen": 4842936, + "step": 7410 + }, + { + "epoch": 3.8862683438155137, + "grad_norm": 1.6327521800994873, + "learning_rate": 3.8353318584102096e-05, + "loss": 0.4812, + "num_input_tokens_seen": 4846200, + "step": 7415 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 1.161097764968872, + "learning_rate": 3.83339798968771e-05, + "loss": 0.3937, + "num_input_tokens_seen": 4849336, + "step": 7420 + }, + { + "epoch": 3.891509433962264, + "grad_norm": 1.3068815469741821, + "learning_rate": 3.83146300523868e-05, + "loss": 0.3864, + "num_input_tokens_seen": 4852632, + "step": 7425 + }, + { + "epoch": 3.8941299790356396, + "grad_norm": 1.1398427486419678, + "learning_rate": 3.829526906682227e-05, + "loss": 0.3907, + "num_input_tokens_seen": 4855384, + "step": 7430 + }, + { + "epoch": 3.896750524109015, + "grad_norm": 0.97249835729599, + "learning_rate": 3.827589695638388e-05, + "loss": 0.421, + "num_input_tokens_seen": 4858936, + "step": 7435 + }, + { + "epoch": 3.89937106918239, + "grad_norm": 1.631010890007019, + "learning_rate": 3.825651373728133e-05, + "loss": 0.4259, + "num_input_tokens_seen": 4862488, + "step": 7440 + }, + { + "epoch": 3.901991614255765, + "grad_norm": 0.8973783254623413, + "learning_rate": 3.8237119425733625e-05, + "loss": 0.3551, + "num_input_tokens_seen": 4865240, + "step": 7445 + }, + { + "epoch": 3.9046121593291403, + "grad_norm": 0.9998116493225098, + "learning_rate": 3.8217714037969035e-05, + "loss": 0.349, + "num_input_tokens_seen": 4868152, + "step": 7450 + }, + { + "epoch": 3.9072327044025155, + "grad_norm": 1.0244877338409424, + "learning_rate": 3.8198297590225095e-05, + "loss": 0.2777, + "num_input_tokens_seen": 4870808, + "step": 7455 + }, + { + "epoch": 3.909853249475891, + "grad_norm": 0.7518368363380432, + "learning_rate": 3.817887009874861e-05, + "loss": 0.3573, + "num_input_tokens_seen": 4873944, + "step": 7460 + }, + { + "epoch": 3.9124737945492662, + "grad_norm": 1.2486989498138428, + "learning_rate": 3.815943157979561e-05, + "loss": 0.4399, + "num_input_tokens_seen": 4877528, + "step": 7465 + }, + { + "epoch": 3.9150943396226414, + "grad_norm": 1.4983527660369873, + "learning_rate": 3.813998204963136e-05, + "loss": 0.3413, + "num_input_tokens_seen": 4880504, + "step": 7470 + }, + { + "epoch": 3.917714884696017, + "grad_norm": 1.237634301185608, + "learning_rate": 3.812052152453035e-05, + "loss": 0.3445, + "num_input_tokens_seen": 4884312, + "step": 7475 + }, + { + "epoch": 3.920335429769392, + "grad_norm": 1.435648798942566, + "learning_rate": 3.8101050020776244e-05, + "loss": 0.3125, + "num_input_tokens_seen": 4887032, + "step": 7480 + }, + { + "epoch": 3.9229559748427674, + "grad_norm": 0.9781675934791565, + "learning_rate": 3.808156755466191e-05, + "loss": 0.4413, + "num_input_tokens_seen": 4889976, + "step": 7485 + }, + { + "epoch": 3.9255765199161425, + "grad_norm": 0.8445286154747009, + "learning_rate": 3.806207414248939e-05, + "loss": 0.4876, + "num_input_tokens_seen": 4892600, + "step": 7490 + }, + { + "epoch": 3.9281970649895177, + "grad_norm": 1.201065182685852, + "learning_rate": 3.804256980056988e-05, + "loss": 0.377, + "num_input_tokens_seen": 4895640, + "step": 7495 + }, + { + "epoch": 3.930817610062893, + "grad_norm": 1.0015331506729126, + "learning_rate": 3.8023054545223723e-05, + "loss": 0.3144, + "num_input_tokens_seen": 4898200, + "step": 7500 + }, + { + "epoch": 3.9334381551362685, + "grad_norm": 1.5023002624511719, + "learning_rate": 3.8003528392780385e-05, + "loss": 0.3161, + "num_input_tokens_seen": 4901080, + "step": 7505 + }, + { + "epoch": 3.9360587002096437, + "grad_norm": 1.2076185941696167, + "learning_rate": 3.798399135957847e-05, + "loss": 0.4348, + "num_input_tokens_seen": 4904280, + "step": 7510 + }, + { + "epoch": 3.938679245283019, + "grad_norm": 0.9720194935798645, + "learning_rate": 3.7964443461965674e-05, + "loss": 0.5645, + "num_input_tokens_seen": 4907832, + "step": 7515 + }, + { + "epoch": 3.941299790356394, + "grad_norm": 1.550282597541809, + "learning_rate": 3.794488471629878e-05, + "loss": 0.5006, + "num_input_tokens_seen": 4911320, + "step": 7520 + }, + { + "epoch": 3.9439203354297696, + "grad_norm": 0.7030708193778992, + "learning_rate": 3.7925315138943655e-05, + "loss": 0.3691, + "num_input_tokens_seen": 4914072, + "step": 7525 + }, + { + "epoch": 3.9465408805031448, + "grad_norm": 1.276421070098877, + "learning_rate": 3.790573474627522e-05, + "loss": 0.4146, + "num_input_tokens_seen": 4918584, + "step": 7530 + }, + { + "epoch": 3.94916142557652, + "grad_norm": 1.3502362966537476, + "learning_rate": 3.7886143554677466e-05, + "loss": 0.3513, + "num_input_tokens_seen": 4921496, + "step": 7535 + }, + { + "epoch": 3.951781970649895, + "grad_norm": 1.101816177368164, + "learning_rate": 3.7866541580543405e-05, + "loss": 0.408, + "num_input_tokens_seen": 4924600, + "step": 7540 + }, + { + "epoch": 3.9544025157232703, + "grad_norm": 0.6562595367431641, + "learning_rate": 3.7846928840275056e-05, + "loss": 0.389, + "num_input_tokens_seen": 4927512, + "step": 7545 + }, + { + "epoch": 3.9570230607966455, + "grad_norm": 1.1058262586593628, + "learning_rate": 3.782730535028348e-05, + "loss": 0.378, + "num_input_tokens_seen": 4930360, + "step": 7550 + }, + { + "epoch": 3.959643605870021, + "grad_norm": 0.7622987031936646, + "learning_rate": 3.780767112698872e-05, + "loss": 0.2884, + "num_input_tokens_seen": 4933496, + "step": 7555 + }, + { + "epoch": 3.9622641509433962, + "grad_norm": 1.0803675651550293, + "learning_rate": 3.77880261868198e-05, + "loss": 0.3122, + "num_input_tokens_seen": 4936152, + "step": 7560 + }, + { + "epoch": 3.9648846960167714, + "grad_norm": 1.4296083450317383, + "learning_rate": 3.7768370546214685e-05, + "loss": 0.5613, + "num_input_tokens_seen": 4939832, + "step": 7565 + }, + { + "epoch": 3.967505241090147, + "grad_norm": 0.9086940288543701, + "learning_rate": 3.774870422162034e-05, + "loss": 0.4067, + "num_input_tokens_seen": 4943192, + "step": 7570 + }, + { + "epoch": 3.970125786163522, + "grad_norm": 1.5245287418365479, + "learning_rate": 3.7729027229492645e-05, + "loss": 0.367, + "num_input_tokens_seen": 4946392, + "step": 7575 + }, + { + "epoch": 3.9727463312368974, + "grad_norm": 0.9987620711326599, + "learning_rate": 3.770933958629639e-05, + "loss": 0.2925, + "num_input_tokens_seen": 4949464, + "step": 7580 + }, + { + "epoch": 3.9753668763102725, + "grad_norm": 1.0498383045196533, + "learning_rate": 3.768964130850532e-05, + "loss": 0.3138, + "num_input_tokens_seen": 4955288, + "step": 7585 + }, + { + "epoch": 3.9779874213836477, + "grad_norm": 1.0394786596298218, + "learning_rate": 3.766993241260204e-05, + "loss": 0.3013, + "num_input_tokens_seen": 4958008, + "step": 7590 + }, + { + "epoch": 3.980607966457023, + "grad_norm": 1.0178802013397217, + "learning_rate": 3.765021291507805e-05, + "loss": 0.2932, + "num_input_tokens_seen": 4961400, + "step": 7595 + }, + { + "epoch": 3.9832285115303985, + "grad_norm": 1.2811628580093384, + "learning_rate": 3.763048283243374e-05, + "loss": 0.4802, + "num_input_tokens_seen": 4964536, + "step": 7600 + }, + { + "epoch": 3.9858490566037736, + "grad_norm": 1.123040795326233, + "learning_rate": 3.7610742181178325e-05, + "loss": 0.4728, + "num_input_tokens_seen": 4968280, + "step": 7605 + }, + { + "epoch": 3.988469601677149, + "grad_norm": 0.8979718685150146, + "learning_rate": 3.75909909778299e-05, + "loss": 0.4569, + "num_input_tokens_seen": 4971640, + "step": 7610 + }, + { + "epoch": 3.991090146750524, + "grad_norm": 0.858383297920227, + "learning_rate": 3.757122923891534e-05, + "loss": 0.399, + "num_input_tokens_seen": 4975512, + "step": 7615 + }, + { + "epoch": 3.9937106918238996, + "grad_norm": 1.0501387119293213, + "learning_rate": 3.75514569809704e-05, + "loss": 0.4423, + "num_input_tokens_seen": 4978520, + "step": 7620 + }, + { + "epoch": 3.9963312368972748, + "grad_norm": 1.2124780416488647, + "learning_rate": 3.7531674220539584e-05, + "loss": 0.3145, + "num_input_tokens_seen": 4981944, + "step": 7625 + }, + { + "epoch": 3.99895178197065, + "grad_norm": 1.3892039060592651, + "learning_rate": 3.751188097417619e-05, + "loss": 0.3182, + "num_input_tokens_seen": 4984632, + "step": 7630 + }, + { + "epoch": 4.0, + "eval_loss": 0.4808174669742584, + "eval_runtime": 15.9714, + "eval_samples_per_second": 53.095, + "eval_steps_per_second": 13.274, + "num_input_tokens_seen": 4985200, + "step": 7632 + }, + { + "epoch": 4.001572327044025, + "grad_norm": 0.8712502121925354, + "learning_rate": 3.749207725844234e-05, + "loss": 0.3454, + "num_input_tokens_seen": 4987024, + "step": 7635 + }, + { + "epoch": 4.0041928721174, + "grad_norm": 0.8519287109375, + "learning_rate": 3.747226308990884e-05, + "loss": 0.2941, + "num_input_tokens_seen": 4989552, + "step": 7640 + }, + { + "epoch": 4.006813417190775, + "grad_norm": 1.2300043106079102, + "learning_rate": 3.74524384851553e-05, + "loss": 0.5277, + "num_input_tokens_seen": 4992080, + "step": 7645 + }, + { + "epoch": 4.009433962264151, + "grad_norm": 1.150903344154358, + "learning_rate": 3.743260346077004e-05, + "loss": 0.4865, + "num_input_tokens_seen": 4994512, + "step": 7650 + }, + { + "epoch": 4.012054507337526, + "grad_norm": 0.9182013869285583, + "learning_rate": 3.741275803335011e-05, + "loss": 0.2318, + "num_input_tokens_seen": 4998000, + "step": 7655 + }, + { + "epoch": 4.014675052410902, + "grad_norm": 1.0874757766723633, + "learning_rate": 3.7392902219501234e-05, + "loss": 0.3218, + "num_input_tokens_seen": 5002000, + "step": 7660 + }, + { + "epoch": 4.017295597484277, + "grad_norm": 0.8466274738311768, + "learning_rate": 3.737303603583788e-05, + "loss": 0.3045, + "num_input_tokens_seen": 5005648, + "step": 7665 + }, + { + "epoch": 4.019916142557652, + "grad_norm": 1.3324286937713623, + "learning_rate": 3.735315949898314e-05, + "loss": 0.4367, + "num_input_tokens_seen": 5009104, + "step": 7670 + }, + { + "epoch": 4.022536687631027, + "grad_norm": 0.642522394657135, + "learning_rate": 3.7333272625568804e-05, + "loss": 0.2694, + "num_input_tokens_seen": 5012304, + "step": 7675 + }, + { + "epoch": 4.0251572327044025, + "grad_norm": 3.341203451156616, + "learning_rate": 3.7313375432235295e-05, + "loss": 0.378, + "num_input_tokens_seen": 5014608, + "step": 7680 + }, + { + "epoch": 4.027777777777778, + "grad_norm": 1.566676139831543, + "learning_rate": 3.729346793563167e-05, + "loss": 0.3715, + "num_input_tokens_seen": 5017776, + "step": 7685 + }, + { + "epoch": 4.030398322851153, + "grad_norm": 1.1899265050888062, + "learning_rate": 3.7273550152415635e-05, + "loss": 0.4276, + "num_input_tokens_seen": 5020432, + "step": 7690 + }, + { + "epoch": 4.033018867924528, + "grad_norm": 0.9002857208251953, + "learning_rate": 3.725362209925346e-05, + "loss": 0.2272, + "num_input_tokens_seen": 5023824, + "step": 7695 + }, + { + "epoch": 4.035639412997903, + "grad_norm": 1.0881047248840332, + "learning_rate": 3.7233683792820036e-05, + "loss": 0.3425, + "num_input_tokens_seen": 5027344, + "step": 7700 + }, + { + "epoch": 4.038259958071279, + "grad_norm": 1.8817112445831299, + "learning_rate": 3.721373524979883e-05, + "loss": 0.3542, + "num_input_tokens_seen": 5030928, + "step": 7705 + }, + { + "epoch": 4.040880503144654, + "grad_norm": 0.8192150592803955, + "learning_rate": 3.7193776486881854e-05, + "loss": 0.2912, + "num_input_tokens_seen": 5034320, + "step": 7710 + }, + { + "epoch": 4.04350104821803, + "grad_norm": 0.9730916619300842, + "learning_rate": 3.717380752076971e-05, + "loss": 0.3311, + "num_input_tokens_seen": 5037456, + "step": 7715 + }, + { + "epoch": 4.046121593291405, + "grad_norm": 0.8413804769515991, + "learning_rate": 3.715382836817152e-05, + "loss": 0.2547, + "num_input_tokens_seen": 5040784, + "step": 7720 + }, + { + "epoch": 4.04874213836478, + "grad_norm": 1.1738662719726562, + "learning_rate": 3.7133839045804906e-05, + "loss": 0.4617, + "num_input_tokens_seen": 5044176, + "step": 7725 + }, + { + "epoch": 4.051362683438155, + "grad_norm": 1.1454644203186035, + "learning_rate": 3.711383957039602e-05, + "loss": 0.4082, + "num_input_tokens_seen": 5047376, + "step": 7730 + }, + { + "epoch": 4.05398322851153, + "grad_norm": 1.1196210384368896, + "learning_rate": 3.709382995867954e-05, + "loss": 0.3378, + "num_input_tokens_seen": 5053712, + "step": 7735 + }, + { + "epoch": 4.056603773584905, + "grad_norm": 0.6414330005645752, + "learning_rate": 3.707381022739856e-05, + "loss": 0.3543, + "num_input_tokens_seen": 5057968, + "step": 7740 + }, + { + "epoch": 4.059224318658281, + "grad_norm": 2.24448299407959, + "learning_rate": 3.7053780393304705e-05, + "loss": 0.4517, + "num_input_tokens_seen": 5061328, + "step": 7745 + }, + { + "epoch": 4.061844863731656, + "grad_norm": 1.177159070968628, + "learning_rate": 3.7033740473158e-05, + "loss": 0.3796, + "num_input_tokens_seen": 5064464, + "step": 7750 + }, + { + "epoch": 4.064465408805032, + "grad_norm": 1.1603004932403564, + "learning_rate": 3.701369048372695e-05, + "loss": 0.2646, + "num_input_tokens_seen": 5067344, + "step": 7755 + }, + { + "epoch": 4.067085953878407, + "grad_norm": 0.9118911623954773, + "learning_rate": 3.699363044178847e-05, + "loss": 0.2785, + "num_input_tokens_seen": 5070512, + "step": 7760 + }, + { + "epoch": 4.069706498951782, + "grad_norm": 1.833330512046814, + "learning_rate": 3.697356036412788e-05, + "loss": 0.2876, + "num_input_tokens_seen": 5073488, + "step": 7765 + }, + { + "epoch": 4.072327044025157, + "grad_norm": 1.969106912612915, + "learning_rate": 3.695348026753891e-05, + "loss": 0.3307, + "num_input_tokens_seen": 5079984, + "step": 7770 + }, + { + "epoch": 4.0749475890985325, + "grad_norm": 1.0131076574325562, + "learning_rate": 3.6933390168823655e-05, + "loss": 0.3511, + "num_input_tokens_seen": 5083056, + "step": 7775 + }, + { + "epoch": 4.077568134171908, + "grad_norm": 1.4848763942718506, + "learning_rate": 3.6913290084792616e-05, + "loss": 0.3452, + "num_input_tokens_seen": 5085904, + "step": 7780 + }, + { + "epoch": 4.080188679245283, + "grad_norm": 1.2157073020935059, + "learning_rate": 3.689318003226461e-05, + "loss": 0.3104, + "num_input_tokens_seen": 5088368, + "step": 7785 + }, + { + "epoch": 4.082809224318658, + "grad_norm": 0.9151837229728699, + "learning_rate": 3.687306002806681e-05, + "loss": 0.3216, + "num_input_tokens_seen": 5091088, + "step": 7790 + }, + { + "epoch": 4.085429769392033, + "grad_norm": 0.9689928889274597, + "learning_rate": 3.685293008903471e-05, + "loss": 0.3437, + "num_input_tokens_seen": 5094960, + "step": 7795 + }, + { + "epoch": 4.088050314465409, + "grad_norm": 1.0149915218353271, + "learning_rate": 3.683279023201213e-05, + "loss": 0.3389, + "num_input_tokens_seen": 5098352, + "step": 7800 + }, + { + "epoch": 4.090670859538784, + "grad_norm": 1.5099459886550903, + "learning_rate": 3.681264047385119e-05, + "loss": 0.3494, + "num_input_tokens_seen": 5102416, + "step": 7805 + }, + { + "epoch": 4.09329140461216, + "grad_norm": 1.194740653038025, + "learning_rate": 3.6792480831412293e-05, + "loss": 0.4484, + "num_input_tokens_seen": 5105072, + "step": 7810 + }, + { + "epoch": 4.095911949685535, + "grad_norm": 1.524809718132019, + "learning_rate": 3.677231132156408e-05, + "loss": 0.3347, + "num_input_tokens_seen": 5108528, + "step": 7815 + }, + { + "epoch": 4.09853249475891, + "grad_norm": 1.0977622270584106, + "learning_rate": 3.675213196118349e-05, + "loss": 0.4278, + "num_input_tokens_seen": 5112176, + "step": 7820 + }, + { + "epoch": 4.101153039832285, + "grad_norm": 0.8895376920700073, + "learning_rate": 3.67319427671557e-05, + "loss": 0.3567, + "num_input_tokens_seen": 5116848, + "step": 7825 + }, + { + "epoch": 4.10377358490566, + "grad_norm": 0.7927892208099365, + "learning_rate": 3.6711743756374103e-05, + "loss": 0.3548, + "num_input_tokens_seen": 5120464, + "step": 7830 + }, + { + "epoch": 4.106394129979035, + "grad_norm": 0.6752859354019165, + "learning_rate": 3.6691534945740284e-05, + "loss": 0.2958, + "num_input_tokens_seen": 5123312, + "step": 7835 + }, + { + "epoch": 4.109014675052411, + "grad_norm": 0.9161554574966431, + "learning_rate": 3.667131635216408e-05, + "loss": 0.3362, + "num_input_tokens_seen": 5127088, + "step": 7840 + }, + { + "epoch": 4.111635220125786, + "grad_norm": 0.8166965246200562, + "learning_rate": 3.665108799256348e-05, + "loss": 0.3415, + "num_input_tokens_seen": 5130384, + "step": 7845 + }, + { + "epoch": 4.114255765199162, + "grad_norm": 0.8535136580467224, + "learning_rate": 3.663084988386464e-05, + "loss": 0.303, + "num_input_tokens_seen": 5134288, + "step": 7850 + }, + { + "epoch": 4.116876310272537, + "grad_norm": 1.7405937910079956, + "learning_rate": 3.6610602043001894e-05, + "loss": 0.4113, + "num_input_tokens_seen": 5137392, + "step": 7855 + }, + { + "epoch": 4.119496855345912, + "grad_norm": 1.3475873470306396, + "learning_rate": 3.659034448691771e-05, + "loss": 0.3452, + "num_input_tokens_seen": 5139824, + "step": 7860 + }, + { + "epoch": 4.122117400419287, + "grad_norm": 1.0555108785629272, + "learning_rate": 3.657007723256268e-05, + "loss": 0.4102, + "num_input_tokens_seen": 5143280, + "step": 7865 + }, + { + "epoch": 4.1247379454926625, + "grad_norm": 1.2028048038482666, + "learning_rate": 3.654980029689553e-05, + "loss": 0.4052, + "num_input_tokens_seen": 5145584, + "step": 7870 + }, + { + "epoch": 4.127358490566038, + "grad_norm": 1.1400669813156128, + "learning_rate": 3.6529513696883075e-05, + "loss": 0.3141, + "num_input_tokens_seen": 5148240, + "step": 7875 + }, + { + "epoch": 4.129979035639413, + "grad_norm": 1.2795156240463257, + "learning_rate": 3.650921744950019e-05, + "loss": 0.4232, + "num_input_tokens_seen": 5151632, + "step": 7880 + }, + { + "epoch": 4.132599580712788, + "grad_norm": 2.2455179691314697, + "learning_rate": 3.6488911571729864e-05, + "loss": 0.353, + "num_input_tokens_seen": 5154576, + "step": 7885 + }, + { + "epoch": 4.135220125786163, + "grad_norm": 1.3776997327804565, + "learning_rate": 3.6468596080563134e-05, + "loss": 0.3227, + "num_input_tokens_seen": 5158224, + "step": 7890 + }, + { + "epoch": 4.137840670859539, + "grad_norm": 1.070574402809143, + "learning_rate": 3.6448270992999065e-05, + "loss": 0.3032, + "num_input_tokens_seen": 5161360, + "step": 7895 + }, + { + "epoch": 4.140461215932914, + "grad_norm": 1.4767957925796509, + "learning_rate": 3.6427936326044756e-05, + "loss": 0.2836, + "num_input_tokens_seen": 5164656, + "step": 7900 + }, + { + "epoch": 4.1430817610062896, + "grad_norm": 0.7599817514419556, + "learning_rate": 3.6407592096715345e-05, + "loss": 0.3475, + "num_input_tokens_seen": 5168336, + "step": 7905 + }, + { + "epoch": 4.145702306079665, + "grad_norm": 1.2490651607513428, + "learning_rate": 3.638723832203396e-05, + "loss": 0.3243, + "num_input_tokens_seen": 5170736, + "step": 7910 + }, + { + "epoch": 4.14832285115304, + "grad_norm": 1.638503074645996, + "learning_rate": 3.6366875019031676e-05, + "loss": 0.2031, + "num_input_tokens_seen": 5173424, + "step": 7915 + }, + { + "epoch": 4.150943396226415, + "grad_norm": 0.9837068915367126, + "learning_rate": 3.6346502204747596e-05, + "loss": 0.2853, + "num_input_tokens_seen": 5175856, + "step": 7920 + }, + { + "epoch": 4.15356394129979, + "grad_norm": 0.993317723274231, + "learning_rate": 3.6326119896228766e-05, + "loss": 0.303, + "num_input_tokens_seen": 5178672, + "step": 7925 + }, + { + "epoch": 4.156184486373165, + "grad_norm": 1.0583416223526, + "learning_rate": 3.630572811053016e-05, + "loss": 0.2887, + "num_input_tokens_seen": 5184496, + "step": 7930 + }, + { + "epoch": 4.158805031446541, + "grad_norm": 0.888070285320282, + "learning_rate": 3.62853268647147e-05, + "loss": 0.3809, + "num_input_tokens_seen": 5188144, + "step": 7935 + }, + { + "epoch": 4.161425576519916, + "grad_norm": 2.0195252895355225, + "learning_rate": 3.6264916175853204e-05, + "loss": 0.323, + "num_input_tokens_seen": 5192112, + "step": 7940 + }, + { + "epoch": 4.164046121593292, + "grad_norm": 1.600537896156311, + "learning_rate": 3.624449606102441e-05, + "loss": 0.3524, + "num_input_tokens_seen": 5194320, + "step": 7945 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 2.8423337936401367, + "learning_rate": 3.622406653731495e-05, + "loss": 0.3219, + "num_input_tokens_seen": 5197456, + "step": 7950 + }, + { + "epoch": 4.169287211740042, + "grad_norm": 0.9286392331123352, + "learning_rate": 3.620362762181931e-05, + "loss": 0.3573, + "num_input_tokens_seen": 5199792, + "step": 7955 + }, + { + "epoch": 4.171907756813417, + "grad_norm": 1.4971479177474976, + "learning_rate": 3.6183179331639825e-05, + "loss": 0.3941, + "num_input_tokens_seen": 5202672, + "step": 7960 + }, + { + "epoch": 4.1745283018867925, + "grad_norm": 1.2705193758010864, + "learning_rate": 3.616272168388671e-05, + "loss": 0.3883, + "num_input_tokens_seen": 5205808, + "step": 7965 + }, + { + "epoch": 4.177148846960168, + "grad_norm": 1.172237753868103, + "learning_rate": 3.614225469567798e-05, + "loss": 0.2407, + "num_input_tokens_seen": 5208176, + "step": 7970 + }, + { + "epoch": 4.179769392033543, + "grad_norm": 0.6981173753738403, + "learning_rate": 3.612177838413948e-05, + "loss": 0.3456, + "num_input_tokens_seen": 5215376, + "step": 7975 + }, + { + "epoch": 4.182389937106918, + "grad_norm": 1.270318627357483, + "learning_rate": 3.6101292766404854e-05, + "loss": 0.372, + "num_input_tokens_seen": 5218064, + "step": 7980 + }, + { + "epoch": 4.185010482180293, + "grad_norm": 1.1371474266052246, + "learning_rate": 3.608079785961552e-05, + "loss": 0.3195, + "num_input_tokens_seen": 5221328, + "step": 7985 + }, + { + "epoch": 4.187631027253669, + "grad_norm": 0.8592715859413147, + "learning_rate": 3.60602936809207e-05, + "loss": 0.3557, + "num_input_tokens_seen": 5224432, + "step": 7990 + }, + { + "epoch": 4.190251572327044, + "grad_norm": 1.3438341617584229, + "learning_rate": 3.603978024747733e-05, + "loss": 0.3591, + "num_input_tokens_seen": 5226896, + "step": 7995 + }, + { + "epoch": 4.1928721174004195, + "grad_norm": 1.1500701904296875, + "learning_rate": 3.601925757645013e-05, + "loss": 0.2897, + "num_input_tokens_seen": 5231056, + "step": 8000 + }, + { + "epoch": 4.195492662473795, + "grad_norm": 1.3019765615463257, + "learning_rate": 3.599872568501152e-05, + "loss": 0.2771, + "num_input_tokens_seen": 5234480, + "step": 8005 + }, + { + "epoch": 4.19811320754717, + "grad_norm": 7.017221927642822, + "learning_rate": 3.5978184590341676e-05, + "loss": 0.4836, + "num_input_tokens_seen": 5236816, + "step": 8010 + }, + { + "epoch": 4.200733752620545, + "grad_norm": 4.1116156578063965, + "learning_rate": 3.5957634309628424e-05, + "loss": 0.3593, + "num_input_tokens_seen": 5239600, + "step": 8015 + }, + { + "epoch": 4.20335429769392, + "grad_norm": 1.1284507513046265, + "learning_rate": 3.59370748600673e-05, + "loss": 0.4009, + "num_input_tokens_seen": 5242896, + "step": 8020 + }, + { + "epoch": 4.205974842767295, + "grad_norm": 1.552432894706726, + "learning_rate": 3.591650625886152e-05, + "loss": 0.4251, + "num_input_tokens_seen": 5245904, + "step": 8025 + }, + { + "epoch": 4.2085953878406706, + "grad_norm": 1.0981113910675049, + "learning_rate": 3.5895928523221955e-05, + "loss": 0.4171, + "num_input_tokens_seen": 5249968, + "step": 8030 + }, + { + "epoch": 4.211215932914046, + "grad_norm": 1.4086765050888062, + "learning_rate": 3.58753416703671e-05, + "loss": 0.3064, + "num_input_tokens_seen": 5253680, + "step": 8035 + }, + { + "epoch": 4.213836477987422, + "grad_norm": 1.8517180681228638, + "learning_rate": 3.585474571752311e-05, + "loss": 0.4809, + "num_input_tokens_seen": 5256144, + "step": 8040 + }, + { + "epoch": 4.216457023060797, + "grad_norm": 1.1731607913970947, + "learning_rate": 3.583414068192372e-05, + "loss": 0.2575, + "num_input_tokens_seen": 5261456, + "step": 8045 + }, + { + "epoch": 4.219077568134172, + "grad_norm": 1.1323686838150024, + "learning_rate": 3.58135265808103e-05, + "loss": 0.382, + "num_input_tokens_seen": 5264688, + "step": 8050 + }, + { + "epoch": 4.221698113207547, + "grad_norm": 1.0514044761657715, + "learning_rate": 3.5792903431431775e-05, + "loss": 0.4294, + "num_input_tokens_seen": 5267184, + "step": 8055 + }, + { + "epoch": 4.2243186582809225, + "grad_norm": 1.6703070402145386, + "learning_rate": 3.577227125104466e-05, + "loss": 0.3232, + "num_input_tokens_seen": 5269776, + "step": 8060 + }, + { + "epoch": 4.226939203354298, + "grad_norm": 1.2364603281021118, + "learning_rate": 3.575163005691302e-05, + "loss": 0.3452, + "num_input_tokens_seen": 5273296, + "step": 8065 + }, + { + "epoch": 4.229559748427673, + "grad_norm": 1.1103260517120361, + "learning_rate": 3.573097986630845e-05, + "loss": 0.3354, + "num_input_tokens_seen": 5276432, + "step": 8070 + }, + { + "epoch": 4.232180293501048, + "grad_norm": 0.9679189324378967, + "learning_rate": 3.5710320696510114e-05, + "loss": 0.291, + "num_input_tokens_seen": 5279696, + "step": 8075 + }, + { + "epoch": 4.234800838574423, + "grad_norm": 1.5879613161087036, + "learning_rate": 3.5689652564804646e-05, + "loss": 0.332, + "num_input_tokens_seen": 5283312, + "step": 8080 + }, + { + "epoch": 4.237421383647799, + "grad_norm": 1.4980419874191284, + "learning_rate": 3.566897548848619e-05, + "loss": 0.2614, + "num_input_tokens_seen": 5285712, + "step": 8085 + }, + { + "epoch": 4.240041928721174, + "grad_norm": 1.2711124420166016, + "learning_rate": 3.564828948485639e-05, + "loss": 0.397, + "num_input_tokens_seen": 5289136, + "step": 8090 + }, + { + "epoch": 4.2426624737945495, + "grad_norm": 0.775330662727356, + "learning_rate": 3.562759457122434e-05, + "loss": 0.1899, + "num_input_tokens_seen": 5292528, + "step": 8095 + }, + { + "epoch": 4.245283018867925, + "grad_norm": 1.0126073360443115, + "learning_rate": 3.5606890764906603e-05, + "loss": 0.3911, + "num_input_tokens_seen": 5295984, + "step": 8100 + }, + { + "epoch": 4.2479035639413, + "grad_norm": 1.295328974723816, + "learning_rate": 3.5586178083227175e-05, + "loss": 0.3308, + "num_input_tokens_seen": 5299120, + "step": 8105 + }, + { + "epoch": 4.250524109014675, + "grad_norm": 1.3143222332000732, + "learning_rate": 3.556545654351749e-05, + "loss": 0.2988, + "num_input_tokens_seen": 5302192, + "step": 8110 + }, + { + "epoch": 4.25314465408805, + "grad_norm": 2.093186378479004, + "learning_rate": 3.554472616311638e-05, + "loss": 0.5968, + "num_input_tokens_seen": 5304880, + "step": 8115 + }, + { + "epoch": 4.255765199161425, + "grad_norm": 1.3556334972381592, + "learning_rate": 3.552398695937007e-05, + "loss": 0.3828, + "num_input_tokens_seen": 5308304, + "step": 8120 + }, + { + "epoch": 4.2583857442348005, + "grad_norm": 1.4358805418014526, + "learning_rate": 3.55032389496322e-05, + "loss": 0.3896, + "num_input_tokens_seen": 5310960, + "step": 8125 + }, + { + "epoch": 4.261006289308176, + "grad_norm": 1.5762577056884766, + "learning_rate": 3.548248215126374e-05, + "loss": 0.3293, + "num_input_tokens_seen": 5313264, + "step": 8130 + }, + { + "epoch": 4.263626834381552, + "grad_norm": 1.9536433219909668, + "learning_rate": 3.546171658163304e-05, + "loss": 0.3617, + "num_input_tokens_seen": 5315600, + "step": 8135 + }, + { + "epoch": 4.266247379454927, + "grad_norm": 0.767054557800293, + "learning_rate": 3.544094225811577e-05, + "loss": 0.2776, + "num_input_tokens_seen": 5319024, + "step": 8140 + }, + { + "epoch": 4.268867924528302, + "grad_norm": 1.2784672975540161, + "learning_rate": 3.542015919809495e-05, + "loss": 0.3646, + "num_input_tokens_seen": 5322160, + "step": 8145 + }, + { + "epoch": 4.271488469601677, + "grad_norm": 1.096287488937378, + "learning_rate": 3.539936741896088e-05, + "loss": 0.2936, + "num_input_tokens_seen": 5327152, + "step": 8150 + }, + { + "epoch": 4.274109014675052, + "grad_norm": 1.4791191816329956, + "learning_rate": 3.537856693811118e-05, + "loss": 0.2812, + "num_input_tokens_seen": 5330256, + "step": 8155 + }, + { + "epoch": 4.276729559748428, + "grad_norm": 1.8212032318115234, + "learning_rate": 3.5357757772950746e-05, + "loss": 0.3714, + "num_input_tokens_seen": 5333392, + "step": 8160 + }, + { + "epoch": 4.279350104821803, + "grad_norm": 1.2460838556289673, + "learning_rate": 3.533693994089173e-05, + "loss": 0.3219, + "num_input_tokens_seen": 5336720, + "step": 8165 + }, + { + "epoch": 4.281970649895178, + "grad_norm": 0.6985361576080322, + "learning_rate": 3.531611345935353e-05, + "loss": 0.2603, + "num_input_tokens_seen": 5339696, + "step": 8170 + }, + { + "epoch": 4.284591194968553, + "grad_norm": 1.4513005018234253, + "learning_rate": 3.529527834576282e-05, + "loss": 0.437, + "num_input_tokens_seen": 5342896, + "step": 8175 + }, + { + "epoch": 4.287211740041929, + "grad_norm": 1.4872125387191772, + "learning_rate": 3.527443461755346e-05, + "loss": 0.3296, + "num_input_tokens_seen": 5345392, + "step": 8180 + }, + { + "epoch": 4.289832285115304, + "grad_norm": 0.9531895518302917, + "learning_rate": 3.525358229216653e-05, + "loss": 0.3012, + "num_input_tokens_seen": 5349392, + "step": 8185 + }, + { + "epoch": 4.2924528301886795, + "grad_norm": 0.857496976852417, + "learning_rate": 3.52327213870503e-05, + "loss": 0.3628, + "num_input_tokens_seen": 5353008, + "step": 8190 + }, + { + "epoch": 4.295073375262055, + "grad_norm": 1.8894137144088745, + "learning_rate": 3.521185191966022e-05, + "loss": 0.5549, + "num_input_tokens_seen": 5356464, + "step": 8195 + }, + { + "epoch": 4.29769392033543, + "grad_norm": 0.8412109613418579, + "learning_rate": 3.5190973907458924e-05, + "loss": 0.288, + "num_input_tokens_seen": 5359440, + "step": 8200 + }, + { + "epoch": 4.300314465408805, + "grad_norm": 1.9809881448745728, + "learning_rate": 3.517008736791616e-05, + "loss": 0.4526, + "num_input_tokens_seen": 5361968, + "step": 8205 + }, + { + "epoch": 4.30293501048218, + "grad_norm": 1.1276721954345703, + "learning_rate": 3.514919231850885e-05, + "loss": 0.4802, + "num_input_tokens_seen": 5365744, + "step": 8210 + }, + { + "epoch": 4.305555555555555, + "grad_norm": 1.3352841138839722, + "learning_rate": 3.512828877672099e-05, + "loss": 0.2891, + "num_input_tokens_seen": 5368720, + "step": 8215 + }, + { + "epoch": 4.3081761006289305, + "grad_norm": 1.820926308631897, + "learning_rate": 3.510737676004372e-05, + "loss": 0.3169, + "num_input_tokens_seen": 5371408, + "step": 8220 + }, + { + "epoch": 4.310796645702306, + "grad_norm": 1.7642731666564941, + "learning_rate": 3.5086456285975274e-05, + "loss": 0.3895, + "num_input_tokens_seen": 5374256, + "step": 8225 + }, + { + "epoch": 4.313417190775682, + "grad_norm": 1.834460735321045, + "learning_rate": 3.5065527372020935e-05, + "loss": 0.3881, + "num_input_tokens_seen": 5376624, + "step": 8230 + }, + { + "epoch": 4.316037735849057, + "grad_norm": 1.7083576917648315, + "learning_rate": 3.504459003569306e-05, + "loss": 0.3535, + "num_input_tokens_seen": 5380048, + "step": 8235 + }, + { + "epoch": 4.318658280922432, + "grad_norm": 1.052553653717041, + "learning_rate": 3.5023644294511074e-05, + "loss": 0.2728, + "num_input_tokens_seen": 5382416, + "step": 8240 + }, + { + "epoch": 4.321278825995807, + "grad_norm": 1.09736168384552, + "learning_rate": 3.50026901660014e-05, + "loss": 0.444, + "num_input_tokens_seen": 5385520, + "step": 8245 + }, + { + "epoch": 4.323899371069182, + "grad_norm": 1.4560025930404663, + "learning_rate": 3.4981727667697497e-05, + "loss": 0.5016, + "num_input_tokens_seen": 5388368, + "step": 8250 + }, + { + "epoch": 4.326519916142558, + "grad_norm": 1.1046124696731567, + "learning_rate": 3.4960756817139825e-05, + "loss": 0.3041, + "num_input_tokens_seen": 5391920, + "step": 8255 + }, + { + "epoch": 4.329140461215933, + "grad_norm": 1.0555000305175781, + "learning_rate": 3.493977763187584e-05, + "loss": 0.245, + "num_input_tokens_seen": 5394864, + "step": 8260 + }, + { + "epoch": 4.331761006289308, + "grad_norm": 1.0201241970062256, + "learning_rate": 3.4918790129459975e-05, + "loss": 0.3309, + "num_input_tokens_seen": 5398448, + "step": 8265 + }, + { + "epoch": 4.334381551362683, + "grad_norm": 0.7888246774673462, + "learning_rate": 3.4897794327453586e-05, + "loss": 0.3096, + "num_input_tokens_seen": 5401904, + "step": 8270 + }, + { + "epoch": 4.337002096436059, + "grad_norm": 1.6580758094787598, + "learning_rate": 3.487679024342502e-05, + "loss": 0.2914, + "num_input_tokens_seen": 5404432, + "step": 8275 + }, + { + "epoch": 4.339622641509434, + "grad_norm": 1.7431737184524536, + "learning_rate": 3.4855777894949536e-05, + "loss": 0.3517, + "num_input_tokens_seen": 5407376, + "step": 8280 + }, + { + "epoch": 4.3422431865828095, + "grad_norm": 1.5073769092559814, + "learning_rate": 3.4834757299609306e-05, + "loss": 0.3441, + "num_input_tokens_seen": 5410768, + "step": 8285 + }, + { + "epoch": 4.344863731656185, + "grad_norm": 1.1210559606552124, + "learning_rate": 3.48137284749934e-05, + "loss": 0.3613, + "num_input_tokens_seen": 5414256, + "step": 8290 + }, + { + "epoch": 4.34748427672956, + "grad_norm": 2.2495765686035156, + "learning_rate": 3.479269143869777e-05, + "loss": 0.2954, + "num_input_tokens_seen": 5417296, + "step": 8295 + }, + { + "epoch": 4.350104821802935, + "grad_norm": 1.4518811702728271, + "learning_rate": 3.477164620832527e-05, + "loss": 0.3479, + "num_input_tokens_seen": 5420560, + "step": 8300 + }, + { + "epoch": 4.35272536687631, + "grad_norm": 1.6299513578414917, + "learning_rate": 3.4750592801485564e-05, + "loss": 0.321, + "num_input_tokens_seen": 5424496, + "step": 8305 + }, + { + "epoch": 4.355345911949685, + "grad_norm": 1.101489782333374, + "learning_rate": 3.47295312357952e-05, + "loss": 0.3379, + "num_input_tokens_seen": 5428944, + "step": 8310 + }, + { + "epoch": 4.3579664570230605, + "grad_norm": 0.9130959510803223, + "learning_rate": 3.4708461528877514e-05, + "loss": 0.3986, + "num_input_tokens_seen": 5432624, + "step": 8315 + }, + { + "epoch": 4.360587002096436, + "grad_norm": 1.9711322784423828, + "learning_rate": 3.468738369836269e-05, + "loss": 0.3604, + "num_input_tokens_seen": 5435152, + "step": 8320 + }, + { + "epoch": 4.363207547169811, + "grad_norm": 1.3580713272094727, + "learning_rate": 3.466629776188769e-05, + "loss": 0.335, + "num_input_tokens_seen": 5438640, + "step": 8325 + }, + { + "epoch": 4.365828092243187, + "grad_norm": 1.4655462503433228, + "learning_rate": 3.464520373709627e-05, + "loss": 0.3175, + "num_input_tokens_seen": 5441264, + "step": 8330 + }, + { + "epoch": 4.368448637316562, + "grad_norm": 0.9492835998535156, + "learning_rate": 3.462410164163893e-05, + "loss": 0.3228, + "num_input_tokens_seen": 5444752, + "step": 8335 + }, + { + "epoch": 4.371069182389937, + "grad_norm": 1.672666072845459, + "learning_rate": 3.460299149317294e-05, + "loss": 0.3391, + "num_input_tokens_seen": 5447632, + "step": 8340 + }, + { + "epoch": 4.373689727463312, + "grad_norm": 1.2024283409118652, + "learning_rate": 3.4581873309362326e-05, + "loss": 0.4394, + "num_input_tokens_seen": 5451344, + "step": 8345 + }, + { + "epoch": 4.376310272536688, + "grad_norm": 1.3129512071609497, + "learning_rate": 3.456074710787781e-05, + "loss": 0.4088, + "num_input_tokens_seen": 5454736, + "step": 8350 + }, + { + "epoch": 4.378930817610063, + "grad_norm": 0.9211092591285706, + "learning_rate": 3.453961290639683e-05, + "loss": 0.371, + "num_input_tokens_seen": 5458096, + "step": 8355 + }, + { + "epoch": 4.381551362683438, + "grad_norm": 1.2446508407592773, + "learning_rate": 3.451847072260351e-05, + "loss": 0.3587, + "num_input_tokens_seen": 5460560, + "step": 8360 + }, + { + "epoch": 4.384171907756813, + "grad_norm": 1.2896003723144531, + "learning_rate": 3.4497320574188694e-05, + "loss": 0.3185, + "num_input_tokens_seen": 5463792, + "step": 8365 + }, + { + "epoch": 4.386792452830189, + "grad_norm": 0.9192762970924377, + "learning_rate": 3.447616247884983e-05, + "loss": 0.3038, + "num_input_tokens_seen": 5467216, + "step": 8370 + }, + { + "epoch": 4.389412997903564, + "grad_norm": 0.7725765109062195, + "learning_rate": 3.445499645429107e-05, + "loss": 0.3157, + "num_input_tokens_seen": 5470320, + "step": 8375 + }, + { + "epoch": 4.3920335429769395, + "grad_norm": 0.8446463942527771, + "learning_rate": 3.443382251822315e-05, + "loss": 0.3175, + "num_input_tokens_seen": 5473232, + "step": 8380 + }, + { + "epoch": 4.394654088050315, + "grad_norm": 1.2691420316696167, + "learning_rate": 3.4412640688363475e-05, + "loss": 0.4538, + "num_input_tokens_seen": 5476432, + "step": 8385 + }, + { + "epoch": 4.39727463312369, + "grad_norm": 1.724208950996399, + "learning_rate": 3.439145098243601e-05, + "loss": 0.4003, + "num_input_tokens_seen": 5478928, + "step": 8390 + }, + { + "epoch": 4.399895178197065, + "grad_norm": 1.2046347856521606, + "learning_rate": 3.437025341817137e-05, + "loss": 0.2626, + "num_input_tokens_seen": 5481776, + "step": 8395 + }, + { + "epoch": 4.40251572327044, + "grad_norm": 1.0026038885116577, + "learning_rate": 3.434904801330667e-05, + "loss": 0.3634, + "num_input_tokens_seen": 5485008, + "step": 8400 + }, + { + "epoch": 4.405136268343815, + "grad_norm": 0.8488423228263855, + "learning_rate": 3.432783478558564e-05, + "loss": 0.3542, + "num_input_tokens_seen": 5492496, + "step": 8405 + }, + { + "epoch": 4.4077568134171905, + "grad_norm": 1.7378876209259033, + "learning_rate": 3.430661375275854e-05, + "loss": 0.3417, + "num_input_tokens_seen": 5495664, + "step": 8410 + }, + { + "epoch": 4.410377358490566, + "grad_norm": 1.4988701343536377, + "learning_rate": 3.4285384932582175e-05, + "loss": 0.3274, + "num_input_tokens_seen": 5499792, + "step": 8415 + }, + { + "epoch": 4.412997903563941, + "grad_norm": 1.0510319471359253, + "learning_rate": 3.426414834281982e-05, + "loss": 0.2937, + "num_input_tokens_seen": 5502736, + "step": 8420 + }, + { + "epoch": 4.415618448637317, + "grad_norm": 1.3640896081924438, + "learning_rate": 3.424290400124131e-05, + "loss": 0.3138, + "num_input_tokens_seen": 5505328, + "step": 8425 + }, + { + "epoch": 4.418238993710692, + "grad_norm": 1.4461265802383423, + "learning_rate": 3.422165192562293e-05, + "loss": 0.3614, + "num_input_tokens_seen": 5507952, + "step": 8430 + }, + { + "epoch": 4.420859538784067, + "grad_norm": 1.9665647745132446, + "learning_rate": 3.420039213374745e-05, + "loss": 0.2947, + "num_input_tokens_seen": 5510544, + "step": 8435 + }, + { + "epoch": 4.423480083857442, + "grad_norm": 1.182289719581604, + "learning_rate": 3.4179124643404084e-05, + "loss": 0.3996, + "num_input_tokens_seen": 5512976, + "step": 8440 + }, + { + "epoch": 4.426100628930818, + "grad_norm": 1.1866062879562378, + "learning_rate": 3.41578494723885e-05, + "loss": 0.4048, + "num_input_tokens_seen": 5516496, + "step": 8445 + }, + { + "epoch": 4.428721174004193, + "grad_norm": 0.9753671288490295, + "learning_rate": 3.4136566638502795e-05, + "loss": 0.2575, + "num_input_tokens_seen": 5520720, + "step": 8450 + }, + { + "epoch": 4.431341719077568, + "grad_norm": 0.6552973389625549, + "learning_rate": 3.4115276159555464e-05, + "loss": 0.3632, + "num_input_tokens_seen": 5524336, + "step": 8455 + }, + { + "epoch": 4.433962264150943, + "grad_norm": 1.1773169040679932, + "learning_rate": 3.409397805336142e-05, + "loss": 0.3131, + "num_input_tokens_seen": 5528112, + "step": 8460 + }, + { + "epoch": 4.436582809224318, + "grad_norm": 0.842990517616272, + "learning_rate": 3.407267233774193e-05, + "loss": 0.3659, + "num_input_tokens_seen": 5531664, + "step": 8465 + }, + { + "epoch": 4.439203354297694, + "grad_norm": 2.4101169109344482, + "learning_rate": 3.4051359030524654e-05, + "loss": 0.2857, + "num_input_tokens_seen": 5536368, + "step": 8470 + }, + { + "epoch": 4.4418238993710695, + "grad_norm": 1.2124384641647339, + "learning_rate": 3.4030038149543594e-05, + "loss": 0.3097, + "num_input_tokens_seen": 5539600, + "step": 8475 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.8002151846885681, + "learning_rate": 3.4008709712639084e-05, + "loss": 0.2807, + "num_input_tokens_seen": 5542544, + "step": 8480 + }, + { + "epoch": 4.44706498951782, + "grad_norm": 0.9082409143447876, + "learning_rate": 3.398737373765779e-05, + "loss": 0.3139, + "num_input_tokens_seen": 5546704, + "step": 8485 + }, + { + "epoch": 4.449685534591195, + "grad_norm": 1.3512462377548218, + "learning_rate": 3.396603024245267e-05, + "loss": 0.2691, + "num_input_tokens_seen": 5549520, + "step": 8490 + }, + { + "epoch": 4.45230607966457, + "grad_norm": 1.351981282234192, + "learning_rate": 3.3944679244883e-05, + "loss": 0.3446, + "num_input_tokens_seen": 5553040, + "step": 8495 + }, + { + "epoch": 4.454926624737945, + "grad_norm": 1.2784541845321655, + "learning_rate": 3.392332076281433e-05, + "loss": 0.3749, + "num_input_tokens_seen": 5556880, + "step": 8500 + }, + { + "epoch": 4.4575471698113205, + "grad_norm": 1.458040714263916, + "learning_rate": 3.390195481411842e-05, + "loss": 0.3095, + "num_input_tokens_seen": 5559760, + "step": 8505 + }, + { + "epoch": 4.460167714884696, + "grad_norm": 1.2806816101074219, + "learning_rate": 3.3880581416673366e-05, + "loss": 0.3297, + "num_input_tokens_seen": 5564688, + "step": 8510 + }, + { + "epoch": 4.462788259958071, + "grad_norm": 1.6236982345581055, + "learning_rate": 3.385920058836342e-05, + "loss": 0.4113, + "num_input_tokens_seen": 5568112, + "step": 8515 + }, + { + "epoch": 4.465408805031447, + "grad_norm": 1.2069507837295532, + "learning_rate": 3.38378123470791e-05, + "loss": 0.4451, + "num_input_tokens_seen": 5570928, + "step": 8520 + }, + { + "epoch": 4.468029350104822, + "grad_norm": 2.1279921531677246, + "learning_rate": 3.381641671071709e-05, + "loss": 0.3312, + "num_input_tokens_seen": 5574480, + "step": 8525 + }, + { + "epoch": 4.470649895178197, + "grad_norm": 2.1657121181488037, + "learning_rate": 3.379501369718031e-05, + "loss": 0.3512, + "num_input_tokens_seen": 5577296, + "step": 8530 + }, + { + "epoch": 4.473270440251572, + "grad_norm": 0.7011824250221252, + "learning_rate": 3.377360332437781e-05, + "loss": 0.2377, + "num_input_tokens_seen": 5580784, + "step": 8535 + }, + { + "epoch": 4.475890985324948, + "grad_norm": 1.0545942783355713, + "learning_rate": 3.37521856102248e-05, + "loss": 0.3218, + "num_input_tokens_seen": 5583856, + "step": 8540 + }, + { + "epoch": 4.478511530398323, + "grad_norm": 1.2680487632751465, + "learning_rate": 3.373076057264266e-05, + "loss": 0.3121, + "num_input_tokens_seen": 5586256, + "step": 8545 + }, + { + "epoch": 4.481132075471698, + "grad_norm": 1.136324405670166, + "learning_rate": 3.370932822955888e-05, + "loss": 0.3248, + "num_input_tokens_seen": 5589200, + "step": 8550 + }, + { + "epoch": 4.483752620545073, + "grad_norm": 0.9241526126861572, + "learning_rate": 3.368788859890706e-05, + "loss": 0.3942, + "num_input_tokens_seen": 5592976, + "step": 8555 + }, + { + "epoch": 4.486373165618448, + "grad_norm": 1.1523241996765137, + "learning_rate": 3.3666441698626906e-05, + "loss": 0.3631, + "num_input_tokens_seen": 5595696, + "step": 8560 + }, + { + "epoch": 4.488993710691824, + "grad_norm": 1.4129058122634888, + "learning_rate": 3.364498754666421e-05, + "loss": 0.3822, + "num_input_tokens_seen": 5598288, + "step": 8565 + }, + { + "epoch": 4.4916142557651995, + "grad_norm": 1.6284734010696411, + "learning_rate": 3.362352616097082e-05, + "loss": 0.4293, + "num_input_tokens_seen": 5601296, + "step": 8570 + }, + { + "epoch": 4.494234800838575, + "grad_norm": 1.4776948690414429, + "learning_rate": 3.360205755950464e-05, + "loss": 0.2938, + "num_input_tokens_seen": 5604752, + "step": 8575 + }, + { + "epoch": 4.49685534591195, + "grad_norm": 1.0349596738815308, + "learning_rate": 3.358058176022963e-05, + "loss": 0.416, + "num_input_tokens_seen": 5607824, + "step": 8580 + }, + { + "epoch": 4.499475890985325, + "grad_norm": 1.259002447128296, + "learning_rate": 3.355909878111574e-05, + "loss": 0.3908, + "num_input_tokens_seen": 5611248, + "step": 8585 + }, + { + "epoch": 4.5, + "eval_loss": 0.4978715479373932, + "eval_runtime": 15.9989, + "eval_samples_per_second": 53.003, + "eval_steps_per_second": 13.251, + "num_input_tokens_seen": 5611760, + "step": 8586 + }, + { + "epoch": 4.5020964360587, + "grad_norm": 0.5886470675468445, + "learning_rate": 3.3537608640138954e-05, + "loss": 0.2949, + "num_input_tokens_seen": 5614192, + "step": 8590 + }, + { + "epoch": 4.504716981132075, + "grad_norm": 1.6407116651535034, + "learning_rate": 3.351611135528125e-05, + "loss": 0.3594, + "num_input_tokens_seen": 5616944, + "step": 8595 + }, + { + "epoch": 4.5073375262054505, + "grad_norm": 1.4091613292694092, + "learning_rate": 3.349460694453056e-05, + "loss": 0.3435, + "num_input_tokens_seen": 5619344, + "step": 8600 + }, + { + "epoch": 4.509958071278826, + "grad_norm": 1.127608299255371, + "learning_rate": 3.3473095425880796e-05, + "loss": 0.2661, + "num_input_tokens_seen": 5622608, + "step": 8605 + }, + { + "epoch": 4.512578616352201, + "grad_norm": 1.5801893472671509, + "learning_rate": 3.345157681733181e-05, + "loss": 0.3636, + "num_input_tokens_seen": 5625296, + "step": 8610 + }, + { + "epoch": 4.515199161425577, + "grad_norm": 1.0047849416732788, + "learning_rate": 3.3430051136889404e-05, + "loss": 0.2645, + "num_input_tokens_seen": 5628272, + "step": 8615 + }, + { + "epoch": 4.517819706498952, + "grad_norm": 1.0874544382095337, + "learning_rate": 3.3408518402565276e-05, + "loss": 0.2989, + "num_input_tokens_seen": 5632944, + "step": 8620 + }, + { + "epoch": 4.520440251572327, + "grad_norm": 1.87656569480896, + "learning_rate": 3.338697863237703e-05, + "loss": 0.3159, + "num_input_tokens_seen": 5636016, + "step": 8625 + }, + { + "epoch": 4.523060796645702, + "grad_norm": 1.3067272901535034, + "learning_rate": 3.336543184434817e-05, + "loss": 0.272, + "num_input_tokens_seen": 5642064, + "step": 8630 + }, + { + "epoch": 4.5256813417190775, + "grad_norm": 1.3641289472579956, + "learning_rate": 3.334387805650805e-05, + "loss": 0.4194, + "num_input_tokens_seen": 5645072, + "step": 8635 + }, + { + "epoch": 4.528301886792453, + "grad_norm": 0.9832966923713684, + "learning_rate": 3.3322317286891913e-05, + "loss": 0.394, + "num_input_tokens_seen": 5648304, + "step": 8640 + }, + { + "epoch": 4.530922431865828, + "grad_norm": 1.103370189666748, + "learning_rate": 3.330074955354082e-05, + "loss": 0.4125, + "num_input_tokens_seen": 5651120, + "step": 8645 + }, + { + "epoch": 4.533542976939203, + "grad_norm": 1.1850203275680542, + "learning_rate": 3.3279174874501664e-05, + "loss": 0.2642, + "num_input_tokens_seen": 5653712, + "step": 8650 + }, + { + "epoch": 4.536163522012579, + "grad_norm": 1.113952875137329, + "learning_rate": 3.325759326782715e-05, + "loss": 0.3028, + "num_input_tokens_seen": 5656624, + "step": 8655 + }, + { + "epoch": 4.538784067085954, + "grad_norm": 1.3482344150543213, + "learning_rate": 3.323600475157578e-05, + "loss": 0.3552, + "num_input_tokens_seen": 5658960, + "step": 8660 + }, + { + "epoch": 4.5414046121593294, + "grad_norm": 1.4205938577651978, + "learning_rate": 3.321440934381184e-05, + "loss": 0.3493, + "num_input_tokens_seen": 5661872, + "step": 8665 + }, + { + "epoch": 4.544025157232705, + "grad_norm": 1.177615761756897, + "learning_rate": 3.319280706260538e-05, + "loss": 0.4055, + "num_input_tokens_seen": 5664560, + "step": 8670 + }, + { + "epoch": 4.54664570230608, + "grad_norm": 1.3683922290802002, + "learning_rate": 3.31711979260322e-05, + "loss": 0.3069, + "num_input_tokens_seen": 5667664, + "step": 8675 + }, + { + "epoch": 4.549266247379455, + "grad_norm": 1.4596214294433594, + "learning_rate": 3.3149581952173846e-05, + "loss": 0.3278, + "num_input_tokens_seen": 5670128, + "step": 8680 + }, + { + "epoch": 4.55188679245283, + "grad_norm": 1.4464242458343506, + "learning_rate": 3.312795915911757e-05, + "loss": 0.4042, + "num_input_tokens_seen": 5673296, + "step": 8685 + }, + { + "epoch": 4.554507337526205, + "grad_norm": 1.7085386514663696, + "learning_rate": 3.310632956495634e-05, + "loss": 0.4234, + "num_input_tokens_seen": 5675920, + "step": 8690 + }, + { + "epoch": 4.5571278825995805, + "grad_norm": 2.14363956451416, + "learning_rate": 3.308469318778881e-05, + "loss": 0.2866, + "num_input_tokens_seen": 5678896, + "step": 8695 + }, + { + "epoch": 4.559748427672956, + "grad_norm": 1.9729502201080322, + "learning_rate": 3.306305004571932e-05, + "loss": 0.3273, + "num_input_tokens_seen": 5681488, + "step": 8700 + }, + { + "epoch": 4.562368972746331, + "grad_norm": 2.505772352218628, + "learning_rate": 3.304140015685785e-05, + "loss": 0.3592, + "num_input_tokens_seen": 5684432, + "step": 8705 + }, + { + "epoch": 4.564989517819707, + "grad_norm": 0.8020384907722473, + "learning_rate": 3.301974353932005e-05, + "loss": 0.3266, + "num_input_tokens_seen": 5688112, + "step": 8710 + }, + { + "epoch": 4.567610062893082, + "grad_norm": 0.801102340221405, + "learning_rate": 3.2998080211227185e-05, + "loss": 0.3807, + "num_input_tokens_seen": 5690416, + "step": 8715 + }, + { + "epoch": 4.570230607966457, + "grad_norm": 1.9956717491149902, + "learning_rate": 3.297641019070613e-05, + "loss": 0.4023, + "num_input_tokens_seen": 5693744, + "step": 8720 + }, + { + "epoch": 4.572851153039832, + "grad_norm": 0.8861028552055359, + "learning_rate": 3.2954733495889376e-05, + "loss": 0.3675, + "num_input_tokens_seen": 5697776, + "step": 8725 + }, + { + "epoch": 4.5754716981132075, + "grad_norm": 1.968856930732727, + "learning_rate": 3.2933050144915e-05, + "loss": 0.5364, + "num_input_tokens_seen": 5700240, + "step": 8730 + }, + { + "epoch": 4.578092243186583, + "grad_norm": 1.6183362007141113, + "learning_rate": 3.2911360155926624e-05, + "loss": 0.3179, + "num_input_tokens_seen": 5702992, + "step": 8735 + }, + { + "epoch": 4.580712788259958, + "grad_norm": 2.64227557182312, + "learning_rate": 3.2889663547073444e-05, + "loss": 0.3097, + "num_input_tokens_seen": 5706544, + "step": 8740 + }, + { + "epoch": 4.583333333333333, + "grad_norm": 1.4033111333847046, + "learning_rate": 3.286796033651019e-05, + "loss": 0.351, + "num_input_tokens_seen": 5709328, + "step": 8745 + }, + { + "epoch": 4.585953878406709, + "grad_norm": 0.9095118641853333, + "learning_rate": 3.284625054239714e-05, + "loss": 0.3058, + "num_input_tokens_seen": 5712720, + "step": 8750 + }, + { + "epoch": 4.588574423480084, + "grad_norm": 1.431565523147583, + "learning_rate": 3.282453418290002e-05, + "loss": 0.4158, + "num_input_tokens_seen": 5715536, + "step": 8755 + }, + { + "epoch": 4.591194968553459, + "grad_norm": 1.4092451333999634, + "learning_rate": 3.28028112761901e-05, + "loss": 0.3666, + "num_input_tokens_seen": 5718352, + "step": 8760 + }, + { + "epoch": 4.593815513626835, + "grad_norm": 0.7068199515342712, + "learning_rate": 3.278108184044414e-05, + "loss": 0.3365, + "num_input_tokens_seen": 5722096, + "step": 8765 + }, + { + "epoch": 4.59643605870021, + "grad_norm": 1.4158157110214233, + "learning_rate": 3.275934589384432e-05, + "loss": 0.3223, + "num_input_tokens_seen": 5724816, + "step": 8770 + }, + { + "epoch": 4.599056603773585, + "grad_norm": 1.28522527217865, + "learning_rate": 3.273760345457828e-05, + "loss": 0.3397, + "num_input_tokens_seen": 5729040, + "step": 8775 + }, + { + "epoch": 4.60167714884696, + "grad_norm": 1.3387162685394287, + "learning_rate": 3.2715854540839106e-05, + "loss": 0.3442, + "num_input_tokens_seen": 5732464, + "step": 8780 + }, + { + "epoch": 4.604297693920335, + "grad_norm": 1.0524277687072754, + "learning_rate": 3.269409917082531e-05, + "loss": 0.2272, + "num_input_tokens_seen": 5735472, + "step": 8785 + }, + { + "epoch": 4.6069182389937104, + "grad_norm": 1.4226911067962646, + "learning_rate": 3.2672337362740765e-05, + "loss": 0.3314, + "num_input_tokens_seen": 5738896, + "step": 8790 + }, + { + "epoch": 4.609538784067086, + "grad_norm": 0.662600040435791, + "learning_rate": 3.265056913479479e-05, + "loss": 0.244, + "num_input_tokens_seen": 5742032, + "step": 8795 + }, + { + "epoch": 4.612159329140461, + "grad_norm": 0.9680163860321045, + "learning_rate": 3.262879450520201e-05, + "loss": 0.356, + "num_input_tokens_seen": 5745168, + "step": 8800 + }, + { + "epoch": 4.614779874213837, + "grad_norm": 1.6760804653167725, + "learning_rate": 3.260701349218248e-05, + "loss": 0.4321, + "num_input_tokens_seen": 5747920, + "step": 8805 + }, + { + "epoch": 4.617400419287212, + "grad_norm": 1.50858473777771, + "learning_rate": 3.258522611396151e-05, + "loss": 0.3422, + "num_input_tokens_seen": 5751120, + "step": 8810 + }, + { + "epoch": 4.620020964360587, + "grad_norm": 1.437058448791504, + "learning_rate": 3.256343238876983e-05, + "loss": 0.3479, + "num_input_tokens_seen": 5754352, + "step": 8815 + }, + { + "epoch": 4.622641509433962, + "grad_norm": 3.4102559089660645, + "learning_rate": 3.2541632334843394e-05, + "loss": 0.3195, + "num_input_tokens_seen": 5757200, + "step": 8820 + }, + { + "epoch": 4.6252620545073375, + "grad_norm": 1.5239863395690918, + "learning_rate": 3.251982597042351e-05, + "loss": 0.2471, + "num_input_tokens_seen": 5760944, + "step": 8825 + }, + { + "epoch": 4.627882599580713, + "grad_norm": 1.2239450216293335, + "learning_rate": 3.249801331375675e-05, + "loss": 0.3825, + "num_input_tokens_seen": 5764848, + "step": 8830 + }, + { + "epoch": 4.630503144654088, + "grad_norm": 1.1343847513198853, + "learning_rate": 3.2476194383094946e-05, + "loss": 0.2939, + "num_input_tokens_seen": 5767984, + "step": 8835 + }, + { + "epoch": 4.633123689727463, + "grad_norm": 1.055489182472229, + "learning_rate": 3.245436919669517e-05, + "loss": 0.2888, + "num_input_tokens_seen": 5771472, + "step": 8840 + }, + { + "epoch": 4.635744234800838, + "grad_norm": 1.8912805318832397, + "learning_rate": 3.243253777281977e-05, + "loss": 0.3354, + "num_input_tokens_seen": 5775536, + "step": 8845 + }, + { + "epoch": 4.638364779874214, + "grad_norm": 1.273618221282959, + "learning_rate": 3.241070012973625e-05, + "loss": 0.3455, + "num_input_tokens_seen": 5778704, + "step": 8850 + }, + { + "epoch": 4.640985324947589, + "grad_norm": 1.853013515472412, + "learning_rate": 3.238885628571738e-05, + "loss": 0.3446, + "num_input_tokens_seen": 5781104, + "step": 8855 + }, + { + "epoch": 4.643605870020965, + "grad_norm": 1.2935553789138794, + "learning_rate": 3.236700625904107e-05, + "loss": 0.3847, + "num_input_tokens_seen": 5784528, + "step": 8860 + }, + { + "epoch": 4.64622641509434, + "grad_norm": 1.412458896636963, + "learning_rate": 3.234515006799045e-05, + "loss": 0.5616, + "num_input_tokens_seen": 5787184, + "step": 8865 + }, + { + "epoch": 4.648846960167715, + "grad_norm": 2.170436382293701, + "learning_rate": 3.232328773085375e-05, + "loss": 0.3581, + "num_input_tokens_seen": 5790032, + "step": 8870 + }, + { + "epoch": 4.65146750524109, + "grad_norm": 1.6354900598526, + "learning_rate": 3.2301419265924395e-05, + "loss": 0.3485, + "num_input_tokens_seen": 5793104, + "step": 8875 + }, + { + "epoch": 4.654088050314465, + "grad_norm": 2.411390542984009, + "learning_rate": 3.2279544691500915e-05, + "loss": 0.3812, + "num_input_tokens_seen": 5796368, + "step": 8880 + }, + { + "epoch": 4.65670859538784, + "grad_norm": 2.055542230606079, + "learning_rate": 3.2257664025886956e-05, + "loss": 0.4427, + "num_input_tokens_seen": 5799248, + "step": 8885 + }, + { + "epoch": 4.659329140461216, + "grad_norm": 0.9750780463218689, + "learning_rate": 3.2235777287391256e-05, + "loss": 0.4003, + "num_input_tokens_seen": 5802448, + "step": 8890 + }, + { + "epoch": 4.661949685534591, + "grad_norm": 1.4513804912567139, + "learning_rate": 3.221388449432764e-05, + "loss": 0.3991, + "num_input_tokens_seen": 5805936, + "step": 8895 + }, + { + "epoch": 4.664570230607967, + "grad_norm": 1.4358595609664917, + "learning_rate": 3.219198566501499e-05, + "loss": 0.2811, + "num_input_tokens_seen": 5809744, + "step": 8900 + }, + { + "epoch": 4.667190775681342, + "grad_norm": 1.1983482837677002, + "learning_rate": 3.217008081777726e-05, + "loss": 0.4242, + "num_input_tokens_seen": 5812656, + "step": 8905 + }, + { + "epoch": 4.669811320754717, + "grad_norm": 2.4946820735931396, + "learning_rate": 3.214816997094341e-05, + "loss": 0.456, + "num_input_tokens_seen": 5816048, + "step": 8910 + }, + { + "epoch": 4.672431865828092, + "grad_norm": 1.0039724111557007, + "learning_rate": 3.2126253142847454e-05, + "loss": 0.4629, + "num_input_tokens_seen": 5820080, + "step": 8915 + }, + { + "epoch": 4.6750524109014675, + "grad_norm": 0.8438076972961426, + "learning_rate": 3.2104330351828374e-05, + "loss": 0.3851, + "num_input_tokens_seen": 5823344, + "step": 8920 + }, + { + "epoch": 4.677672955974843, + "grad_norm": 1.174633264541626, + "learning_rate": 3.208240161623017e-05, + "loss": 0.3377, + "num_input_tokens_seen": 5826448, + "step": 8925 + }, + { + "epoch": 4.680293501048218, + "grad_norm": 2.7194912433624268, + "learning_rate": 3.20604669544018e-05, + "loss": 0.307, + "num_input_tokens_seen": 5830000, + "step": 8930 + }, + { + "epoch": 4.682914046121593, + "grad_norm": 1.4555424451828003, + "learning_rate": 3.2038526384697204e-05, + "loss": 0.2679, + "num_input_tokens_seen": 5832464, + "step": 8935 + }, + { + "epoch": 4.685534591194968, + "grad_norm": 1.1744908094406128, + "learning_rate": 3.201657992547523e-05, + "loss": 0.3293, + "num_input_tokens_seen": 5835568, + "step": 8940 + }, + { + "epoch": 4.688155136268344, + "grad_norm": 0.9478568434715271, + "learning_rate": 3.1994627595099674e-05, + "loss": 0.3312, + "num_input_tokens_seen": 5841360, + "step": 8945 + }, + { + "epoch": 4.690775681341719, + "grad_norm": 1.3964036703109741, + "learning_rate": 3.1972669411939256e-05, + "loss": 0.2746, + "num_input_tokens_seen": 5844240, + "step": 8950 + }, + { + "epoch": 4.693396226415095, + "grad_norm": 1.3589837551116943, + "learning_rate": 3.195070539436757e-05, + "loss": 0.2707, + "num_input_tokens_seen": 5848144, + "step": 8955 + }, + { + "epoch": 4.69601677148847, + "grad_norm": 1.5758486986160278, + "learning_rate": 3.19287355607631e-05, + "loss": 0.4083, + "num_input_tokens_seen": 5850800, + "step": 8960 + }, + { + "epoch": 4.698637316561845, + "grad_norm": 1.1244548559188843, + "learning_rate": 3.190675992950921e-05, + "loss": 0.3494, + "num_input_tokens_seen": 5853712, + "step": 8965 + }, + { + "epoch": 4.70125786163522, + "grad_norm": 1.3469127416610718, + "learning_rate": 3.18847785189941e-05, + "loss": 0.3181, + "num_input_tokens_seen": 5856400, + "step": 8970 + }, + { + "epoch": 4.703878406708595, + "grad_norm": 1.560512661933899, + "learning_rate": 3.186279134761081e-05, + "loss": 0.3822, + "num_input_tokens_seen": 5859760, + "step": 8975 + }, + { + "epoch": 4.70649895178197, + "grad_norm": 0.9483230710029602, + "learning_rate": 3.18407984337572e-05, + "loss": 0.2818, + "num_input_tokens_seen": 5862672, + "step": 8980 + }, + { + "epoch": 4.709119496855346, + "grad_norm": 1.3618714809417725, + "learning_rate": 3.181879979583593e-05, + "loss": 0.3478, + "num_input_tokens_seen": 5866288, + "step": 8985 + }, + { + "epoch": 4.711740041928721, + "grad_norm": 1.1943814754486084, + "learning_rate": 3.179679545225447e-05, + "loss": 0.4217, + "num_input_tokens_seen": 5869424, + "step": 8990 + }, + { + "epoch": 4.714360587002097, + "grad_norm": 0.6716363430023193, + "learning_rate": 3.177478542142503e-05, + "loss": 0.2472, + "num_input_tokens_seen": 5872240, + "step": 8995 + }, + { + "epoch": 4.716981132075472, + "grad_norm": 1.4202327728271484, + "learning_rate": 3.175276972176462e-05, + "loss": 0.2727, + "num_input_tokens_seen": 5874672, + "step": 9000 + }, + { + "epoch": 4.719601677148847, + "grad_norm": 0.9171267151832581, + "learning_rate": 3.173074837169495e-05, + "loss": 0.2862, + "num_input_tokens_seen": 5878544, + "step": 9005 + }, + { + "epoch": 4.722222222222222, + "grad_norm": 1.40464186668396, + "learning_rate": 3.1708721389642495e-05, + "loss": 0.3991, + "num_input_tokens_seen": 5882096, + "step": 9010 + }, + { + "epoch": 4.7248427672955975, + "grad_norm": 1.3607922792434692, + "learning_rate": 3.1686688794038436e-05, + "loss": 0.3624, + "num_input_tokens_seen": 5886512, + "step": 9015 + }, + { + "epoch": 4.727463312368973, + "grad_norm": 1.2498103380203247, + "learning_rate": 3.1664650603318616e-05, + "loss": 0.284, + "num_input_tokens_seen": 5889648, + "step": 9020 + }, + { + "epoch": 4.730083857442348, + "grad_norm": 1.4802837371826172, + "learning_rate": 3.1642606835923606e-05, + "loss": 0.3164, + "num_input_tokens_seen": 5892752, + "step": 9025 + }, + { + "epoch": 4.732704402515723, + "grad_norm": 0.9151492118835449, + "learning_rate": 3.1620557510298607e-05, + "loss": 0.4274, + "num_input_tokens_seen": 5896112, + "step": 9030 + }, + { + "epoch": 4.735324947589098, + "grad_norm": 1.162420630455017, + "learning_rate": 3.159850264489351e-05, + "loss": 0.3237, + "num_input_tokens_seen": 5899440, + "step": 9035 + }, + { + "epoch": 4.737945492662474, + "grad_norm": 1.59941565990448, + "learning_rate": 3.157644225816281e-05, + "loss": 0.4401, + "num_input_tokens_seen": 5902096, + "step": 9040 + }, + { + "epoch": 4.740566037735849, + "grad_norm": 1.116262674331665, + "learning_rate": 3.1554376368565616e-05, + "loss": 0.3314, + "num_input_tokens_seen": 5904880, + "step": 9045 + }, + { + "epoch": 4.743186582809225, + "grad_norm": 1.1998831033706665, + "learning_rate": 3.153230499456568e-05, + "loss": 0.3935, + "num_input_tokens_seen": 5907632, + "step": 9050 + }, + { + "epoch": 4.7458071278826, + "grad_norm": 1.9836959838867188, + "learning_rate": 3.15102281546313e-05, + "loss": 0.2938, + "num_input_tokens_seen": 5911120, + "step": 9055 + }, + { + "epoch": 4.748427672955975, + "grad_norm": 0.8326165080070496, + "learning_rate": 3.148814586723537e-05, + "loss": 0.3014, + "num_input_tokens_seen": 5914928, + "step": 9060 + }, + { + "epoch": 4.75104821802935, + "grad_norm": 1.0526385307312012, + "learning_rate": 3.146605815085536e-05, + "loss": 0.322, + "num_input_tokens_seen": 5918192, + "step": 9065 + }, + { + "epoch": 4.753668763102725, + "grad_norm": 1.1450408697128296, + "learning_rate": 3.1443965023973245e-05, + "loss": 0.2471, + "num_input_tokens_seen": 5921232, + "step": 9070 + }, + { + "epoch": 4.7562893081761, + "grad_norm": 1.183256983757019, + "learning_rate": 3.142186650507554e-05, + "loss": 0.2807, + "num_input_tokens_seen": 5925040, + "step": 9075 + }, + { + "epoch": 4.758909853249476, + "grad_norm": 1.6830798387527466, + "learning_rate": 3.1399762612653286e-05, + "loss": 0.3808, + "num_input_tokens_seen": 5927376, + "step": 9080 + }, + { + "epoch": 4.761530398322851, + "grad_norm": 0.8592130541801453, + "learning_rate": 3.137765336520201e-05, + "loss": 0.3265, + "num_input_tokens_seen": 5930576, + "step": 9085 + }, + { + "epoch": 4.764150943396227, + "grad_norm": 1.5636258125305176, + "learning_rate": 3.1355538781221705e-05, + "loss": 0.283, + "num_input_tokens_seen": 5933328, + "step": 9090 + }, + { + "epoch": 4.766771488469602, + "grad_norm": 0.770348846912384, + "learning_rate": 3.133341887921687e-05, + "loss": 0.3503, + "num_input_tokens_seen": 5936208, + "step": 9095 + }, + { + "epoch": 4.769392033542977, + "grad_norm": 1.145006537437439, + "learning_rate": 3.1311293677696404e-05, + "loss": 0.4296, + "num_input_tokens_seen": 5938832, + "step": 9100 + }, + { + "epoch": 4.772012578616352, + "grad_norm": 1.0398402214050293, + "learning_rate": 3.1289163195173695e-05, + "loss": 0.2595, + "num_input_tokens_seen": 5942128, + "step": 9105 + }, + { + "epoch": 4.7746331236897275, + "grad_norm": 0.9423107504844666, + "learning_rate": 3.126702745016648e-05, + "loss": 0.2159, + "num_input_tokens_seen": 5945648, + "step": 9110 + }, + { + "epoch": 4.777253668763103, + "grad_norm": 1.1239264011383057, + "learning_rate": 3.1244886461196976e-05, + "loss": 0.2893, + "num_input_tokens_seen": 5950352, + "step": 9115 + }, + { + "epoch": 4.779874213836478, + "grad_norm": 1.99076509475708, + "learning_rate": 3.1222740246791734e-05, + "loss": 0.4452, + "num_input_tokens_seen": 5953488, + "step": 9120 + }, + { + "epoch": 4.782494758909853, + "grad_norm": 1.260117769241333, + "learning_rate": 3.12005888254817e-05, + "loss": 0.3224, + "num_input_tokens_seen": 5956784, + "step": 9125 + }, + { + "epoch": 4.785115303983228, + "grad_norm": 1.5071136951446533, + "learning_rate": 3.1178432215802155e-05, + "loss": 0.3741, + "num_input_tokens_seen": 5960080, + "step": 9130 + }, + { + "epoch": 4.787735849056604, + "grad_norm": 1.2131491899490356, + "learning_rate": 3.115627043629277e-05, + "loss": 0.2644, + "num_input_tokens_seen": 5962864, + "step": 9135 + }, + { + "epoch": 4.790356394129979, + "grad_norm": 0.9818352460861206, + "learning_rate": 3.113410350549748e-05, + "loss": 0.3152, + "num_input_tokens_seen": 5966000, + "step": 9140 + }, + { + "epoch": 4.7929769392033545, + "grad_norm": 1.0544987916946411, + "learning_rate": 3.111193144196457e-05, + "loss": 0.3601, + "num_input_tokens_seen": 5969168, + "step": 9145 + }, + { + "epoch": 4.79559748427673, + "grad_norm": 0.9292876124382019, + "learning_rate": 3.1089754264246615e-05, + "loss": 0.3912, + "num_input_tokens_seen": 5972720, + "step": 9150 + }, + { + "epoch": 4.798218029350105, + "grad_norm": 0.8916581869125366, + "learning_rate": 3.106757199090046e-05, + "loss": 0.2402, + "num_input_tokens_seen": 5975760, + "step": 9155 + }, + { + "epoch": 4.80083857442348, + "grad_norm": 1.5525586605072021, + "learning_rate": 3.104538464048721e-05, + "loss": 0.2622, + "num_input_tokens_seen": 5979120, + "step": 9160 + }, + { + "epoch": 4.803459119496855, + "grad_norm": 1.1967064142227173, + "learning_rate": 3.102319223157225e-05, + "loss": 0.3829, + "num_input_tokens_seen": 5983376, + "step": 9165 + }, + { + "epoch": 4.80607966457023, + "grad_norm": 1.5103479623794556, + "learning_rate": 3.100099478272515e-05, + "loss": 0.3926, + "num_input_tokens_seen": 5986416, + "step": 9170 + }, + { + "epoch": 4.808700209643606, + "grad_norm": 2.0881009101867676, + "learning_rate": 3.097879231251973e-05, + "loss": 0.2633, + "num_input_tokens_seen": 5989136, + "step": 9175 + }, + { + "epoch": 4.811320754716981, + "grad_norm": 1.5257418155670166, + "learning_rate": 3.0956584839534006e-05, + "loss": 0.4643, + "num_input_tokens_seen": 5992112, + "step": 9180 + }, + { + "epoch": 4.813941299790356, + "grad_norm": 1.6893008947372437, + "learning_rate": 3.093437238235018e-05, + "loss": 0.2704, + "num_input_tokens_seen": 5995472, + "step": 9185 + }, + { + "epoch": 4.816561844863732, + "grad_norm": 0.8827978372573853, + "learning_rate": 3.0912154959554606e-05, + "loss": 0.292, + "num_input_tokens_seen": 5998736, + "step": 9190 + }, + { + "epoch": 4.819182389937107, + "grad_norm": 1.3918213844299316, + "learning_rate": 3.088993258973782e-05, + "loss": 0.4749, + "num_input_tokens_seen": 6001552, + "step": 9195 + }, + { + "epoch": 4.821802935010482, + "grad_norm": 1.686590552330017, + "learning_rate": 3.0867705291494486e-05, + "loss": 0.3701, + "num_input_tokens_seen": 6005520, + "step": 9200 + }, + { + "epoch": 4.8244234800838575, + "grad_norm": 1.486477255821228, + "learning_rate": 3.0845473083423395e-05, + "loss": 0.3237, + "num_input_tokens_seen": 6009200, + "step": 9205 + }, + { + "epoch": 4.827044025157233, + "grad_norm": 1.5309555530548096, + "learning_rate": 3.082323598412743e-05, + "loss": 0.3243, + "num_input_tokens_seen": 6012912, + "step": 9210 + }, + { + "epoch": 4.829664570230608, + "grad_norm": 1.2555326223373413, + "learning_rate": 3.080099401221359e-05, + "loss": 0.4159, + "num_input_tokens_seen": 6017744, + "step": 9215 + }, + { + "epoch": 4.832285115303983, + "grad_norm": 1.6810495853424072, + "learning_rate": 3.0778747186292936e-05, + "loss": 0.3233, + "num_input_tokens_seen": 6020624, + "step": 9220 + }, + { + "epoch": 4.834905660377358, + "grad_norm": 1.1008107662200928, + "learning_rate": 3.075649552498061e-05, + "loss": 0.3336, + "num_input_tokens_seen": 6023312, + "step": 9225 + }, + { + "epoch": 4.837526205450734, + "grad_norm": 1.9708564281463623, + "learning_rate": 3.073423904689577e-05, + "loss": 0.3372, + "num_input_tokens_seen": 6026032, + "step": 9230 + }, + { + "epoch": 4.840146750524109, + "grad_norm": 1.382580041885376, + "learning_rate": 3.071197777066162e-05, + "loss": 0.3056, + "num_input_tokens_seen": 6029872, + "step": 9235 + }, + { + "epoch": 4.8427672955974845, + "grad_norm": 0.9332600235939026, + "learning_rate": 3.068971171490539e-05, + "loss": 0.3415, + "num_input_tokens_seen": 6033232, + "step": 9240 + }, + { + "epoch": 4.84538784067086, + "grad_norm": 2.0201618671417236, + "learning_rate": 3.066744089825829e-05, + "loss": 0.4773, + "num_input_tokens_seen": 6035888, + "step": 9245 + }, + { + "epoch": 4.848008385744235, + "grad_norm": 0.85897296667099, + "learning_rate": 3.064516533935553e-05, + "loss": 0.3423, + "num_input_tokens_seen": 6039408, + "step": 9250 + }, + { + "epoch": 4.85062893081761, + "grad_norm": 0.9874672293663025, + "learning_rate": 3.062288505683626e-05, + "loss": 0.419, + "num_input_tokens_seen": 6042736, + "step": 9255 + }, + { + "epoch": 4.853249475890985, + "grad_norm": 1.0776275396347046, + "learning_rate": 3.060060006934363e-05, + "loss": 0.3399, + "num_input_tokens_seen": 6045296, + "step": 9260 + }, + { + "epoch": 4.85587002096436, + "grad_norm": 1.9823976755142212, + "learning_rate": 3.057831039552469e-05, + "loss": 0.3364, + "num_input_tokens_seen": 6047920, + "step": 9265 + }, + { + "epoch": 4.8584905660377355, + "grad_norm": 0.8779424428939819, + "learning_rate": 3.0556016054030416e-05, + "loss": 0.2573, + "num_input_tokens_seen": 6052432, + "step": 9270 + }, + { + "epoch": 4.861111111111111, + "grad_norm": 2.5187621116638184, + "learning_rate": 3.053371706351569e-05, + "loss": 0.2968, + "num_input_tokens_seen": 6055472, + "step": 9275 + }, + { + "epoch": 4.863731656184486, + "grad_norm": 1.9689977169036865, + "learning_rate": 3.0511413442639296e-05, + "loss": 0.4363, + "num_input_tokens_seen": 6059184, + "step": 9280 + }, + { + "epoch": 4.866352201257862, + "grad_norm": 1.4931344985961914, + "learning_rate": 3.048910521006389e-05, + "loss": 0.4224, + "num_input_tokens_seen": 6062192, + "step": 9285 + }, + { + "epoch": 4.868972746331237, + "grad_norm": 1.555403232574463, + "learning_rate": 3.046679238445598e-05, + "loss": 0.3017, + "num_input_tokens_seen": 6065264, + "step": 9290 + }, + { + "epoch": 4.871593291404612, + "grad_norm": 1.066077709197998, + "learning_rate": 3.0444474984485905e-05, + "loss": 0.4291, + "num_input_tokens_seen": 6069008, + "step": 9295 + }, + { + "epoch": 4.8742138364779874, + "grad_norm": 1.8031880855560303, + "learning_rate": 3.042215302882786e-05, + "loss": 0.286, + "num_input_tokens_seen": 6072624, + "step": 9300 + }, + { + "epoch": 4.876834381551363, + "grad_norm": 1.2062236070632935, + "learning_rate": 3.0399826536159836e-05, + "loss": 0.4268, + "num_input_tokens_seen": 6075280, + "step": 9305 + }, + { + "epoch": 4.879454926624738, + "grad_norm": 1.7205119132995605, + "learning_rate": 3.0377495525163624e-05, + "loss": 0.3797, + "num_input_tokens_seen": 6078448, + "step": 9310 + }, + { + "epoch": 4.882075471698113, + "grad_norm": 1.4781047105789185, + "learning_rate": 3.0355160014524786e-05, + "loss": 0.1847, + "num_input_tokens_seen": 6081008, + "step": 9315 + }, + { + "epoch": 4.884696016771488, + "grad_norm": 0.8573052287101746, + "learning_rate": 3.033282002293266e-05, + "loss": 0.3633, + "num_input_tokens_seen": 6085296, + "step": 9320 + }, + { + "epoch": 4.887316561844864, + "grad_norm": 1.0943922996520996, + "learning_rate": 3.0310475569080345e-05, + "loss": 0.3859, + "num_input_tokens_seen": 6087664, + "step": 9325 + }, + { + "epoch": 4.889937106918239, + "grad_norm": 1.9573922157287598, + "learning_rate": 3.0288126671664628e-05, + "loss": 0.358, + "num_input_tokens_seen": 6090832, + "step": 9330 + }, + { + "epoch": 4.8925576519916145, + "grad_norm": 1.2056328058242798, + "learning_rate": 3.0265773349386078e-05, + "loss": 0.3863, + "num_input_tokens_seen": 6093616, + "step": 9335 + }, + { + "epoch": 4.89517819706499, + "grad_norm": 1.115019679069519, + "learning_rate": 3.024341562094891e-05, + "loss": 0.2675, + "num_input_tokens_seen": 6096592, + "step": 9340 + }, + { + "epoch": 4.897798742138365, + "grad_norm": 0.8647496700286865, + "learning_rate": 3.0221053505061063e-05, + "loss": 0.2423, + "num_input_tokens_seen": 6100144, + "step": 9345 + }, + { + "epoch": 4.90041928721174, + "grad_norm": 1.1667486429214478, + "learning_rate": 3.0198687020434142e-05, + "loss": 0.3765, + "num_input_tokens_seen": 6103536, + "step": 9350 + }, + { + "epoch": 4.903039832285115, + "grad_norm": 1.1147897243499756, + "learning_rate": 3.0176316185783383e-05, + "loss": 0.3476, + "num_input_tokens_seen": 6107408, + "step": 9355 + }, + { + "epoch": 4.90566037735849, + "grad_norm": 1.8014243841171265, + "learning_rate": 3.015394101982768e-05, + "loss": 0.3561, + "num_input_tokens_seen": 6110480, + "step": 9360 + }, + { + "epoch": 4.9082809224318655, + "grad_norm": 2.5367941856384277, + "learning_rate": 3.013156154128955e-05, + "loss": 0.2804, + "num_input_tokens_seen": 6113488, + "step": 9365 + }, + { + "epoch": 4.910901467505241, + "grad_norm": 1.5987217426300049, + "learning_rate": 3.010917776889513e-05, + "loss": 0.3606, + "num_input_tokens_seen": 6117648, + "step": 9370 + }, + { + "epoch": 4.913522012578616, + "grad_norm": 1.7719882726669312, + "learning_rate": 3.0086789721374137e-05, + "loss": 0.3381, + "num_input_tokens_seen": 6120592, + "step": 9375 + }, + { + "epoch": 4.916142557651992, + "grad_norm": 1.3600304126739502, + "learning_rate": 3.006439741745985e-05, + "loss": 0.384, + "num_input_tokens_seen": 6124400, + "step": 9380 + }, + { + "epoch": 4.918763102725367, + "grad_norm": 1.1715763807296753, + "learning_rate": 3.004200087588914e-05, + "loss": 0.3637, + "num_input_tokens_seen": 6128400, + "step": 9385 + }, + { + "epoch": 4.921383647798742, + "grad_norm": 1.4236778020858765, + "learning_rate": 3.00196001154024e-05, + "loss": 0.2981, + "num_input_tokens_seen": 6131536, + "step": 9390 + }, + { + "epoch": 4.924004192872117, + "grad_norm": 1.0583385229110718, + "learning_rate": 2.999719515474358e-05, + "loss": 0.2396, + "num_input_tokens_seen": 6135280, + "step": 9395 + }, + { + "epoch": 4.926624737945493, + "grad_norm": 1.8990200757980347, + "learning_rate": 2.997478601266011e-05, + "loss": 0.3338, + "num_input_tokens_seen": 6137680, + "step": 9400 + }, + { + "epoch": 4.929245283018868, + "grad_norm": 1.3464423418045044, + "learning_rate": 2.995237270790295e-05, + "loss": 0.5502, + "num_input_tokens_seen": 6141264, + "step": 9405 + }, + { + "epoch": 4.931865828092243, + "grad_norm": 1.2551518678665161, + "learning_rate": 2.9929955259226515e-05, + "loss": 0.3084, + "num_input_tokens_seen": 6145104, + "step": 9410 + }, + { + "epoch": 4.934486373165618, + "grad_norm": 0.9798745512962341, + "learning_rate": 2.990753368538872e-05, + "loss": 0.3205, + "num_input_tokens_seen": 6148976, + "step": 9415 + }, + { + "epoch": 4.937106918238994, + "grad_norm": 1.7851706743240356, + "learning_rate": 2.9885108005150897e-05, + "loss": 0.4429, + "num_input_tokens_seen": 6152560, + "step": 9420 + }, + { + "epoch": 4.939727463312369, + "grad_norm": 1.949576735496521, + "learning_rate": 2.986267823727784e-05, + "loss": 0.3645, + "num_input_tokens_seen": 6155024, + "step": 9425 + }, + { + "epoch": 4.9423480083857445, + "grad_norm": 0.7821433544158936, + "learning_rate": 2.9840244400537754e-05, + "loss": 0.3147, + "num_input_tokens_seen": 6157584, + "step": 9430 + }, + { + "epoch": 4.94496855345912, + "grad_norm": 1.3693190813064575, + "learning_rate": 2.9817806513702244e-05, + "loss": 0.3636, + "num_input_tokens_seen": 6160240, + "step": 9435 + }, + { + "epoch": 4.947589098532495, + "grad_norm": 0.9847647547721863, + "learning_rate": 2.9795364595546315e-05, + "loss": 0.3881, + "num_input_tokens_seen": 6163696, + "step": 9440 + }, + { + "epoch": 4.95020964360587, + "grad_norm": 1.3199070692062378, + "learning_rate": 2.977291866484833e-05, + "loss": 0.2752, + "num_input_tokens_seen": 6168880, + "step": 9445 + }, + { + "epoch": 4.952830188679245, + "grad_norm": 1.7151103019714355, + "learning_rate": 2.975046874039003e-05, + "loss": 0.3489, + "num_input_tokens_seen": 6172208, + "step": 9450 + }, + { + "epoch": 4.95545073375262, + "grad_norm": 1.162812352180481, + "learning_rate": 2.9728014840956488e-05, + "loss": 0.277, + "num_input_tokens_seen": 6174928, + "step": 9455 + }, + { + "epoch": 4.9580712788259955, + "grad_norm": 1.1788779497146606, + "learning_rate": 2.9705556985336086e-05, + "loss": 0.3704, + "num_input_tokens_seen": 6178992, + "step": 9460 + }, + { + "epoch": 4.960691823899371, + "grad_norm": 1.4541935920715332, + "learning_rate": 2.968309519232053e-05, + "loss": 0.3301, + "num_input_tokens_seen": 6182128, + "step": 9465 + }, + { + "epoch": 4.963312368972746, + "grad_norm": 1.6641051769256592, + "learning_rate": 2.966062948070485e-05, + "loss": 0.295, + "num_input_tokens_seen": 6185328, + "step": 9470 + }, + { + "epoch": 4.965932914046122, + "grad_norm": 0.941203236579895, + "learning_rate": 2.9638159869287303e-05, + "loss": 0.3622, + "num_input_tokens_seen": 6188560, + "step": 9475 + }, + { + "epoch": 4.968553459119497, + "grad_norm": 0.7833370566368103, + "learning_rate": 2.9615686376869434e-05, + "loss": 0.42, + "num_input_tokens_seen": 6193136, + "step": 9480 + }, + { + "epoch": 4.971174004192872, + "grad_norm": 1.3899377584457397, + "learning_rate": 2.9593209022256046e-05, + "loss": 0.5098, + "num_input_tokens_seen": 6196816, + "step": 9485 + }, + { + "epoch": 4.973794549266247, + "grad_norm": 1.7399433851242065, + "learning_rate": 2.9570727824255163e-05, + "loss": 0.3441, + "num_input_tokens_seen": 6200272, + "step": 9490 + }, + { + "epoch": 4.976415094339623, + "grad_norm": 2.4759671688079834, + "learning_rate": 2.954824280167801e-05, + "loss": 0.27, + "num_input_tokens_seen": 6203216, + "step": 9495 + }, + { + "epoch": 4.979035639412998, + "grad_norm": 0.6462287306785583, + "learning_rate": 2.9525753973339044e-05, + "loss": 0.2726, + "num_input_tokens_seen": 6206736, + "step": 9500 + }, + { + "epoch": 4.981656184486373, + "grad_norm": 1.0060827732086182, + "learning_rate": 2.9503261358055873e-05, + "loss": 0.303, + "num_input_tokens_seen": 6209936, + "step": 9505 + }, + { + "epoch": 4.984276729559748, + "grad_norm": 0.9829003810882568, + "learning_rate": 2.9480764974649305e-05, + "loss": 0.3129, + "num_input_tokens_seen": 6213424, + "step": 9510 + }, + { + "epoch": 4.986897274633124, + "grad_norm": 1.6895394325256348, + "learning_rate": 2.9458264841943272e-05, + "loss": 0.2891, + "num_input_tokens_seen": 6217360, + "step": 9515 + }, + { + "epoch": 4.989517819706499, + "grad_norm": 2.145308494567871, + "learning_rate": 2.9435760978764874e-05, + "loss": 0.3811, + "num_input_tokens_seen": 6219984, + "step": 9520 + }, + { + "epoch": 4.9921383647798745, + "grad_norm": 0.9384110569953918, + "learning_rate": 2.9413253403944297e-05, + "loss": 0.2717, + "num_input_tokens_seen": 6223664, + "step": 9525 + }, + { + "epoch": 4.99475890985325, + "grad_norm": 1.0120168924331665, + "learning_rate": 2.9390742136314863e-05, + "loss": 0.3395, + "num_input_tokens_seen": 6227440, + "step": 9530 + }, + { + "epoch": 4.997379454926625, + "grad_norm": 1.3780900239944458, + "learning_rate": 2.9368227194712978e-05, + "loss": 0.2633, + "num_input_tokens_seen": 6230608, + "step": 9535 + }, + { + "epoch": 5.0, + "grad_norm": 1.9539482593536377, + "learning_rate": 2.9345708597978106e-05, + "loss": 0.3861, + "num_input_tokens_seen": 6233920, + "step": 9540 + }, + { + "epoch": 5.0, + "eval_loss": 0.5026046633720398, + "eval_runtime": 16.0017, + "eval_samples_per_second": 52.995, + "eval_steps_per_second": 13.249, + "num_input_tokens_seen": 6233920, + "step": 9540 + }, + { + "epoch": 5.002620545073375, + "grad_norm": 1.4936861991882324, + "learning_rate": 2.932318636495278e-05, + "loss": 0.2544, + "num_input_tokens_seen": 6237312, + "step": 9545 + }, + { + "epoch": 5.00524109014675, + "grad_norm": 1.1564022302627563, + "learning_rate": 2.930066051448258e-05, + "loss": 0.2807, + "num_input_tokens_seen": 6241344, + "step": 9550 + }, + { + "epoch": 5.0078616352201255, + "grad_norm": 1.1891398429870605, + "learning_rate": 2.927813106541611e-05, + "loss": 0.3589, + "num_input_tokens_seen": 6244064, + "step": 9555 + }, + { + "epoch": 5.010482180293501, + "grad_norm": 1.0623530149459839, + "learning_rate": 2.9255598036604982e-05, + "loss": 0.4144, + "num_input_tokens_seen": 6247008, + "step": 9560 + }, + { + "epoch": 5.013102725366877, + "grad_norm": 0.9752365946769714, + "learning_rate": 2.92330614469038e-05, + "loss": 0.3816, + "num_input_tokens_seen": 6250016, + "step": 9565 + }, + { + "epoch": 5.015723270440252, + "grad_norm": 0.9480928778648376, + "learning_rate": 2.921052131517016e-05, + "loss": 0.4039, + "num_input_tokens_seen": 6253504, + "step": 9570 + }, + { + "epoch": 5.018343815513627, + "grad_norm": 0.7977662682533264, + "learning_rate": 2.9187977660264615e-05, + "loss": 0.2994, + "num_input_tokens_seen": 6256256, + "step": 9575 + }, + { + "epoch": 5.020964360587002, + "grad_norm": 1.1466612815856934, + "learning_rate": 2.9165430501050657e-05, + "loss": 0.4472, + "num_input_tokens_seen": 6260160, + "step": 9580 + }, + { + "epoch": 5.023584905660377, + "grad_norm": 1.2105783224105835, + "learning_rate": 2.9142879856394732e-05, + "loss": 0.3107, + "num_input_tokens_seen": 6264032, + "step": 9585 + }, + { + "epoch": 5.026205450733753, + "grad_norm": 0.7917044162750244, + "learning_rate": 2.9120325745166178e-05, + "loss": 0.3168, + "num_input_tokens_seen": 6267552, + "step": 9590 + }, + { + "epoch": 5.028825995807128, + "grad_norm": 0.9196416735649109, + "learning_rate": 2.909776818623725e-05, + "loss": 0.3154, + "num_input_tokens_seen": 6270272, + "step": 9595 + }, + { + "epoch": 5.031446540880503, + "grad_norm": 0.9726985096931458, + "learning_rate": 2.9075207198483084e-05, + "loss": 0.1491, + "num_input_tokens_seen": 6274624, + "step": 9600 + }, + { + "epoch": 5.034067085953878, + "grad_norm": 1.19660484790802, + "learning_rate": 2.905264280078168e-05, + "loss": 0.3013, + "num_input_tokens_seen": 6277728, + "step": 9605 + }, + { + "epoch": 5.036687631027253, + "grad_norm": 2.5044615268707275, + "learning_rate": 2.9030075012013902e-05, + "loss": 0.253, + "num_input_tokens_seen": 6281024, + "step": 9610 + }, + { + "epoch": 5.039308176100629, + "grad_norm": 1.8291192054748535, + "learning_rate": 2.9007503851063433e-05, + "loss": 0.376, + "num_input_tokens_seen": 6284576, + "step": 9615 + }, + { + "epoch": 5.0419287211740045, + "grad_norm": 2.2918877601623535, + "learning_rate": 2.8984929336816807e-05, + "loss": 0.3543, + "num_input_tokens_seen": 6288096, + "step": 9620 + }, + { + "epoch": 5.04454926624738, + "grad_norm": 1.5902843475341797, + "learning_rate": 2.896235148816333e-05, + "loss": 0.3326, + "num_input_tokens_seen": 6292128, + "step": 9625 + }, + { + "epoch": 5.047169811320755, + "grad_norm": 1.5539497137069702, + "learning_rate": 2.893977032399512e-05, + "loss": 0.2955, + "num_input_tokens_seen": 6298912, + "step": 9630 + }, + { + "epoch": 5.04979035639413, + "grad_norm": 1.5790624618530273, + "learning_rate": 2.8917185863207062e-05, + "loss": 0.2999, + "num_input_tokens_seen": 6301536, + "step": 9635 + }, + { + "epoch": 5.052410901467505, + "grad_norm": 1.2224384546279907, + "learning_rate": 2.889459812469681e-05, + "loss": 0.2689, + "num_input_tokens_seen": 6303936, + "step": 9640 + }, + { + "epoch": 5.05503144654088, + "grad_norm": 1.731372356414795, + "learning_rate": 2.8872007127364746e-05, + "loss": 0.2859, + "num_input_tokens_seen": 6306880, + "step": 9645 + }, + { + "epoch": 5.0576519916142555, + "grad_norm": 1.320493459701538, + "learning_rate": 2.884941289011398e-05, + "loss": 0.2948, + "num_input_tokens_seen": 6309920, + "step": 9650 + }, + { + "epoch": 5.060272536687631, + "grad_norm": 1.0657271146774292, + "learning_rate": 2.882681543185034e-05, + "loss": 0.2665, + "num_input_tokens_seen": 6312480, + "step": 9655 + }, + { + "epoch": 5.062893081761007, + "grad_norm": 1.0896363258361816, + "learning_rate": 2.880421477148235e-05, + "loss": 0.2704, + "num_input_tokens_seen": 6316256, + "step": 9660 + }, + { + "epoch": 5.065513626834382, + "grad_norm": 1.5528614521026611, + "learning_rate": 2.878161092792121e-05, + "loss": 0.3014, + "num_input_tokens_seen": 6319520, + "step": 9665 + }, + { + "epoch": 5.068134171907757, + "grad_norm": 1.1187517642974854, + "learning_rate": 2.8759003920080786e-05, + "loss": 0.3564, + "num_input_tokens_seen": 6323328, + "step": 9670 + }, + { + "epoch": 5.070754716981132, + "grad_norm": 1.092139720916748, + "learning_rate": 2.8736393766877578e-05, + "loss": 0.2066, + "num_input_tokens_seen": 6326720, + "step": 9675 + }, + { + "epoch": 5.073375262054507, + "grad_norm": 1.5481607913970947, + "learning_rate": 2.871378048723074e-05, + "loss": 0.3259, + "num_input_tokens_seen": 6329280, + "step": 9680 + }, + { + "epoch": 5.075995807127883, + "grad_norm": 0.7891998887062073, + "learning_rate": 2.8691164100062034e-05, + "loss": 0.2767, + "num_input_tokens_seen": 6332384, + "step": 9685 + }, + { + "epoch": 5.078616352201258, + "grad_norm": 1.2698355913162231, + "learning_rate": 2.8668544624295814e-05, + "loss": 0.2355, + "num_input_tokens_seen": 6336096, + "step": 9690 + }, + { + "epoch": 5.081236897274633, + "grad_norm": 1.4974365234375, + "learning_rate": 2.864592207885902e-05, + "loss": 0.3268, + "num_input_tokens_seen": 6338432, + "step": 9695 + }, + { + "epoch": 5.083857442348008, + "grad_norm": 1.101672887802124, + "learning_rate": 2.8623296482681166e-05, + "loss": 0.2455, + "num_input_tokens_seen": 6342016, + "step": 9700 + }, + { + "epoch": 5.086477987421383, + "grad_norm": 1.6249593496322632, + "learning_rate": 2.8600667854694328e-05, + "loss": 0.3063, + "num_input_tokens_seen": 6344896, + "step": 9705 + }, + { + "epoch": 5.089098532494759, + "grad_norm": 1.720100998878479, + "learning_rate": 2.857803621383311e-05, + "loss": 0.2903, + "num_input_tokens_seen": 6348096, + "step": 9710 + }, + { + "epoch": 5.0917190775681345, + "grad_norm": 2.0597212314605713, + "learning_rate": 2.8555401579034607e-05, + "loss": 0.3736, + "num_input_tokens_seen": 6351488, + "step": 9715 + }, + { + "epoch": 5.09433962264151, + "grad_norm": 1.5507402420043945, + "learning_rate": 2.853276396923848e-05, + "loss": 0.328, + "num_input_tokens_seen": 6354464, + "step": 9720 + }, + { + "epoch": 5.096960167714885, + "grad_norm": 1.0558422803878784, + "learning_rate": 2.851012340338683e-05, + "loss": 0.2867, + "num_input_tokens_seen": 6358912, + "step": 9725 + }, + { + "epoch": 5.09958071278826, + "grad_norm": 1.9690996408462524, + "learning_rate": 2.8487479900424253e-05, + "loss": 0.2343, + "num_input_tokens_seen": 6362112, + "step": 9730 + }, + { + "epoch": 5.102201257861635, + "grad_norm": 1.455104112625122, + "learning_rate": 2.8464833479297794e-05, + "loss": 0.2945, + "num_input_tokens_seen": 6364896, + "step": 9735 + }, + { + "epoch": 5.10482180293501, + "grad_norm": 1.710214376449585, + "learning_rate": 2.8442184158956947e-05, + "loss": 0.4413, + "num_input_tokens_seen": 6368160, + "step": 9740 + }, + { + "epoch": 5.1074423480083855, + "grad_norm": 1.4091544151306152, + "learning_rate": 2.8419531958353635e-05, + "loss": 0.2371, + "num_input_tokens_seen": 6371264, + "step": 9745 + }, + { + "epoch": 5.110062893081761, + "grad_norm": 1.7531687021255493, + "learning_rate": 2.839687689644217e-05, + "loss": 0.3744, + "num_input_tokens_seen": 6374144, + "step": 9750 + }, + { + "epoch": 5.112683438155136, + "grad_norm": 2.2690482139587402, + "learning_rate": 2.837421899217928e-05, + "loss": 0.3086, + "num_input_tokens_seen": 6377600, + "step": 9755 + }, + { + "epoch": 5.115303983228512, + "grad_norm": 1.5581897497177124, + "learning_rate": 2.8351558264524076e-05, + "loss": 0.2761, + "num_input_tokens_seen": 6380608, + "step": 9760 + }, + { + "epoch": 5.117924528301887, + "grad_norm": 1.5691553354263306, + "learning_rate": 2.8328894732437998e-05, + "loss": 0.4444, + "num_input_tokens_seen": 6384640, + "step": 9765 + }, + { + "epoch": 5.120545073375262, + "grad_norm": 2.4313645362854004, + "learning_rate": 2.830622841488488e-05, + "loss": 0.2361, + "num_input_tokens_seen": 6387456, + "step": 9770 + }, + { + "epoch": 5.123165618448637, + "grad_norm": 1.5850837230682373, + "learning_rate": 2.8283559330830834e-05, + "loss": 0.2782, + "num_input_tokens_seen": 6390880, + "step": 9775 + }, + { + "epoch": 5.1257861635220126, + "grad_norm": 1.5746101140975952, + "learning_rate": 2.8260887499244333e-05, + "loss": 0.2365, + "num_input_tokens_seen": 6393696, + "step": 9780 + }, + { + "epoch": 5.128406708595388, + "grad_norm": 0.7907288670539856, + "learning_rate": 2.823821293909612e-05, + "loss": 0.2895, + "num_input_tokens_seen": 6397472, + "step": 9785 + }, + { + "epoch": 5.131027253668763, + "grad_norm": 1.208571195602417, + "learning_rate": 2.821553566935924e-05, + "loss": 0.2994, + "num_input_tokens_seen": 6400576, + "step": 9790 + }, + { + "epoch": 5.133647798742138, + "grad_norm": 1.4698835611343384, + "learning_rate": 2.8192855709008985e-05, + "loss": 0.3815, + "num_input_tokens_seen": 6403168, + "step": 9795 + }, + { + "epoch": 5.136268343815513, + "grad_norm": 1.2819846868515015, + "learning_rate": 2.8170173077022915e-05, + "loss": 0.306, + "num_input_tokens_seen": 6406144, + "step": 9800 + }, + { + "epoch": 5.138888888888889, + "grad_norm": 1.8654865026474, + "learning_rate": 2.8147487792380832e-05, + "loss": 0.309, + "num_input_tokens_seen": 6409344, + "step": 9805 + }, + { + "epoch": 5.1415094339622645, + "grad_norm": 1.6853727102279663, + "learning_rate": 2.8124799874064733e-05, + "loss": 0.3124, + "num_input_tokens_seen": 6411776, + "step": 9810 + }, + { + "epoch": 5.14412997903564, + "grad_norm": 1.128055453300476, + "learning_rate": 2.810210934105883e-05, + "loss": 0.262, + "num_input_tokens_seen": 6414784, + "step": 9815 + }, + { + "epoch": 5.146750524109015, + "grad_norm": 2.33241605758667, + "learning_rate": 2.8079416212349528e-05, + "loss": 0.2747, + "num_input_tokens_seen": 6417248, + "step": 9820 + }, + { + "epoch": 5.14937106918239, + "grad_norm": 1.4844012260437012, + "learning_rate": 2.805672050692541e-05, + "loss": 0.3777, + "num_input_tokens_seen": 6420416, + "step": 9825 + }, + { + "epoch": 5.151991614255765, + "grad_norm": 1.9532442092895508, + "learning_rate": 2.8034022243777197e-05, + "loss": 0.2226, + "num_input_tokens_seen": 6424288, + "step": 9830 + }, + { + "epoch": 5.15461215932914, + "grad_norm": 1.0490703582763672, + "learning_rate": 2.8011321441897754e-05, + "loss": 0.3919, + "num_input_tokens_seen": 6426784, + "step": 9835 + }, + { + "epoch": 5.1572327044025155, + "grad_norm": 1.1563125848770142, + "learning_rate": 2.7988618120282074e-05, + "loss": 0.299, + "num_input_tokens_seen": 6429760, + "step": 9840 + }, + { + "epoch": 5.159853249475891, + "grad_norm": 1.6096397638320923, + "learning_rate": 2.7965912297927277e-05, + "loss": 0.2619, + "num_input_tokens_seen": 6432480, + "step": 9845 + }, + { + "epoch": 5.162473794549266, + "grad_norm": 1.454189419746399, + "learning_rate": 2.794320399383254e-05, + "loss": 0.3239, + "num_input_tokens_seen": 6437312, + "step": 9850 + }, + { + "epoch": 5.165094339622642, + "grad_norm": 1.3485158681869507, + "learning_rate": 2.7920493226999143e-05, + "loss": 0.3706, + "num_input_tokens_seen": 6441664, + "step": 9855 + }, + { + "epoch": 5.167714884696017, + "grad_norm": 0.7968718409538269, + "learning_rate": 2.7897780016430414e-05, + "loss": 0.2518, + "num_input_tokens_seen": 6445120, + "step": 9860 + }, + { + "epoch": 5.170335429769392, + "grad_norm": 1.6640194654464722, + "learning_rate": 2.7875064381131733e-05, + "loss": 0.3054, + "num_input_tokens_seen": 6447968, + "step": 9865 + }, + { + "epoch": 5.172955974842767, + "grad_norm": 1.4375865459442139, + "learning_rate": 2.7852346340110508e-05, + "loss": 0.276, + "num_input_tokens_seen": 6450592, + "step": 9870 + }, + { + "epoch": 5.1755765199161425, + "grad_norm": 2.2090983390808105, + "learning_rate": 2.7829625912376163e-05, + "loss": 0.2818, + "num_input_tokens_seen": 6454496, + "step": 9875 + }, + { + "epoch": 5.178197064989518, + "grad_norm": 1.5788066387176514, + "learning_rate": 2.7806903116940093e-05, + "loss": 0.2254, + "num_input_tokens_seen": 6457088, + "step": 9880 + }, + { + "epoch": 5.180817610062893, + "grad_norm": 1.449162483215332, + "learning_rate": 2.778417797281571e-05, + "loss": 0.3521, + "num_input_tokens_seen": 6461088, + "step": 9885 + }, + { + "epoch": 5.183438155136268, + "grad_norm": 1.013202428817749, + "learning_rate": 2.7761450499018383e-05, + "loss": 0.3402, + "num_input_tokens_seen": 6463872, + "step": 9890 + }, + { + "epoch": 5.186058700209643, + "grad_norm": 1.4406256675720215, + "learning_rate": 2.7738720714565418e-05, + "loss": 0.3271, + "num_input_tokens_seen": 6467424, + "step": 9895 + }, + { + "epoch": 5.188679245283019, + "grad_norm": 1.063663125038147, + "learning_rate": 2.7715988638476055e-05, + "loss": 0.2718, + "num_input_tokens_seen": 6470688, + "step": 9900 + }, + { + "epoch": 5.191299790356394, + "grad_norm": 0.7530269622802734, + "learning_rate": 2.7693254289771454e-05, + "loss": 0.2834, + "num_input_tokens_seen": 6473440, + "step": 9905 + }, + { + "epoch": 5.19392033542977, + "grad_norm": 2.6351828575134277, + "learning_rate": 2.7670517687474697e-05, + "loss": 0.3434, + "num_input_tokens_seen": 6476352, + "step": 9910 + }, + { + "epoch": 5.196540880503145, + "grad_norm": 1.1618239879608154, + "learning_rate": 2.7647778850610723e-05, + "loss": 0.2508, + "num_input_tokens_seen": 6480064, + "step": 9915 + }, + { + "epoch": 5.19916142557652, + "grad_norm": 1.5626897811889648, + "learning_rate": 2.7625037798206345e-05, + "loss": 0.2616, + "num_input_tokens_seen": 6483840, + "step": 9920 + }, + { + "epoch": 5.201781970649895, + "grad_norm": 2.1951704025268555, + "learning_rate": 2.7602294549290243e-05, + "loss": 0.4979, + "num_input_tokens_seen": 6486368, + "step": 9925 + }, + { + "epoch": 5.20440251572327, + "grad_norm": 0.983921468257904, + "learning_rate": 2.757954912289294e-05, + "loss": 0.2677, + "num_input_tokens_seen": 6489152, + "step": 9930 + }, + { + "epoch": 5.2070230607966455, + "grad_norm": 1.0763611793518066, + "learning_rate": 2.755680153804675e-05, + "loss": 0.2664, + "num_input_tokens_seen": 6493088, + "step": 9935 + }, + { + "epoch": 5.209643605870021, + "grad_norm": 0.9048705101013184, + "learning_rate": 2.7534051813785834e-05, + "loss": 0.2953, + "num_input_tokens_seen": 6496256, + "step": 9940 + }, + { + "epoch": 5.212264150943396, + "grad_norm": 1.590989112854004, + "learning_rate": 2.75112999691461e-05, + "loss": 0.2602, + "num_input_tokens_seen": 6499296, + "step": 9945 + }, + { + "epoch": 5.214884696016772, + "grad_norm": 1.4732024669647217, + "learning_rate": 2.7488546023165262e-05, + "loss": 0.2692, + "num_input_tokens_seen": 6502624, + "step": 9950 + }, + { + "epoch": 5.217505241090147, + "grad_norm": 1.093816876411438, + "learning_rate": 2.7465789994882796e-05, + "loss": 0.4453, + "num_input_tokens_seen": 6506272, + "step": 9955 + }, + { + "epoch": 5.220125786163522, + "grad_norm": 1.7459266185760498, + "learning_rate": 2.7443031903339896e-05, + "loss": 0.2996, + "num_input_tokens_seen": 6509504, + "step": 9960 + }, + { + "epoch": 5.222746331236897, + "grad_norm": 3.843048095703125, + "learning_rate": 2.742027176757948e-05, + "loss": 0.3298, + "num_input_tokens_seen": 6512512, + "step": 9965 + }, + { + "epoch": 5.2253668763102725, + "grad_norm": 0.6961658596992493, + "learning_rate": 2.7397509606646204e-05, + "loss": 0.3355, + "num_input_tokens_seen": 6515456, + "step": 9970 + }, + { + "epoch": 5.227987421383648, + "grad_norm": 2.4165444374084473, + "learning_rate": 2.7374745439586414e-05, + "loss": 0.2538, + "num_input_tokens_seen": 6518880, + "step": 9975 + }, + { + "epoch": 5.230607966457023, + "grad_norm": 1.3571380376815796, + "learning_rate": 2.735197928544811e-05, + "loss": 0.2825, + "num_input_tokens_seen": 6521664, + "step": 9980 + }, + { + "epoch": 5.233228511530398, + "grad_norm": 1.8352069854736328, + "learning_rate": 2.7329211163280972e-05, + "loss": 0.2757, + "num_input_tokens_seen": 6524160, + "step": 9985 + }, + { + "epoch": 5.235849056603773, + "grad_norm": 1.902510404586792, + "learning_rate": 2.730644109213632e-05, + "loss": 0.2083, + "num_input_tokens_seen": 6527904, + "step": 9990 + }, + { + "epoch": 5.238469601677149, + "grad_norm": 3.9842188358306885, + "learning_rate": 2.7283669091067127e-05, + "loss": 0.4727, + "num_input_tokens_seen": 6530560, + "step": 9995 + }, + { + "epoch": 5.241090146750524, + "grad_norm": 1.7861855030059814, + "learning_rate": 2.7260895179127944e-05, + "loss": 0.3178, + "num_input_tokens_seen": 6534048, + "step": 10000 + }, + { + "epoch": 5.2437106918239, + "grad_norm": 1.7066043615341187, + "learning_rate": 2.7238119375374954e-05, + "loss": 0.2754, + "num_input_tokens_seen": 6536864, + "step": 10005 + }, + { + "epoch": 5.246331236897275, + "grad_norm": 1.867722511291504, + "learning_rate": 2.7215341698865904e-05, + "loss": 0.3526, + "num_input_tokens_seen": 6540608, + "step": 10010 + }, + { + "epoch": 5.24895178197065, + "grad_norm": 1.665394902229309, + "learning_rate": 2.7192562168660113e-05, + "loss": 0.332, + "num_input_tokens_seen": 6543744, + "step": 10015 + }, + { + "epoch": 5.251572327044025, + "grad_norm": 1.4945679903030396, + "learning_rate": 2.7169780803818445e-05, + "loss": 0.2885, + "num_input_tokens_seen": 6546656, + "step": 10020 + }, + { + "epoch": 5.2541928721174, + "grad_norm": 1.8533202409744263, + "learning_rate": 2.714699762340332e-05, + "loss": 0.3512, + "num_input_tokens_seen": 6549152, + "step": 10025 + }, + { + "epoch": 5.256813417190775, + "grad_norm": 1.7284371852874756, + "learning_rate": 2.7124212646478652e-05, + "loss": 0.3014, + "num_input_tokens_seen": 6552096, + "step": 10030 + }, + { + "epoch": 5.259433962264151, + "grad_norm": 1.7629505395889282, + "learning_rate": 2.7101425892109865e-05, + "loss": 0.3407, + "num_input_tokens_seen": 6555232, + "step": 10035 + }, + { + "epoch": 5.262054507337526, + "grad_norm": 1.562330722808838, + "learning_rate": 2.707863737936389e-05, + "loss": 0.278, + "num_input_tokens_seen": 6557920, + "step": 10040 + }, + { + "epoch": 5.264675052410902, + "grad_norm": 1.8599263429641724, + "learning_rate": 2.7055847127309107e-05, + "loss": 0.2661, + "num_input_tokens_seen": 6560928, + "step": 10045 + }, + { + "epoch": 5.267295597484277, + "grad_norm": 1.9006868600845337, + "learning_rate": 2.703305515501534e-05, + "loss": 0.2635, + "num_input_tokens_seen": 6563488, + "step": 10050 + }, + { + "epoch": 5.269916142557652, + "grad_norm": 1.2214319705963135, + "learning_rate": 2.70102614815539e-05, + "loss": 0.2933, + "num_input_tokens_seen": 6567520, + "step": 10055 + }, + { + "epoch": 5.272536687631027, + "grad_norm": 1.147352933883667, + "learning_rate": 2.6987466125997475e-05, + "loss": 0.336, + "num_input_tokens_seen": 6570528, + "step": 10060 + }, + { + "epoch": 5.2751572327044025, + "grad_norm": 1.307580590248108, + "learning_rate": 2.696466910742018e-05, + "loss": 0.2495, + "num_input_tokens_seen": 6573824, + "step": 10065 + }, + { + "epoch": 5.277777777777778, + "grad_norm": 1.1940315961837769, + "learning_rate": 2.694187044489751e-05, + "loss": 0.3324, + "num_input_tokens_seen": 6576896, + "step": 10070 + }, + { + "epoch": 5.280398322851153, + "grad_norm": 3.026355266571045, + "learning_rate": 2.691907015750636e-05, + "loss": 0.3321, + "num_input_tokens_seen": 6579136, + "step": 10075 + }, + { + "epoch": 5.283018867924528, + "grad_norm": 1.4834192991256714, + "learning_rate": 2.6896268264324964e-05, + "loss": 0.2554, + "num_input_tokens_seen": 6582080, + "step": 10080 + }, + { + "epoch": 5.285639412997903, + "grad_norm": 1.7224262952804565, + "learning_rate": 2.6873464784432894e-05, + "loss": 0.275, + "num_input_tokens_seen": 6584896, + "step": 10085 + }, + { + "epoch": 5.288259958071279, + "grad_norm": 2.634972333908081, + "learning_rate": 2.6850659736911073e-05, + "loss": 0.3076, + "num_input_tokens_seen": 6588448, + "step": 10090 + }, + { + "epoch": 5.290880503144654, + "grad_norm": 1.5458506345748901, + "learning_rate": 2.682785314084172e-05, + "loss": 0.289, + "num_input_tokens_seen": 6592704, + "step": 10095 + }, + { + "epoch": 5.29350104821803, + "grad_norm": 2.224030017852783, + "learning_rate": 2.680504501530835e-05, + "loss": 0.292, + "num_input_tokens_seen": 6596256, + "step": 10100 + }, + { + "epoch": 5.296121593291405, + "grad_norm": 1.4395005702972412, + "learning_rate": 2.6782235379395766e-05, + "loss": 0.3285, + "num_input_tokens_seen": 6598656, + "step": 10105 + }, + { + "epoch": 5.29874213836478, + "grad_norm": 1.3234714269638062, + "learning_rate": 2.675942425219002e-05, + "loss": 0.308, + "num_input_tokens_seen": 6601504, + "step": 10110 + }, + { + "epoch": 5.301362683438155, + "grad_norm": 2.2083494663238525, + "learning_rate": 2.673661165277843e-05, + "loss": 0.2933, + "num_input_tokens_seen": 6604640, + "step": 10115 + }, + { + "epoch": 5.30398322851153, + "grad_norm": 1.1980255842208862, + "learning_rate": 2.6713797600249536e-05, + "loss": 0.3277, + "num_input_tokens_seen": 6607520, + "step": 10120 + }, + { + "epoch": 5.306603773584905, + "grad_norm": 0.7903380393981934, + "learning_rate": 2.6690982113693092e-05, + "loss": 0.2325, + "num_input_tokens_seen": 6611360, + "step": 10125 + }, + { + "epoch": 5.309224318658281, + "grad_norm": 1.1785119771957397, + "learning_rate": 2.6668165212200057e-05, + "loss": 0.2297, + "num_input_tokens_seen": 6616672, + "step": 10130 + }, + { + "epoch": 5.311844863731656, + "grad_norm": 1.210551381111145, + "learning_rate": 2.664534691486257e-05, + "loss": 0.3888, + "num_input_tokens_seen": 6619424, + "step": 10135 + }, + { + "epoch": 5.314465408805032, + "grad_norm": 1.311276912689209, + "learning_rate": 2.6622527240773942e-05, + "loss": 0.2021, + "num_input_tokens_seen": 6622656, + "step": 10140 + }, + { + "epoch": 5.317085953878407, + "grad_norm": 1.6555570363998413, + "learning_rate": 2.6599706209028634e-05, + "loss": 0.236, + "num_input_tokens_seen": 6625568, + "step": 10145 + }, + { + "epoch": 5.319706498951782, + "grad_norm": 2.8383896350860596, + "learning_rate": 2.657688383872224e-05, + "loss": 0.4279, + "num_input_tokens_seen": 6628896, + "step": 10150 + }, + { + "epoch": 5.322327044025157, + "grad_norm": 1.6347856521606445, + "learning_rate": 2.655406014895147e-05, + "loss": 0.2578, + "num_input_tokens_seen": 6632672, + "step": 10155 + }, + { + "epoch": 5.3249475890985325, + "grad_norm": 1.7432290315628052, + "learning_rate": 2.653123515881417e-05, + "loss": 0.2673, + "num_input_tokens_seen": 6637344, + "step": 10160 + }, + { + "epoch": 5.327568134171908, + "grad_norm": 1.8260486125946045, + "learning_rate": 2.650840888740923e-05, + "loss": 0.3619, + "num_input_tokens_seen": 6640416, + "step": 10165 + }, + { + "epoch": 5.330188679245283, + "grad_norm": 1.3224385976791382, + "learning_rate": 2.6485581353836624e-05, + "loss": 0.2253, + "num_input_tokens_seen": 6642688, + "step": 10170 + }, + { + "epoch": 5.332809224318658, + "grad_norm": 1.8604865074157715, + "learning_rate": 2.6462752577197407e-05, + "loss": 0.2744, + "num_input_tokens_seen": 6645952, + "step": 10175 + }, + { + "epoch": 5.335429769392033, + "grad_norm": 1.4389704465866089, + "learning_rate": 2.643992257659365e-05, + "loss": 0.3222, + "num_input_tokens_seen": 6648576, + "step": 10180 + }, + { + "epoch": 5.338050314465409, + "grad_norm": 2.529261350631714, + "learning_rate": 2.641709137112845e-05, + "loss": 0.3374, + "num_input_tokens_seen": 6651840, + "step": 10185 + }, + { + "epoch": 5.340670859538784, + "grad_norm": 2.312906265258789, + "learning_rate": 2.639425897990593e-05, + "loss": 0.4116, + "num_input_tokens_seen": 6654336, + "step": 10190 + }, + { + "epoch": 5.34329140461216, + "grad_norm": 1.8639981746673584, + "learning_rate": 2.6371425422031172e-05, + "loss": 0.3428, + "num_input_tokens_seen": 6657600, + "step": 10195 + }, + { + "epoch": 5.345911949685535, + "grad_norm": 1.1588916778564453, + "learning_rate": 2.6348590716610273e-05, + "loss": 0.2419, + "num_input_tokens_seen": 6660768, + "step": 10200 + }, + { + "epoch": 5.34853249475891, + "grad_norm": 2.540006399154663, + "learning_rate": 2.6325754882750252e-05, + "loss": 0.3147, + "num_input_tokens_seen": 6663808, + "step": 10205 + }, + { + "epoch": 5.351153039832285, + "grad_norm": 0.7889065146446228, + "learning_rate": 2.630291793955911e-05, + "loss": 0.3233, + "num_input_tokens_seen": 6666048, + "step": 10210 + }, + { + "epoch": 5.35377358490566, + "grad_norm": 1.3966227769851685, + "learning_rate": 2.6280079906145756e-05, + "loss": 0.2685, + "num_input_tokens_seen": 6668672, + "step": 10215 + }, + { + "epoch": 5.356394129979035, + "grad_norm": 1.569060206413269, + "learning_rate": 2.6257240801620004e-05, + "loss": 0.3981, + "num_input_tokens_seen": 6671680, + "step": 10220 + }, + { + "epoch": 5.359014675052411, + "grad_norm": 1.1697046756744385, + "learning_rate": 2.623440064509258e-05, + "loss": 0.2577, + "num_input_tokens_seen": 6674592, + "step": 10225 + }, + { + "epoch": 5.361635220125786, + "grad_norm": 1.5640220642089844, + "learning_rate": 2.621155945567508e-05, + "loss": 0.2424, + "num_input_tokens_seen": 6677632, + "step": 10230 + }, + { + "epoch": 5.364255765199162, + "grad_norm": 1.1419621706008911, + "learning_rate": 2.6188717252479968e-05, + "loss": 0.2151, + "num_input_tokens_seen": 6680768, + "step": 10235 + }, + { + "epoch": 5.366876310272537, + "grad_norm": 2.0608773231506348, + "learning_rate": 2.6165874054620552e-05, + "loss": 0.3222, + "num_input_tokens_seen": 6683712, + "step": 10240 + }, + { + "epoch": 5.369496855345912, + "grad_norm": 1.565932035446167, + "learning_rate": 2.614302988121099e-05, + "loss": 0.3265, + "num_input_tokens_seen": 6688512, + "step": 10245 + }, + { + "epoch": 5.372117400419287, + "grad_norm": 1.2253667116165161, + "learning_rate": 2.6120184751366238e-05, + "loss": 0.2623, + "num_input_tokens_seen": 6691520, + "step": 10250 + }, + { + "epoch": 5.3747379454926625, + "grad_norm": 0.9191533923149109, + "learning_rate": 2.6097338684202043e-05, + "loss": 0.3268, + "num_input_tokens_seen": 6695360, + "step": 10255 + }, + { + "epoch": 5.377358490566038, + "grad_norm": 1.102532982826233, + "learning_rate": 2.607449169883497e-05, + "loss": 0.277, + "num_input_tokens_seen": 6698688, + "step": 10260 + }, + { + "epoch": 5.379979035639413, + "grad_norm": 1.5865212678909302, + "learning_rate": 2.605164381438232e-05, + "loss": 0.3095, + "num_input_tokens_seen": 6701312, + "step": 10265 + }, + { + "epoch": 5.382599580712788, + "grad_norm": 1.544390320777893, + "learning_rate": 2.6028795049962167e-05, + "loss": 0.3119, + "num_input_tokens_seen": 6703840, + "step": 10270 + }, + { + "epoch": 5.385220125786163, + "grad_norm": 2.1069939136505127, + "learning_rate": 2.600594542469331e-05, + "loss": 0.2651, + "num_input_tokens_seen": 6706720, + "step": 10275 + }, + { + "epoch": 5.387840670859539, + "grad_norm": 1.6490042209625244, + "learning_rate": 2.5983094957695263e-05, + "loss": 0.2717, + "num_input_tokens_seen": 6710240, + "step": 10280 + }, + { + "epoch": 5.390461215932914, + "grad_norm": 1.5272318124771118, + "learning_rate": 2.596024366808827e-05, + "loss": 0.4386, + "num_input_tokens_seen": 6713632, + "step": 10285 + }, + { + "epoch": 5.3930817610062896, + "grad_norm": 1.116688847541809, + "learning_rate": 2.5937391574993238e-05, + "loss": 0.2715, + "num_input_tokens_seen": 6716032, + "step": 10290 + }, + { + "epoch": 5.395702306079665, + "grad_norm": 1.8351441621780396, + "learning_rate": 2.5914538697531755e-05, + "loss": 0.2848, + "num_input_tokens_seen": 6719136, + "step": 10295 + }, + { + "epoch": 5.39832285115304, + "grad_norm": 2.6148946285247803, + "learning_rate": 2.5891685054826054e-05, + "loss": 0.2901, + "num_input_tokens_seen": 6721824, + "step": 10300 + }, + { + "epoch": 5.400943396226415, + "grad_norm": 1.1907955408096313, + "learning_rate": 2.586883066599904e-05, + "loss": 0.2808, + "num_input_tokens_seen": 6724864, + "step": 10305 + }, + { + "epoch": 5.40356394129979, + "grad_norm": 1.8429936170578003, + "learning_rate": 2.5845975550174206e-05, + "loss": 0.2196, + "num_input_tokens_seen": 6728064, + "step": 10310 + }, + { + "epoch": 5.406184486373165, + "grad_norm": 1.6748179197311401, + "learning_rate": 2.5823119726475682e-05, + "loss": 0.3909, + "num_input_tokens_seen": 6730784, + "step": 10315 + }, + { + "epoch": 5.408805031446541, + "grad_norm": 1.3120073080062866, + "learning_rate": 2.5800263214028153e-05, + "loss": 0.3567, + "num_input_tokens_seen": 6733408, + "step": 10320 + }, + { + "epoch": 5.411425576519916, + "grad_norm": 0.2461719959974289, + "learning_rate": 2.5777406031956935e-05, + "loss": 0.3091, + "num_input_tokens_seen": 6738912, + "step": 10325 + }, + { + "epoch": 5.414046121593292, + "grad_norm": 1.5097438097000122, + "learning_rate": 2.5754548199387863e-05, + "loss": 0.3353, + "num_input_tokens_seen": 6742784, + "step": 10330 + }, + { + "epoch": 5.416666666666667, + "grad_norm": 0.9000470638275146, + "learning_rate": 2.5731689735447317e-05, + "loss": 0.2552, + "num_input_tokens_seen": 6746080, + "step": 10335 + }, + { + "epoch": 5.419287211740042, + "grad_norm": 1.253502368927002, + "learning_rate": 2.5708830659262218e-05, + "loss": 0.3695, + "num_input_tokens_seen": 6750368, + "step": 10340 + }, + { + "epoch": 5.421907756813417, + "grad_norm": 1.2341094017028809, + "learning_rate": 2.5685970989960005e-05, + "loss": 0.3133, + "num_input_tokens_seen": 6753984, + "step": 10345 + }, + { + "epoch": 5.4245283018867925, + "grad_norm": 1.614012360572815, + "learning_rate": 2.5663110746668612e-05, + "loss": 0.3676, + "num_input_tokens_seen": 6756576, + "step": 10350 + }, + { + "epoch": 5.427148846960168, + "grad_norm": 1.6587220430374146, + "learning_rate": 2.564024994851642e-05, + "loss": 0.2767, + "num_input_tokens_seen": 6759904, + "step": 10355 + }, + { + "epoch": 5.429769392033543, + "grad_norm": 1.5984411239624023, + "learning_rate": 2.561738861463232e-05, + "loss": 0.2674, + "num_input_tokens_seen": 6762880, + "step": 10360 + }, + { + "epoch": 5.432389937106918, + "grad_norm": 1.8521273136138916, + "learning_rate": 2.559452676414564e-05, + "loss": 0.4797, + "num_input_tokens_seen": 6765728, + "step": 10365 + }, + { + "epoch": 5.435010482180293, + "grad_norm": 2.9887044429779053, + "learning_rate": 2.5571664416186108e-05, + "loss": 0.2469, + "num_input_tokens_seen": 6768704, + "step": 10370 + }, + { + "epoch": 5.437631027253669, + "grad_norm": 1.1034015417099, + "learning_rate": 2.5548801589883913e-05, + "loss": 0.2169, + "num_input_tokens_seen": 6772256, + "step": 10375 + }, + { + "epoch": 5.440251572327044, + "grad_norm": 1.4357777833938599, + "learning_rate": 2.5525938304369614e-05, + "loss": 0.3369, + "num_input_tokens_seen": 6775552, + "step": 10380 + }, + { + "epoch": 5.4428721174004195, + "grad_norm": 1.569899559020996, + "learning_rate": 2.5503074578774166e-05, + "loss": 0.257, + "num_input_tokens_seen": 6778176, + "step": 10385 + }, + { + "epoch": 5.445492662473795, + "grad_norm": 1.4927079677581787, + "learning_rate": 2.5480210432228886e-05, + "loss": 0.2949, + "num_input_tokens_seen": 6781888, + "step": 10390 + }, + { + "epoch": 5.44811320754717, + "grad_norm": 1.5143654346466064, + "learning_rate": 2.5457345883865457e-05, + "loss": 0.3082, + "num_input_tokens_seen": 6785056, + "step": 10395 + }, + { + "epoch": 5.450733752620545, + "grad_norm": 1.663265347480774, + "learning_rate": 2.5434480952815877e-05, + "loss": 0.291, + "num_input_tokens_seen": 6788832, + "step": 10400 + }, + { + "epoch": 5.45335429769392, + "grad_norm": 1.8114901781082153, + "learning_rate": 2.5411615658212478e-05, + "loss": 0.3194, + "num_input_tokens_seen": 6792320, + "step": 10405 + }, + { + "epoch": 5.455974842767295, + "grad_norm": 0.7383809685707092, + "learning_rate": 2.5388750019187912e-05, + "loss": 0.3595, + "num_input_tokens_seen": 6796000, + "step": 10410 + }, + { + "epoch": 5.4585953878406706, + "grad_norm": 1.1850756406784058, + "learning_rate": 2.5365884054875084e-05, + "loss": 0.282, + "num_input_tokens_seen": 6798400, + "step": 10415 + }, + { + "epoch": 5.461215932914046, + "grad_norm": 2.216820240020752, + "learning_rate": 2.5343017784407184e-05, + "loss": 0.3324, + "num_input_tokens_seen": 6801440, + "step": 10420 + }, + { + "epoch": 5.463836477987422, + "grad_norm": 1.4877219200134277, + "learning_rate": 2.532015122691767e-05, + "loss": 0.2818, + "num_input_tokens_seen": 6807072, + "step": 10425 + }, + { + "epoch": 5.466457023060797, + "grad_norm": 1.3214092254638672, + "learning_rate": 2.5297284401540243e-05, + "loss": 0.2869, + "num_input_tokens_seen": 6810272, + "step": 10430 + }, + { + "epoch": 5.469077568134172, + "grad_norm": 1.28770112991333, + "learning_rate": 2.5274417327408805e-05, + "loss": 0.2761, + "num_input_tokens_seen": 6813184, + "step": 10435 + }, + { + "epoch": 5.471698113207547, + "grad_norm": 2.2417471408843994, + "learning_rate": 2.5251550023657478e-05, + "loss": 0.3149, + "num_input_tokens_seen": 6817824, + "step": 10440 + }, + { + "epoch": 5.4743186582809225, + "grad_norm": 1.3860909938812256, + "learning_rate": 2.5228682509420582e-05, + "loss": 0.2651, + "num_input_tokens_seen": 6820480, + "step": 10445 + }, + { + "epoch": 5.476939203354298, + "grad_norm": 1.7986016273498535, + "learning_rate": 2.5205814803832617e-05, + "loss": 0.3449, + "num_input_tokens_seen": 6823360, + "step": 10450 + }, + { + "epoch": 5.479559748427673, + "grad_norm": 1.338629126548767, + "learning_rate": 2.518294692602821e-05, + "loss": 0.3432, + "num_input_tokens_seen": 6825856, + "step": 10455 + }, + { + "epoch": 5.482180293501048, + "grad_norm": 1.968843936920166, + "learning_rate": 2.5160078895142186e-05, + "loss": 0.3333, + "num_input_tokens_seen": 6828896, + "step": 10460 + }, + { + "epoch": 5.484800838574423, + "grad_norm": 1.1637799739837646, + "learning_rate": 2.5137210730309447e-05, + "loss": 0.3223, + "num_input_tokens_seen": 6832032, + "step": 10465 + }, + { + "epoch": 5.487421383647799, + "grad_norm": 3.7179155349731445, + "learning_rate": 2.5114342450665034e-05, + "loss": 0.3322, + "num_input_tokens_seen": 6834624, + "step": 10470 + }, + { + "epoch": 5.490041928721174, + "grad_norm": 1.3310115337371826, + "learning_rate": 2.509147407534409e-05, + "loss": 0.4047, + "num_input_tokens_seen": 6837504, + "step": 10475 + }, + { + "epoch": 5.4926624737945495, + "grad_norm": 1.0037142038345337, + "learning_rate": 2.5068605623481816e-05, + "loss": 0.3016, + "num_input_tokens_seen": 6840224, + "step": 10480 + }, + { + "epoch": 5.495283018867925, + "grad_norm": 1.2554866075515747, + "learning_rate": 2.5045737114213487e-05, + "loss": 0.2046, + "num_input_tokens_seen": 6843520, + "step": 10485 + }, + { + "epoch": 5.4979035639413, + "grad_norm": 1.0959641933441162, + "learning_rate": 2.502286856667443e-05, + "loss": 0.3046, + "num_input_tokens_seen": 6846464, + "step": 10490 + }, + { + "epoch": 5.5, + "eval_loss": 0.536855936050415, + "eval_runtime": 15.9928, + "eval_samples_per_second": 53.024, + "eval_steps_per_second": 13.256, + "num_input_tokens_seen": 6849184, + "step": 10494 + }, + { + "epoch": 5.500524109014675, + "grad_norm": 1.6931992769241333, + "learning_rate": 2.5e-05, + "loss": 0.3478, + "num_input_tokens_seen": 6849696, + "step": 10495 + }, + { + "epoch": 5.50314465408805, + "grad_norm": 1.6266348361968994, + "learning_rate": 2.497713143332557e-05, + "loss": 0.3954, + "num_input_tokens_seen": 6853216, + "step": 10500 + }, + { + "epoch": 5.505765199161425, + "grad_norm": 2.38529634475708, + "learning_rate": 2.495426288578652e-05, + "loss": 0.3069, + "num_input_tokens_seen": 6855872, + "step": 10505 + }, + { + "epoch": 5.5083857442348005, + "grad_norm": 2.0661675930023193, + "learning_rate": 2.493139437651819e-05, + "loss": 0.2498, + "num_input_tokens_seen": 6858688, + "step": 10510 + }, + { + "epoch": 5.511006289308176, + "grad_norm": 1.9192407131195068, + "learning_rate": 2.490852592465591e-05, + "loss": 0.2789, + "num_input_tokens_seen": 6861632, + "step": 10515 + }, + { + "epoch": 5.513626834381551, + "grad_norm": 1.631698727607727, + "learning_rate": 2.488565754933497e-05, + "loss": 0.436, + "num_input_tokens_seen": 6865696, + "step": 10520 + }, + { + "epoch": 5.516247379454927, + "grad_norm": 1.744831919670105, + "learning_rate": 2.486278926969056e-05, + "loss": 0.2859, + "num_input_tokens_seen": 6869056, + "step": 10525 + }, + { + "epoch": 5.518867924528302, + "grad_norm": 2.928347587585449, + "learning_rate": 2.483992110485782e-05, + "loss": 0.2791, + "num_input_tokens_seen": 6871488, + "step": 10530 + }, + { + "epoch": 5.521488469601677, + "grad_norm": 1.1097009181976318, + "learning_rate": 2.4817053073971792e-05, + "loss": 0.332, + "num_input_tokens_seen": 6875008, + "step": 10535 + }, + { + "epoch": 5.524109014675052, + "grad_norm": 3.6025993824005127, + "learning_rate": 2.4794185196167392e-05, + "loss": 0.427, + "num_input_tokens_seen": 6878336, + "step": 10540 + }, + { + "epoch": 5.526729559748428, + "grad_norm": 1.0030460357666016, + "learning_rate": 2.477131749057942e-05, + "loss": 0.2911, + "num_input_tokens_seen": 6882080, + "step": 10545 + }, + { + "epoch": 5.529350104821803, + "grad_norm": 1.253927230834961, + "learning_rate": 2.4748449976342524e-05, + "loss": 0.2436, + "num_input_tokens_seen": 6885504, + "step": 10550 + }, + { + "epoch": 5.531970649895178, + "grad_norm": 1.1760814189910889, + "learning_rate": 2.47255826725912e-05, + "loss": 0.338, + "num_input_tokens_seen": 6888704, + "step": 10555 + }, + { + "epoch": 5.534591194968553, + "grad_norm": 1.3661428689956665, + "learning_rate": 2.4702715598459766e-05, + "loss": 0.3519, + "num_input_tokens_seen": 6891936, + "step": 10560 + }, + { + "epoch": 5.537211740041929, + "grad_norm": 1.8131790161132812, + "learning_rate": 2.467984877308233e-05, + "loss": 0.3522, + "num_input_tokens_seen": 6894848, + "step": 10565 + }, + { + "epoch": 5.539832285115304, + "grad_norm": 1.7170110940933228, + "learning_rate": 2.4656982215592818e-05, + "loss": 0.2947, + "num_input_tokens_seen": 6897728, + "step": 10570 + }, + { + "epoch": 5.5424528301886795, + "grad_norm": 1.6416997909545898, + "learning_rate": 2.463411594512493e-05, + "loss": 0.2364, + "num_input_tokens_seen": 6900672, + "step": 10575 + }, + { + "epoch": 5.545073375262055, + "grad_norm": 1.6862335205078125, + "learning_rate": 2.4611249980812094e-05, + "loss": 0.2752, + "num_input_tokens_seen": 6905920, + "step": 10580 + }, + { + "epoch": 5.54769392033543, + "grad_norm": 0.7630168199539185, + "learning_rate": 2.4588384341787518e-05, + "loss": 0.3102, + "num_input_tokens_seen": 6909408, + "step": 10585 + }, + { + "epoch": 5.550314465408805, + "grad_norm": 1.5025696754455566, + "learning_rate": 2.456551904718413e-05, + "loss": 0.2585, + "num_input_tokens_seen": 6912128, + "step": 10590 + }, + { + "epoch": 5.55293501048218, + "grad_norm": 1.66398286819458, + "learning_rate": 2.454265411613455e-05, + "loss": 0.3502, + "num_input_tokens_seen": 6915136, + "step": 10595 + }, + { + "epoch": 5.555555555555555, + "grad_norm": 1.7460919618606567, + "learning_rate": 2.4519789567771116e-05, + "loss": 0.3156, + "num_input_tokens_seen": 6917856, + "step": 10600 + }, + { + "epoch": 5.5581761006289305, + "grad_norm": 1.5610179901123047, + "learning_rate": 2.4496925421225847e-05, + "loss": 0.2934, + "num_input_tokens_seen": 6920864, + "step": 10605 + }, + { + "epoch": 5.560796645702306, + "grad_norm": 1.434424638748169, + "learning_rate": 2.4474061695630395e-05, + "loss": 0.4519, + "num_input_tokens_seen": 6924384, + "step": 10610 + }, + { + "epoch": 5.563417190775681, + "grad_norm": 1.7475148439407349, + "learning_rate": 2.4451198410116086e-05, + "loss": 0.3209, + "num_input_tokens_seen": 6926848, + "step": 10615 + }, + { + "epoch": 5.566037735849057, + "grad_norm": 1.2978439331054688, + "learning_rate": 2.4428335583813898e-05, + "loss": 0.2269, + "num_input_tokens_seen": 6929536, + "step": 10620 + }, + { + "epoch": 5.568658280922432, + "grad_norm": 2.071075677871704, + "learning_rate": 2.4405473235854367e-05, + "loss": 0.3689, + "num_input_tokens_seen": 6932064, + "step": 10625 + }, + { + "epoch": 5.571278825995807, + "grad_norm": 1.707824468612671, + "learning_rate": 2.4382611385367678e-05, + "loss": 0.3016, + "num_input_tokens_seen": 6935296, + "step": 10630 + }, + { + "epoch": 5.573899371069182, + "grad_norm": 1.2196133136749268, + "learning_rate": 2.4359750051483584e-05, + "loss": 0.2531, + "num_input_tokens_seen": 6937824, + "step": 10635 + }, + { + "epoch": 5.576519916142558, + "grad_norm": 1.5418543815612793, + "learning_rate": 2.4336889253331397e-05, + "loss": 0.317, + "num_input_tokens_seen": 6941600, + "step": 10640 + }, + { + "epoch": 5.579140461215933, + "grad_norm": 1.4642380475997925, + "learning_rate": 2.4314029010040004e-05, + "loss": 0.3109, + "num_input_tokens_seen": 6944608, + "step": 10645 + }, + { + "epoch": 5.581761006289308, + "grad_norm": 1.9674766063690186, + "learning_rate": 2.429116934073779e-05, + "loss": 0.3521, + "num_input_tokens_seen": 6948000, + "step": 10650 + }, + { + "epoch": 5.584381551362683, + "grad_norm": 1.3576979637145996, + "learning_rate": 2.426831026455269e-05, + "loss": 0.2272, + "num_input_tokens_seen": 6951424, + "step": 10655 + }, + { + "epoch": 5.587002096436059, + "grad_norm": 1.326974868774414, + "learning_rate": 2.424545180061215e-05, + "loss": 0.2797, + "num_input_tokens_seen": 6956000, + "step": 10660 + }, + { + "epoch": 5.589622641509434, + "grad_norm": 1.463066577911377, + "learning_rate": 2.422259396804307e-05, + "loss": 0.3084, + "num_input_tokens_seen": 6958944, + "step": 10665 + }, + { + "epoch": 5.5922431865828095, + "grad_norm": 1.1540006399154663, + "learning_rate": 2.4199736785971846e-05, + "loss": 0.2723, + "num_input_tokens_seen": 6962304, + "step": 10670 + }, + { + "epoch": 5.594863731656185, + "grad_norm": 1.7980889081954956, + "learning_rate": 2.417688027352433e-05, + "loss": 0.352, + "num_input_tokens_seen": 6965632, + "step": 10675 + }, + { + "epoch": 5.59748427672956, + "grad_norm": 1.0041598081588745, + "learning_rate": 2.41540244498258e-05, + "loss": 0.3503, + "num_input_tokens_seen": 6969312, + "step": 10680 + }, + { + "epoch": 5.600104821802935, + "grad_norm": 0.9418330192565918, + "learning_rate": 2.4131169334000963e-05, + "loss": 0.2307, + "num_input_tokens_seen": 6972576, + "step": 10685 + }, + { + "epoch": 5.60272536687631, + "grad_norm": 1.2036410570144653, + "learning_rate": 2.4108314945173955e-05, + "loss": 0.2087, + "num_input_tokens_seen": 6975872, + "step": 10690 + }, + { + "epoch": 5.605345911949685, + "grad_norm": 1.122182011604309, + "learning_rate": 2.4085461302468254e-05, + "loss": 0.2492, + "num_input_tokens_seen": 6978784, + "step": 10695 + }, + { + "epoch": 5.6079664570230605, + "grad_norm": 1.6034501791000366, + "learning_rate": 2.4062608425006765e-05, + "loss": 0.435, + "num_input_tokens_seen": 6981408, + "step": 10700 + }, + { + "epoch": 5.610587002096436, + "grad_norm": 1.695624828338623, + "learning_rate": 2.4039756331911737e-05, + "loss": 0.2838, + "num_input_tokens_seen": 6984800, + "step": 10705 + }, + { + "epoch": 5.613207547169811, + "grad_norm": 1.227729320526123, + "learning_rate": 2.401690504230474e-05, + "loss": 0.3121, + "num_input_tokens_seen": 6988192, + "step": 10710 + }, + { + "epoch": 5.615828092243187, + "grad_norm": 1.3587455749511719, + "learning_rate": 2.3994054575306698e-05, + "loss": 0.2355, + "num_input_tokens_seen": 6992416, + "step": 10715 + }, + { + "epoch": 5.618448637316562, + "grad_norm": 1.6825109720230103, + "learning_rate": 2.397120495003784e-05, + "loss": 0.2757, + "num_input_tokens_seen": 6995520, + "step": 10720 + }, + { + "epoch": 5.621069182389937, + "grad_norm": 1.1350255012512207, + "learning_rate": 2.394835618561768e-05, + "loss": 0.2338, + "num_input_tokens_seen": 6998464, + "step": 10725 + }, + { + "epoch": 5.623689727463312, + "grad_norm": 1.8160197734832764, + "learning_rate": 2.3925508301165043e-05, + "loss": 0.2989, + "num_input_tokens_seen": 7001600, + "step": 10730 + }, + { + "epoch": 5.626310272536688, + "grad_norm": 1.653826355934143, + "learning_rate": 2.390266131579796e-05, + "loss": 0.2975, + "num_input_tokens_seen": 7004704, + "step": 10735 + }, + { + "epoch": 5.628930817610063, + "grad_norm": 1.8042151927947998, + "learning_rate": 2.3879815248633768e-05, + "loss": 0.3862, + "num_input_tokens_seen": 7007424, + "step": 10740 + }, + { + "epoch": 5.631551362683438, + "grad_norm": 0.8002895712852478, + "learning_rate": 2.385697011878902e-05, + "loss": 0.2334, + "num_input_tokens_seen": 7010688, + "step": 10745 + }, + { + "epoch": 5.634171907756813, + "grad_norm": 1.033745527267456, + "learning_rate": 2.383412594537945e-05, + "loss": 0.2162, + "num_input_tokens_seen": 7014048, + "step": 10750 + }, + { + "epoch": 5.636792452830189, + "grad_norm": 1.5148131847381592, + "learning_rate": 2.3811282747520038e-05, + "loss": 0.2717, + "num_input_tokens_seen": 7017856, + "step": 10755 + }, + { + "epoch": 5.639412997903564, + "grad_norm": 1.4131299257278442, + "learning_rate": 2.378844054432493e-05, + "loss": 0.236, + "num_input_tokens_seen": 7021248, + "step": 10760 + }, + { + "epoch": 5.6420335429769395, + "grad_norm": 1.5122156143188477, + "learning_rate": 2.3765599354907427e-05, + "loss": 0.3137, + "num_input_tokens_seen": 7024320, + "step": 10765 + }, + { + "epoch": 5.644654088050315, + "grad_norm": 3.0795578956604004, + "learning_rate": 2.374275919838e-05, + "loss": 0.3176, + "num_input_tokens_seen": 7027296, + "step": 10770 + }, + { + "epoch": 5.64727463312369, + "grad_norm": 0.9495642185211182, + "learning_rate": 2.371992009385425e-05, + "loss": 0.2889, + "num_input_tokens_seen": 7030784, + "step": 10775 + }, + { + "epoch": 5.649895178197065, + "grad_norm": 1.136631727218628, + "learning_rate": 2.369708206044089e-05, + "loss": 0.4798, + "num_input_tokens_seen": 7034112, + "step": 10780 + }, + { + "epoch": 5.65251572327044, + "grad_norm": 1.711442470550537, + "learning_rate": 2.3674245117249747e-05, + "loss": 0.2388, + "num_input_tokens_seen": 7039072, + "step": 10785 + }, + { + "epoch": 5.655136268343815, + "grad_norm": 1.45369553565979, + "learning_rate": 2.3651409283389743e-05, + "loss": 0.2912, + "num_input_tokens_seen": 7042304, + "step": 10790 + }, + { + "epoch": 5.6577568134171905, + "grad_norm": 1.4934300184249878, + "learning_rate": 2.3628574577968834e-05, + "loss": 0.3547, + "num_input_tokens_seen": 7046400, + "step": 10795 + }, + { + "epoch": 5.660377358490566, + "grad_norm": 1.317301869392395, + "learning_rate": 2.360574102009408e-05, + "loss": 0.2939, + "num_input_tokens_seen": 7050656, + "step": 10800 + }, + { + "epoch": 5.662997903563941, + "grad_norm": 1.587763786315918, + "learning_rate": 2.3582908628871554e-05, + "loss": 0.2767, + "num_input_tokens_seen": 7053824, + "step": 10805 + }, + { + "epoch": 5.665618448637317, + "grad_norm": 2.588754415512085, + "learning_rate": 2.3560077423406355e-05, + "loss": 0.3162, + "num_input_tokens_seen": 7057760, + "step": 10810 + }, + { + "epoch": 5.668238993710692, + "grad_norm": 1.233951210975647, + "learning_rate": 2.3537247422802595e-05, + "loss": 0.2615, + "num_input_tokens_seen": 7061888, + "step": 10815 + }, + { + "epoch": 5.670859538784067, + "grad_norm": 0.6391374468803406, + "learning_rate": 2.351441864616338e-05, + "loss": 0.2295, + "num_input_tokens_seen": 7064960, + "step": 10820 + }, + { + "epoch": 5.673480083857442, + "grad_norm": 1.1321889162063599, + "learning_rate": 2.3491591112590776e-05, + "loss": 0.2681, + "num_input_tokens_seen": 7067968, + "step": 10825 + }, + { + "epoch": 5.676100628930818, + "grad_norm": 1.5624566078186035, + "learning_rate": 2.346876484118584e-05, + "loss": 0.3377, + "num_input_tokens_seen": 7071424, + "step": 10830 + }, + { + "epoch": 5.678721174004193, + "grad_norm": 1.8861722946166992, + "learning_rate": 2.3445939851048533e-05, + "loss": 0.3386, + "num_input_tokens_seen": 7074656, + "step": 10835 + }, + { + "epoch": 5.681341719077568, + "grad_norm": 1.753158688545227, + "learning_rate": 2.342311616127777e-05, + "loss": 0.3318, + "num_input_tokens_seen": 7077952, + "step": 10840 + }, + { + "epoch": 5.683962264150943, + "grad_norm": 2.3471720218658447, + "learning_rate": 2.3400293790971378e-05, + "loss": 0.2315, + "num_input_tokens_seen": 7080512, + "step": 10845 + }, + { + "epoch": 5.686582809224319, + "grad_norm": 2.118250608444214, + "learning_rate": 2.3377472759226064e-05, + "loss": 0.2912, + "num_input_tokens_seen": 7083904, + "step": 10850 + }, + { + "epoch": 5.689203354297694, + "grad_norm": 2.0518412590026855, + "learning_rate": 2.3354653085137433e-05, + "loss": 0.2927, + "num_input_tokens_seen": 7086144, + "step": 10855 + }, + { + "epoch": 5.6918238993710695, + "grad_norm": 1.5196763277053833, + "learning_rate": 2.333183478779995e-05, + "loss": 0.3052, + "num_input_tokens_seen": 7088608, + "step": 10860 + }, + { + "epoch": 5.694444444444445, + "grad_norm": 0.9430700540542603, + "learning_rate": 2.330901788630691e-05, + "loss": 0.2278, + "num_input_tokens_seen": 7091296, + "step": 10865 + }, + { + "epoch": 5.69706498951782, + "grad_norm": 2.0101895332336426, + "learning_rate": 2.3286202399750463e-05, + "loss": 0.2591, + "num_input_tokens_seen": 7095008, + "step": 10870 + }, + { + "epoch": 5.699685534591195, + "grad_norm": 1.6852608919143677, + "learning_rate": 2.3263388347221575e-05, + "loss": 0.3014, + "num_input_tokens_seen": 7098016, + "step": 10875 + }, + { + "epoch": 5.70230607966457, + "grad_norm": 1.0314075946807861, + "learning_rate": 2.3240575747809984e-05, + "loss": 0.2432, + "num_input_tokens_seen": 7101056, + "step": 10880 + }, + { + "epoch": 5.704926624737945, + "grad_norm": 1.6371474266052246, + "learning_rate": 2.3217764620604233e-05, + "loss": 0.2194, + "num_input_tokens_seen": 7103488, + "step": 10885 + }, + { + "epoch": 5.7075471698113205, + "grad_norm": 1.2762826681137085, + "learning_rate": 2.3194954984691656e-05, + "loss": 0.2637, + "num_input_tokens_seen": 7107072, + "step": 10890 + }, + { + "epoch": 5.710167714884696, + "grad_norm": 1.4565401077270508, + "learning_rate": 2.3172146859158282e-05, + "loss": 0.3178, + "num_input_tokens_seen": 7110336, + "step": 10895 + }, + { + "epoch": 5.712788259958071, + "grad_norm": 1.304860234260559, + "learning_rate": 2.314934026308893e-05, + "loss": 0.3445, + "num_input_tokens_seen": 7114240, + "step": 10900 + }, + { + "epoch": 5.715408805031447, + "grad_norm": 1.2572346925735474, + "learning_rate": 2.3126535215567112e-05, + "loss": 0.342, + "num_input_tokens_seen": 7121216, + "step": 10905 + }, + { + "epoch": 5.718029350104822, + "grad_norm": 2.0347084999084473, + "learning_rate": 2.3103731735675045e-05, + "loss": 0.2805, + "num_input_tokens_seen": 7124800, + "step": 10910 + }, + { + "epoch": 5.720649895178197, + "grad_norm": 1.6089833974838257, + "learning_rate": 2.308092984249365e-05, + "loss": 0.3769, + "num_input_tokens_seen": 7128224, + "step": 10915 + }, + { + "epoch": 5.723270440251572, + "grad_norm": 1.642440676689148, + "learning_rate": 2.3058129555102498e-05, + "loss": 0.2377, + "num_input_tokens_seen": 7131456, + "step": 10920 + }, + { + "epoch": 5.725890985324948, + "grad_norm": 1.6446024179458618, + "learning_rate": 2.3035330892579825e-05, + "loss": 0.3131, + "num_input_tokens_seen": 7134944, + "step": 10925 + }, + { + "epoch": 5.728511530398323, + "grad_norm": 1.0204812288284302, + "learning_rate": 2.3012533874002534e-05, + "loss": 0.4228, + "num_input_tokens_seen": 7139200, + "step": 10930 + }, + { + "epoch": 5.731132075471698, + "grad_norm": 1.8705710172653198, + "learning_rate": 2.2989738518446104e-05, + "loss": 0.2426, + "num_input_tokens_seen": 7141792, + "step": 10935 + }, + { + "epoch": 5.733752620545073, + "grad_norm": 0.7263554930686951, + "learning_rate": 2.2966944844984658e-05, + "loss": 0.2776, + "num_input_tokens_seen": 7145152, + "step": 10940 + }, + { + "epoch": 5.736373165618449, + "grad_norm": 1.5773160457611084, + "learning_rate": 2.29441528726909e-05, + "loss": 0.4254, + "num_input_tokens_seen": 7148192, + "step": 10945 + }, + { + "epoch": 5.738993710691824, + "grad_norm": 1.3763858079910278, + "learning_rate": 2.292136262063611e-05, + "loss": 0.2538, + "num_input_tokens_seen": 7151520, + "step": 10950 + }, + { + "epoch": 5.7416142557651995, + "grad_norm": 2.2206859588623047, + "learning_rate": 2.289857410789013e-05, + "loss": 0.2488, + "num_input_tokens_seen": 7154496, + "step": 10955 + }, + { + "epoch": 5.744234800838575, + "grad_norm": 1.1672108173370361, + "learning_rate": 2.287578735352136e-05, + "loss": 0.33, + "num_input_tokens_seen": 7157600, + "step": 10960 + }, + { + "epoch": 5.74685534591195, + "grad_norm": 2.2323615550994873, + "learning_rate": 2.285300237659668e-05, + "loss": 0.3144, + "num_input_tokens_seen": 7160224, + "step": 10965 + }, + { + "epoch": 5.749475890985325, + "grad_norm": 1.081146240234375, + "learning_rate": 2.283021919618155e-05, + "loss": 0.2307, + "num_input_tokens_seen": 7162720, + "step": 10970 + }, + { + "epoch": 5.7520964360587, + "grad_norm": 1.1632752418518066, + "learning_rate": 2.28074378313399e-05, + "loss": 0.4011, + "num_input_tokens_seen": 7166240, + "step": 10975 + }, + { + "epoch": 5.754716981132075, + "grad_norm": 3.498849868774414, + "learning_rate": 2.2784658301134105e-05, + "loss": 0.3469, + "num_input_tokens_seen": 7172480, + "step": 10980 + }, + { + "epoch": 5.7573375262054505, + "grad_norm": 1.2528836727142334, + "learning_rate": 2.2761880624625048e-05, + "loss": 0.3445, + "num_input_tokens_seen": 7176416, + "step": 10985 + }, + { + "epoch": 5.759958071278826, + "grad_norm": 1.5371856689453125, + "learning_rate": 2.2739104820872062e-05, + "loss": 0.3918, + "num_input_tokens_seen": 7180832, + "step": 10990 + }, + { + "epoch": 5.762578616352201, + "grad_norm": 2.2767114639282227, + "learning_rate": 2.271633090893288e-05, + "loss": 0.3047, + "num_input_tokens_seen": 7184416, + "step": 10995 + }, + { + "epoch": 5.765199161425577, + "grad_norm": 1.8045638799667358, + "learning_rate": 2.269355890786368e-05, + "loss": 0.2726, + "num_input_tokens_seen": 7186816, + "step": 11000 + }, + { + "epoch": 5.767819706498952, + "grad_norm": 1.7524943351745605, + "learning_rate": 2.2670788836719037e-05, + "loss": 0.3829, + "num_input_tokens_seen": 7189824, + "step": 11005 + }, + { + "epoch": 5.770440251572327, + "grad_norm": 2.4163718223571777, + "learning_rate": 2.2648020714551897e-05, + "loss": 0.3618, + "num_input_tokens_seen": 7192640, + "step": 11010 + }, + { + "epoch": 5.773060796645702, + "grad_norm": 1.7317065000534058, + "learning_rate": 2.26252545604136e-05, + "loss": 0.3361, + "num_input_tokens_seen": 7195328, + "step": 11015 + }, + { + "epoch": 5.7756813417190775, + "grad_norm": 2.4882237911224365, + "learning_rate": 2.2602490393353798e-05, + "loss": 0.4698, + "num_input_tokens_seen": 7198464, + "step": 11020 + }, + { + "epoch": 5.778301886792453, + "grad_norm": 1.8632382154464722, + "learning_rate": 2.2579728232420525e-05, + "loss": 0.2619, + "num_input_tokens_seen": 7202080, + "step": 11025 + }, + { + "epoch": 5.780922431865828, + "grad_norm": 1.3826314210891724, + "learning_rate": 2.255696809666012e-05, + "loss": 0.3086, + "num_input_tokens_seen": 7204832, + "step": 11030 + }, + { + "epoch": 5.783542976939203, + "grad_norm": 1.4365938901901245, + "learning_rate": 2.253421000511721e-05, + "loss": 0.2706, + "num_input_tokens_seen": 7208128, + "step": 11035 + }, + { + "epoch": 5.786163522012579, + "grad_norm": 1.1301137208938599, + "learning_rate": 2.2511453976834733e-05, + "loss": 0.2581, + "num_input_tokens_seen": 7211232, + "step": 11040 + }, + { + "epoch": 5.788784067085954, + "grad_norm": 2.2140986919403076, + "learning_rate": 2.2488700030853907e-05, + "loss": 0.2738, + "num_input_tokens_seen": 7214720, + "step": 11045 + }, + { + "epoch": 5.7914046121593294, + "grad_norm": 1.8359766006469727, + "learning_rate": 2.2465948186214175e-05, + "loss": 0.3792, + "num_input_tokens_seen": 7219200, + "step": 11050 + }, + { + "epoch": 5.794025157232705, + "grad_norm": 1.1771100759506226, + "learning_rate": 2.244319846195325e-05, + "loss": 0.2927, + "num_input_tokens_seen": 7223808, + "step": 11055 + }, + { + "epoch": 5.79664570230608, + "grad_norm": 2.5643551349639893, + "learning_rate": 2.2420450877107075e-05, + "loss": 0.3262, + "num_input_tokens_seen": 7227264, + "step": 11060 + }, + { + "epoch": 5.799266247379455, + "grad_norm": 1.378183364868164, + "learning_rate": 2.2397705450709763e-05, + "loss": 0.347, + "num_input_tokens_seen": 7232672, + "step": 11065 + }, + { + "epoch": 5.80188679245283, + "grad_norm": 1.9163074493408203, + "learning_rate": 2.237496220179366e-05, + "loss": 0.2691, + "num_input_tokens_seen": 7235584, + "step": 11070 + }, + { + "epoch": 5.804507337526205, + "grad_norm": 1.8348073959350586, + "learning_rate": 2.235222114938929e-05, + "loss": 0.3331, + "num_input_tokens_seen": 7238592, + "step": 11075 + }, + { + "epoch": 5.8071278825995805, + "grad_norm": 1.526652455329895, + "learning_rate": 2.232948231252531e-05, + "loss": 0.3164, + "num_input_tokens_seen": 7241952, + "step": 11080 + }, + { + "epoch": 5.809748427672956, + "grad_norm": 1.349829912185669, + "learning_rate": 2.2306745710228545e-05, + "loss": 0.2977, + "num_input_tokens_seen": 7245056, + "step": 11085 + }, + { + "epoch": 5.812368972746331, + "grad_norm": 1.4606393575668335, + "learning_rate": 2.2284011361523954e-05, + "loss": 0.3304, + "num_input_tokens_seen": 7248896, + "step": 11090 + }, + { + "epoch": 5.814989517819707, + "grad_norm": 1.6900333166122437, + "learning_rate": 2.2261279285434588e-05, + "loss": 0.3871, + "num_input_tokens_seen": 7251680, + "step": 11095 + }, + { + "epoch": 5.817610062893082, + "grad_norm": 1.3500412702560425, + "learning_rate": 2.2238549500981626e-05, + "loss": 0.3801, + "num_input_tokens_seen": 7254432, + "step": 11100 + }, + { + "epoch": 5.820230607966457, + "grad_norm": 4.054520130157471, + "learning_rate": 2.2215822027184294e-05, + "loss": 0.2976, + "num_input_tokens_seen": 7257280, + "step": 11105 + }, + { + "epoch": 5.822851153039832, + "grad_norm": 1.1890689134597778, + "learning_rate": 2.2193096883059913e-05, + "loss": 0.262, + "num_input_tokens_seen": 7259840, + "step": 11110 + }, + { + "epoch": 5.8254716981132075, + "grad_norm": 1.4350340366363525, + "learning_rate": 2.2170374087623853e-05, + "loss": 0.2725, + "num_input_tokens_seen": 7262624, + "step": 11115 + }, + { + "epoch": 5.828092243186583, + "grad_norm": 1.1100026369094849, + "learning_rate": 2.2147653659889494e-05, + "loss": 0.2894, + "num_input_tokens_seen": 7265568, + "step": 11120 + }, + { + "epoch": 5.830712788259958, + "grad_norm": 1.4940029382705688, + "learning_rate": 2.2124935618868266e-05, + "loss": 0.2634, + "num_input_tokens_seen": 7268896, + "step": 11125 + }, + { + "epoch": 5.833333333333333, + "grad_norm": 2.1425297260284424, + "learning_rate": 2.210221998356959e-05, + "loss": 0.2665, + "num_input_tokens_seen": 7271328, + "step": 11130 + }, + { + "epoch": 5.835953878406709, + "grad_norm": 1.5910600423812866, + "learning_rate": 2.2079506773000862e-05, + "loss": 0.3462, + "num_input_tokens_seen": 7274752, + "step": 11135 + }, + { + "epoch": 5.838574423480084, + "grad_norm": 1.2377848625183105, + "learning_rate": 2.205679600616746e-05, + "loss": 0.4082, + "num_input_tokens_seen": 7279168, + "step": 11140 + }, + { + "epoch": 5.841194968553459, + "grad_norm": 1.8677080869674683, + "learning_rate": 2.2034087702072736e-05, + "loss": 0.301, + "num_input_tokens_seen": 7282944, + "step": 11145 + }, + { + "epoch": 5.843815513626835, + "grad_norm": 2.9494314193725586, + "learning_rate": 2.2011381879717928e-05, + "loss": 0.3685, + "num_input_tokens_seen": 7285824, + "step": 11150 + }, + { + "epoch": 5.84643605870021, + "grad_norm": 4.219709396362305, + "learning_rate": 2.1988678558102255e-05, + "loss": 0.2983, + "num_input_tokens_seen": 7288384, + "step": 11155 + }, + { + "epoch": 5.849056603773585, + "grad_norm": 1.8136125802993774, + "learning_rate": 2.1965977756222816e-05, + "loss": 0.3486, + "num_input_tokens_seen": 7291840, + "step": 11160 + }, + { + "epoch": 5.85167714884696, + "grad_norm": 1.1853052377700806, + "learning_rate": 2.1943279493074595e-05, + "loss": 0.2915, + "num_input_tokens_seen": 7296032, + "step": 11165 + }, + { + "epoch": 5.854297693920335, + "grad_norm": 1.4001506567001343, + "learning_rate": 2.192058378765047e-05, + "loss": 0.4307, + "num_input_tokens_seen": 7298560, + "step": 11170 + }, + { + "epoch": 5.8569182389937104, + "grad_norm": 1.5030360221862793, + "learning_rate": 2.1897890658941175e-05, + "loss": 0.2672, + "num_input_tokens_seen": 7301376, + "step": 11175 + }, + { + "epoch": 5.859538784067086, + "grad_norm": 2.0272531509399414, + "learning_rate": 2.1875200125935273e-05, + "loss": 0.3551, + "num_input_tokens_seen": 7304896, + "step": 11180 + }, + { + "epoch": 5.862159329140461, + "grad_norm": 1.8083481788635254, + "learning_rate": 2.185251220761917e-05, + "loss": 0.3122, + "num_input_tokens_seen": 7307968, + "step": 11185 + }, + { + "epoch": 5.864779874213837, + "grad_norm": 1.601516842842102, + "learning_rate": 2.182982692297709e-05, + "loss": 0.3317, + "num_input_tokens_seen": 7311040, + "step": 11190 + }, + { + "epoch": 5.867400419287212, + "grad_norm": 1.3975056409835815, + "learning_rate": 2.180714429099102e-05, + "loss": 0.2645, + "num_input_tokens_seen": 7314848, + "step": 11195 + }, + { + "epoch": 5.870020964360587, + "grad_norm": 1.5992439985275269, + "learning_rate": 2.1784464330640774e-05, + "loss": 0.3166, + "num_input_tokens_seen": 7317440, + "step": 11200 + }, + { + "epoch": 5.872641509433962, + "grad_norm": 1.5235990285873413, + "learning_rate": 2.1761787060903888e-05, + "loss": 0.3284, + "num_input_tokens_seen": 7320288, + "step": 11205 + }, + { + "epoch": 5.8752620545073375, + "grad_norm": 1.722038984298706, + "learning_rate": 2.1739112500755673e-05, + "loss": 0.3472, + "num_input_tokens_seen": 7323168, + "step": 11210 + }, + { + "epoch": 5.877882599580713, + "grad_norm": 1.2175216674804688, + "learning_rate": 2.1716440669169175e-05, + "loss": 0.2762, + "num_input_tokens_seen": 7326048, + "step": 11215 + }, + { + "epoch": 5.880503144654088, + "grad_norm": 1.6307625770568848, + "learning_rate": 2.169377158511513e-05, + "loss": 0.3026, + "num_input_tokens_seen": 7328288, + "step": 11220 + }, + { + "epoch": 5.883123689727463, + "grad_norm": 3.451291084289551, + "learning_rate": 2.1671105267562e-05, + "loss": 0.3224, + "num_input_tokens_seen": 7331328, + "step": 11225 + }, + { + "epoch": 5.885744234800838, + "grad_norm": 1.0780895948410034, + "learning_rate": 2.1648441735475936e-05, + "loss": 0.2617, + "num_input_tokens_seen": 7334720, + "step": 11230 + }, + { + "epoch": 5.888364779874214, + "grad_norm": 1.6970548629760742, + "learning_rate": 2.1625781007820723e-05, + "loss": 0.2511, + "num_input_tokens_seen": 7337312, + "step": 11235 + }, + { + "epoch": 5.890985324947589, + "grad_norm": 1.8641880750656128, + "learning_rate": 2.160312310355783e-05, + "loss": 0.3659, + "num_input_tokens_seen": 7340384, + "step": 11240 + }, + { + "epoch": 5.893605870020965, + "grad_norm": 0.9328728318214417, + "learning_rate": 2.1580468041646378e-05, + "loss": 0.3424, + "num_input_tokens_seen": 7343936, + "step": 11245 + }, + { + "epoch": 5.89622641509434, + "grad_norm": 1.3599082231521606, + "learning_rate": 2.155781584104306e-05, + "loss": 0.295, + "num_input_tokens_seen": 7346848, + "step": 11250 + }, + { + "epoch": 5.898846960167715, + "grad_norm": 2.0308847427368164, + "learning_rate": 2.153516652070221e-05, + "loss": 0.3667, + "num_input_tokens_seen": 7350112, + "step": 11255 + }, + { + "epoch": 5.90146750524109, + "grad_norm": 2.245769739151001, + "learning_rate": 2.1512520099575756e-05, + "loss": 0.2732, + "num_input_tokens_seen": 7353952, + "step": 11260 + }, + { + "epoch": 5.904088050314465, + "grad_norm": 2.394798517227173, + "learning_rate": 2.1489876596613176e-05, + "loss": 0.3735, + "num_input_tokens_seen": 7357440, + "step": 11265 + }, + { + "epoch": 5.90670859538784, + "grad_norm": 1.588474988937378, + "learning_rate": 2.146723603076152e-05, + "loss": 0.3605, + "num_input_tokens_seen": 7360800, + "step": 11270 + }, + { + "epoch": 5.909329140461216, + "grad_norm": 2.0924081802368164, + "learning_rate": 2.14445984209654e-05, + "loss": 0.3685, + "num_input_tokens_seen": 7364032, + "step": 11275 + }, + { + "epoch": 5.911949685534591, + "grad_norm": 1.9490742683410645, + "learning_rate": 2.14219637861669e-05, + "loss": 0.3395, + "num_input_tokens_seen": 7366880, + "step": 11280 + }, + { + "epoch": 5.914570230607967, + "grad_norm": 1.1018174886703491, + "learning_rate": 2.1399332145305678e-05, + "loss": 0.2665, + "num_input_tokens_seen": 7369664, + "step": 11285 + }, + { + "epoch": 5.917190775681342, + "grad_norm": 1.401915192604065, + "learning_rate": 2.1376703517318837e-05, + "loss": 0.3918, + "num_input_tokens_seen": 7372192, + "step": 11290 + }, + { + "epoch": 5.919811320754717, + "grad_norm": 2.6432907581329346, + "learning_rate": 2.1354077921140984e-05, + "loss": 0.2747, + "num_input_tokens_seen": 7374720, + "step": 11295 + }, + { + "epoch": 5.922431865828092, + "grad_norm": 1.5218701362609863, + "learning_rate": 2.1331455375704195e-05, + "loss": 0.2885, + "num_input_tokens_seen": 7378112, + "step": 11300 + }, + { + "epoch": 5.9250524109014675, + "grad_norm": 1.9604367017745972, + "learning_rate": 2.1308835899937972e-05, + "loss": 0.3291, + "num_input_tokens_seen": 7381728, + "step": 11305 + }, + { + "epoch": 5.927672955974843, + "grad_norm": 2.0925943851470947, + "learning_rate": 2.128621951276926e-05, + "loss": 0.3337, + "num_input_tokens_seen": 7385504, + "step": 11310 + }, + { + "epoch": 5.930293501048218, + "grad_norm": 0.9930068254470825, + "learning_rate": 2.126360623312243e-05, + "loss": 0.2568, + "num_input_tokens_seen": 7389600, + "step": 11315 + }, + { + "epoch": 5.932914046121593, + "grad_norm": 2.1949737071990967, + "learning_rate": 2.124099607991922e-05, + "loss": 0.2868, + "num_input_tokens_seen": 7394240, + "step": 11320 + }, + { + "epoch": 5.935534591194968, + "grad_norm": 1.0724313259124756, + "learning_rate": 2.121838907207879e-05, + "loss": 0.3074, + "num_input_tokens_seen": 7396704, + "step": 11325 + }, + { + "epoch": 5.938155136268344, + "grad_norm": 1.0054652690887451, + "learning_rate": 2.1195785228517658e-05, + "loss": 0.2698, + "num_input_tokens_seen": 7400352, + "step": 11330 + }, + { + "epoch": 5.940775681341719, + "grad_norm": 1.1702011823654175, + "learning_rate": 2.117318456814967e-05, + "loss": 0.2477, + "num_input_tokens_seen": 7402816, + "step": 11335 + }, + { + "epoch": 5.943396226415095, + "grad_norm": 1.6418579816818237, + "learning_rate": 2.1150587109886026e-05, + "loss": 0.2729, + "num_input_tokens_seen": 7405568, + "step": 11340 + }, + { + "epoch": 5.94601677148847, + "grad_norm": 1.316508173942566, + "learning_rate": 2.1127992872635263e-05, + "loss": 0.326, + "num_input_tokens_seen": 7408608, + "step": 11345 + }, + { + "epoch": 5.948637316561845, + "grad_norm": 1.4878672361373901, + "learning_rate": 2.1105401875303193e-05, + "loss": 0.3315, + "num_input_tokens_seen": 7412448, + "step": 11350 + }, + { + "epoch": 5.95125786163522, + "grad_norm": 3.036696195602417, + "learning_rate": 2.1082814136792937e-05, + "loss": 0.2423, + "num_input_tokens_seen": 7415104, + "step": 11355 + }, + { + "epoch": 5.953878406708595, + "grad_norm": 1.7637921571731567, + "learning_rate": 2.1060229676004887e-05, + "loss": 0.2725, + "num_input_tokens_seen": 7418944, + "step": 11360 + }, + { + "epoch": 5.95649895178197, + "grad_norm": 0.9687924981117249, + "learning_rate": 2.1037648511836675e-05, + "loss": 0.255, + "num_input_tokens_seen": 7422816, + "step": 11365 + }, + { + "epoch": 5.959119496855346, + "grad_norm": 1.678527593612671, + "learning_rate": 2.1015070663183195e-05, + "loss": 0.2152, + "num_input_tokens_seen": 7425824, + "step": 11370 + }, + { + "epoch": 5.961740041928721, + "grad_norm": 1.0100524425506592, + "learning_rate": 2.0992496148936573e-05, + "loss": 0.3689, + "num_input_tokens_seen": 7429088, + "step": 11375 + }, + { + "epoch": 5.964360587002097, + "grad_norm": 1.1391823291778564, + "learning_rate": 2.0969924987986107e-05, + "loss": 0.3684, + "num_input_tokens_seen": 7432736, + "step": 11380 + }, + { + "epoch": 5.966981132075472, + "grad_norm": 1.505523681640625, + "learning_rate": 2.0947357199218325e-05, + "loss": 0.3618, + "num_input_tokens_seen": 7435840, + "step": 11385 + }, + { + "epoch": 5.969601677148847, + "grad_norm": 2.466850996017456, + "learning_rate": 2.0924792801516922e-05, + "loss": 0.2553, + "num_input_tokens_seen": 7441056, + "step": 11390 + }, + { + "epoch": 5.972222222222222, + "grad_norm": 0.9482998847961426, + "learning_rate": 2.0902231813762753e-05, + "loss": 0.3494, + "num_input_tokens_seen": 7444640, + "step": 11395 + }, + { + "epoch": 5.9748427672955975, + "grad_norm": 1.6494003534317017, + "learning_rate": 2.0879674254833828e-05, + "loss": 0.3165, + "num_input_tokens_seen": 7448096, + "step": 11400 + }, + { + "epoch": 5.977463312368973, + "grad_norm": 0.9905501008033752, + "learning_rate": 2.085712014360527e-05, + "loss": 0.2725, + "num_input_tokens_seen": 7451520, + "step": 11405 + }, + { + "epoch": 5.980083857442348, + "grad_norm": 2.0314829349517822, + "learning_rate": 2.0834569498949342e-05, + "loss": 0.2834, + "num_input_tokens_seen": 7454048, + "step": 11410 + }, + { + "epoch": 5.982704402515723, + "grad_norm": 1.4458974599838257, + "learning_rate": 2.0812022339735395e-05, + "loss": 0.239, + "num_input_tokens_seen": 7457216, + "step": 11415 + }, + { + "epoch": 5.985324947589098, + "grad_norm": 1.0548394918441772, + "learning_rate": 2.0789478684829846e-05, + "loss": 0.3113, + "num_input_tokens_seen": 7460992, + "step": 11420 + }, + { + "epoch": 5.987945492662474, + "grad_norm": 1.8784269094467163, + "learning_rate": 2.0766938553096204e-05, + "loss": 0.2578, + "num_input_tokens_seen": 7463488, + "step": 11425 + }, + { + "epoch": 5.990566037735849, + "grad_norm": 1.2264389991760254, + "learning_rate": 2.0744401963395027e-05, + "loss": 0.3137, + "num_input_tokens_seen": 7466304, + "step": 11430 + }, + { + "epoch": 5.993186582809225, + "grad_norm": 1.1381311416625977, + "learning_rate": 2.0721868934583897e-05, + "loss": 0.3285, + "num_input_tokens_seen": 7470176, + "step": 11435 + }, + { + "epoch": 5.9958071278826, + "grad_norm": 1.6372565031051636, + "learning_rate": 2.0699339485517422e-05, + "loss": 0.2623, + "num_input_tokens_seen": 7473376, + "step": 11440 + }, + { + "epoch": 5.998427672955975, + "grad_norm": 2.589959144592285, + "learning_rate": 2.0676813635047225e-05, + "loss": 0.3087, + "num_input_tokens_seen": 7476864, + "step": 11445 + }, + { + "epoch": 6.0, + "eval_loss": 0.5362622141838074, + "eval_runtime": 15.9787, + "eval_samples_per_second": 53.071, + "eval_steps_per_second": 13.268, + "num_input_tokens_seen": 7478504, + "step": 11448 + }, + { + "epoch": 6.00104821802935, + "grad_norm": 1.1252886056900024, + "learning_rate": 2.0654291402021896e-05, + "loss": 0.2559, + "num_input_tokens_seen": 7479688, + "step": 11450 + }, + { + "epoch": 6.003668763102725, + "grad_norm": 0.9952239394187927, + "learning_rate": 2.063177280528702e-05, + "loss": 0.2381, + "num_input_tokens_seen": 7483176, + "step": 11455 + }, + { + "epoch": 6.0062893081761, + "grad_norm": 1.321679711341858, + "learning_rate": 2.0609257863685142e-05, + "loss": 0.259, + "num_input_tokens_seen": 7486120, + "step": 11460 + }, + { + "epoch": 6.008909853249476, + "grad_norm": 1.6771317720413208, + "learning_rate": 2.0586746596055706e-05, + "loss": 0.2618, + "num_input_tokens_seen": 7488872, + "step": 11465 + }, + { + "epoch": 6.011530398322851, + "grad_norm": 2.1230344772338867, + "learning_rate": 2.0564239021235128e-05, + "loss": 0.284, + "num_input_tokens_seen": 7491880, + "step": 11470 + }, + { + "epoch": 6.014150943396227, + "grad_norm": 1.7935186624526978, + "learning_rate": 2.0541735158056733e-05, + "loss": 0.2154, + "num_input_tokens_seen": 7494952, + "step": 11475 + }, + { + "epoch": 6.016771488469602, + "grad_norm": 1.0849055051803589, + "learning_rate": 2.0519235025350704e-05, + "loss": 0.2987, + "num_input_tokens_seen": 7498600, + "step": 11480 + }, + { + "epoch": 6.019392033542977, + "grad_norm": 1.2185665369033813, + "learning_rate": 2.0496738641944133e-05, + "loss": 0.2858, + "num_input_tokens_seen": 7502696, + "step": 11485 + }, + { + "epoch": 6.022012578616352, + "grad_norm": 1.3220360279083252, + "learning_rate": 2.0474246026660966e-05, + "loss": 0.2465, + "num_input_tokens_seen": 7505800, + "step": 11490 + }, + { + "epoch": 6.0246331236897275, + "grad_norm": 2.3902218341827393, + "learning_rate": 2.0451757198321992e-05, + "loss": 0.2678, + "num_input_tokens_seen": 7508712, + "step": 11495 + }, + { + "epoch": 6.027253668763103, + "grad_norm": 1.217427134513855, + "learning_rate": 2.042927217574485e-05, + "loss": 0.2096, + "num_input_tokens_seen": 7511400, + "step": 11500 + }, + { + "epoch": 6.029874213836478, + "grad_norm": 1.546868085861206, + "learning_rate": 2.040679097774396e-05, + "loss": 0.2796, + "num_input_tokens_seen": 7515208, + "step": 11505 + }, + { + "epoch": 6.032494758909853, + "grad_norm": 1.944570541381836, + "learning_rate": 2.0384313623130565e-05, + "loss": 0.1444, + "num_input_tokens_seen": 7520712, + "step": 11510 + }, + { + "epoch": 6.035115303983228, + "grad_norm": 1.0628665685653687, + "learning_rate": 2.0361840130712706e-05, + "loss": 0.2173, + "num_input_tokens_seen": 7522984, + "step": 11515 + }, + { + "epoch": 6.037735849056604, + "grad_norm": 1.059002161026001, + "learning_rate": 2.033937051929516e-05, + "loss": 0.2719, + "num_input_tokens_seen": 7526024, + "step": 11520 + }, + { + "epoch": 6.040356394129979, + "grad_norm": 2.9365551471710205, + "learning_rate": 2.0316904807679464e-05, + "loss": 0.2327, + "num_input_tokens_seen": 7529288, + "step": 11525 + }, + { + "epoch": 6.0429769392033545, + "grad_norm": 1.6005911827087402, + "learning_rate": 2.0294443014663923e-05, + "loss": 0.2245, + "num_input_tokens_seen": 7531752, + "step": 11530 + }, + { + "epoch": 6.04559748427673, + "grad_norm": 2.24049973487854, + "learning_rate": 2.0271985159043518e-05, + "loss": 0.2833, + "num_input_tokens_seen": 7535112, + "step": 11535 + }, + { + "epoch": 6.048218029350105, + "grad_norm": 2.9492459297180176, + "learning_rate": 2.0249531259609965e-05, + "loss": 0.2197, + "num_input_tokens_seen": 7538344, + "step": 11540 + }, + { + "epoch": 6.05083857442348, + "grad_norm": 1.9047576189041138, + "learning_rate": 2.0227081335151675e-05, + "loss": 0.2337, + "num_input_tokens_seen": 7541064, + "step": 11545 + }, + { + "epoch": 6.053459119496855, + "grad_norm": 1.543198823928833, + "learning_rate": 2.0204635404453688e-05, + "loss": 0.186, + "num_input_tokens_seen": 7544808, + "step": 11550 + }, + { + "epoch": 6.05607966457023, + "grad_norm": 1.7935700416564941, + "learning_rate": 2.0182193486297755e-05, + "loss": 0.2041, + "num_input_tokens_seen": 7548872, + "step": 11555 + }, + { + "epoch": 6.058700209643606, + "grad_norm": 1.7921264171600342, + "learning_rate": 2.0159755599462256e-05, + "loss": 0.3074, + "num_input_tokens_seen": 7551912, + "step": 11560 + }, + { + "epoch": 6.061320754716981, + "grad_norm": 1.6353380680084229, + "learning_rate": 2.0137321762722166e-05, + "loss": 0.2048, + "num_input_tokens_seen": 7554856, + "step": 11565 + }, + { + "epoch": 6.063941299790357, + "grad_norm": 3.2239859104156494, + "learning_rate": 2.0114891994849112e-05, + "loss": 0.3189, + "num_input_tokens_seen": 7558344, + "step": 11570 + }, + { + "epoch": 6.066561844863732, + "grad_norm": 1.9188010692596436, + "learning_rate": 2.0092466314611287e-05, + "loss": 0.2728, + "num_input_tokens_seen": 7561704, + "step": 11575 + }, + { + "epoch": 6.069182389937107, + "grad_norm": 2.3723671436309814, + "learning_rate": 2.0070044740773487e-05, + "loss": 0.2814, + "num_input_tokens_seen": 7563816, + "step": 11580 + }, + { + "epoch": 6.071802935010482, + "grad_norm": 1.2071486711502075, + "learning_rate": 2.0047627292097067e-05, + "loss": 0.1986, + "num_input_tokens_seen": 7566408, + "step": 11585 + }, + { + "epoch": 6.0744234800838575, + "grad_norm": 1.4221431016921997, + "learning_rate": 2.002521398733989e-05, + "loss": 0.2198, + "num_input_tokens_seen": 7569960, + "step": 11590 + }, + { + "epoch": 6.077044025157233, + "grad_norm": 2.2883737087249756, + "learning_rate": 2.0002804845256423e-05, + "loss": 0.2178, + "num_input_tokens_seen": 7572872, + "step": 11595 + }, + { + "epoch": 6.079664570230608, + "grad_norm": 2.7291321754455566, + "learning_rate": 1.9980399884597605e-05, + "loss": 0.3179, + "num_input_tokens_seen": 7576104, + "step": 11600 + }, + { + "epoch": 6.082285115303983, + "grad_norm": 1.132606863975525, + "learning_rate": 1.995799912411087e-05, + "loss": 0.199, + "num_input_tokens_seen": 7578856, + "step": 11605 + }, + { + "epoch": 6.084905660377358, + "grad_norm": 1.5992707014083862, + "learning_rate": 1.993560258254016e-05, + "loss": 0.2777, + "num_input_tokens_seen": 7582408, + "step": 11610 + }, + { + "epoch": 6.087526205450734, + "grad_norm": 1.3933792114257812, + "learning_rate": 1.9913210278625876e-05, + "loss": 0.2373, + "num_input_tokens_seen": 7585224, + "step": 11615 + }, + { + "epoch": 6.090146750524109, + "grad_norm": 2.0800514221191406, + "learning_rate": 1.9890822231104872e-05, + "loss": 0.2401, + "num_input_tokens_seen": 7587880, + "step": 11620 + }, + { + "epoch": 6.0927672955974845, + "grad_norm": 2.3412861824035645, + "learning_rate": 1.9868438458710447e-05, + "loss": 0.3311, + "num_input_tokens_seen": 7591784, + "step": 11625 + }, + { + "epoch": 6.09538784067086, + "grad_norm": 2.0664916038513184, + "learning_rate": 1.984605898017233e-05, + "loss": 0.2545, + "num_input_tokens_seen": 7594920, + "step": 11630 + }, + { + "epoch": 6.098008385744235, + "grad_norm": 2.9148876667022705, + "learning_rate": 1.9823683814216622e-05, + "loss": 0.3848, + "num_input_tokens_seen": 7598504, + "step": 11635 + }, + { + "epoch": 6.10062893081761, + "grad_norm": 1.1644994020462036, + "learning_rate": 1.980131297956586e-05, + "loss": 0.3215, + "num_input_tokens_seen": 7601640, + "step": 11640 + }, + { + "epoch": 6.103249475890985, + "grad_norm": 1.4788013696670532, + "learning_rate": 1.977894649493894e-05, + "loss": 0.2892, + "num_input_tokens_seen": 7604520, + "step": 11645 + }, + { + "epoch": 6.10587002096436, + "grad_norm": 1.1646517515182495, + "learning_rate": 1.9756584379051092e-05, + "loss": 0.1712, + "num_input_tokens_seen": 7608136, + "step": 11650 + }, + { + "epoch": 6.1084905660377355, + "grad_norm": 2.5735394954681396, + "learning_rate": 1.9734226650613928e-05, + "loss": 0.3147, + "num_input_tokens_seen": 7611592, + "step": 11655 + }, + { + "epoch": 6.111111111111111, + "grad_norm": 2.4284942150115967, + "learning_rate": 1.9711873328335374e-05, + "loss": 0.2699, + "num_input_tokens_seen": 7614184, + "step": 11660 + }, + { + "epoch": 6.113731656184487, + "grad_norm": 2.0420773029327393, + "learning_rate": 1.9689524430919664e-05, + "loss": 0.1831, + "num_input_tokens_seen": 7617064, + "step": 11665 + }, + { + "epoch": 6.116352201257862, + "grad_norm": 1.206039309501648, + "learning_rate": 1.9667179977067344e-05, + "loss": 0.2344, + "num_input_tokens_seen": 7620680, + "step": 11670 + }, + { + "epoch": 6.118972746331237, + "grad_norm": 1.465491771697998, + "learning_rate": 1.9644839985475216e-05, + "loss": 0.1901, + "num_input_tokens_seen": 7623944, + "step": 11675 + }, + { + "epoch": 6.121593291404612, + "grad_norm": 4.6063313484191895, + "learning_rate": 1.962250447483638e-05, + "loss": 0.2247, + "num_input_tokens_seen": 7627144, + "step": 11680 + }, + { + "epoch": 6.1242138364779874, + "grad_norm": 1.7620121240615845, + "learning_rate": 1.960017346384017e-05, + "loss": 0.2792, + "num_input_tokens_seen": 7630216, + "step": 11685 + }, + { + "epoch": 6.126834381551363, + "grad_norm": 2.2679522037506104, + "learning_rate": 1.9577846971172144e-05, + "loss": 0.2387, + "num_input_tokens_seen": 7633000, + "step": 11690 + }, + { + "epoch": 6.129454926624738, + "grad_norm": 2.5539512634277344, + "learning_rate": 1.9555525015514097e-05, + "loss": 0.1901, + "num_input_tokens_seen": 7636104, + "step": 11695 + }, + { + "epoch": 6.132075471698113, + "grad_norm": 2.0879766941070557, + "learning_rate": 1.9533207615544034e-05, + "loss": 0.3774, + "num_input_tokens_seen": 7639208, + "step": 11700 + }, + { + "epoch": 6.134696016771488, + "grad_norm": 0.9438270330429077, + "learning_rate": 1.9510894789936113e-05, + "loss": 0.2565, + "num_input_tokens_seen": 7642184, + "step": 11705 + }, + { + "epoch": 6.137316561844864, + "grad_norm": 2.9627065658569336, + "learning_rate": 1.9488586557360703e-05, + "loss": 0.303, + "num_input_tokens_seen": 7645480, + "step": 11710 + }, + { + "epoch": 6.139937106918239, + "grad_norm": 1.1047650575637817, + "learning_rate": 1.9466282936484313e-05, + "loss": 0.2442, + "num_input_tokens_seen": 7648936, + "step": 11715 + }, + { + "epoch": 6.1425576519916145, + "grad_norm": 1.5424915552139282, + "learning_rate": 1.944398394596959e-05, + "loss": 0.2317, + "num_input_tokens_seen": 7651816, + "step": 11720 + }, + { + "epoch": 6.14517819706499, + "grad_norm": 1.5531866550445557, + "learning_rate": 1.942168960447531e-05, + "loss": 0.2964, + "num_input_tokens_seen": 7654568, + "step": 11725 + }, + { + "epoch": 6.147798742138365, + "grad_norm": 1.2254117727279663, + "learning_rate": 1.9399399930656377e-05, + "loss": 0.1615, + "num_input_tokens_seen": 7657544, + "step": 11730 + }, + { + "epoch": 6.15041928721174, + "grad_norm": 2.0252749919891357, + "learning_rate": 1.937711494316374e-05, + "loss": 0.3449, + "num_input_tokens_seen": 7660936, + "step": 11735 + }, + { + "epoch": 6.153039832285115, + "grad_norm": 1.7485922574996948, + "learning_rate": 1.9354834660644478e-05, + "loss": 0.2991, + "num_input_tokens_seen": 7663336, + "step": 11740 + }, + { + "epoch": 6.15566037735849, + "grad_norm": 0.9728035926818848, + "learning_rate": 1.9332559101741715e-05, + "loss": 0.242, + "num_input_tokens_seen": 7666312, + "step": 11745 + }, + { + "epoch": 6.1582809224318655, + "grad_norm": 1.442870020866394, + "learning_rate": 1.9310288285094615e-05, + "loss": 0.3122, + "num_input_tokens_seen": 7669224, + "step": 11750 + }, + { + "epoch": 6.160901467505241, + "grad_norm": 2.4640865325927734, + "learning_rate": 1.9288022229338384e-05, + "loss": 0.2454, + "num_input_tokens_seen": 7672264, + "step": 11755 + }, + { + "epoch": 6.163522012578617, + "grad_norm": 3.027601480484009, + "learning_rate": 1.9265760953104235e-05, + "loss": 0.3137, + "num_input_tokens_seen": 7674824, + "step": 11760 + }, + { + "epoch": 6.166142557651992, + "grad_norm": 1.9031015634536743, + "learning_rate": 1.924350447501939e-05, + "loss": 0.1955, + "num_input_tokens_seen": 7677608, + "step": 11765 + }, + { + "epoch": 6.168763102725367, + "grad_norm": 1.6966614723205566, + "learning_rate": 1.922125281370707e-05, + "loss": 0.2832, + "num_input_tokens_seen": 7680680, + "step": 11770 + }, + { + "epoch": 6.171383647798742, + "grad_norm": 1.5963120460510254, + "learning_rate": 1.919900598778642e-05, + "loss": 0.2817, + "num_input_tokens_seen": 7684072, + "step": 11775 + }, + { + "epoch": 6.174004192872117, + "grad_norm": 1.2199673652648926, + "learning_rate": 1.9176764015872578e-05, + "loss": 0.2681, + "num_input_tokens_seen": 7687624, + "step": 11780 + }, + { + "epoch": 6.176624737945493, + "grad_norm": 1.3708301782608032, + "learning_rate": 1.9154526916576618e-05, + "loss": 0.3556, + "num_input_tokens_seen": 7691656, + "step": 11785 + }, + { + "epoch": 6.179245283018868, + "grad_norm": 1.4344536066055298, + "learning_rate": 1.913229470850552e-05, + "loss": 0.2744, + "num_input_tokens_seen": 7694568, + "step": 11790 + }, + { + "epoch": 6.181865828092243, + "grad_norm": 1.0118110179901123, + "learning_rate": 1.9110067410262185e-05, + "loss": 0.2459, + "num_input_tokens_seen": 7697544, + "step": 11795 + }, + { + "epoch": 6.184486373165618, + "grad_norm": 2.9606986045837402, + "learning_rate": 1.90878450404454e-05, + "loss": 0.5069, + "num_input_tokens_seen": 7700456, + "step": 11800 + }, + { + "epoch": 6.187106918238993, + "grad_norm": 1.7445012331008911, + "learning_rate": 1.9065627617649828e-05, + "loss": 0.2426, + "num_input_tokens_seen": 7703112, + "step": 11805 + }, + { + "epoch": 6.189727463312369, + "grad_norm": 2.9059855937957764, + "learning_rate": 1.9043415160465993e-05, + "loss": 0.2278, + "num_input_tokens_seen": 7705992, + "step": 11810 + }, + { + "epoch": 6.1923480083857445, + "grad_norm": 1.062615156173706, + "learning_rate": 1.9021207687480278e-05, + "loss": 0.261, + "num_input_tokens_seen": 7709576, + "step": 11815 + }, + { + "epoch": 6.19496855345912, + "grad_norm": 1.7245144844055176, + "learning_rate": 1.8999005217274857e-05, + "loss": 0.2605, + "num_input_tokens_seen": 7712712, + "step": 11820 + }, + { + "epoch": 6.197589098532495, + "grad_norm": 2.6237597465515137, + "learning_rate": 1.897680776842775e-05, + "loss": 0.2413, + "num_input_tokens_seen": 7715432, + "step": 11825 + }, + { + "epoch": 6.20020964360587, + "grad_norm": 1.8688325881958008, + "learning_rate": 1.895461535951279e-05, + "loss": 0.2076, + "num_input_tokens_seen": 7718216, + "step": 11830 + }, + { + "epoch": 6.202830188679245, + "grad_norm": 1.536496877670288, + "learning_rate": 1.8932428009099545e-05, + "loss": 0.2596, + "num_input_tokens_seen": 7721096, + "step": 11835 + }, + { + "epoch": 6.20545073375262, + "grad_norm": 1.74091637134552, + "learning_rate": 1.891024573575339e-05, + "loss": 0.2593, + "num_input_tokens_seen": 7724264, + "step": 11840 + }, + { + "epoch": 6.2080712788259955, + "grad_norm": 1.711727261543274, + "learning_rate": 1.8888068558035435e-05, + "loss": 0.2202, + "num_input_tokens_seen": 7727176, + "step": 11845 + }, + { + "epoch": 6.210691823899371, + "grad_norm": 1.6277484893798828, + "learning_rate": 1.8865896494502525e-05, + "loss": 0.2794, + "num_input_tokens_seen": 7729544, + "step": 11850 + }, + { + "epoch": 6.213312368972747, + "grad_norm": 0.9970278739929199, + "learning_rate": 1.8843729563707247e-05, + "loss": 0.3442, + "num_input_tokens_seen": 7733288, + "step": 11855 + }, + { + "epoch": 6.215932914046122, + "grad_norm": 1.9448025226593018, + "learning_rate": 1.8821567784197847e-05, + "loss": 0.2457, + "num_input_tokens_seen": 7737096, + "step": 11860 + }, + { + "epoch": 6.218553459119497, + "grad_norm": 3.0458767414093018, + "learning_rate": 1.8799411174518306e-05, + "loss": 0.2667, + "num_input_tokens_seen": 7741512, + "step": 11865 + }, + { + "epoch": 6.221174004192872, + "grad_norm": 1.3721001148223877, + "learning_rate": 1.8777259753208275e-05, + "loss": 0.2456, + "num_input_tokens_seen": 7744648, + "step": 11870 + }, + { + "epoch": 6.223794549266247, + "grad_norm": 2.407688617706299, + "learning_rate": 1.8755113538803026e-05, + "loss": 0.2028, + "num_input_tokens_seen": 7748392, + "step": 11875 + }, + { + "epoch": 6.226415094339623, + "grad_norm": 1.379409670829773, + "learning_rate": 1.8732972549833516e-05, + "loss": 0.2318, + "num_input_tokens_seen": 7752360, + "step": 11880 + }, + { + "epoch": 6.229035639412998, + "grad_norm": 1.5538350343704224, + "learning_rate": 1.8710836804826314e-05, + "loss": 0.2006, + "num_input_tokens_seen": 7755016, + "step": 11885 + }, + { + "epoch": 6.231656184486373, + "grad_norm": 2.0426886081695557, + "learning_rate": 1.8688706322303595e-05, + "loss": 0.2958, + "num_input_tokens_seen": 7758120, + "step": 11890 + }, + { + "epoch": 6.234276729559748, + "grad_norm": 2.0337796211242676, + "learning_rate": 1.8666581120783134e-05, + "loss": 0.358, + "num_input_tokens_seen": 7761032, + "step": 11895 + }, + { + "epoch": 6.236897274633123, + "grad_norm": 2.0040876865386963, + "learning_rate": 1.8644461218778304e-05, + "loss": 0.2265, + "num_input_tokens_seen": 7763240, + "step": 11900 + }, + { + "epoch": 6.239517819706499, + "grad_norm": 1.4647881984710693, + "learning_rate": 1.8622346634798e-05, + "loss": 0.2333, + "num_input_tokens_seen": 7766664, + "step": 11905 + }, + { + "epoch": 6.2421383647798745, + "grad_norm": 2.0868232250213623, + "learning_rate": 1.8600237387346716e-05, + "loss": 0.3153, + "num_input_tokens_seen": 7769448, + "step": 11910 + }, + { + "epoch": 6.24475890985325, + "grad_norm": 4.156392574310303, + "learning_rate": 1.8578133494924473e-05, + "loss": 0.3245, + "num_input_tokens_seen": 7772296, + "step": 11915 + }, + { + "epoch": 6.247379454926625, + "grad_norm": 1.8407942056655884, + "learning_rate": 1.8556034976026764e-05, + "loss": 0.1729, + "num_input_tokens_seen": 7775912, + "step": 11920 + }, + { + "epoch": 6.25, + "grad_norm": 0.8866196274757385, + "learning_rate": 1.8533941849144642e-05, + "loss": 0.3072, + "num_input_tokens_seen": 7779784, + "step": 11925 + }, + { + "epoch": 6.252620545073375, + "grad_norm": 1.7711373567581177, + "learning_rate": 1.8511854132764627e-05, + "loss": 0.201, + "num_input_tokens_seen": 7782504, + "step": 11930 + }, + { + "epoch": 6.25524109014675, + "grad_norm": 2.1362147331237793, + "learning_rate": 1.84897718453687e-05, + "loss": 0.174, + "num_input_tokens_seen": 7785864, + "step": 11935 + }, + { + "epoch": 6.2578616352201255, + "grad_norm": 2.2670788764953613, + "learning_rate": 1.846769500543434e-05, + "loss": 0.2405, + "num_input_tokens_seen": 7788808, + "step": 11940 + }, + { + "epoch": 6.260482180293501, + "grad_norm": 1.7642759084701538, + "learning_rate": 1.844562363143439e-05, + "loss": 0.2615, + "num_input_tokens_seen": 7792232, + "step": 11945 + }, + { + "epoch": 6.263102725366876, + "grad_norm": 1.8023638725280762, + "learning_rate": 1.8423557741837198e-05, + "loss": 0.2462, + "num_input_tokens_seen": 7794856, + "step": 11950 + }, + { + "epoch": 6.265723270440252, + "grad_norm": 1.7595170736312866, + "learning_rate": 1.84014973551065e-05, + "loss": 0.3547, + "num_input_tokens_seen": 7797800, + "step": 11955 + }, + { + "epoch": 6.268343815513627, + "grad_norm": 1.400741457939148, + "learning_rate": 1.8379442489701396e-05, + "loss": 0.2164, + "num_input_tokens_seen": 7801256, + "step": 11960 + }, + { + "epoch": 6.270964360587002, + "grad_norm": 2.409156322479248, + "learning_rate": 1.8357393164076403e-05, + "loss": 0.2267, + "num_input_tokens_seen": 7804296, + "step": 11965 + }, + { + "epoch": 6.273584905660377, + "grad_norm": 1.436583399772644, + "learning_rate": 1.8335349396681394e-05, + "loss": 0.2442, + "num_input_tokens_seen": 7807464, + "step": 11970 + }, + { + "epoch": 6.276205450733753, + "grad_norm": 0.9483934044837952, + "learning_rate": 1.8313311205961577e-05, + "loss": 0.2056, + "num_input_tokens_seen": 7812008, + "step": 11975 + }, + { + "epoch": 6.278825995807128, + "grad_norm": 2.259737730026245, + "learning_rate": 1.82912786103575e-05, + "loss": 0.2247, + "num_input_tokens_seen": 7815336, + "step": 11980 + }, + { + "epoch": 6.281446540880503, + "grad_norm": 0.9413652420043945, + "learning_rate": 1.826925162830505e-05, + "loss": 0.1782, + "num_input_tokens_seen": 7818856, + "step": 11985 + }, + { + "epoch": 6.284067085953878, + "grad_norm": 1.7321817874908447, + "learning_rate": 1.8247230278235384e-05, + "loss": 0.3346, + "num_input_tokens_seen": 7822024, + "step": 11990 + }, + { + "epoch": 6.286687631027253, + "grad_norm": 1.4052562713623047, + "learning_rate": 1.8225214578574967e-05, + "loss": 0.3504, + "num_input_tokens_seen": 7825640, + "step": 11995 + }, + { + "epoch": 6.289308176100629, + "grad_norm": 3.9926533699035645, + "learning_rate": 1.820320454774554e-05, + "loss": 0.2221, + "num_input_tokens_seen": 7827944, + "step": 12000 + }, + { + "epoch": 6.2919287211740045, + "grad_norm": 1.356492280960083, + "learning_rate": 1.8181200204164073e-05, + "loss": 0.2794, + "num_input_tokens_seen": 7830600, + "step": 12005 + }, + { + "epoch": 6.29454926624738, + "grad_norm": 0.9302386045455933, + "learning_rate": 1.8159201566242806e-05, + "loss": 0.2044, + "num_input_tokens_seen": 7834056, + "step": 12010 + }, + { + "epoch": 6.297169811320755, + "grad_norm": 2.0940585136413574, + "learning_rate": 1.81372086523892e-05, + "loss": 0.2782, + "num_input_tokens_seen": 7836904, + "step": 12015 + }, + { + "epoch": 6.29979035639413, + "grad_norm": 3.8928005695343018, + "learning_rate": 1.8115221481005904e-05, + "loss": 0.2666, + "num_input_tokens_seen": 7839880, + "step": 12020 + }, + { + "epoch": 6.302410901467505, + "grad_norm": 1.788697361946106, + "learning_rate": 1.809324007049079e-05, + "loss": 0.2607, + "num_input_tokens_seen": 7843432, + "step": 12025 + }, + { + "epoch": 6.30503144654088, + "grad_norm": 1.734206199645996, + "learning_rate": 1.8071264439236903e-05, + "loss": 0.2374, + "num_input_tokens_seen": 7846216, + "step": 12030 + }, + { + "epoch": 6.3076519916142555, + "grad_norm": 7.956557273864746, + "learning_rate": 1.8049294605632434e-05, + "loss": 0.2904, + "num_input_tokens_seen": 7849544, + "step": 12035 + }, + { + "epoch": 6.310272536687631, + "grad_norm": 1.5615049600601196, + "learning_rate": 1.8027330588060757e-05, + "loss": 0.2807, + "num_input_tokens_seen": 7852552, + "step": 12040 + }, + { + "epoch": 6.312893081761006, + "grad_norm": 2.118574857711792, + "learning_rate": 1.8005372404900335e-05, + "loss": 0.3055, + "num_input_tokens_seen": 7855336, + "step": 12045 + }, + { + "epoch": 6.315513626834382, + "grad_norm": 1.5415072441101074, + "learning_rate": 1.7983420074524777e-05, + "loss": 0.2852, + "num_input_tokens_seen": 7858184, + "step": 12050 + }, + { + "epoch": 6.318134171907757, + "grad_norm": 3.2148852348327637, + "learning_rate": 1.7961473615302805e-05, + "loss": 0.2433, + "num_input_tokens_seen": 7861192, + "step": 12055 + }, + { + "epoch": 6.320754716981132, + "grad_norm": 1.4511171579360962, + "learning_rate": 1.79395330455982e-05, + "loss": 0.2848, + "num_input_tokens_seen": 7864520, + "step": 12060 + }, + { + "epoch": 6.323375262054507, + "grad_norm": 2.4186179637908936, + "learning_rate": 1.7917598383769836e-05, + "loss": 0.305, + "num_input_tokens_seen": 7867240, + "step": 12065 + }, + { + "epoch": 6.325995807127883, + "grad_norm": 2.220956802368164, + "learning_rate": 1.789566964817163e-05, + "loss": 0.2722, + "num_input_tokens_seen": 7870120, + "step": 12070 + }, + { + "epoch": 6.328616352201258, + "grad_norm": 2.1023809909820557, + "learning_rate": 1.7873746857152552e-05, + "loss": 0.2339, + "num_input_tokens_seen": 7873736, + "step": 12075 + }, + { + "epoch": 6.331236897274633, + "grad_norm": 1.668831467628479, + "learning_rate": 1.7851830029056587e-05, + "loss": 0.2832, + "num_input_tokens_seen": 7877000, + "step": 12080 + }, + { + "epoch": 6.333857442348008, + "grad_norm": 2.099139451980591, + "learning_rate": 1.7829919182222752e-05, + "loss": 0.1762, + "num_input_tokens_seen": 7879784, + "step": 12085 + }, + { + "epoch": 6.336477987421383, + "grad_norm": 3.3115439414978027, + "learning_rate": 1.780801433498501e-05, + "loss": 0.2328, + "num_input_tokens_seen": 7882792, + "step": 12090 + }, + { + "epoch": 6.339098532494759, + "grad_norm": 2.218196392059326, + "learning_rate": 1.7786115505672364e-05, + "loss": 0.2959, + "num_input_tokens_seen": 7885416, + "step": 12095 + }, + { + "epoch": 6.3417190775681345, + "grad_norm": 1.437225103378296, + "learning_rate": 1.7764222712608753e-05, + "loss": 0.2792, + "num_input_tokens_seen": 7888936, + "step": 12100 + }, + { + "epoch": 6.34433962264151, + "grad_norm": 2.0400726795196533, + "learning_rate": 1.7742335974113046e-05, + "loss": 0.3333, + "num_input_tokens_seen": 7891976, + "step": 12105 + }, + { + "epoch": 6.346960167714885, + "grad_norm": 2.4216983318328857, + "learning_rate": 1.7720455308499084e-05, + "loss": 0.3036, + "num_input_tokens_seen": 7895848, + "step": 12110 + }, + { + "epoch": 6.34958071278826, + "grad_norm": 1.8124538660049438, + "learning_rate": 1.769858073407561e-05, + "loss": 0.3671, + "num_input_tokens_seen": 7898760, + "step": 12115 + }, + { + "epoch": 6.352201257861635, + "grad_norm": 1.5658555030822754, + "learning_rate": 1.767671226914625e-05, + "loss": 0.2749, + "num_input_tokens_seen": 7902120, + "step": 12120 + }, + { + "epoch": 6.35482180293501, + "grad_norm": 2.8276453018188477, + "learning_rate": 1.7654849932009566e-05, + "loss": 0.419, + "num_input_tokens_seen": 7905224, + "step": 12125 + }, + { + "epoch": 6.3574423480083855, + "grad_norm": 1.6814545392990112, + "learning_rate": 1.763299374095893e-05, + "loss": 0.3183, + "num_input_tokens_seen": 7908424, + "step": 12130 + }, + { + "epoch": 6.360062893081761, + "grad_norm": 0.7039715051651001, + "learning_rate": 1.761114371428262e-05, + "loss": 0.2585, + "num_input_tokens_seen": 7911336, + "step": 12135 + }, + { + "epoch": 6.362683438155136, + "grad_norm": 1.116101622581482, + "learning_rate": 1.7589299870263753e-05, + "loss": 0.1856, + "num_input_tokens_seen": 7913704, + "step": 12140 + }, + { + "epoch": 6.365303983228512, + "grad_norm": 2.247493267059326, + "learning_rate": 1.756746222718024e-05, + "loss": 0.2851, + "num_input_tokens_seen": 7916584, + "step": 12145 + }, + { + "epoch": 6.367924528301887, + "grad_norm": 1.4738693237304688, + "learning_rate": 1.7545630803304826e-05, + "loss": 0.2954, + "num_input_tokens_seen": 7919880, + "step": 12150 + }, + { + "epoch": 6.370545073375262, + "grad_norm": 5.828043460845947, + "learning_rate": 1.7523805616905063e-05, + "loss": 0.2972, + "num_input_tokens_seen": 7923752, + "step": 12155 + }, + { + "epoch": 6.373165618448637, + "grad_norm": 1.3725018501281738, + "learning_rate": 1.7501986686243256e-05, + "loss": 0.3452, + "num_input_tokens_seen": 7927464, + "step": 12160 + }, + { + "epoch": 6.3757861635220126, + "grad_norm": 1.3766758441925049, + "learning_rate": 1.748017402957649e-05, + "loss": 0.1766, + "num_input_tokens_seen": 7930792, + "step": 12165 + }, + { + "epoch": 6.378406708595388, + "grad_norm": 1.1983824968338013, + "learning_rate": 1.7458367665156615e-05, + "loss": 0.3142, + "num_input_tokens_seen": 7935112, + "step": 12170 + }, + { + "epoch": 6.381027253668763, + "grad_norm": 1.1723613739013672, + "learning_rate": 1.743656761123018e-05, + "loss": 0.2621, + "num_input_tokens_seen": 7938152, + "step": 12175 + }, + { + "epoch": 6.383647798742138, + "grad_norm": 1.1620721817016602, + "learning_rate": 1.7414773886038487e-05, + "loss": 0.2679, + "num_input_tokens_seen": 7942536, + "step": 12180 + }, + { + "epoch": 6.386268343815513, + "grad_norm": 1.7049720287322998, + "learning_rate": 1.7392986507817532e-05, + "loss": 0.196, + "num_input_tokens_seen": 7945224, + "step": 12185 + }, + { + "epoch": 6.388888888888889, + "grad_norm": 1.2651934623718262, + "learning_rate": 1.7371205494797987e-05, + "loss": 0.2334, + "num_input_tokens_seen": 7948520, + "step": 12190 + }, + { + "epoch": 6.3915094339622645, + "grad_norm": 2.207848310470581, + "learning_rate": 1.7349430865205215e-05, + "loss": 0.3207, + "num_input_tokens_seen": 7951368, + "step": 12195 + }, + { + "epoch": 6.39412997903564, + "grad_norm": 1.491644024848938, + "learning_rate": 1.7327662637259234e-05, + "loss": 0.3617, + "num_input_tokens_seen": 7954472, + "step": 12200 + }, + { + "epoch": 6.396750524109015, + "grad_norm": 1.8521127700805664, + "learning_rate": 1.7305900829174697e-05, + "loss": 0.2314, + "num_input_tokens_seen": 7957000, + "step": 12205 + }, + { + "epoch": 6.39937106918239, + "grad_norm": 2.4715309143066406, + "learning_rate": 1.7284145459160893e-05, + "loss": 0.2151, + "num_input_tokens_seen": 7959400, + "step": 12210 + }, + { + "epoch": 6.401991614255765, + "grad_norm": 3.9261746406555176, + "learning_rate": 1.7262396545421728e-05, + "loss": 0.3133, + "num_input_tokens_seen": 7962312, + "step": 12215 + }, + { + "epoch": 6.40461215932914, + "grad_norm": 2.772233486175537, + "learning_rate": 1.7240654106155688e-05, + "loss": 0.3308, + "num_input_tokens_seen": 7965768, + "step": 12220 + }, + { + "epoch": 6.4072327044025155, + "grad_norm": 1.1709213256835938, + "learning_rate": 1.721891815955587e-05, + "loss": 0.2194, + "num_input_tokens_seen": 7972840, + "step": 12225 + }, + { + "epoch": 6.409853249475891, + "grad_norm": 2.389655590057373, + "learning_rate": 1.71971887238099e-05, + "loss": 0.2863, + "num_input_tokens_seen": 7975304, + "step": 12230 + }, + { + "epoch": 6.412473794549266, + "grad_norm": 1.3025864362716675, + "learning_rate": 1.7175465817099988e-05, + "loss": 0.2265, + "num_input_tokens_seen": 7977928, + "step": 12235 + }, + { + "epoch": 6.415094339622642, + "grad_norm": 2.3628733158111572, + "learning_rate": 1.7153749457602874e-05, + "loss": 0.2663, + "num_input_tokens_seen": 7982120, + "step": 12240 + }, + { + "epoch": 6.417714884696017, + "grad_norm": 2.225254535675049, + "learning_rate": 1.7132039663489806e-05, + "loss": 0.2687, + "num_input_tokens_seen": 7985032, + "step": 12245 + }, + { + "epoch": 6.420335429769392, + "grad_norm": 1.9270530939102173, + "learning_rate": 1.7110336452926555e-05, + "loss": 0.2278, + "num_input_tokens_seen": 7987272, + "step": 12250 + }, + { + "epoch": 6.422955974842767, + "grad_norm": 1.3575942516326904, + "learning_rate": 1.708863984407338e-05, + "loss": 0.3132, + "num_input_tokens_seen": 7990152, + "step": 12255 + }, + { + "epoch": 6.4255765199161425, + "grad_norm": 1.6311551332473755, + "learning_rate": 1.7066949855085e-05, + "loss": 0.2826, + "num_input_tokens_seen": 7992648, + "step": 12260 + }, + { + "epoch": 6.428197064989518, + "grad_norm": 1.6151758432388306, + "learning_rate": 1.704526650411062e-05, + "loss": 0.2438, + "num_input_tokens_seen": 7995880, + "step": 12265 + }, + { + "epoch": 6.430817610062893, + "grad_norm": 1.8315584659576416, + "learning_rate": 1.7023589809293876e-05, + "loss": 0.2477, + "num_input_tokens_seen": 7998312, + "step": 12270 + }, + { + "epoch": 6.433438155136268, + "grad_norm": 1.4034457206726074, + "learning_rate": 1.7001919788772824e-05, + "loss": 0.277, + "num_input_tokens_seen": 8001768, + "step": 12275 + }, + { + "epoch": 6.436058700209643, + "grad_norm": 1.5832607746124268, + "learning_rate": 1.6980256460679953e-05, + "loss": 0.2346, + "num_input_tokens_seen": 8004392, + "step": 12280 + }, + { + "epoch": 6.438679245283019, + "grad_norm": 1.6987919807434082, + "learning_rate": 1.6958599843142153e-05, + "loss": 0.2459, + "num_input_tokens_seen": 8006984, + "step": 12285 + }, + { + "epoch": 6.441299790356394, + "grad_norm": 2.814215660095215, + "learning_rate": 1.6936949954280686e-05, + "loss": 0.4025, + "num_input_tokens_seen": 8010920, + "step": 12290 + }, + { + "epoch": 6.44392033542977, + "grad_norm": 1.7837235927581787, + "learning_rate": 1.691530681221119e-05, + "loss": 0.3278, + "num_input_tokens_seen": 8013800, + "step": 12295 + }, + { + "epoch": 6.446540880503145, + "grad_norm": 1.4107568264007568, + "learning_rate": 1.6893670435043666e-05, + "loss": 0.2103, + "num_input_tokens_seen": 8016616, + "step": 12300 + }, + { + "epoch": 6.44916142557652, + "grad_norm": 0.9530507326126099, + "learning_rate": 1.6872040840882434e-05, + "loss": 0.2037, + "num_input_tokens_seen": 8020040, + "step": 12305 + }, + { + "epoch": 6.451781970649895, + "grad_norm": 1.3296623229980469, + "learning_rate": 1.6850418047826167e-05, + "loss": 0.2304, + "num_input_tokens_seen": 8024008, + "step": 12310 + }, + { + "epoch": 6.45440251572327, + "grad_norm": 1.9739508628845215, + "learning_rate": 1.6828802073967805e-05, + "loss": 0.282, + "num_input_tokens_seen": 8027528, + "step": 12315 + }, + { + "epoch": 6.4570230607966455, + "grad_norm": 2.2736854553222656, + "learning_rate": 1.6807192937394624e-05, + "loss": 0.277, + "num_input_tokens_seen": 8030312, + "step": 12320 + }, + { + "epoch": 6.459643605870021, + "grad_norm": 1.9037047624588013, + "learning_rate": 1.6785590656188167e-05, + "loss": 0.2185, + "num_input_tokens_seen": 8034280, + "step": 12325 + }, + { + "epoch": 6.462264150943396, + "grad_norm": 0.9646826386451721, + "learning_rate": 1.6763995248424223e-05, + "loss": 0.2889, + "num_input_tokens_seen": 8038408, + "step": 12330 + }, + { + "epoch": 6.464884696016772, + "grad_norm": 2.2917516231536865, + "learning_rate": 1.6742406732172854e-05, + "loss": 0.2996, + "num_input_tokens_seen": 8041544, + "step": 12335 + }, + { + "epoch": 6.467505241090147, + "grad_norm": 1.0054724216461182, + "learning_rate": 1.6720825125498342e-05, + "loss": 0.1675, + "num_input_tokens_seen": 8045096, + "step": 12340 + }, + { + "epoch": 6.470125786163522, + "grad_norm": 0.9639859199523926, + "learning_rate": 1.6699250446459182e-05, + "loss": 0.2653, + "num_input_tokens_seen": 8049064, + "step": 12345 + }, + { + "epoch": 6.472746331236897, + "grad_norm": 1.880847454071045, + "learning_rate": 1.6677682713108082e-05, + "loss": 0.2776, + "num_input_tokens_seen": 8051816, + "step": 12350 + }, + { + "epoch": 6.4753668763102725, + "grad_norm": 1.8046821355819702, + "learning_rate": 1.6656121943491954e-05, + "loss": 0.2631, + "num_input_tokens_seen": 8055528, + "step": 12355 + }, + { + "epoch": 6.477987421383648, + "grad_norm": 1.3113499879837036, + "learning_rate": 1.6634568155651842e-05, + "loss": 0.2448, + "num_input_tokens_seen": 8058472, + "step": 12360 + }, + { + "epoch": 6.480607966457023, + "grad_norm": 1.6949057579040527, + "learning_rate": 1.6613021367622978e-05, + "loss": 0.4276, + "num_input_tokens_seen": 8061608, + "step": 12365 + }, + { + "epoch": 6.483228511530398, + "grad_norm": 0.8797181248664856, + "learning_rate": 1.6591481597434733e-05, + "loss": 0.298, + "num_input_tokens_seen": 8064648, + "step": 12370 + }, + { + "epoch": 6.485849056603773, + "grad_norm": 1.5672030448913574, + "learning_rate": 1.65699488631106e-05, + "loss": 0.3602, + "num_input_tokens_seen": 8067496, + "step": 12375 + }, + { + "epoch": 6.488469601677149, + "grad_norm": 1.720110297203064, + "learning_rate": 1.6548423182668186e-05, + "loss": 0.2332, + "num_input_tokens_seen": 8069864, + "step": 12380 + }, + { + "epoch": 6.491090146750524, + "grad_norm": 1.866997480392456, + "learning_rate": 1.6526904574119213e-05, + "loss": 0.3142, + "num_input_tokens_seen": 8072936, + "step": 12385 + }, + { + "epoch": 6.4937106918239, + "grad_norm": 3.062676191329956, + "learning_rate": 1.6505393055469444e-05, + "loss": 0.2865, + "num_input_tokens_seen": 8075784, + "step": 12390 + }, + { + "epoch": 6.496331236897275, + "grad_norm": 7.087163925170898, + "learning_rate": 1.648388864471875e-05, + "loss": 0.3531, + "num_input_tokens_seen": 8078280, + "step": 12395 + }, + { + "epoch": 6.49895178197065, + "grad_norm": 2.4925692081451416, + "learning_rate": 1.646239135986105e-05, + "loss": 0.1964, + "num_input_tokens_seen": 8081992, + "step": 12400 + }, + { + "epoch": 6.5, + "eval_loss": 0.5859813094139099, + "eval_runtime": 15.9891, + "eval_samples_per_second": 53.036, + "eval_steps_per_second": 13.259, + "num_input_tokens_seen": 8083560, + "step": 12402 + }, + { + "epoch": 6.501572327044025, + "grad_norm": 1.6883546113967896, + "learning_rate": 1.6440901218884264e-05, + "loss": 0.2555, + "num_input_tokens_seen": 8085672, + "step": 12405 + }, + { + "epoch": 6.5041928721174, + "grad_norm": 1.2608355283737183, + "learning_rate": 1.641941823977038e-05, + "loss": 0.2419, + "num_input_tokens_seen": 8089000, + "step": 12410 + }, + { + "epoch": 6.506813417190775, + "grad_norm": 1.3271030187606812, + "learning_rate": 1.6397942440495363e-05, + "loss": 0.236, + "num_input_tokens_seen": 8092168, + "step": 12415 + }, + { + "epoch": 6.509433962264151, + "grad_norm": 1.3291306495666504, + "learning_rate": 1.6376473839029188e-05, + "loss": 0.2968, + "num_input_tokens_seen": 8094728, + "step": 12420 + }, + { + "epoch": 6.512054507337526, + "grad_norm": 2.140132427215576, + "learning_rate": 1.63550124533358e-05, + "loss": 0.2955, + "num_input_tokens_seen": 8098088, + "step": 12425 + }, + { + "epoch": 6.514675052410902, + "grad_norm": 1.7024279832839966, + "learning_rate": 1.63335583013731e-05, + "loss": 0.3204, + "num_input_tokens_seen": 8100904, + "step": 12430 + }, + { + "epoch": 6.517295597484277, + "grad_norm": 1.7791649103164673, + "learning_rate": 1.6312111401092946e-05, + "loss": 0.2161, + "num_input_tokens_seen": 8104264, + "step": 12435 + }, + { + "epoch": 6.519916142557652, + "grad_norm": 1.2050609588623047, + "learning_rate": 1.6290671770441135e-05, + "loss": 0.2326, + "num_input_tokens_seen": 8107144, + "step": 12440 + }, + { + "epoch": 6.522536687631027, + "grad_norm": 1.8756892681121826, + "learning_rate": 1.6269239427357348e-05, + "loss": 0.2187, + "num_input_tokens_seen": 8111304, + "step": 12445 + }, + { + "epoch": 6.5251572327044025, + "grad_norm": 1.537737250328064, + "learning_rate": 1.62478143897752e-05, + "loss": 0.3014, + "num_input_tokens_seen": 8114056, + "step": 12450 + }, + { + "epoch": 6.527777777777778, + "grad_norm": 2.112191677093506, + "learning_rate": 1.6226396675622203e-05, + "loss": 0.3432, + "num_input_tokens_seen": 8116808, + "step": 12455 + }, + { + "epoch": 6.530398322851153, + "grad_norm": 2.4885411262512207, + "learning_rate": 1.6204986302819693e-05, + "loss": 0.2529, + "num_input_tokens_seen": 8119496, + "step": 12460 + }, + { + "epoch": 6.533018867924528, + "grad_norm": 2.9520339965820312, + "learning_rate": 1.6183583289282906e-05, + "loss": 0.2866, + "num_input_tokens_seen": 8122728, + "step": 12465 + }, + { + "epoch": 6.535639412997903, + "grad_norm": 1.95371675491333, + "learning_rate": 1.616218765292091e-05, + "loss": 0.3781, + "num_input_tokens_seen": 8126504, + "step": 12470 + }, + { + "epoch": 6.538259958071279, + "grad_norm": 3.4323458671569824, + "learning_rate": 1.6140799411636586e-05, + "loss": 0.3559, + "num_input_tokens_seen": 8128776, + "step": 12475 + }, + { + "epoch": 6.540880503144654, + "grad_norm": 1.1894211769104004, + "learning_rate": 1.611941858332664e-05, + "loss": 0.344, + "num_input_tokens_seen": 8131720, + "step": 12480 + }, + { + "epoch": 6.54350104821803, + "grad_norm": 2.5521364212036133, + "learning_rate": 1.6098045185881587e-05, + "loss": 0.3218, + "num_input_tokens_seen": 8135624, + "step": 12485 + }, + { + "epoch": 6.546121593291405, + "grad_norm": 1.1170042753219604, + "learning_rate": 1.6076679237185682e-05, + "loss": 0.2334, + "num_input_tokens_seen": 8138632, + "step": 12490 + }, + { + "epoch": 6.54874213836478, + "grad_norm": 2.0113213062286377, + "learning_rate": 1.6055320755117004e-05, + "loss": 0.2773, + "num_input_tokens_seen": 8142184, + "step": 12495 + }, + { + "epoch": 6.551362683438155, + "grad_norm": 1.2647958993911743, + "learning_rate": 1.6033969757547336e-05, + "loss": 0.3048, + "num_input_tokens_seen": 8146792, + "step": 12500 + }, + { + "epoch": 6.55398322851153, + "grad_norm": 1.6147233247756958, + "learning_rate": 1.601262626234222e-05, + "loss": 0.1912, + "num_input_tokens_seen": 8150504, + "step": 12505 + }, + { + "epoch": 6.556603773584905, + "grad_norm": 1.9921218156814575, + "learning_rate": 1.5991290287360925e-05, + "loss": 0.2781, + "num_input_tokens_seen": 8153000, + "step": 12510 + }, + { + "epoch": 6.559224318658281, + "grad_norm": 2.7952208518981934, + "learning_rate": 1.5969961850456412e-05, + "loss": 0.1795, + "num_input_tokens_seen": 8155720, + "step": 12515 + }, + { + "epoch": 6.561844863731656, + "grad_norm": 2.1635258197784424, + "learning_rate": 1.5948640969475346e-05, + "loss": 0.2093, + "num_input_tokens_seen": 8159400, + "step": 12520 + }, + { + "epoch": 6.564465408805032, + "grad_norm": 1.628792643547058, + "learning_rate": 1.592732766225808e-05, + "loss": 0.1847, + "num_input_tokens_seen": 8162952, + "step": 12525 + }, + { + "epoch": 6.567085953878407, + "grad_norm": 1.3725554943084717, + "learning_rate": 1.5906021946638585e-05, + "loss": 0.244, + "num_input_tokens_seen": 8166376, + "step": 12530 + }, + { + "epoch": 6.569706498951782, + "grad_norm": 2.7819035053253174, + "learning_rate": 1.5884723840444532e-05, + "loss": 0.3056, + "num_input_tokens_seen": 8169064, + "step": 12535 + }, + { + "epoch": 6.572327044025157, + "grad_norm": 1.1739121675491333, + "learning_rate": 1.5863433361497214e-05, + "loss": 0.3315, + "num_input_tokens_seen": 8171976, + "step": 12540 + }, + { + "epoch": 6.5749475890985325, + "grad_norm": 1.8769489526748657, + "learning_rate": 1.5842150527611506e-05, + "loss": 0.2472, + "num_input_tokens_seen": 8176200, + "step": 12545 + }, + { + "epoch": 6.577568134171908, + "grad_norm": 2.7107460498809814, + "learning_rate": 1.5820875356595925e-05, + "loss": 0.2763, + "num_input_tokens_seen": 8179368, + "step": 12550 + }, + { + "epoch": 6.580188679245283, + "grad_norm": 2.543771266937256, + "learning_rate": 1.579960786625256e-05, + "loss": 0.2023, + "num_input_tokens_seen": 8182248, + "step": 12555 + }, + { + "epoch": 6.582809224318658, + "grad_norm": 1.3281869888305664, + "learning_rate": 1.5778348074377074e-05, + "loss": 0.3261, + "num_input_tokens_seen": 8184776, + "step": 12560 + }, + { + "epoch": 6.585429769392033, + "grad_norm": 1.566015601158142, + "learning_rate": 1.575709599875869e-05, + "loss": 0.2033, + "num_input_tokens_seen": 8187976, + "step": 12565 + }, + { + "epoch": 6.588050314465409, + "grad_norm": 1.937162160873413, + "learning_rate": 1.5735851657180184e-05, + "loss": 0.2496, + "num_input_tokens_seen": 8190984, + "step": 12570 + }, + { + "epoch": 6.590670859538784, + "grad_norm": 2.086411714553833, + "learning_rate": 1.571461506741783e-05, + "loss": 0.2248, + "num_input_tokens_seen": 8194568, + "step": 12575 + }, + { + "epoch": 6.59329140461216, + "grad_norm": 1.625921607017517, + "learning_rate": 1.5693386247241453e-05, + "loss": 0.3299, + "num_input_tokens_seen": 8197512, + "step": 12580 + }, + { + "epoch": 6.595911949685535, + "grad_norm": 1.7329317331314087, + "learning_rate": 1.5672165214414362e-05, + "loss": 0.2345, + "num_input_tokens_seen": 8201128, + "step": 12585 + }, + { + "epoch": 6.59853249475891, + "grad_norm": 1.4993127584457397, + "learning_rate": 1.5650951986693334e-05, + "loss": 0.1506, + "num_input_tokens_seen": 8204392, + "step": 12590 + }, + { + "epoch": 6.601153039832285, + "grad_norm": 6.336995601654053, + "learning_rate": 1.5629746581828642e-05, + "loss": 0.2492, + "num_input_tokens_seen": 8206952, + "step": 12595 + }, + { + "epoch": 6.60377358490566, + "grad_norm": 1.90379798412323, + "learning_rate": 1.560854901756399e-05, + "loss": 0.1892, + "num_input_tokens_seen": 8209768, + "step": 12600 + }, + { + "epoch": 6.606394129979035, + "grad_norm": 1.5854160785675049, + "learning_rate": 1.558735931163653e-05, + "loss": 0.2685, + "num_input_tokens_seen": 8216680, + "step": 12605 + }, + { + "epoch": 6.609014675052411, + "grad_norm": 4.109755992889404, + "learning_rate": 1.5566177481776857e-05, + "loss": 0.257, + "num_input_tokens_seen": 8220552, + "step": 12610 + }, + { + "epoch": 6.611635220125786, + "grad_norm": 1.3275561332702637, + "learning_rate": 1.554500354570894e-05, + "loss": 0.3011, + "num_input_tokens_seen": 8225064, + "step": 12615 + }, + { + "epoch": 6.614255765199162, + "grad_norm": 1.4680341482162476, + "learning_rate": 1.552383752115017e-05, + "loss": 0.3214, + "num_input_tokens_seen": 8228168, + "step": 12620 + }, + { + "epoch": 6.616876310272537, + "grad_norm": 1.9214447736740112, + "learning_rate": 1.550267942581132e-05, + "loss": 0.2171, + "num_input_tokens_seen": 8231944, + "step": 12625 + }, + { + "epoch": 6.619496855345912, + "grad_norm": 2.570204496383667, + "learning_rate": 1.548152927739649e-05, + "loss": 0.2746, + "num_input_tokens_seen": 8235848, + "step": 12630 + }, + { + "epoch": 6.622117400419287, + "grad_norm": 1.2856791019439697, + "learning_rate": 1.5460387093603178e-05, + "loss": 0.3832, + "num_input_tokens_seen": 8238792, + "step": 12635 + }, + { + "epoch": 6.6247379454926625, + "grad_norm": 2.043464183807373, + "learning_rate": 1.5439252892122197e-05, + "loss": 0.2684, + "num_input_tokens_seen": 8242088, + "step": 12640 + }, + { + "epoch": 6.627358490566038, + "grad_norm": 2.0727591514587402, + "learning_rate": 1.5418126690637673e-05, + "loss": 0.3061, + "num_input_tokens_seen": 8244616, + "step": 12645 + }, + { + "epoch": 6.629979035639413, + "grad_norm": 4.003169536590576, + "learning_rate": 1.5397008506827057e-05, + "loss": 0.2384, + "num_input_tokens_seen": 8247912, + "step": 12650 + }, + { + "epoch": 6.632599580712788, + "grad_norm": 1.5596928596496582, + "learning_rate": 1.537589835836108e-05, + "loss": 0.3069, + "num_input_tokens_seen": 8251816, + "step": 12655 + }, + { + "epoch": 6.635220125786163, + "grad_norm": 1.9107252359390259, + "learning_rate": 1.5354796262903736e-05, + "loss": 0.3156, + "num_input_tokens_seen": 8255432, + "step": 12660 + }, + { + "epoch": 6.637840670859539, + "grad_norm": 0.900353729724884, + "learning_rate": 1.5333702238112306e-05, + "loss": 0.2033, + "num_input_tokens_seen": 8258824, + "step": 12665 + }, + { + "epoch": 6.640461215932914, + "grad_norm": 1.6756856441497803, + "learning_rate": 1.5312616301637313e-05, + "loss": 0.2927, + "num_input_tokens_seen": 8262888, + "step": 12670 + }, + { + "epoch": 6.6430817610062896, + "grad_norm": 0.9775093793869019, + "learning_rate": 1.5291538471122488e-05, + "loss": 0.1966, + "num_input_tokens_seen": 8266568, + "step": 12675 + }, + { + "epoch": 6.645702306079665, + "grad_norm": 1.7148720026016235, + "learning_rate": 1.527046876420481e-05, + "loss": 0.3234, + "num_input_tokens_seen": 8269800, + "step": 12680 + }, + { + "epoch": 6.64832285115304, + "grad_norm": 1.508944034576416, + "learning_rate": 1.524940719851444e-05, + "loss": 0.2248, + "num_input_tokens_seen": 8272936, + "step": 12685 + }, + { + "epoch": 6.650943396226415, + "grad_norm": 1.4226903915405273, + "learning_rate": 1.5228353791674734e-05, + "loss": 0.2761, + "num_input_tokens_seen": 8276104, + "step": 12690 + }, + { + "epoch": 6.65356394129979, + "grad_norm": 0.8299527764320374, + "learning_rate": 1.520730856130223e-05, + "loss": 0.2134, + "num_input_tokens_seen": 8278728, + "step": 12695 + }, + { + "epoch": 6.656184486373165, + "grad_norm": 1.4762593507766724, + "learning_rate": 1.5186271525006607e-05, + "loss": 0.2529, + "num_input_tokens_seen": 8282312, + "step": 12700 + }, + { + "epoch": 6.658805031446541, + "grad_norm": 1.8549680709838867, + "learning_rate": 1.5165242700390697e-05, + "loss": 0.2998, + "num_input_tokens_seen": 8285800, + "step": 12705 + }, + { + "epoch": 6.661425576519916, + "grad_norm": 1.6331050395965576, + "learning_rate": 1.5144222105050471e-05, + "loss": 0.3055, + "num_input_tokens_seen": 8288648, + "step": 12710 + }, + { + "epoch": 6.664046121593291, + "grad_norm": 1.6113396883010864, + "learning_rate": 1.5123209756574986e-05, + "loss": 0.2213, + "num_input_tokens_seen": 8291688, + "step": 12715 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 2.3572442531585693, + "learning_rate": 1.5102205672546416e-05, + "loss": 0.3025, + "num_input_tokens_seen": 8295400, + "step": 12720 + }, + { + "epoch": 6.669287211740042, + "grad_norm": 1.6396994590759277, + "learning_rate": 1.508120987054004e-05, + "loss": 0.2685, + "num_input_tokens_seen": 8297704, + "step": 12725 + }, + { + "epoch": 6.671907756813417, + "grad_norm": 2.302988290786743, + "learning_rate": 1.5060222368124163e-05, + "loss": 0.3659, + "num_input_tokens_seen": 8300808, + "step": 12730 + }, + { + "epoch": 6.6745283018867925, + "grad_norm": 2.64300537109375, + "learning_rate": 1.5039243182860177e-05, + "loss": 0.3362, + "num_input_tokens_seen": 8304104, + "step": 12735 + }, + { + "epoch": 6.677148846960168, + "grad_norm": 2.593535900115967, + "learning_rate": 1.5018272332302513e-05, + "loss": 0.311, + "num_input_tokens_seen": 8307784, + "step": 12740 + }, + { + "epoch": 6.679769392033543, + "grad_norm": 1.0343259572982788, + "learning_rate": 1.4997309833998607e-05, + "loss": 0.2428, + "num_input_tokens_seen": 8311848, + "step": 12745 + }, + { + "epoch": 6.682389937106918, + "grad_norm": 1.4649369716644287, + "learning_rate": 1.4976355705488932e-05, + "loss": 0.2479, + "num_input_tokens_seen": 8315272, + "step": 12750 + }, + { + "epoch": 6.685010482180293, + "grad_norm": 2.433157205581665, + "learning_rate": 1.4955409964306946e-05, + "loss": 0.2855, + "num_input_tokens_seen": 8318376, + "step": 12755 + }, + { + "epoch": 6.687631027253669, + "grad_norm": 2.058140277862549, + "learning_rate": 1.4934472627979067e-05, + "loss": 0.2357, + "num_input_tokens_seen": 8322248, + "step": 12760 + }, + { + "epoch": 6.690251572327044, + "grad_norm": 0.8801541924476624, + "learning_rate": 1.491354371402473e-05, + "loss": 0.1754, + "num_input_tokens_seen": 8326056, + "step": 12765 + }, + { + "epoch": 6.6928721174004195, + "grad_norm": 1.5142908096313477, + "learning_rate": 1.4892623239956289e-05, + "loss": 0.3408, + "num_input_tokens_seen": 8329608, + "step": 12770 + }, + { + "epoch": 6.695492662473795, + "grad_norm": 1.3883342742919922, + "learning_rate": 1.4871711223279022e-05, + "loss": 0.223, + "num_input_tokens_seen": 8332072, + "step": 12775 + }, + { + "epoch": 6.69811320754717, + "grad_norm": 2.7733047008514404, + "learning_rate": 1.4850807681491169e-05, + "loss": 0.2719, + "num_input_tokens_seen": 8334760, + "step": 12780 + }, + { + "epoch": 6.700733752620545, + "grad_norm": 1.5172420740127563, + "learning_rate": 1.4829912632083845e-05, + "loss": 0.1913, + "num_input_tokens_seen": 8337736, + "step": 12785 + }, + { + "epoch": 6.70335429769392, + "grad_norm": 2.3827834129333496, + "learning_rate": 1.4809026092541078e-05, + "loss": 0.1969, + "num_input_tokens_seen": 8340488, + "step": 12790 + }, + { + "epoch": 6.705974842767295, + "grad_norm": 2.108814239501953, + "learning_rate": 1.4788148080339787e-05, + "loss": 0.2937, + "num_input_tokens_seen": 8343176, + "step": 12795 + }, + { + "epoch": 6.7085953878406706, + "grad_norm": 2.076021671295166, + "learning_rate": 1.4767278612949703e-05, + "loss": 0.3316, + "num_input_tokens_seen": 8346376, + "step": 12800 + }, + { + "epoch": 6.711215932914046, + "grad_norm": 1.1902464628219604, + "learning_rate": 1.474641770783347e-05, + "loss": 0.2328, + "num_input_tokens_seen": 8349672, + "step": 12805 + }, + { + "epoch": 6.713836477987421, + "grad_norm": 1.559134840965271, + "learning_rate": 1.4725565382446549e-05, + "loss": 0.2417, + "num_input_tokens_seen": 8352232, + "step": 12810 + }, + { + "epoch": 6.716457023060797, + "grad_norm": 1.3198843002319336, + "learning_rate": 1.4704721654237185e-05, + "loss": 0.2066, + "num_input_tokens_seen": 8355592, + "step": 12815 + }, + { + "epoch": 6.719077568134172, + "grad_norm": 1.3008294105529785, + "learning_rate": 1.4683886540646468e-05, + "loss": 0.178, + "num_input_tokens_seen": 8357736, + "step": 12820 + }, + { + "epoch": 6.721698113207547, + "grad_norm": 1.3216506242752075, + "learning_rate": 1.4663060059108282e-05, + "loss": 0.2019, + "num_input_tokens_seen": 8362568, + "step": 12825 + }, + { + "epoch": 6.7243186582809225, + "grad_norm": 1.4118742942810059, + "learning_rate": 1.464224222704926e-05, + "loss": 0.3148, + "num_input_tokens_seen": 8366152, + "step": 12830 + }, + { + "epoch": 6.726939203354298, + "grad_norm": 2.597742795944214, + "learning_rate": 1.462143306188882e-05, + "loss": 0.2415, + "num_input_tokens_seen": 8368456, + "step": 12835 + }, + { + "epoch": 6.729559748427673, + "grad_norm": 1.9159129858016968, + "learning_rate": 1.4600632581039123e-05, + "loss": 0.2858, + "num_input_tokens_seen": 8371560, + "step": 12840 + }, + { + "epoch": 6.732180293501048, + "grad_norm": 2.2582833766937256, + "learning_rate": 1.457984080190506e-05, + "loss": 0.2288, + "num_input_tokens_seen": 8374792, + "step": 12845 + }, + { + "epoch": 6.734800838574423, + "grad_norm": 2.670609951019287, + "learning_rate": 1.4559057741884227e-05, + "loss": 0.2585, + "num_input_tokens_seen": 8377832, + "step": 12850 + }, + { + "epoch": 6.737421383647799, + "grad_norm": 3.3610754013061523, + "learning_rate": 1.4538283418366965e-05, + "loss": 0.3016, + "num_input_tokens_seen": 8380872, + "step": 12855 + }, + { + "epoch": 6.740041928721174, + "grad_norm": 1.6860570907592773, + "learning_rate": 1.4517517848736267e-05, + "loss": 0.283, + "num_input_tokens_seen": 8384264, + "step": 12860 + }, + { + "epoch": 6.7426624737945495, + "grad_norm": 1.2907449007034302, + "learning_rate": 1.449676105036781e-05, + "loss": 0.3048, + "num_input_tokens_seen": 8389064, + "step": 12865 + }, + { + "epoch": 6.745283018867925, + "grad_norm": 1.359747052192688, + "learning_rate": 1.4476013040629938e-05, + "loss": 0.3172, + "num_input_tokens_seen": 8391976, + "step": 12870 + }, + { + "epoch": 6.7479035639413, + "grad_norm": 1.692556381225586, + "learning_rate": 1.4455273836883629e-05, + "loss": 0.1831, + "num_input_tokens_seen": 8395176, + "step": 12875 + }, + { + "epoch": 6.750524109014675, + "grad_norm": 2.1168079376220703, + "learning_rate": 1.443454345648252e-05, + "loss": 0.3009, + "num_input_tokens_seen": 8398216, + "step": 12880 + }, + { + "epoch": 6.75314465408805, + "grad_norm": 1.50838041305542, + "learning_rate": 1.4413821916772832e-05, + "loss": 0.3124, + "num_input_tokens_seen": 8402120, + "step": 12885 + }, + { + "epoch": 6.755765199161425, + "grad_norm": 1.6216214895248413, + "learning_rate": 1.4393109235093399e-05, + "loss": 0.3278, + "num_input_tokens_seen": 8405736, + "step": 12890 + }, + { + "epoch": 6.7583857442348005, + "grad_norm": 3.7149834632873535, + "learning_rate": 1.4372405428775664e-05, + "loss": 0.2518, + "num_input_tokens_seen": 8408808, + "step": 12895 + }, + { + "epoch": 6.761006289308176, + "grad_norm": 0.7348297834396362, + "learning_rate": 1.4351710515143618e-05, + "loss": 0.1907, + "num_input_tokens_seen": 8411400, + "step": 12900 + }, + { + "epoch": 6.763626834381551, + "grad_norm": 1.3080559968948364, + "learning_rate": 1.4331024511513808e-05, + "loss": 0.1903, + "num_input_tokens_seen": 8414632, + "step": 12905 + }, + { + "epoch": 6.766247379454927, + "grad_norm": 1.7936948537826538, + "learning_rate": 1.4310347435195368e-05, + "loss": 0.263, + "num_input_tokens_seen": 8417256, + "step": 12910 + }, + { + "epoch": 6.768867924528302, + "grad_norm": 1.2016165256500244, + "learning_rate": 1.428967930348989e-05, + "loss": 0.2401, + "num_input_tokens_seen": 8420584, + "step": 12915 + }, + { + "epoch": 6.771488469601677, + "grad_norm": 1.4496650695800781, + "learning_rate": 1.4269020133691542e-05, + "loss": 0.3443, + "num_input_tokens_seen": 8423912, + "step": 12920 + }, + { + "epoch": 6.774109014675052, + "grad_norm": 1.5732132196426392, + "learning_rate": 1.4248369943086998e-05, + "loss": 0.2322, + "num_input_tokens_seen": 8427368, + "step": 12925 + }, + { + "epoch": 6.776729559748428, + "grad_norm": 1.411317229270935, + "learning_rate": 1.4227728748955345e-05, + "loss": 0.2719, + "num_input_tokens_seen": 8431176, + "step": 12930 + }, + { + "epoch": 6.779350104821803, + "grad_norm": 1.3454132080078125, + "learning_rate": 1.4207096568568232e-05, + "loss": 0.2477, + "num_input_tokens_seen": 8433768, + "step": 12935 + }, + { + "epoch": 6.781970649895178, + "grad_norm": 2.0250256061553955, + "learning_rate": 1.418647341918971e-05, + "loss": 0.233, + "num_input_tokens_seen": 8437384, + "step": 12940 + }, + { + "epoch": 6.784591194968553, + "grad_norm": 1.515677571296692, + "learning_rate": 1.4165859318076276e-05, + "loss": 0.2413, + "num_input_tokens_seen": 8441192, + "step": 12945 + }, + { + "epoch": 6.787211740041929, + "grad_norm": 1.7375632524490356, + "learning_rate": 1.4145254282476895e-05, + "loss": 0.403, + "num_input_tokens_seen": 8446504, + "step": 12950 + }, + { + "epoch": 6.789832285115304, + "grad_norm": 1.265319585800171, + "learning_rate": 1.4124658329632901e-05, + "loss": 0.1804, + "num_input_tokens_seen": 8449384, + "step": 12955 + }, + { + "epoch": 6.7924528301886795, + "grad_norm": 2.6198785305023193, + "learning_rate": 1.4104071476778044e-05, + "loss": 0.2526, + "num_input_tokens_seen": 8452456, + "step": 12960 + }, + { + "epoch": 6.795073375262055, + "grad_norm": 1.7192237377166748, + "learning_rate": 1.4083493741138486e-05, + "loss": 0.3211, + "num_input_tokens_seen": 8457000, + "step": 12965 + }, + { + "epoch": 6.79769392033543, + "grad_norm": 2.2308335304260254, + "learning_rate": 1.4062925139932703e-05, + "loss": 0.2609, + "num_input_tokens_seen": 8461576, + "step": 12970 + }, + { + "epoch": 6.800314465408805, + "grad_norm": 4.064630508422852, + "learning_rate": 1.4042365690371587e-05, + "loss": 0.2553, + "num_input_tokens_seen": 8464712, + "step": 12975 + }, + { + "epoch": 6.80293501048218, + "grad_norm": 1.2103077173233032, + "learning_rate": 1.4021815409658335e-05, + "loss": 0.1947, + "num_input_tokens_seen": 8467880, + "step": 12980 + }, + { + "epoch": 6.805555555555555, + "grad_norm": 1.7691709995269775, + "learning_rate": 1.4001274314988475e-05, + "loss": 0.1951, + "num_input_tokens_seen": 8471208, + "step": 12985 + }, + { + "epoch": 6.8081761006289305, + "grad_norm": 1.4674121141433716, + "learning_rate": 1.3980742423549875e-05, + "loss": 0.2395, + "num_input_tokens_seen": 8474248, + "step": 12990 + }, + { + "epoch": 6.810796645702306, + "grad_norm": 2.3957455158233643, + "learning_rate": 1.3960219752522679e-05, + "loss": 0.2414, + "num_input_tokens_seen": 8477064, + "step": 12995 + }, + { + "epoch": 6.813417190775681, + "grad_norm": 1.7861123085021973, + "learning_rate": 1.3939706319079305e-05, + "loss": 0.2264, + "num_input_tokens_seen": 8479528, + "step": 13000 + }, + { + "epoch": 6.816037735849057, + "grad_norm": 3.4157166481018066, + "learning_rate": 1.391920214038448e-05, + "loss": 0.1853, + "num_input_tokens_seen": 8482856, + "step": 13005 + }, + { + "epoch": 6.818658280922432, + "grad_norm": 1.596688985824585, + "learning_rate": 1.3898707233595153e-05, + "loss": 0.2875, + "num_input_tokens_seen": 8486312, + "step": 13010 + }, + { + "epoch": 6.821278825995807, + "grad_norm": 1.5668946504592896, + "learning_rate": 1.3878221615860527e-05, + "loss": 0.2778, + "num_input_tokens_seen": 8489096, + "step": 13015 + }, + { + "epoch": 6.823899371069182, + "grad_norm": 1.4767284393310547, + "learning_rate": 1.3857745304322017e-05, + "loss": 0.3043, + "num_input_tokens_seen": 8493448, + "step": 13020 + }, + { + "epoch": 6.826519916142558, + "grad_norm": 1.5348654985427856, + "learning_rate": 1.3837278316113293e-05, + "loss": 0.1773, + "num_input_tokens_seen": 8496968, + "step": 13025 + }, + { + "epoch": 6.829140461215933, + "grad_norm": 1.681921124458313, + "learning_rate": 1.3816820668360177e-05, + "loss": 0.196, + "num_input_tokens_seen": 8500616, + "step": 13030 + }, + { + "epoch": 6.831761006289308, + "grad_norm": 1.8020164966583252, + "learning_rate": 1.3796372378180691e-05, + "loss": 0.2881, + "num_input_tokens_seen": 8504136, + "step": 13035 + }, + { + "epoch": 6.834381551362683, + "grad_norm": 2.3553788661956787, + "learning_rate": 1.3775933462685047e-05, + "loss": 0.29, + "num_input_tokens_seen": 8507464, + "step": 13040 + }, + { + "epoch": 6.837002096436059, + "grad_norm": 2.3074238300323486, + "learning_rate": 1.375550393897559e-05, + "loss": 0.3076, + "num_input_tokens_seen": 8510216, + "step": 13045 + }, + { + "epoch": 6.839622641509434, + "grad_norm": 1.5302139520645142, + "learning_rate": 1.3735083824146793e-05, + "loss": 0.2903, + "num_input_tokens_seen": 8513960, + "step": 13050 + }, + { + "epoch": 6.8422431865828095, + "grad_norm": 1.566707968711853, + "learning_rate": 1.3714673135285316e-05, + "loss": 0.2628, + "num_input_tokens_seen": 8517768, + "step": 13055 + }, + { + "epoch": 6.844863731656185, + "grad_norm": 1.4743281602859497, + "learning_rate": 1.3694271889469844e-05, + "loss": 0.3487, + "num_input_tokens_seen": 8521640, + "step": 13060 + }, + { + "epoch": 6.84748427672956, + "grad_norm": 1.6817585229873657, + "learning_rate": 1.3673880103771241e-05, + "loss": 0.2382, + "num_input_tokens_seen": 8524968, + "step": 13065 + }, + { + "epoch": 6.850104821802935, + "grad_norm": 2.306931734085083, + "learning_rate": 1.365349779525241e-05, + "loss": 0.4017, + "num_input_tokens_seen": 8528200, + "step": 13070 + }, + { + "epoch": 6.85272536687631, + "grad_norm": 1.5429435968399048, + "learning_rate": 1.3633124980968327e-05, + "loss": 0.2469, + "num_input_tokens_seen": 8531144, + "step": 13075 + }, + { + "epoch": 6.855345911949685, + "grad_norm": 2.0534563064575195, + "learning_rate": 1.3612761677966051e-05, + "loss": 0.279, + "num_input_tokens_seen": 8535240, + "step": 13080 + }, + { + "epoch": 6.8579664570230605, + "grad_norm": 1.6095243692398071, + "learning_rate": 1.3592407903284654e-05, + "loss": 0.2777, + "num_input_tokens_seen": 8537352, + "step": 13085 + }, + { + "epoch": 6.860587002096436, + "grad_norm": 2.2034800052642822, + "learning_rate": 1.3572063673955238e-05, + "loss": 0.2601, + "num_input_tokens_seen": 8540776, + "step": 13090 + }, + { + "epoch": 6.863207547169811, + "grad_norm": 1.6925619840621948, + "learning_rate": 1.355172900700095e-05, + "loss": 0.2952, + "num_input_tokens_seen": 8543944, + "step": 13095 + }, + { + "epoch": 6.865828092243187, + "grad_norm": 2.7255196571350098, + "learning_rate": 1.3531403919436875e-05, + "loss": 0.2946, + "num_input_tokens_seen": 8546984, + "step": 13100 + }, + { + "epoch": 6.868448637316562, + "grad_norm": 4.533089637756348, + "learning_rate": 1.3511088428270142e-05, + "loss": 0.5922, + "num_input_tokens_seen": 8550376, + "step": 13105 + }, + { + "epoch": 6.871069182389937, + "grad_norm": 2.2692296504974365, + "learning_rate": 1.3490782550499823e-05, + "loss": 0.2754, + "num_input_tokens_seen": 8553512, + "step": 13110 + }, + { + "epoch": 6.873689727463312, + "grad_norm": 1.5050545930862427, + "learning_rate": 1.3470486303116936e-05, + "loss": 0.2592, + "num_input_tokens_seen": 8557320, + "step": 13115 + }, + { + "epoch": 6.876310272536688, + "grad_norm": 1.6262577772140503, + "learning_rate": 1.3450199703104471e-05, + "loss": 0.2238, + "num_input_tokens_seen": 8559848, + "step": 13120 + }, + { + "epoch": 6.878930817610063, + "grad_norm": 1.0808504819869995, + "learning_rate": 1.3429922767437319e-05, + "loss": 0.1813, + "num_input_tokens_seen": 8563400, + "step": 13125 + }, + { + "epoch": 6.881551362683438, + "grad_norm": 1.998358130455017, + "learning_rate": 1.3409655513082291e-05, + "loss": 0.2723, + "num_input_tokens_seen": 8566728, + "step": 13130 + }, + { + "epoch": 6.884171907756813, + "grad_norm": 2.1453781127929688, + "learning_rate": 1.3389397956998111e-05, + "loss": 0.2777, + "num_input_tokens_seen": 8570056, + "step": 13135 + }, + { + "epoch": 6.886792452830189, + "grad_norm": 1.5957955121994019, + "learning_rate": 1.336915011613537e-05, + "loss": 0.2468, + "num_input_tokens_seen": 8573064, + "step": 13140 + }, + { + "epoch": 6.889412997903564, + "grad_norm": 1.8406596183776855, + "learning_rate": 1.3348912007436537e-05, + "loss": 0.3454, + "num_input_tokens_seen": 8577640, + "step": 13145 + }, + { + "epoch": 6.8920335429769395, + "grad_norm": 2.162998676300049, + "learning_rate": 1.3328683647835933e-05, + "loss": 0.3387, + "num_input_tokens_seen": 8581256, + "step": 13150 + }, + { + "epoch": 6.894654088050315, + "grad_norm": 1.7255654335021973, + "learning_rate": 1.330846505425972e-05, + "loss": 0.2171, + "num_input_tokens_seen": 8584168, + "step": 13155 + }, + { + "epoch": 6.89727463312369, + "grad_norm": 1.1394784450531006, + "learning_rate": 1.3288256243625911e-05, + "loss": 0.3158, + "num_input_tokens_seen": 8587432, + "step": 13160 + }, + { + "epoch": 6.899895178197065, + "grad_norm": 1.3700053691864014, + "learning_rate": 1.3268057232844305e-05, + "loss": 0.2749, + "num_input_tokens_seen": 8590728, + "step": 13165 + }, + { + "epoch": 6.90251572327044, + "grad_norm": 1.7068361043930054, + "learning_rate": 1.3247868038816504e-05, + "loss": 0.2117, + "num_input_tokens_seen": 8594120, + "step": 13170 + }, + { + "epoch": 6.905136268343815, + "grad_norm": 1.0701546669006348, + "learning_rate": 1.3227688678435924e-05, + "loss": 0.3051, + "num_input_tokens_seen": 8596904, + "step": 13175 + }, + { + "epoch": 6.9077568134171905, + "grad_norm": 1.4255445003509521, + "learning_rate": 1.3207519168587717e-05, + "loss": 0.2514, + "num_input_tokens_seen": 8599848, + "step": 13180 + }, + { + "epoch": 6.910377358490566, + "grad_norm": 1.3703550100326538, + "learning_rate": 1.3187359526148813e-05, + "loss": 0.2684, + "num_input_tokens_seen": 8603272, + "step": 13185 + }, + { + "epoch": 6.912997903563941, + "grad_norm": 1.3654152154922485, + "learning_rate": 1.3167209767987868e-05, + "loss": 0.2782, + "num_input_tokens_seen": 8607240, + "step": 13190 + }, + { + "epoch": 6.915618448637317, + "grad_norm": 6.644425392150879, + "learning_rate": 1.3147069910965298e-05, + "loss": 0.3342, + "num_input_tokens_seen": 8613160, + "step": 13195 + }, + { + "epoch": 6.918238993710692, + "grad_norm": 1.2713948488235474, + "learning_rate": 1.3126939971933205e-05, + "loss": 0.278, + "num_input_tokens_seen": 8617480, + "step": 13200 + }, + { + "epoch": 6.920859538784067, + "grad_norm": 1.7662363052368164, + "learning_rate": 1.3106819967735395e-05, + "loss": 0.2108, + "num_input_tokens_seen": 8620520, + "step": 13205 + }, + { + "epoch": 6.923480083857442, + "grad_norm": 1.4171288013458252, + "learning_rate": 1.3086709915207388e-05, + "loss": 0.1676, + "num_input_tokens_seen": 8626568, + "step": 13210 + }, + { + "epoch": 6.926100628930818, + "grad_norm": 1.6605417728424072, + "learning_rate": 1.3066609831176346e-05, + "loss": 0.275, + "num_input_tokens_seen": 8630184, + "step": 13215 + }, + { + "epoch": 6.928721174004193, + "grad_norm": 1.835072636604309, + "learning_rate": 1.3046519732461094e-05, + "loss": 0.2322, + "num_input_tokens_seen": 8633576, + "step": 13220 + }, + { + "epoch": 6.931341719077568, + "grad_norm": 1.2948498725891113, + "learning_rate": 1.302643963587213e-05, + "loss": 0.2732, + "num_input_tokens_seen": 8636520, + "step": 13225 + }, + { + "epoch": 6.933962264150943, + "grad_norm": 1.6800092458724976, + "learning_rate": 1.3006369558211534e-05, + "loss": 0.1968, + "num_input_tokens_seen": 8639816, + "step": 13230 + }, + { + "epoch": 6.936582809224319, + "grad_norm": 1.730649471282959, + "learning_rate": 1.2986309516273043e-05, + "loss": 0.2957, + "num_input_tokens_seen": 8642888, + "step": 13235 + }, + { + "epoch": 6.939203354297694, + "grad_norm": 1.1990939378738403, + "learning_rate": 1.2966259526842006e-05, + "loss": 0.292, + "num_input_tokens_seen": 8645704, + "step": 13240 + }, + { + "epoch": 6.9418238993710695, + "grad_norm": 1.7116172313690186, + "learning_rate": 1.2946219606695297e-05, + "loss": 0.2311, + "num_input_tokens_seen": 8648648, + "step": 13245 + }, + { + "epoch": 6.944444444444445, + "grad_norm": 1.8319426774978638, + "learning_rate": 1.2926189772601438e-05, + "loss": 0.2336, + "num_input_tokens_seen": 8651560, + "step": 13250 + }, + { + "epoch": 6.94706498951782, + "grad_norm": 1.022766351699829, + "learning_rate": 1.2906170041320468e-05, + "loss": 0.3147, + "num_input_tokens_seen": 8655208, + "step": 13255 + }, + { + "epoch": 6.949685534591195, + "grad_norm": 1.5704644918441772, + "learning_rate": 1.2886160429603972e-05, + "loss": 0.1921, + "num_input_tokens_seen": 8657576, + "step": 13260 + }, + { + "epoch": 6.95230607966457, + "grad_norm": 1.8592151403427124, + "learning_rate": 1.2866160954195112e-05, + "loss": 0.4246, + "num_input_tokens_seen": 8660872, + "step": 13265 + }, + { + "epoch": 6.954926624737945, + "grad_norm": 1.3657832145690918, + "learning_rate": 1.284617163182849e-05, + "loss": 0.249, + "num_input_tokens_seen": 8665192, + "step": 13270 + }, + { + "epoch": 6.9575471698113205, + "grad_norm": 0.8552311658859253, + "learning_rate": 1.2826192479230287e-05, + "loss": 0.2494, + "num_input_tokens_seen": 8670088, + "step": 13275 + }, + { + "epoch": 6.960167714884696, + "grad_norm": 3.63655948638916, + "learning_rate": 1.2806223513118154e-05, + "loss": 0.2563, + "num_input_tokens_seen": 8673576, + "step": 13280 + }, + { + "epoch": 6.962788259958071, + "grad_norm": 1.5391684770584106, + "learning_rate": 1.2786264750201182e-05, + "loss": 0.3908, + "num_input_tokens_seen": 8676424, + "step": 13285 + }, + { + "epoch": 6.965408805031447, + "grad_norm": 2.395314931869507, + "learning_rate": 1.2766316207179973e-05, + "loss": 0.2539, + "num_input_tokens_seen": 8679912, + "step": 13290 + }, + { + "epoch": 6.968029350104822, + "grad_norm": 1.2237796783447266, + "learning_rate": 1.2746377900746548e-05, + "loss": 0.4889, + "num_input_tokens_seen": 8683880, + "step": 13295 + }, + { + "epoch": 6.970649895178197, + "grad_norm": 1.3379462957382202, + "learning_rate": 1.2726449847584365e-05, + "loss": 0.3536, + "num_input_tokens_seen": 8687784, + "step": 13300 + }, + { + "epoch": 6.973270440251572, + "grad_norm": 1.7921833992004395, + "learning_rate": 1.2706532064368326e-05, + "loss": 0.2638, + "num_input_tokens_seen": 8690920, + "step": 13305 + }, + { + "epoch": 6.975890985324948, + "grad_norm": 1.4342195987701416, + "learning_rate": 1.268662456776471e-05, + "loss": 0.206, + "num_input_tokens_seen": 8693960, + "step": 13310 + }, + { + "epoch": 6.978511530398323, + "grad_norm": 3.6345055103302, + "learning_rate": 1.2666727374431198e-05, + "loss": 0.2536, + "num_input_tokens_seen": 8697640, + "step": 13315 + }, + { + "epoch": 6.981132075471698, + "grad_norm": 1.7242456674575806, + "learning_rate": 1.2646840501016863e-05, + "loss": 0.4112, + "num_input_tokens_seen": 8699912, + "step": 13320 + }, + { + "epoch": 6.983752620545073, + "grad_norm": 1.960410475730896, + "learning_rate": 1.262696396416213e-05, + "loss": 0.3555, + "num_input_tokens_seen": 8703240, + "step": 13325 + }, + { + "epoch": 6.986373165618449, + "grad_norm": 1.3457996845245361, + "learning_rate": 1.2607097780498772e-05, + "loss": 0.2033, + "num_input_tokens_seen": 8706376, + "step": 13330 + }, + { + "epoch": 6.988993710691824, + "grad_norm": 1.5688949823379517, + "learning_rate": 1.2587241966649908e-05, + "loss": 0.3375, + "num_input_tokens_seen": 8709224, + "step": 13335 + }, + { + "epoch": 6.9916142557651995, + "grad_norm": 2.704617500305176, + "learning_rate": 1.2567396539229965e-05, + "loss": 0.3022, + "num_input_tokens_seen": 8711816, + "step": 13340 + }, + { + "epoch": 6.994234800838575, + "grad_norm": 3.2293365001678467, + "learning_rate": 1.2547561514844704e-05, + "loss": 0.2622, + "num_input_tokens_seen": 8715496, + "step": 13345 + }, + { + "epoch": 6.99685534591195, + "grad_norm": 1.4486802816390991, + "learning_rate": 1.2527736910091168e-05, + "loss": 0.2655, + "num_input_tokens_seen": 8718312, + "step": 13350 + }, + { + "epoch": 6.999475890985325, + "grad_norm": 2.092552423477173, + "learning_rate": 1.2507922741557665e-05, + "loss": 0.2524, + "num_input_tokens_seen": 8722632, + "step": 13355 + }, + { + "epoch": 7.0, + "eval_loss": 0.5758277773857117, + "eval_runtime": 15.9707, + "eval_samples_per_second": 53.097, + "eval_steps_per_second": 13.274, + "num_input_tokens_seen": 8722744, + "step": 13356 + }, + { + "epoch": 7.0020964360587, + "grad_norm": 2.3001608848571777, + "learning_rate": 1.2488119025823802e-05, + "loss": 0.219, + "num_input_tokens_seen": 8725944, + "step": 13360 + }, + { + "epoch": 7.004716981132075, + "grad_norm": 2.1127822399139404, + "learning_rate": 1.2468325779460424e-05, + "loss": 0.2501, + "num_input_tokens_seen": 8729592, + "step": 13365 + }, + { + "epoch": 7.0073375262054505, + "grad_norm": 1.3587021827697754, + "learning_rate": 1.2448543019029607e-05, + "loss": 0.2089, + "num_input_tokens_seen": 8734264, + "step": 13370 + }, + { + "epoch": 7.009958071278826, + "grad_norm": 1.2044236660003662, + "learning_rate": 1.2428770761084655e-05, + "loss": 0.2021, + "num_input_tokens_seen": 8738744, + "step": 13375 + }, + { + "epoch": 7.012578616352202, + "grad_norm": 1.2784289121627808, + "learning_rate": 1.2409009022170109e-05, + "loss": 0.1968, + "num_input_tokens_seen": 8741720, + "step": 13380 + }, + { + "epoch": 7.015199161425577, + "grad_norm": 1.7683532238006592, + "learning_rate": 1.2389257818821679e-05, + "loss": 0.2267, + "num_input_tokens_seen": 8745112, + "step": 13385 + }, + { + "epoch": 7.017819706498952, + "grad_norm": 3.2522521018981934, + "learning_rate": 1.236951716756626e-05, + "loss": 0.2023, + "num_input_tokens_seen": 8747960, + "step": 13390 + }, + { + "epoch": 7.020440251572327, + "grad_norm": 1.5093135833740234, + "learning_rate": 1.2349787084921952e-05, + "loss": 0.2716, + "num_input_tokens_seen": 8750808, + "step": 13395 + }, + { + "epoch": 7.023060796645702, + "grad_norm": 12.189131736755371, + "learning_rate": 1.233006758739797e-05, + "loss": 0.2397, + "num_input_tokens_seen": 8753368, + "step": 13400 + }, + { + "epoch": 7.0256813417190775, + "grad_norm": 2.4385814666748047, + "learning_rate": 1.2310358691494681e-05, + "loss": 0.2199, + "num_input_tokens_seen": 8756056, + "step": 13405 + }, + { + "epoch": 7.028301886792453, + "grad_norm": 2.1352012157440186, + "learning_rate": 1.229066041370362e-05, + "loss": 0.2535, + "num_input_tokens_seen": 8759256, + "step": 13410 + }, + { + "epoch": 7.030922431865828, + "grad_norm": 2.1002278327941895, + "learning_rate": 1.2270972770507364e-05, + "loss": 0.1767, + "num_input_tokens_seen": 8763032, + "step": 13415 + }, + { + "epoch": 7.033542976939203, + "grad_norm": 3.5407724380493164, + "learning_rate": 1.2251295778379657e-05, + "loss": 0.1379, + "num_input_tokens_seen": 8766072, + "step": 13420 + }, + { + "epoch": 7.036163522012578, + "grad_norm": 2.922335386276245, + "learning_rate": 1.2231629453785324e-05, + "loss": 0.2105, + "num_input_tokens_seen": 8769976, + "step": 13425 + }, + { + "epoch": 7.038784067085954, + "grad_norm": 1.6956243515014648, + "learning_rate": 1.2211973813180209e-05, + "loss": 0.2681, + "num_input_tokens_seen": 8773528, + "step": 13430 + }, + { + "epoch": 7.0414046121593294, + "grad_norm": 3.157125949859619, + "learning_rate": 1.2192328873011283e-05, + "loss": 0.2309, + "num_input_tokens_seen": 8776440, + "step": 13435 + }, + { + "epoch": 7.044025157232705, + "grad_norm": 1.8394790887832642, + "learning_rate": 1.2172694649716524e-05, + "loss": 0.1957, + "num_input_tokens_seen": 8780408, + "step": 13440 + }, + { + "epoch": 7.04664570230608, + "grad_norm": 2.011281967163086, + "learning_rate": 1.2153071159724947e-05, + "loss": 0.2221, + "num_input_tokens_seen": 8782872, + "step": 13445 + }, + { + "epoch": 7.049266247379455, + "grad_norm": 1.3316665887832642, + "learning_rate": 1.2133458419456614e-05, + "loss": 0.2557, + "num_input_tokens_seen": 8786776, + "step": 13450 + }, + { + "epoch": 7.05188679245283, + "grad_norm": 1.5268054008483887, + "learning_rate": 1.2113856445322541e-05, + "loss": 0.2246, + "num_input_tokens_seen": 8791576, + "step": 13455 + }, + { + "epoch": 7.054507337526205, + "grad_norm": 1.7863059043884277, + "learning_rate": 1.2094265253724777e-05, + "loss": 0.1734, + "num_input_tokens_seen": 8794072, + "step": 13460 + }, + { + "epoch": 7.0571278825995805, + "grad_norm": 1.036874532699585, + "learning_rate": 1.207468486105636e-05, + "loss": 0.2571, + "num_input_tokens_seen": 8798424, + "step": 13465 + }, + { + "epoch": 7.059748427672956, + "grad_norm": 1.9720077514648438, + "learning_rate": 1.2055115283701224e-05, + "loss": 0.1893, + "num_input_tokens_seen": 8801208, + "step": 13470 + }, + { + "epoch": 7.062368972746331, + "grad_norm": 1.7736713886260986, + "learning_rate": 1.2035556538034332e-05, + "loss": 0.2431, + "num_input_tokens_seen": 8804472, + "step": 13475 + }, + { + "epoch": 7.064989517819707, + "grad_norm": 1.9962832927703857, + "learning_rate": 1.2016008640421533e-05, + "loss": 0.2932, + "num_input_tokens_seen": 8808152, + "step": 13480 + }, + { + "epoch": 7.067610062893082, + "grad_norm": 1.4793835878372192, + "learning_rate": 1.1996471607219612e-05, + "loss": 0.2055, + "num_input_tokens_seen": 8811352, + "step": 13485 + }, + { + "epoch": 7.070230607966457, + "grad_norm": 2.5994668006896973, + "learning_rate": 1.1976945454776284e-05, + "loss": 0.1911, + "num_input_tokens_seen": 8814328, + "step": 13490 + }, + { + "epoch": 7.072851153039832, + "grad_norm": 1.470081090927124, + "learning_rate": 1.1957430199430128e-05, + "loss": 0.2598, + "num_input_tokens_seen": 8817976, + "step": 13495 + }, + { + "epoch": 7.0754716981132075, + "grad_norm": 1.1828618049621582, + "learning_rate": 1.1937925857510609e-05, + "loss": 0.1713, + "num_input_tokens_seen": 8820856, + "step": 13500 + }, + { + "epoch": 7.078092243186583, + "grad_norm": 2.6379308700561523, + "learning_rate": 1.1918432445338092e-05, + "loss": 0.3403, + "num_input_tokens_seen": 8824120, + "step": 13505 + }, + { + "epoch": 7.080712788259958, + "grad_norm": 1.535631775856018, + "learning_rate": 1.1898949979223765e-05, + "loss": 0.2135, + "num_input_tokens_seen": 8827192, + "step": 13510 + }, + { + "epoch": 7.083333333333333, + "grad_norm": 1.8702434301376343, + "learning_rate": 1.187947847546966e-05, + "loss": 0.1911, + "num_input_tokens_seen": 8831064, + "step": 13515 + }, + { + "epoch": 7.085953878406708, + "grad_norm": 1.5607421398162842, + "learning_rate": 1.1860017950368646e-05, + "loss": 0.2925, + "num_input_tokens_seen": 8834200, + "step": 13520 + }, + { + "epoch": 7.088574423480084, + "grad_norm": 1.425235629081726, + "learning_rate": 1.1840568420204392e-05, + "loss": 0.1694, + "num_input_tokens_seen": 8837240, + "step": 13525 + }, + { + "epoch": 7.091194968553459, + "grad_norm": 1.414704442024231, + "learning_rate": 1.1821129901251396e-05, + "loss": 0.3288, + "num_input_tokens_seen": 8841976, + "step": 13530 + }, + { + "epoch": 7.093815513626835, + "grad_norm": 2.8193037509918213, + "learning_rate": 1.1801702409774909e-05, + "loss": 0.184, + "num_input_tokens_seen": 8845336, + "step": 13535 + }, + { + "epoch": 7.09643605870021, + "grad_norm": 2.0196049213409424, + "learning_rate": 1.1782285962030965e-05, + "loss": 0.2175, + "num_input_tokens_seen": 8848184, + "step": 13540 + }, + { + "epoch": 7.099056603773585, + "grad_norm": 3.513864040374756, + "learning_rate": 1.1762880574266374e-05, + "loss": 0.2272, + "num_input_tokens_seen": 8850840, + "step": 13545 + }, + { + "epoch": 7.10167714884696, + "grad_norm": 8.313050270080566, + "learning_rate": 1.1743486262718673e-05, + "loss": 0.3258, + "num_input_tokens_seen": 8854296, + "step": 13550 + }, + { + "epoch": 7.104297693920335, + "grad_norm": 2.0631215572357178, + "learning_rate": 1.1724103043616134e-05, + "loss": 0.184, + "num_input_tokens_seen": 8857560, + "step": 13555 + }, + { + "epoch": 7.1069182389937104, + "grad_norm": 1.2310103178024292, + "learning_rate": 1.1704730933177738e-05, + "loss": 0.2146, + "num_input_tokens_seen": 8861496, + "step": 13560 + }, + { + "epoch": 7.109538784067086, + "grad_norm": 1.7559232711791992, + "learning_rate": 1.1685369947613204e-05, + "loss": 0.2205, + "num_input_tokens_seen": 8864344, + "step": 13565 + }, + { + "epoch": 7.112159329140461, + "grad_norm": 1.2440334558486938, + "learning_rate": 1.1666020103122907e-05, + "loss": 0.3153, + "num_input_tokens_seen": 8868152, + "step": 13570 + }, + { + "epoch": 7.114779874213837, + "grad_norm": 2.427042007446289, + "learning_rate": 1.1646681415897912e-05, + "loss": 0.1826, + "num_input_tokens_seen": 8870776, + "step": 13575 + }, + { + "epoch": 7.117400419287212, + "grad_norm": 1.8634577989578247, + "learning_rate": 1.1627353902119958e-05, + "loss": 0.2598, + "num_input_tokens_seen": 8873944, + "step": 13580 + }, + { + "epoch": 7.120020964360587, + "grad_norm": 3.164578437805176, + "learning_rate": 1.1608037577961423e-05, + "loss": 0.2215, + "num_input_tokens_seen": 8876792, + "step": 13585 + }, + { + "epoch": 7.122641509433962, + "grad_norm": 1.6248865127563477, + "learning_rate": 1.158873245958531e-05, + "loss": 0.2525, + "num_input_tokens_seen": 8879672, + "step": 13590 + }, + { + "epoch": 7.1252620545073375, + "grad_norm": 3.177460193634033, + "learning_rate": 1.1569438563145297e-05, + "loss": 0.1689, + "num_input_tokens_seen": 8882904, + "step": 13595 + }, + { + "epoch": 7.127882599580713, + "grad_norm": 1.6751197576522827, + "learning_rate": 1.1550155904785587e-05, + "loss": 0.1776, + "num_input_tokens_seen": 8886328, + "step": 13600 + }, + { + "epoch": 7.130503144654088, + "grad_norm": 2.9571640491485596, + "learning_rate": 1.1530884500641063e-05, + "loss": 0.1834, + "num_input_tokens_seen": 8888888, + "step": 13605 + }, + { + "epoch": 7.133123689727463, + "grad_norm": 2.0531914234161377, + "learning_rate": 1.1511624366837143e-05, + "loss": 0.2092, + "num_input_tokens_seen": 8892536, + "step": 13610 + }, + { + "epoch": 7.135744234800838, + "grad_norm": 1.3483970165252686, + "learning_rate": 1.149237551948982e-05, + "loss": 0.2106, + "num_input_tokens_seen": 8895768, + "step": 13615 + }, + { + "epoch": 7.138364779874214, + "grad_norm": 2.129277467727661, + "learning_rate": 1.147313797470567e-05, + "loss": 0.2071, + "num_input_tokens_seen": 8899320, + "step": 13620 + }, + { + "epoch": 7.140985324947589, + "grad_norm": 2.1186089515686035, + "learning_rate": 1.1453911748581778e-05, + "loss": 0.2532, + "num_input_tokens_seen": 8901816, + "step": 13625 + }, + { + "epoch": 7.143605870020965, + "grad_norm": 1.3478468656539917, + "learning_rate": 1.1434696857205765e-05, + "loss": 0.1748, + "num_input_tokens_seen": 8904984, + "step": 13630 + }, + { + "epoch": 7.14622641509434, + "grad_norm": 1.4618072509765625, + "learning_rate": 1.1415493316655804e-05, + "loss": 0.2545, + "num_input_tokens_seen": 8909272, + "step": 13635 + }, + { + "epoch": 7.148846960167715, + "grad_norm": 1.7802257537841797, + "learning_rate": 1.1396301143000499e-05, + "loss": 0.226, + "num_input_tokens_seen": 8912088, + "step": 13640 + }, + { + "epoch": 7.15146750524109, + "grad_norm": 1.6934123039245605, + "learning_rate": 1.1377120352299014e-05, + "loss": 0.2636, + "num_input_tokens_seen": 8915160, + "step": 13645 + }, + { + "epoch": 7.154088050314465, + "grad_norm": 1.5942950248718262, + "learning_rate": 1.1357950960600955e-05, + "loss": 0.2147, + "num_input_tokens_seen": 8918200, + "step": 13650 + }, + { + "epoch": 7.15670859538784, + "grad_norm": 1.7299551963806152, + "learning_rate": 1.1338792983946376e-05, + "loss": 0.1564, + "num_input_tokens_seen": 8921464, + "step": 13655 + }, + { + "epoch": 7.159329140461216, + "grad_norm": 3.036987781524658, + "learning_rate": 1.1319646438365817e-05, + "loss": 0.2066, + "num_input_tokens_seen": 8924344, + "step": 13660 + }, + { + "epoch": 7.161949685534591, + "grad_norm": 2.5489280223846436, + "learning_rate": 1.1300511339880227e-05, + "loss": 0.1724, + "num_input_tokens_seen": 8927352, + "step": 13665 + }, + { + "epoch": 7.164570230607967, + "grad_norm": 1.9462271928787231, + "learning_rate": 1.128138770450097e-05, + "loss": 0.2554, + "num_input_tokens_seen": 8930936, + "step": 13670 + }, + { + "epoch": 7.167190775681342, + "grad_norm": 16.061914443969727, + "learning_rate": 1.126227554822985e-05, + "loss": 0.2985, + "num_input_tokens_seen": 8934296, + "step": 13675 + }, + { + "epoch": 7.169811320754717, + "grad_norm": 0.882878303527832, + "learning_rate": 1.1243174887059038e-05, + "loss": 0.1586, + "num_input_tokens_seen": 8937016, + "step": 13680 + }, + { + "epoch": 7.172431865828092, + "grad_norm": 2.070096492767334, + "learning_rate": 1.1224085736971093e-05, + "loss": 0.1678, + "num_input_tokens_seen": 8940792, + "step": 13685 + }, + { + "epoch": 7.1750524109014675, + "grad_norm": 1.7648677825927734, + "learning_rate": 1.1205008113938934e-05, + "loss": 0.2842, + "num_input_tokens_seen": 8944024, + "step": 13690 + }, + { + "epoch": 7.177672955974843, + "grad_norm": 3.1623260974884033, + "learning_rate": 1.1185942033925867e-05, + "loss": 0.2377, + "num_input_tokens_seen": 8946872, + "step": 13695 + }, + { + "epoch": 7.180293501048218, + "grad_norm": 1.5460522174835205, + "learning_rate": 1.1166887512885505e-05, + "loss": 0.2116, + "num_input_tokens_seen": 8950168, + "step": 13700 + }, + { + "epoch": 7.182914046121593, + "grad_norm": 5.3068342208862305, + "learning_rate": 1.11478445667618e-05, + "loss": 0.2368, + "num_input_tokens_seen": 8953112, + "step": 13705 + }, + { + "epoch": 7.185534591194968, + "grad_norm": 2.3596906661987305, + "learning_rate": 1.1128813211489012e-05, + "loss": 0.1892, + "num_input_tokens_seen": 8956312, + "step": 13710 + }, + { + "epoch": 7.188155136268344, + "grad_norm": 4.089278697967529, + "learning_rate": 1.1109793462991725e-05, + "loss": 0.1995, + "num_input_tokens_seen": 8959032, + "step": 13715 + }, + { + "epoch": 7.190775681341719, + "grad_norm": 2.6336753368377686, + "learning_rate": 1.109078533718479e-05, + "loss": 0.3119, + "num_input_tokens_seen": 8962744, + "step": 13720 + }, + { + "epoch": 7.193396226415095, + "grad_norm": 1.6725581884384155, + "learning_rate": 1.107178884997334e-05, + "loss": 0.2687, + "num_input_tokens_seen": 8966520, + "step": 13725 + }, + { + "epoch": 7.19601677148847, + "grad_norm": 1.9628137350082397, + "learning_rate": 1.1052804017252751e-05, + "loss": 0.2367, + "num_input_tokens_seen": 8969304, + "step": 13730 + }, + { + "epoch": 7.198637316561845, + "grad_norm": 1.3592787981033325, + "learning_rate": 1.1033830854908691e-05, + "loss": 0.3163, + "num_input_tokens_seen": 8973432, + "step": 13735 + }, + { + "epoch": 7.20125786163522, + "grad_norm": 2.711794376373291, + "learning_rate": 1.1014869378817022e-05, + "loss": 0.2272, + "num_input_tokens_seen": 8976248, + "step": 13740 + }, + { + "epoch": 7.203878406708595, + "grad_norm": 1.4492549896240234, + "learning_rate": 1.0995919604843832e-05, + "loss": 0.2402, + "num_input_tokens_seen": 8980792, + "step": 13745 + }, + { + "epoch": 7.20649895178197, + "grad_norm": 1.6394906044006348, + "learning_rate": 1.0976981548845444e-05, + "loss": 0.2661, + "num_input_tokens_seen": 8983480, + "step": 13750 + }, + { + "epoch": 7.209119496855346, + "grad_norm": 1.6296336650848389, + "learning_rate": 1.095805522666835e-05, + "loss": 0.203, + "num_input_tokens_seen": 8986200, + "step": 13755 + }, + { + "epoch": 7.211740041928721, + "grad_norm": 2.239989995956421, + "learning_rate": 1.0939140654149225e-05, + "loss": 0.2238, + "num_input_tokens_seen": 8989816, + "step": 13760 + }, + { + "epoch": 7.214360587002097, + "grad_norm": 2.120915412902832, + "learning_rate": 1.0920237847114944e-05, + "loss": 0.1221, + "num_input_tokens_seen": 8992600, + "step": 13765 + }, + { + "epoch": 7.216981132075472, + "grad_norm": 2.041830062866211, + "learning_rate": 1.0901346821382476e-05, + "loss": 0.236, + "num_input_tokens_seen": 8997432, + "step": 13770 + }, + { + "epoch": 7.219601677148847, + "grad_norm": 2.747438907623291, + "learning_rate": 1.0882467592758989e-05, + "loss": 0.2221, + "num_input_tokens_seen": 8999864, + "step": 13775 + }, + { + "epoch": 7.222222222222222, + "grad_norm": 1.0594990253448486, + "learning_rate": 1.0863600177041772e-05, + "loss": 0.1987, + "num_input_tokens_seen": 9003160, + "step": 13780 + }, + { + "epoch": 7.2248427672955975, + "grad_norm": 2.3293116092681885, + "learning_rate": 1.0844744590018186e-05, + "loss": 0.1959, + "num_input_tokens_seen": 9006040, + "step": 13785 + }, + { + "epoch": 7.227463312368973, + "grad_norm": 2.1205575466156006, + "learning_rate": 1.0825900847465748e-05, + "loss": 0.2257, + "num_input_tokens_seen": 9009624, + "step": 13790 + }, + { + "epoch": 7.230083857442348, + "grad_norm": 1.2028599977493286, + "learning_rate": 1.0807068965152033e-05, + "loss": 0.2179, + "num_input_tokens_seen": 9013912, + "step": 13795 + }, + { + "epoch": 7.232704402515723, + "grad_norm": 2.086912155151367, + "learning_rate": 1.0788248958834695e-05, + "loss": 0.2408, + "num_input_tokens_seen": 9017016, + "step": 13800 + }, + { + "epoch": 7.235324947589098, + "grad_norm": 3.3249478340148926, + "learning_rate": 1.0769440844261481e-05, + "loss": 0.2081, + "num_input_tokens_seen": 9019000, + "step": 13805 + }, + { + "epoch": 7.237945492662474, + "grad_norm": 2.4838709831237793, + "learning_rate": 1.0750644637170122e-05, + "loss": 0.3195, + "num_input_tokens_seen": 9022520, + "step": 13810 + }, + { + "epoch": 7.240566037735849, + "grad_norm": 2.468867301940918, + "learning_rate": 1.0731860353288445e-05, + "loss": 0.2152, + "num_input_tokens_seen": 9025496, + "step": 13815 + }, + { + "epoch": 7.243186582809225, + "grad_norm": 1.7562633752822876, + "learning_rate": 1.0713088008334302e-05, + "loss": 0.2139, + "num_input_tokens_seen": 9029464, + "step": 13820 + }, + { + "epoch": 7.2458071278826, + "grad_norm": 2.6214685440063477, + "learning_rate": 1.0694327618015493e-05, + "loss": 0.2706, + "num_input_tokens_seen": 9032088, + "step": 13825 + }, + { + "epoch": 7.248427672955975, + "grad_norm": 1.2956594228744507, + "learning_rate": 1.0675579198029887e-05, + "loss": 0.2212, + "num_input_tokens_seen": 9035768, + "step": 13830 + }, + { + "epoch": 7.25104821802935, + "grad_norm": 1.8509280681610107, + "learning_rate": 1.0656842764065295e-05, + "loss": 0.2237, + "num_input_tokens_seen": 9038840, + "step": 13835 + }, + { + "epoch": 7.253668763102725, + "grad_norm": 1.9637784957885742, + "learning_rate": 1.0638118331799499e-05, + "loss": 0.1715, + "num_input_tokens_seen": 9042328, + "step": 13840 + }, + { + "epoch": 7.2562893081761, + "grad_norm": 1.7094391584396362, + "learning_rate": 1.061940591690027e-05, + "loss": 0.1769, + "num_input_tokens_seen": 9045336, + "step": 13845 + }, + { + "epoch": 7.258909853249476, + "grad_norm": 1.8281512260437012, + "learning_rate": 1.0600705535025285e-05, + "loss": 0.2673, + "num_input_tokens_seen": 9048312, + "step": 13850 + }, + { + "epoch": 7.261530398322851, + "grad_norm": 1.5726357698440552, + "learning_rate": 1.058201720182217e-05, + "loss": 0.2973, + "num_input_tokens_seen": 9052760, + "step": 13855 + }, + { + "epoch": 7.264150943396227, + "grad_norm": 2.9404163360595703, + "learning_rate": 1.056334093292848e-05, + "loss": 0.3835, + "num_input_tokens_seen": 9055864, + "step": 13860 + }, + { + "epoch": 7.266771488469602, + "grad_norm": 1.5494771003723145, + "learning_rate": 1.054467674397166e-05, + "loss": 0.1898, + "num_input_tokens_seen": 9058776, + "step": 13865 + }, + { + "epoch": 7.269392033542977, + "grad_norm": 2.3538897037506104, + "learning_rate": 1.0526024650569047e-05, + "loss": 0.2986, + "num_input_tokens_seen": 9062360, + "step": 13870 + }, + { + "epoch": 7.272012578616352, + "grad_norm": 1.520703911781311, + "learning_rate": 1.0507384668327852e-05, + "loss": 0.3207, + "num_input_tokens_seen": 9065016, + "step": 13875 + }, + { + "epoch": 7.2746331236897275, + "grad_norm": 1.8793237209320068, + "learning_rate": 1.048875681284518e-05, + "loss": 0.2465, + "num_input_tokens_seen": 9071992, + "step": 13880 + }, + { + "epoch": 7.277253668763103, + "grad_norm": 1.5580955743789673, + "learning_rate": 1.0470141099707959e-05, + "loss": 0.2551, + "num_input_tokens_seen": 9075672, + "step": 13885 + }, + { + "epoch": 7.279874213836478, + "grad_norm": 1.8462719917297363, + "learning_rate": 1.0451537544492968e-05, + "loss": 0.2233, + "num_input_tokens_seen": 9078936, + "step": 13890 + }, + { + "epoch": 7.282494758909853, + "grad_norm": 3.0202417373657227, + "learning_rate": 1.0432946162766805e-05, + "loss": 0.3187, + "num_input_tokens_seen": 9082168, + "step": 13895 + }, + { + "epoch": 7.285115303983228, + "grad_norm": 2.8277249336242676, + "learning_rate": 1.0414366970085906e-05, + "loss": 0.3281, + "num_input_tokens_seen": 9084664, + "step": 13900 + }, + { + "epoch": 7.287735849056604, + "grad_norm": 1.2061214447021484, + "learning_rate": 1.0395799981996479e-05, + "loss": 0.2246, + "num_input_tokens_seen": 9087864, + "step": 13905 + }, + { + "epoch": 7.290356394129979, + "grad_norm": 1.684157371520996, + "learning_rate": 1.0377245214034537e-05, + "loss": 0.2843, + "num_input_tokens_seen": 9091416, + "step": 13910 + }, + { + "epoch": 7.2929769392033545, + "grad_norm": 1.9428437948226929, + "learning_rate": 1.0358702681725848e-05, + "loss": 0.2321, + "num_input_tokens_seen": 9094296, + "step": 13915 + }, + { + "epoch": 7.29559748427673, + "grad_norm": 1.5246329307556152, + "learning_rate": 1.0340172400585977e-05, + "loss": 0.2507, + "num_input_tokens_seen": 9097112, + "step": 13920 + }, + { + "epoch": 7.298218029350105, + "grad_norm": 1.9910820722579956, + "learning_rate": 1.0321654386120205e-05, + "loss": 0.2214, + "num_input_tokens_seen": 9100632, + "step": 13925 + }, + { + "epoch": 7.30083857442348, + "grad_norm": 2.73954701423645, + "learning_rate": 1.0303148653823557e-05, + "loss": 0.2274, + "num_input_tokens_seen": 9103864, + "step": 13930 + }, + { + "epoch": 7.303459119496855, + "grad_norm": 2.4947926998138428, + "learning_rate": 1.0284655219180797e-05, + "loss": 0.2345, + "num_input_tokens_seen": 9107928, + "step": 13935 + }, + { + "epoch": 7.30607966457023, + "grad_norm": 1.6416081190109253, + "learning_rate": 1.026617409766638e-05, + "loss": 0.1884, + "num_input_tokens_seen": 9111384, + "step": 13940 + }, + { + "epoch": 7.308700209643606, + "grad_norm": 2.289095640182495, + "learning_rate": 1.0247705304744457e-05, + "loss": 0.2119, + "num_input_tokens_seen": 9114488, + "step": 13945 + }, + { + "epoch": 7.311320754716981, + "grad_norm": 1.406084418296814, + "learning_rate": 1.0229248855868892e-05, + "loss": 0.1961, + "num_input_tokens_seen": 9117208, + "step": 13950 + }, + { + "epoch": 7.313941299790357, + "grad_norm": 3.8961777687072754, + "learning_rate": 1.0210804766483168e-05, + "loss": 0.2198, + "num_input_tokens_seen": 9121176, + "step": 13955 + }, + { + "epoch": 7.316561844863732, + "grad_norm": 2.7497928142547607, + "learning_rate": 1.019237305202048e-05, + "loss": 0.2953, + "num_input_tokens_seen": 9124216, + "step": 13960 + }, + { + "epoch": 7.319182389937107, + "grad_norm": 2.275792121887207, + "learning_rate": 1.0173953727903634e-05, + "loss": 0.2429, + "num_input_tokens_seen": 9126936, + "step": 13965 + }, + { + "epoch": 7.321802935010482, + "grad_norm": 1.8836416006088257, + "learning_rate": 1.0155546809545077e-05, + "loss": 0.2641, + "num_input_tokens_seen": 9129688, + "step": 13970 + }, + { + "epoch": 7.3244234800838575, + "grad_norm": 1.7572382688522339, + "learning_rate": 1.013715231234689e-05, + "loss": 0.158, + "num_input_tokens_seen": 9132152, + "step": 13975 + }, + { + "epoch": 7.327044025157233, + "grad_norm": 2.158669948577881, + "learning_rate": 1.0118770251700741e-05, + "loss": 0.3088, + "num_input_tokens_seen": 9135352, + "step": 13980 + }, + { + "epoch": 7.329664570230608, + "grad_norm": 4.26284646987915, + "learning_rate": 1.0100400642987886e-05, + "loss": 0.309, + "num_input_tokens_seen": 9139064, + "step": 13985 + }, + { + "epoch": 7.332285115303983, + "grad_norm": 1.9328830242156982, + "learning_rate": 1.0082043501579205e-05, + "loss": 0.2079, + "num_input_tokens_seen": 9141624, + "step": 13990 + }, + { + "epoch": 7.334905660377358, + "grad_norm": 1.599593997001648, + "learning_rate": 1.0063698842835082e-05, + "loss": 0.1691, + "num_input_tokens_seen": 9144120, + "step": 13995 + }, + { + "epoch": 7.337526205450734, + "grad_norm": 1.7296916246414185, + "learning_rate": 1.0045366682105511e-05, + "loss": 0.1957, + "num_input_tokens_seen": 9147384, + "step": 14000 + }, + { + "epoch": 7.340146750524109, + "grad_norm": 1.2465999126434326, + "learning_rate": 1.002704703473e-05, + "loss": 0.2532, + "num_input_tokens_seen": 9150744, + "step": 14005 + }, + { + "epoch": 7.3427672955974845, + "grad_norm": 1.7737622261047363, + "learning_rate": 1.0008739916037585e-05, + "loss": 0.2242, + "num_input_tokens_seen": 9153560, + "step": 14010 + }, + { + "epoch": 7.34538784067086, + "grad_norm": 1.6900724172592163, + "learning_rate": 9.990445341346846e-06, + "loss": 0.169, + "num_input_tokens_seen": 9156696, + "step": 14015 + }, + { + "epoch": 7.348008385744235, + "grad_norm": 1.5204064846038818, + "learning_rate": 9.972163325965833e-06, + "loss": 0.1704, + "num_input_tokens_seen": 9159416, + "step": 14020 + }, + { + "epoch": 7.35062893081761, + "grad_norm": 1.919115662574768, + "learning_rate": 9.953893885192097e-06, + "loss": 0.2243, + "num_input_tokens_seen": 9162584, + "step": 14025 + }, + { + "epoch": 7.353249475890985, + "grad_norm": 2.067452907562256, + "learning_rate": 9.93563703431269e-06, + "loss": 0.2184, + "num_input_tokens_seen": 9165848, + "step": 14030 + }, + { + "epoch": 7.35587002096436, + "grad_norm": 2.7281055450439453, + "learning_rate": 9.917392788604097e-06, + "loss": 0.2478, + "num_input_tokens_seen": 9168600, + "step": 14035 + }, + { + "epoch": 7.3584905660377355, + "grad_norm": 1.652585744857788, + "learning_rate": 9.899161163332274e-06, + "loss": 0.202, + "num_input_tokens_seen": 9173336, + "step": 14040 + }, + { + "epoch": 7.361111111111111, + "grad_norm": 1.6154171228408813, + "learning_rate": 9.880942173752602e-06, + "loss": 0.2193, + "num_input_tokens_seen": 9176376, + "step": 14045 + }, + { + "epoch": 7.363731656184487, + "grad_norm": 2.177863121032715, + "learning_rate": 9.862735835109915e-06, + "loss": 0.2117, + "num_input_tokens_seen": 9180088, + "step": 14050 + }, + { + "epoch": 7.366352201257862, + "grad_norm": 2.7955217361450195, + "learning_rate": 9.844542162638442e-06, + "loss": 0.1717, + "num_input_tokens_seen": 9183064, + "step": 14055 + }, + { + "epoch": 7.368972746331237, + "grad_norm": 2.5332114696502686, + "learning_rate": 9.826361171561802e-06, + "loss": 0.2263, + "num_input_tokens_seen": 9186040, + "step": 14060 + }, + { + "epoch": 7.371593291404612, + "grad_norm": 1.9945125579833984, + "learning_rate": 9.808192877093039e-06, + "loss": 0.2595, + "num_input_tokens_seen": 9189080, + "step": 14065 + }, + { + "epoch": 7.3742138364779874, + "grad_norm": 1.4229518175125122, + "learning_rate": 9.790037294434545e-06, + "loss": 0.358, + "num_input_tokens_seen": 9191992, + "step": 14070 + }, + { + "epoch": 7.376834381551363, + "grad_norm": 1.987026572227478, + "learning_rate": 9.771894438778075e-06, + "loss": 0.2183, + "num_input_tokens_seen": 9194520, + "step": 14075 + }, + { + "epoch": 7.379454926624738, + "grad_norm": 3.38374924659729, + "learning_rate": 9.753764325304751e-06, + "loss": 0.2615, + "num_input_tokens_seen": 9196632, + "step": 14080 + }, + { + "epoch": 7.382075471698113, + "grad_norm": 2.836859703063965, + "learning_rate": 9.735646969185008e-06, + "loss": 0.2662, + "num_input_tokens_seen": 9199832, + "step": 14085 + }, + { + "epoch": 7.384696016771488, + "grad_norm": 1.8199838399887085, + "learning_rate": 9.717542385578645e-06, + "loss": 0.2527, + "num_input_tokens_seen": 9203384, + "step": 14090 + }, + { + "epoch": 7.387316561844864, + "grad_norm": 2.1033935546875, + "learning_rate": 9.699450589634736e-06, + "loss": 0.206, + "num_input_tokens_seen": 9206840, + "step": 14095 + }, + { + "epoch": 7.389937106918239, + "grad_norm": 2.1317977905273438, + "learning_rate": 9.681371596491665e-06, + "loss": 0.2834, + "num_input_tokens_seen": 9209432, + "step": 14100 + }, + { + "epoch": 7.3925576519916145, + "grad_norm": 1.8817282915115356, + "learning_rate": 9.663305421277125e-06, + "loss": 0.2215, + "num_input_tokens_seen": 9212056, + "step": 14105 + }, + { + "epoch": 7.39517819706499, + "grad_norm": 1.6570310592651367, + "learning_rate": 9.645252079108055e-06, + "loss": 0.1583, + "num_input_tokens_seen": 9215224, + "step": 14110 + }, + { + "epoch": 7.397798742138365, + "grad_norm": 1.595542073249817, + "learning_rate": 9.62721158509066e-06, + "loss": 0.2145, + "num_input_tokens_seen": 9219672, + "step": 14115 + }, + { + "epoch": 7.40041928721174, + "grad_norm": 2.5902297496795654, + "learning_rate": 9.609183954320425e-06, + "loss": 0.2131, + "num_input_tokens_seen": 9222616, + "step": 14120 + }, + { + "epoch": 7.403039832285115, + "grad_norm": 0.9838438630104065, + "learning_rate": 9.59116920188202e-06, + "loss": 0.3921, + "num_input_tokens_seen": 9227032, + "step": 14125 + }, + { + "epoch": 7.40566037735849, + "grad_norm": 3.8878896236419678, + "learning_rate": 9.573167342849375e-06, + "loss": 0.2597, + "num_input_tokens_seen": 9229688, + "step": 14130 + }, + { + "epoch": 7.4082809224318655, + "grad_norm": 1.8204970359802246, + "learning_rate": 9.555178392285647e-06, + "loss": 0.2735, + "num_input_tokens_seen": 9233016, + "step": 14135 + }, + { + "epoch": 7.410901467505241, + "grad_norm": 1.6912283897399902, + "learning_rate": 9.53720236524313e-06, + "loss": 0.2355, + "num_input_tokens_seen": 9236504, + "step": 14140 + }, + { + "epoch": 7.413522012578617, + "grad_norm": 1.3672579526901245, + "learning_rate": 9.519239276763376e-06, + "loss": 0.2312, + "num_input_tokens_seen": 9240344, + "step": 14145 + }, + { + "epoch": 7.416142557651992, + "grad_norm": 2.3656930923461914, + "learning_rate": 9.501289141877056e-06, + "loss": 0.1862, + "num_input_tokens_seen": 9242904, + "step": 14150 + }, + { + "epoch": 7.418763102725367, + "grad_norm": 2.7520811557769775, + "learning_rate": 9.483351975604025e-06, + "loss": 0.3045, + "num_input_tokens_seen": 9246072, + "step": 14155 + }, + { + "epoch": 7.421383647798742, + "grad_norm": 3.6793289184570312, + "learning_rate": 9.465427792953293e-06, + "loss": 0.2217, + "num_input_tokens_seen": 9248696, + "step": 14160 + }, + { + "epoch": 7.424004192872117, + "grad_norm": 3.043703556060791, + "learning_rate": 9.447516608922996e-06, + "loss": 0.3832, + "num_input_tokens_seen": 9251160, + "step": 14165 + }, + { + "epoch": 7.426624737945493, + "grad_norm": 2.061483383178711, + "learning_rate": 9.429618438500381e-06, + "loss": 0.259, + "num_input_tokens_seen": 9254040, + "step": 14170 + }, + { + "epoch": 7.429245283018868, + "grad_norm": 1.4450173377990723, + "learning_rate": 9.411733296661852e-06, + "loss": 0.2374, + "num_input_tokens_seen": 9257816, + "step": 14175 + }, + { + "epoch": 7.431865828092243, + "grad_norm": 2.6914355754852295, + "learning_rate": 9.393861198372836e-06, + "loss": 0.1989, + "num_input_tokens_seen": 9260184, + "step": 14180 + }, + { + "epoch": 7.434486373165618, + "grad_norm": 1.866809606552124, + "learning_rate": 9.376002158587915e-06, + "loss": 0.3214, + "num_input_tokens_seen": 9263032, + "step": 14185 + }, + { + "epoch": 7.437106918238994, + "grad_norm": 1.5914703607559204, + "learning_rate": 9.358156192250717e-06, + "loss": 0.3032, + "num_input_tokens_seen": 9267448, + "step": 14190 + }, + { + "epoch": 7.439727463312369, + "grad_norm": 1.9883204698562622, + "learning_rate": 9.340323314293917e-06, + "loss": 0.1982, + "num_input_tokens_seen": 9270200, + "step": 14195 + }, + { + "epoch": 7.4423480083857445, + "grad_norm": 2.194624900817871, + "learning_rate": 9.322503539639269e-06, + "loss": 0.2303, + "num_input_tokens_seen": 9272856, + "step": 14200 + }, + { + "epoch": 7.44496855345912, + "grad_norm": 1.0585918426513672, + "learning_rate": 9.304696883197542e-06, + "loss": 0.2227, + "num_input_tokens_seen": 9276504, + "step": 14205 + }, + { + "epoch": 7.447589098532495, + "grad_norm": 2.3365957736968994, + "learning_rate": 9.286903359868518e-06, + "loss": 0.2839, + "num_input_tokens_seen": 9279736, + "step": 14210 + }, + { + "epoch": 7.45020964360587, + "grad_norm": 1.9157354831695557, + "learning_rate": 9.269122984541029e-06, + "loss": 0.2323, + "num_input_tokens_seen": 9282680, + "step": 14215 + }, + { + "epoch": 7.452830188679245, + "grad_norm": 2.9500672817230225, + "learning_rate": 9.251355772092867e-06, + "loss": 0.1877, + "num_input_tokens_seen": 9285912, + "step": 14220 + }, + { + "epoch": 7.45545073375262, + "grad_norm": 1.5911558866500854, + "learning_rate": 9.233601737390826e-06, + "loss": 0.1652, + "num_input_tokens_seen": 9289176, + "step": 14225 + }, + { + "epoch": 7.4580712788259955, + "grad_norm": 2.6496734619140625, + "learning_rate": 9.215860895290662e-06, + "loss": 0.207, + "num_input_tokens_seen": 9291736, + "step": 14230 + }, + { + "epoch": 7.460691823899371, + "grad_norm": 1.5802274942398071, + "learning_rate": 9.198133260637121e-06, + "loss": 0.2096, + "num_input_tokens_seen": 9296312, + "step": 14235 + }, + { + "epoch": 7.463312368972747, + "grad_norm": 2.7461063861846924, + "learning_rate": 9.180418848263866e-06, + "loss": 0.2323, + "num_input_tokens_seen": 9298904, + "step": 14240 + }, + { + "epoch": 7.465932914046122, + "grad_norm": 2.435826301574707, + "learning_rate": 9.162717672993499e-06, + "loss": 0.2103, + "num_input_tokens_seen": 9301432, + "step": 14245 + }, + { + "epoch": 7.468553459119497, + "grad_norm": 2.532421827316284, + "learning_rate": 9.145029749637576e-06, + "loss": 0.2505, + "num_input_tokens_seen": 9304376, + "step": 14250 + }, + { + "epoch": 7.471174004192872, + "grad_norm": 2.4649946689605713, + "learning_rate": 9.127355092996532e-06, + "loss": 0.2566, + "num_input_tokens_seen": 9307864, + "step": 14255 + }, + { + "epoch": 7.473794549266247, + "grad_norm": 2.273007869720459, + "learning_rate": 9.10969371785971e-06, + "loss": 0.277, + "num_input_tokens_seen": 9311032, + "step": 14260 + }, + { + "epoch": 7.476415094339623, + "grad_norm": 1.8767508268356323, + "learning_rate": 9.092045639005347e-06, + "loss": 0.2058, + "num_input_tokens_seen": 9314264, + "step": 14265 + }, + { + "epoch": 7.479035639412998, + "grad_norm": 1.9367843866348267, + "learning_rate": 9.07441087120054e-06, + "loss": 0.215, + "num_input_tokens_seen": 9317272, + "step": 14270 + }, + { + "epoch": 7.481656184486373, + "grad_norm": 3.140422821044922, + "learning_rate": 9.05678942920127e-06, + "loss": 0.24, + "num_input_tokens_seen": 9320504, + "step": 14275 + }, + { + "epoch": 7.484276729559748, + "grad_norm": 2.142961025238037, + "learning_rate": 9.03918132775235e-06, + "loss": 0.212, + "num_input_tokens_seen": 9324792, + "step": 14280 + }, + { + "epoch": 7.486897274633124, + "grad_norm": 1.209787368774414, + "learning_rate": 9.021586581587425e-06, + "loss": 0.1722, + "num_input_tokens_seen": 9327544, + "step": 14285 + }, + { + "epoch": 7.489517819706499, + "grad_norm": 2.453937530517578, + "learning_rate": 9.004005205428992e-06, + "loss": 0.2514, + "num_input_tokens_seen": 9330488, + "step": 14290 + }, + { + "epoch": 7.4921383647798745, + "grad_norm": 2.275536060333252, + "learning_rate": 8.986437213988336e-06, + "loss": 0.3044, + "num_input_tokens_seen": 9335288, + "step": 14295 + }, + { + "epoch": 7.49475890985325, + "grad_norm": 1.2258293628692627, + "learning_rate": 8.968882621965542e-06, + "loss": 0.1906, + "num_input_tokens_seen": 9339384, + "step": 14300 + }, + { + "epoch": 7.497379454926625, + "grad_norm": 1.7657229900360107, + "learning_rate": 8.951341444049513e-06, + "loss": 0.2725, + "num_input_tokens_seen": 9342872, + "step": 14305 + }, + { + "epoch": 7.5, + "grad_norm": 1.919360876083374, + "learning_rate": 8.933813694917873e-06, + "loss": 0.1868, + "num_input_tokens_seen": 9345976, + "step": 14310 + }, + { + "epoch": 7.5, + "eval_loss": 0.6236981153488159, + "eval_runtime": 15.9963, + "eval_samples_per_second": 53.012, + "eval_steps_per_second": 13.253, + "num_input_tokens_seen": 9345976, + "step": 14310 + }, + { + "epoch": 7.502620545073375, + "grad_norm": 1.662032961845398, + "learning_rate": 8.916299389237067e-06, + "loss": 0.2104, + "num_input_tokens_seen": 9349208, + "step": 14315 + }, + { + "epoch": 7.50524109014675, + "grad_norm": 2.235090970993042, + "learning_rate": 8.898798541662259e-06, + "loss": 0.2036, + "num_input_tokens_seen": 9352184, + "step": 14320 + }, + { + "epoch": 7.5078616352201255, + "grad_norm": 1.76005220413208, + "learning_rate": 8.88131116683735e-06, + "loss": 0.2402, + "num_input_tokens_seen": 9356216, + "step": 14325 + }, + { + "epoch": 7.510482180293501, + "grad_norm": 2.1392526626586914, + "learning_rate": 8.863837279394993e-06, + "loss": 0.2308, + "num_input_tokens_seen": 9359544, + "step": 14330 + }, + { + "epoch": 7.513102725366876, + "grad_norm": 1.810613751411438, + "learning_rate": 8.84637689395653e-06, + "loss": 0.2111, + "num_input_tokens_seen": 9362200, + "step": 14335 + }, + { + "epoch": 7.515723270440252, + "grad_norm": 2.161484718322754, + "learning_rate": 8.828930025132006e-06, + "loss": 0.1959, + "num_input_tokens_seen": 9366520, + "step": 14340 + }, + { + "epoch": 7.518343815513627, + "grad_norm": 1.2317489385604858, + "learning_rate": 8.81149668752018e-06, + "loss": 0.2399, + "num_input_tokens_seen": 9372440, + "step": 14345 + }, + { + "epoch": 7.520964360587002, + "grad_norm": 2.2906486988067627, + "learning_rate": 8.794076895708463e-06, + "loss": 0.1733, + "num_input_tokens_seen": 9375064, + "step": 14350 + }, + { + "epoch": 7.523584905660377, + "grad_norm": 2.6109049320220947, + "learning_rate": 8.776670664272946e-06, + "loss": 0.2045, + "num_input_tokens_seen": 9377720, + "step": 14355 + }, + { + "epoch": 7.526205450733753, + "grad_norm": 1.482316017150879, + "learning_rate": 8.759278007778362e-06, + "loss": 0.2217, + "num_input_tokens_seen": 9380984, + "step": 14360 + }, + { + "epoch": 7.528825995807128, + "grad_norm": 2.174329996109009, + "learning_rate": 8.741898940778088e-06, + "loss": 0.2009, + "num_input_tokens_seen": 9383992, + "step": 14365 + }, + { + "epoch": 7.531446540880503, + "grad_norm": 2.471519708633423, + "learning_rate": 8.724533477814148e-06, + "loss": 0.2052, + "num_input_tokens_seen": 9386360, + "step": 14370 + }, + { + "epoch": 7.534067085953878, + "grad_norm": 2.2879574298858643, + "learning_rate": 8.707181633417159e-06, + "loss": 0.3046, + "num_input_tokens_seen": 9389144, + "step": 14375 + }, + { + "epoch": 7.536687631027254, + "grad_norm": 1.3613113164901733, + "learning_rate": 8.689843422106345e-06, + "loss": 0.165, + "num_input_tokens_seen": 9392024, + "step": 14380 + }, + { + "epoch": 7.539308176100629, + "grad_norm": 2.8971643447875977, + "learning_rate": 8.672518858389548e-06, + "loss": 0.2632, + "num_input_tokens_seen": 9394872, + "step": 14385 + }, + { + "epoch": 7.5419287211740045, + "grad_norm": 1.577438235282898, + "learning_rate": 8.655207956763159e-06, + "loss": 0.2847, + "num_input_tokens_seen": 9397304, + "step": 14390 + }, + { + "epoch": 7.54454926624738, + "grad_norm": 2.759469985961914, + "learning_rate": 8.63791073171215e-06, + "loss": 0.2513, + "num_input_tokens_seen": 9400664, + "step": 14395 + }, + { + "epoch": 7.547169811320755, + "grad_norm": 1.5404325723648071, + "learning_rate": 8.620627197710044e-06, + "loss": 0.295, + "num_input_tokens_seen": 9403864, + "step": 14400 + }, + { + "epoch": 7.54979035639413, + "grad_norm": 2.1506919860839844, + "learning_rate": 8.603357369218928e-06, + "loss": 0.2101, + "num_input_tokens_seen": 9407544, + "step": 14405 + }, + { + "epoch": 7.552410901467505, + "grad_norm": 1.0795432329177856, + "learning_rate": 8.586101260689397e-06, + "loss": 0.193, + "num_input_tokens_seen": 9410968, + "step": 14410 + }, + { + "epoch": 7.55503144654088, + "grad_norm": 3.289491653442383, + "learning_rate": 8.568858886560563e-06, + "loss": 0.2375, + "num_input_tokens_seen": 9413400, + "step": 14415 + }, + { + "epoch": 7.5576519916142555, + "grad_norm": 1.818740963935852, + "learning_rate": 8.551630261260079e-06, + "loss": 0.1936, + "num_input_tokens_seen": 9415832, + "step": 14420 + }, + { + "epoch": 7.560272536687631, + "grad_norm": 1.9564696550369263, + "learning_rate": 8.53441539920406e-06, + "loss": 0.3053, + "num_input_tokens_seen": 9419064, + "step": 14425 + }, + { + "epoch": 7.562893081761006, + "grad_norm": 2.136024236679077, + "learning_rate": 8.517214314797108e-06, + "loss": 0.279, + "num_input_tokens_seen": 9421752, + "step": 14430 + }, + { + "epoch": 7.565513626834382, + "grad_norm": 1.8282382488250732, + "learning_rate": 8.500027022432333e-06, + "loss": 0.2373, + "num_input_tokens_seen": 9424984, + "step": 14435 + }, + { + "epoch": 7.568134171907757, + "grad_norm": 1.7186704874038696, + "learning_rate": 8.482853536491239e-06, + "loss": 0.1963, + "num_input_tokens_seen": 9428216, + "step": 14440 + }, + { + "epoch": 7.570754716981132, + "grad_norm": 1.9514596462249756, + "learning_rate": 8.465693871343842e-06, + "loss": 0.2201, + "num_input_tokens_seen": 9431576, + "step": 14445 + }, + { + "epoch": 7.573375262054507, + "grad_norm": 3.769378423690796, + "learning_rate": 8.448548041348552e-06, + "loss": 0.2906, + "num_input_tokens_seen": 9434680, + "step": 14450 + }, + { + "epoch": 7.575995807127883, + "grad_norm": 2.6484298706054688, + "learning_rate": 8.431416060852218e-06, + "loss": 0.1278, + "num_input_tokens_seen": 9437432, + "step": 14455 + }, + { + "epoch": 7.578616352201258, + "grad_norm": 1.2562941312789917, + "learning_rate": 8.414297944190108e-06, + "loss": 0.2002, + "num_input_tokens_seen": 9441208, + "step": 14460 + }, + { + "epoch": 7.581236897274633, + "grad_norm": 2.5358762741088867, + "learning_rate": 8.397193705685873e-06, + "loss": 0.2294, + "num_input_tokens_seen": 9444056, + "step": 14465 + }, + { + "epoch": 7.583857442348008, + "grad_norm": 4.995631694793701, + "learning_rate": 8.380103359651553e-06, + "loss": 0.2905, + "num_input_tokens_seen": 9446968, + "step": 14470 + }, + { + "epoch": 7.586477987421384, + "grad_norm": 1.4915084838867188, + "learning_rate": 8.36302692038759e-06, + "loss": 0.2077, + "num_input_tokens_seen": 9450712, + "step": 14475 + }, + { + "epoch": 7.589098532494759, + "grad_norm": 2.241638660430908, + "learning_rate": 8.345964402182739e-06, + "loss": 0.1801, + "num_input_tokens_seen": 9453240, + "step": 14480 + }, + { + "epoch": 7.5917190775681345, + "grad_norm": 1.4718079566955566, + "learning_rate": 8.328915819314148e-06, + "loss": 0.1811, + "num_input_tokens_seen": 9456728, + "step": 14485 + }, + { + "epoch": 7.59433962264151, + "grad_norm": 2.9626729488372803, + "learning_rate": 8.31188118604731e-06, + "loss": 0.273, + "num_input_tokens_seen": 9459800, + "step": 14490 + }, + { + "epoch": 7.596960167714885, + "grad_norm": 1.6215986013412476, + "learning_rate": 8.294860516636e-06, + "loss": 0.191, + "num_input_tokens_seen": 9463384, + "step": 14495 + }, + { + "epoch": 7.59958071278826, + "grad_norm": 2.6395010948181152, + "learning_rate": 8.277853825322355e-06, + "loss": 0.184, + "num_input_tokens_seen": 9466520, + "step": 14500 + }, + { + "epoch": 7.602201257861635, + "grad_norm": 2.12701678276062, + "learning_rate": 8.260861126336794e-06, + "loss": 0.1733, + "num_input_tokens_seen": 9469944, + "step": 14505 + }, + { + "epoch": 7.60482180293501, + "grad_norm": 2.641530990600586, + "learning_rate": 8.243882433898018e-06, + "loss": 0.1729, + "num_input_tokens_seen": 9472280, + "step": 14510 + }, + { + "epoch": 7.6074423480083855, + "grad_norm": 2.9443931579589844, + "learning_rate": 8.226917762213044e-06, + "loss": 0.2417, + "num_input_tokens_seen": 9474808, + "step": 14515 + }, + { + "epoch": 7.610062893081761, + "grad_norm": 1.5779521465301514, + "learning_rate": 8.209967125477119e-06, + "loss": 0.2617, + "num_input_tokens_seen": 9479064, + "step": 14520 + }, + { + "epoch": 7.612683438155136, + "grad_norm": 1.8209656476974487, + "learning_rate": 8.193030537873761e-06, + "loss": 0.2782, + "num_input_tokens_seen": 9483128, + "step": 14525 + }, + { + "epoch": 7.615303983228512, + "grad_norm": 2.064784288406372, + "learning_rate": 8.176108013574743e-06, + "loss": 0.2026, + "num_input_tokens_seen": 9486136, + "step": 14530 + }, + { + "epoch": 7.617924528301887, + "grad_norm": 2.204210042953491, + "learning_rate": 8.159199566740055e-06, + "loss": 0.4206, + "num_input_tokens_seen": 9488568, + "step": 14535 + }, + { + "epoch": 7.620545073375262, + "grad_norm": 2.0450518131256104, + "learning_rate": 8.142305211517914e-06, + "loss": 0.221, + "num_input_tokens_seen": 9492440, + "step": 14540 + }, + { + "epoch": 7.623165618448637, + "grad_norm": 1.7150040864944458, + "learning_rate": 8.125424962044742e-06, + "loss": 0.1912, + "num_input_tokens_seen": 9495192, + "step": 14545 + }, + { + "epoch": 7.6257861635220126, + "grad_norm": 2.5726852416992188, + "learning_rate": 8.108558832445157e-06, + "loss": 0.26, + "num_input_tokens_seen": 9497688, + "step": 14550 + }, + { + "epoch": 7.628406708595388, + "grad_norm": 1.6831401586532593, + "learning_rate": 8.091706836831974e-06, + "loss": 0.2407, + "num_input_tokens_seen": 9500984, + "step": 14555 + }, + { + "epoch": 7.631027253668763, + "grad_norm": 1.8277997970581055, + "learning_rate": 8.074868989306173e-06, + "loss": 0.2272, + "num_input_tokens_seen": 9504152, + "step": 14560 + }, + { + "epoch": 7.633647798742138, + "grad_norm": 1.6091017723083496, + "learning_rate": 8.058045303956885e-06, + "loss": 0.247, + "num_input_tokens_seen": 9506840, + "step": 14565 + }, + { + "epoch": 7.636268343815514, + "grad_norm": 1.3330003023147583, + "learning_rate": 8.041235794861416e-06, + "loss": 0.2557, + "num_input_tokens_seen": 9509880, + "step": 14570 + }, + { + "epoch": 7.638888888888889, + "grad_norm": 2.556184768676758, + "learning_rate": 8.024440476085188e-06, + "loss": 0.2057, + "num_input_tokens_seen": 9513464, + "step": 14575 + }, + { + "epoch": 7.6415094339622645, + "grad_norm": 3.303401231765747, + "learning_rate": 8.007659361681758e-06, + "loss": 0.2438, + "num_input_tokens_seen": 9516216, + "step": 14580 + }, + { + "epoch": 7.64412997903564, + "grad_norm": 2.162243366241455, + "learning_rate": 7.990892465692787e-06, + "loss": 0.1982, + "num_input_tokens_seen": 9519032, + "step": 14585 + }, + { + "epoch": 7.646750524109015, + "grad_norm": 1.1501082181930542, + "learning_rate": 7.974139802148065e-06, + "loss": 0.1914, + "num_input_tokens_seen": 9523640, + "step": 14590 + }, + { + "epoch": 7.64937106918239, + "grad_norm": 2.1429126262664795, + "learning_rate": 7.957401385065444e-06, + "loss": 0.2356, + "num_input_tokens_seen": 9526616, + "step": 14595 + }, + { + "epoch": 7.651991614255765, + "grad_norm": 1.939954161643982, + "learning_rate": 7.94067722845086e-06, + "loss": 0.2191, + "num_input_tokens_seen": 9529656, + "step": 14600 + }, + { + "epoch": 7.65461215932914, + "grad_norm": 1.3818461894989014, + "learning_rate": 7.923967346298345e-06, + "loss": 0.217, + "num_input_tokens_seen": 9532984, + "step": 14605 + }, + { + "epoch": 7.6572327044025155, + "grad_norm": 2.478694438934326, + "learning_rate": 7.907271752589951e-06, + "loss": 0.221, + "num_input_tokens_seen": 9535928, + "step": 14610 + }, + { + "epoch": 7.659853249475891, + "grad_norm": 1.741974949836731, + "learning_rate": 7.89059046129578e-06, + "loss": 0.2769, + "num_input_tokens_seen": 9539256, + "step": 14615 + }, + { + "epoch": 7.662473794549266, + "grad_norm": 1.6909282207489014, + "learning_rate": 7.873923486374001e-06, + "loss": 0.2671, + "num_input_tokens_seen": 9542296, + "step": 14620 + }, + { + "epoch": 7.665094339622642, + "grad_norm": 1.0316325426101685, + "learning_rate": 7.857270841770745e-06, + "loss": 0.1999, + "num_input_tokens_seen": 9545368, + "step": 14625 + }, + { + "epoch": 7.667714884696017, + "grad_norm": 1.3982822895050049, + "learning_rate": 7.8406325414202e-06, + "loss": 0.2196, + "num_input_tokens_seen": 9548472, + "step": 14630 + }, + { + "epoch": 7.670335429769392, + "grad_norm": 1.272775411605835, + "learning_rate": 7.824008599244553e-06, + "loss": 0.205, + "num_input_tokens_seen": 9552248, + "step": 14635 + }, + { + "epoch": 7.672955974842767, + "grad_norm": 1.7751154899597168, + "learning_rate": 7.807399029153925e-06, + "loss": 0.435, + "num_input_tokens_seen": 9555320, + "step": 14640 + }, + { + "epoch": 7.6755765199161425, + "grad_norm": 2.0464375019073486, + "learning_rate": 7.790803845046474e-06, + "loss": 0.2062, + "num_input_tokens_seen": 9558904, + "step": 14645 + }, + { + "epoch": 7.678197064989518, + "grad_norm": 2.9340274333953857, + "learning_rate": 7.774223060808277e-06, + "loss": 0.2267, + "num_input_tokens_seen": 9561624, + "step": 14650 + }, + { + "epoch": 7.680817610062893, + "grad_norm": 3.335193634033203, + "learning_rate": 7.757656690313375e-06, + "loss": 0.3155, + "num_input_tokens_seen": 9564568, + "step": 14655 + }, + { + "epoch": 7.683438155136268, + "grad_norm": 1.4169280529022217, + "learning_rate": 7.741104747423769e-06, + "loss": 0.2656, + "num_input_tokens_seen": 9567704, + "step": 14660 + }, + { + "epoch": 7.686058700209644, + "grad_norm": 1.8434770107269287, + "learning_rate": 7.724567245989342e-06, + "loss": 0.196, + "num_input_tokens_seen": 9571512, + "step": 14665 + }, + { + "epoch": 7.688679245283019, + "grad_norm": 2.036295175552368, + "learning_rate": 7.708044199847934e-06, + "loss": 0.324, + "num_input_tokens_seen": 9575448, + "step": 14670 + }, + { + "epoch": 7.691299790356394, + "grad_norm": 1.3357584476470947, + "learning_rate": 7.691535622825288e-06, + "loss": 0.1525, + "num_input_tokens_seen": 9583704, + "step": 14675 + }, + { + "epoch": 7.69392033542977, + "grad_norm": 1.9677075147628784, + "learning_rate": 7.675041528735e-06, + "loss": 0.213, + "num_input_tokens_seen": 9586392, + "step": 14680 + }, + { + "epoch": 7.696540880503145, + "grad_norm": 2.8646328449249268, + "learning_rate": 7.658561931378594e-06, + "loss": 0.3283, + "num_input_tokens_seen": 9589336, + "step": 14685 + }, + { + "epoch": 7.69916142557652, + "grad_norm": 1.0995250940322876, + "learning_rate": 7.64209684454544e-06, + "loss": 0.217, + "num_input_tokens_seen": 9593016, + "step": 14690 + }, + { + "epoch": 7.701781970649895, + "grad_norm": 5.220744609832764, + "learning_rate": 7.625646282012763e-06, + "loss": 0.2319, + "num_input_tokens_seen": 9595512, + "step": 14695 + }, + { + "epoch": 7.70440251572327, + "grad_norm": 1.994441032409668, + "learning_rate": 7.6092102575456546e-06, + "loss": 0.2339, + "num_input_tokens_seen": 9598904, + "step": 14700 + }, + { + "epoch": 7.7070230607966455, + "grad_norm": 1.3244322538375854, + "learning_rate": 7.592788784897023e-06, + "loss": 0.1381, + "num_input_tokens_seen": 9602712, + "step": 14705 + }, + { + "epoch": 7.709643605870021, + "grad_norm": 1.3699042797088623, + "learning_rate": 7.576381877807598e-06, + "loss": 0.1964, + "num_input_tokens_seen": 9605560, + "step": 14710 + }, + { + "epoch": 7.712264150943396, + "grad_norm": 1.5619258880615234, + "learning_rate": 7.559989550005947e-06, + "loss": 0.337, + "num_input_tokens_seen": 9609048, + "step": 14715 + }, + { + "epoch": 7.714884696016772, + "grad_norm": 1.7010018825531006, + "learning_rate": 7.543611815208415e-06, + "loss": 0.2438, + "num_input_tokens_seen": 9612824, + "step": 14720 + }, + { + "epoch": 7.717505241090147, + "grad_norm": 1.037163257598877, + "learning_rate": 7.5272486871191375e-06, + "loss": 0.2153, + "num_input_tokens_seen": 9615704, + "step": 14725 + }, + { + "epoch": 7.720125786163522, + "grad_norm": 1.6925773620605469, + "learning_rate": 7.510900179430036e-06, + "loss": 0.2247, + "num_input_tokens_seen": 9618840, + "step": 14730 + }, + { + "epoch": 7.722746331236897, + "grad_norm": 1.1778713464736938, + "learning_rate": 7.494566305820788e-06, + "loss": 0.1776, + "num_input_tokens_seen": 9622552, + "step": 14735 + }, + { + "epoch": 7.7253668763102725, + "grad_norm": 0.9091711044311523, + "learning_rate": 7.478247079958845e-06, + "loss": 0.2762, + "num_input_tokens_seen": 9625720, + "step": 14740 + }, + { + "epoch": 7.727987421383648, + "grad_norm": 2.8053231239318848, + "learning_rate": 7.461942515499384e-06, + "loss": 0.2194, + "num_input_tokens_seen": 9628664, + "step": 14745 + }, + { + "epoch": 7.730607966457023, + "grad_norm": 1.8665549755096436, + "learning_rate": 7.445652626085312e-06, + "loss": 0.2062, + "num_input_tokens_seen": 9632088, + "step": 14750 + }, + { + "epoch": 7.733228511530398, + "grad_norm": 2.5972917079925537, + "learning_rate": 7.429377425347281e-06, + "loss": 0.237, + "num_input_tokens_seen": 9635160, + "step": 14755 + }, + { + "epoch": 7.735849056603773, + "grad_norm": 2.0725152492523193, + "learning_rate": 7.413116926903624e-06, + "loss": 0.2505, + "num_input_tokens_seen": 9638776, + "step": 14760 + }, + { + "epoch": 7.738469601677149, + "grad_norm": 1.99451744556427, + "learning_rate": 7.396871144360387e-06, + "loss": 0.2501, + "num_input_tokens_seen": 9641624, + "step": 14765 + }, + { + "epoch": 7.741090146750524, + "grad_norm": 1.588271975517273, + "learning_rate": 7.380640091311291e-06, + "loss": 0.2775, + "num_input_tokens_seen": 9649176, + "step": 14770 + }, + { + "epoch": 7.7437106918239, + "grad_norm": 4.215014934539795, + "learning_rate": 7.3644237813377535e-06, + "loss": 0.3455, + "num_input_tokens_seen": 9651768, + "step": 14775 + }, + { + "epoch": 7.746331236897275, + "grad_norm": 1.927949070930481, + "learning_rate": 7.348222228008836e-06, + "loss": 0.1533, + "num_input_tokens_seen": 9654552, + "step": 14780 + }, + { + "epoch": 7.74895178197065, + "grad_norm": 1.6269893646240234, + "learning_rate": 7.332035444881247e-06, + "loss": 0.2177, + "num_input_tokens_seen": 9657112, + "step": 14785 + }, + { + "epoch": 7.751572327044025, + "grad_norm": 1.1047288179397583, + "learning_rate": 7.315863445499366e-06, + "loss": 0.2154, + "num_input_tokens_seen": 9661176, + "step": 14790 + }, + { + "epoch": 7.7541928721174, + "grad_norm": 2.0795867443084717, + "learning_rate": 7.299706243395177e-06, + "loss": 0.2334, + "num_input_tokens_seen": 9663480, + "step": 14795 + }, + { + "epoch": 7.756813417190775, + "grad_norm": 1.6864101886749268, + "learning_rate": 7.283563852088277e-06, + "loss": 0.2411, + "num_input_tokens_seen": 9666808, + "step": 14800 + }, + { + "epoch": 7.759433962264151, + "grad_norm": 1.787536859512329, + "learning_rate": 7.267436285085905e-06, + "loss": 0.2949, + "num_input_tokens_seen": 9669336, + "step": 14805 + }, + { + "epoch": 7.762054507337526, + "grad_norm": 1.4159404039382935, + "learning_rate": 7.251323555882844e-06, + "loss": 0.2151, + "num_input_tokens_seen": 9671928, + "step": 14810 + }, + { + "epoch": 7.764675052410902, + "grad_norm": 1.3617085218429565, + "learning_rate": 7.235225677961513e-06, + "loss": 0.3022, + "num_input_tokens_seen": 9676120, + "step": 14815 + }, + { + "epoch": 7.767295597484277, + "grad_norm": 2.4477362632751465, + "learning_rate": 7.219142664791872e-06, + "loss": 0.2204, + "num_input_tokens_seen": 9679032, + "step": 14820 + }, + { + "epoch": 7.769916142557652, + "grad_norm": 1.776781678199768, + "learning_rate": 7.203074529831444e-06, + "loss": 0.2022, + "num_input_tokens_seen": 9682744, + "step": 14825 + }, + { + "epoch": 7.772536687631027, + "grad_norm": 2.630666494369507, + "learning_rate": 7.187021286525328e-06, + "loss": 0.3389, + "num_input_tokens_seen": 9686040, + "step": 14830 + }, + { + "epoch": 7.7751572327044025, + "grad_norm": 2.0952839851379395, + "learning_rate": 7.170982948306135e-06, + "loss": 0.2418, + "num_input_tokens_seen": 9690168, + "step": 14835 + }, + { + "epoch": 7.777777777777778, + "grad_norm": 3.090125799179077, + "learning_rate": 7.154959528594002e-06, + "loss": 0.2301, + "num_input_tokens_seen": 9693496, + "step": 14840 + }, + { + "epoch": 7.780398322851153, + "grad_norm": 5.710425853729248, + "learning_rate": 7.138951040796627e-06, + "loss": 0.2398, + "num_input_tokens_seen": 9696760, + "step": 14845 + }, + { + "epoch": 7.783018867924528, + "grad_norm": 1.2033238410949707, + "learning_rate": 7.122957498309143e-06, + "loss": 0.2981, + "num_input_tokens_seen": 9699672, + "step": 14850 + }, + { + "epoch": 7.785639412997903, + "grad_norm": 1.8604650497436523, + "learning_rate": 7.1069789145142424e-06, + "loss": 0.2073, + "num_input_tokens_seen": 9703384, + "step": 14855 + }, + { + "epoch": 7.788259958071279, + "grad_norm": 1.396855115890503, + "learning_rate": 7.091015302782064e-06, + "loss": 0.287, + "num_input_tokens_seen": 9706328, + "step": 14860 + }, + { + "epoch": 7.790880503144654, + "grad_norm": 2.9554035663604736, + "learning_rate": 7.075066676470224e-06, + "loss": 0.2604, + "num_input_tokens_seen": 9709432, + "step": 14865 + }, + { + "epoch": 7.79350104821803, + "grad_norm": 3.409730911254883, + "learning_rate": 7.0591330489238185e-06, + "loss": 0.3159, + "num_input_tokens_seen": 9712696, + "step": 14870 + }, + { + "epoch": 7.796121593291405, + "grad_norm": 2.140523672103882, + "learning_rate": 7.0432144334753734e-06, + "loss": 0.2425, + "num_input_tokens_seen": 9715704, + "step": 14875 + }, + { + "epoch": 7.79874213836478, + "grad_norm": 2.163120746612549, + "learning_rate": 7.027310843444846e-06, + "loss": 0.4468, + "num_input_tokens_seen": 9720344, + "step": 14880 + }, + { + "epoch": 7.801362683438155, + "grad_norm": 1.3685312271118164, + "learning_rate": 7.011422292139655e-06, + "loss": 0.2611, + "num_input_tokens_seen": 9723544, + "step": 14885 + }, + { + "epoch": 7.80398322851153, + "grad_norm": 1.8228625059127808, + "learning_rate": 6.995548792854606e-06, + "loss": 0.4037, + "num_input_tokens_seen": 9726424, + "step": 14890 + }, + { + "epoch": 7.806603773584905, + "grad_norm": 2.4528262615203857, + "learning_rate": 6.979690358871912e-06, + "loss": 0.1833, + "num_input_tokens_seen": 9729144, + "step": 14895 + }, + { + "epoch": 7.809224318658281, + "grad_norm": 1.0927399396896362, + "learning_rate": 6.963847003461188e-06, + "loss": 0.2117, + "num_input_tokens_seen": 9732504, + "step": 14900 + }, + { + "epoch": 7.811844863731656, + "grad_norm": 3.4283909797668457, + "learning_rate": 6.948018739879439e-06, + "loss": 0.2058, + "num_input_tokens_seen": 9737048, + "step": 14905 + }, + { + "epoch": 7.814465408805032, + "grad_norm": 3.2061166763305664, + "learning_rate": 6.932205581371026e-06, + "loss": 0.2996, + "num_input_tokens_seen": 9739960, + "step": 14910 + }, + { + "epoch": 7.817085953878407, + "grad_norm": 1.5212992429733276, + "learning_rate": 6.91640754116768e-06, + "loss": 0.1601, + "num_input_tokens_seen": 9743288, + "step": 14915 + }, + { + "epoch": 7.819706498951782, + "grad_norm": 2.5228216648101807, + "learning_rate": 6.9006246324884695e-06, + "loss": 0.3448, + "num_input_tokens_seen": 9746072, + "step": 14920 + }, + { + "epoch": 7.822327044025157, + "grad_norm": 2.1740825176239014, + "learning_rate": 6.88485686853983e-06, + "loss": 0.301, + "num_input_tokens_seen": 9748920, + "step": 14925 + }, + { + "epoch": 7.8249475890985325, + "grad_norm": 1.422896146774292, + "learning_rate": 6.869104262515494e-06, + "loss": 0.1816, + "num_input_tokens_seen": 9751864, + "step": 14930 + }, + { + "epoch": 7.827568134171908, + "grad_norm": 2.6843278408050537, + "learning_rate": 6.8533668275965276e-06, + "loss": 0.2776, + "num_input_tokens_seen": 9755128, + "step": 14935 + }, + { + "epoch": 7.830188679245283, + "grad_norm": 2.308321714401245, + "learning_rate": 6.837644576951283e-06, + "loss": 0.1687, + "num_input_tokens_seen": 9757976, + "step": 14940 + }, + { + "epoch": 7.832809224318658, + "grad_norm": 1.2789236307144165, + "learning_rate": 6.8219375237354445e-06, + "loss": 0.193, + "num_input_tokens_seen": 9761464, + "step": 14945 + }, + { + "epoch": 7.835429769392033, + "grad_norm": 2.0611624717712402, + "learning_rate": 6.806245681091944e-06, + "loss": 0.2385, + "num_input_tokens_seen": 9768408, + "step": 14950 + }, + { + "epoch": 7.838050314465409, + "grad_norm": 1.5301892757415771, + "learning_rate": 6.790569062150992e-06, + "loss": 0.2587, + "num_input_tokens_seen": 9771832, + "step": 14955 + }, + { + "epoch": 7.840670859538784, + "grad_norm": 2.5941874980926514, + "learning_rate": 6.774907680030085e-06, + "loss": 0.2772, + "num_input_tokens_seen": 9775736, + "step": 14960 + }, + { + "epoch": 7.84329140461216, + "grad_norm": 1.375044345855713, + "learning_rate": 6.75926154783394e-06, + "loss": 0.2873, + "num_input_tokens_seen": 9779096, + "step": 14965 + }, + { + "epoch": 7.845911949685535, + "grad_norm": 3.071828603744507, + "learning_rate": 6.743630678654519e-06, + "loss": 0.4185, + "num_input_tokens_seen": 9782264, + "step": 14970 + }, + { + "epoch": 7.84853249475891, + "grad_norm": 1.8443059921264648, + "learning_rate": 6.728015085571049e-06, + "loss": 0.1967, + "num_input_tokens_seen": 9785368, + "step": 14975 + }, + { + "epoch": 7.851153039832285, + "grad_norm": 1.5369137525558472, + "learning_rate": 6.712414781649906e-06, + "loss": 0.1906, + "num_input_tokens_seen": 9789304, + "step": 14980 + }, + { + "epoch": 7.85377358490566, + "grad_norm": 2.1378297805786133, + "learning_rate": 6.69682977994473e-06, + "loss": 0.3205, + "num_input_tokens_seen": 9792184, + "step": 14985 + }, + { + "epoch": 7.856394129979035, + "grad_norm": 1.7859604358673096, + "learning_rate": 6.681260093496355e-06, + "loss": 0.2888, + "num_input_tokens_seen": 9795928, + "step": 14990 + }, + { + "epoch": 7.859014675052411, + "grad_norm": 1.057578444480896, + "learning_rate": 6.665705735332753e-06, + "loss": 0.2189, + "num_input_tokens_seen": 9798776, + "step": 14995 + }, + { + "epoch": 7.861635220125786, + "grad_norm": 1.8541970252990723, + "learning_rate": 6.65016671846912e-06, + "loss": 0.1909, + "num_input_tokens_seen": 9802008, + "step": 15000 + }, + { + "epoch": 7.864255765199162, + "grad_norm": 3.9026448726654053, + "learning_rate": 6.634643055907791e-06, + "loss": 0.3259, + "num_input_tokens_seen": 9805912, + "step": 15005 + }, + { + "epoch": 7.866876310272537, + "grad_norm": 2.0119364261627197, + "learning_rate": 6.619134760638248e-06, + "loss": 0.2417, + "num_input_tokens_seen": 9808664, + "step": 15010 + }, + { + "epoch": 7.869496855345912, + "grad_norm": 1.1766445636749268, + "learning_rate": 6.6036418456371516e-06, + "loss": 0.2017, + "num_input_tokens_seen": 9811256, + "step": 15015 + }, + { + "epoch": 7.872117400419287, + "grad_norm": 1.5119521617889404, + "learning_rate": 6.588164323868229e-06, + "loss": 0.2747, + "num_input_tokens_seen": 9816440, + "step": 15020 + }, + { + "epoch": 7.8747379454926625, + "grad_norm": 2.176548480987549, + "learning_rate": 6.572702208282381e-06, + "loss": 0.2263, + "num_input_tokens_seen": 9818776, + "step": 15025 + }, + { + "epoch": 7.877358490566038, + "grad_norm": 1.932707667350769, + "learning_rate": 6.557255511817617e-06, + "loss": 0.2462, + "num_input_tokens_seen": 9822520, + "step": 15030 + }, + { + "epoch": 7.879979035639413, + "grad_norm": 1.9028520584106445, + "learning_rate": 6.5418242473989925e-06, + "loss": 0.1578, + "num_input_tokens_seen": 9825496, + "step": 15035 + }, + { + "epoch": 7.882599580712788, + "grad_norm": 1.4662197828292847, + "learning_rate": 6.52640842793871e-06, + "loss": 0.3287, + "num_input_tokens_seen": 9829080, + "step": 15040 + }, + { + "epoch": 7.885220125786163, + "grad_norm": 2.1470322608947754, + "learning_rate": 6.5110080663360165e-06, + "loss": 0.2281, + "num_input_tokens_seen": 9832056, + "step": 15045 + }, + { + "epoch": 7.887840670859539, + "grad_norm": 2.7878754138946533, + "learning_rate": 6.495623175477223e-06, + "loss": 0.2269, + "num_input_tokens_seen": 9834680, + "step": 15050 + }, + { + "epoch": 7.890461215932914, + "grad_norm": 1.3855258226394653, + "learning_rate": 6.480253768235714e-06, + "loss": 0.3025, + "num_input_tokens_seen": 9838008, + "step": 15055 + }, + { + "epoch": 7.8930817610062896, + "grad_norm": 3.216792106628418, + "learning_rate": 6.464899857471907e-06, + "loss": 0.2872, + "num_input_tokens_seen": 9841528, + "step": 15060 + }, + { + "epoch": 7.895702306079665, + "grad_norm": 3.6438422203063965, + "learning_rate": 6.44956145603324e-06, + "loss": 0.3555, + "num_input_tokens_seen": 9844984, + "step": 15065 + }, + { + "epoch": 7.89832285115304, + "grad_norm": 1.421022891998291, + "learning_rate": 6.4342385767542036e-06, + "loss": 0.2094, + "num_input_tokens_seen": 9848568, + "step": 15070 + }, + { + "epoch": 7.900943396226415, + "grad_norm": 1.3058531284332275, + "learning_rate": 6.418931232456279e-06, + "loss": 0.1368, + "num_input_tokens_seen": 9851640, + "step": 15075 + }, + { + "epoch": 7.90356394129979, + "grad_norm": 1.7572451829910278, + "learning_rate": 6.403639435947948e-06, + "loss": 0.1862, + "num_input_tokens_seen": 9855192, + "step": 15080 + }, + { + "epoch": 7.906184486373165, + "grad_norm": 1.45749831199646, + "learning_rate": 6.38836320002468e-06, + "loss": 0.2579, + "num_input_tokens_seen": 9858072, + "step": 15085 + }, + { + "epoch": 7.908805031446541, + "grad_norm": 1.6602416038513184, + "learning_rate": 6.37310253746895e-06, + "loss": 0.2002, + "num_input_tokens_seen": 9861784, + "step": 15090 + }, + { + "epoch": 7.911425576519916, + "grad_norm": 2.432267189025879, + "learning_rate": 6.357857461050176e-06, + "loss": 0.2991, + "num_input_tokens_seen": 9866104, + "step": 15095 + }, + { + "epoch": 7.914046121593291, + "grad_norm": 4.1623992919921875, + "learning_rate": 6.342627983524737e-06, + "loss": 0.3061, + "num_input_tokens_seen": 9869816, + "step": 15100 + }, + { + "epoch": 7.916666666666667, + "grad_norm": 1.1080108880996704, + "learning_rate": 6.32741411763596e-06, + "loss": 0.1689, + "num_input_tokens_seen": 9872504, + "step": 15105 + }, + { + "epoch": 7.919287211740042, + "grad_norm": 0.776348888874054, + "learning_rate": 6.312215876114127e-06, + "loss": 0.2406, + "num_input_tokens_seen": 9875320, + "step": 15110 + }, + { + "epoch": 7.921907756813417, + "grad_norm": 1.9512699842453003, + "learning_rate": 6.297033271676425e-06, + "loss": 0.2129, + "num_input_tokens_seen": 9878136, + "step": 15115 + }, + { + "epoch": 7.9245283018867925, + "grad_norm": 2.290444850921631, + "learning_rate": 6.281866317026966e-06, + "loss": 0.2154, + "num_input_tokens_seen": 9880792, + "step": 15120 + }, + { + "epoch": 7.927148846960168, + "grad_norm": 1.7342568635940552, + "learning_rate": 6.2667150248567534e-06, + "loss": 0.23, + "num_input_tokens_seen": 9884312, + "step": 15125 + }, + { + "epoch": 7.929769392033543, + "grad_norm": 1.6773602962493896, + "learning_rate": 6.251579407843713e-06, + "loss": 0.2462, + "num_input_tokens_seen": 9887416, + "step": 15130 + }, + { + "epoch": 7.932389937106918, + "grad_norm": 1.668723702430725, + "learning_rate": 6.236459478652629e-06, + "loss": 0.1436, + "num_input_tokens_seen": 9890200, + "step": 15135 + }, + { + "epoch": 7.935010482180293, + "grad_norm": 2.788862705230713, + "learning_rate": 6.221355249935165e-06, + "loss": 0.2224, + "num_input_tokens_seen": 9893368, + "step": 15140 + }, + { + "epoch": 7.937631027253669, + "grad_norm": 3.1393611431121826, + "learning_rate": 6.20626673432986e-06, + "loss": 0.2732, + "num_input_tokens_seen": 9899224, + "step": 15145 + }, + { + "epoch": 7.940251572327044, + "grad_norm": 2.623046398162842, + "learning_rate": 6.191193944462087e-06, + "loss": 0.2634, + "num_input_tokens_seen": 9901944, + "step": 15150 + }, + { + "epoch": 7.9428721174004195, + "grad_norm": 1.4107537269592285, + "learning_rate": 6.176136892944062e-06, + "loss": 0.3293, + "num_input_tokens_seen": 9905720, + "step": 15155 + }, + { + "epoch": 7.945492662473795, + "grad_norm": 1.9240643978118896, + "learning_rate": 6.161095592374863e-06, + "loss": 0.3346, + "num_input_tokens_seen": 9908440, + "step": 15160 + }, + { + "epoch": 7.94811320754717, + "grad_norm": 1.8707492351531982, + "learning_rate": 6.1460700553403275e-06, + "loss": 0.2795, + "num_input_tokens_seen": 9911256, + "step": 15165 + }, + { + "epoch": 7.950733752620545, + "grad_norm": 2.0611135959625244, + "learning_rate": 6.1310602944131655e-06, + "loss": 0.2526, + "num_input_tokens_seen": 9913336, + "step": 15170 + }, + { + "epoch": 7.95335429769392, + "grad_norm": 0.9019869565963745, + "learning_rate": 6.11606632215285e-06, + "loss": 0.1807, + "num_input_tokens_seen": 9917048, + "step": 15175 + }, + { + "epoch": 7.955974842767295, + "grad_norm": 3.0048203468322754, + "learning_rate": 6.101088151105647e-06, + "loss": 0.1993, + "num_input_tokens_seen": 9920952, + "step": 15180 + }, + { + "epoch": 7.9585953878406706, + "grad_norm": 2.2076737880706787, + "learning_rate": 6.086125793804618e-06, + "loss": 0.1734, + "num_input_tokens_seen": 9924376, + "step": 15185 + }, + { + "epoch": 7.961215932914046, + "grad_norm": 1.3324902057647705, + "learning_rate": 6.071179262769572e-06, + "loss": 0.261, + "num_input_tokens_seen": 9927160, + "step": 15190 + }, + { + "epoch": 7.963836477987421, + "grad_norm": 2.2089483737945557, + "learning_rate": 6.056248570507078e-06, + "loss": 0.2635, + "num_input_tokens_seen": 9930840, + "step": 15195 + }, + { + "epoch": 7.966457023060797, + "grad_norm": 2.1402268409729004, + "learning_rate": 6.041333729510479e-06, + "loss": 0.3444, + "num_input_tokens_seen": 9934104, + "step": 15200 + }, + { + "epoch": 7.969077568134172, + "grad_norm": 2.517821788787842, + "learning_rate": 6.026434752259802e-06, + "loss": 0.2049, + "num_input_tokens_seen": 9937016, + "step": 15205 + }, + { + "epoch": 7.971698113207547, + "grad_norm": 3.62532114982605, + "learning_rate": 6.011551651221856e-06, + "loss": 0.2608, + "num_input_tokens_seen": 9939736, + "step": 15210 + }, + { + "epoch": 7.9743186582809225, + "grad_norm": 1.795918583869934, + "learning_rate": 5.996684438850131e-06, + "loss": 0.2483, + "num_input_tokens_seen": 9942744, + "step": 15215 + }, + { + "epoch": 7.976939203354298, + "grad_norm": 5.22670841217041, + "learning_rate": 5.981833127584824e-06, + "loss": 0.3015, + "num_input_tokens_seen": 9946776, + "step": 15220 + }, + { + "epoch": 7.979559748427673, + "grad_norm": 2.885375738143921, + "learning_rate": 5.966997729852844e-06, + "loss": 0.2191, + "num_input_tokens_seen": 9949304, + "step": 15225 + }, + { + "epoch": 7.982180293501048, + "grad_norm": 1.9987646341323853, + "learning_rate": 5.952178258067775e-06, + "loss": 0.2217, + "num_input_tokens_seen": 9953112, + "step": 15230 + }, + { + "epoch": 7.984800838574423, + "grad_norm": 1.8202807903289795, + "learning_rate": 5.93737472462986e-06, + "loss": 0.2775, + "num_input_tokens_seen": 9956056, + "step": 15235 + }, + { + "epoch": 7.987421383647799, + "grad_norm": 3.694580554962158, + "learning_rate": 5.92258714192604e-06, + "loss": 0.2252, + "num_input_tokens_seen": 9959416, + "step": 15240 + }, + { + "epoch": 7.990041928721174, + "grad_norm": 1.6233999729156494, + "learning_rate": 5.907815522329877e-06, + "loss": 0.1856, + "num_input_tokens_seen": 9962680, + "step": 15245 + }, + { + "epoch": 7.9926624737945495, + "grad_norm": 1.4521355628967285, + "learning_rate": 5.893059878201587e-06, + "loss": 0.3674, + "num_input_tokens_seen": 9966616, + "step": 15250 + }, + { + "epoch": 7.995283018867925, + "grad_norm": 1.5159826278686523, + "learning_rate": 5.878320221888015e-06, + "loss": 0.2512, + "num_input_tokens_seen": 9971864, + "step": 15255 + }, + { + "epoch": 7.9979035639413, + "grad_norm": 2.362492799758911, + "learning_rate": 5.8635965657226455e-06, + "loss": 0.5188, + "num_input_tokens_seen": 9975864, + "step": 15260 + }, + { + "epoch": 8.0, + "eval_loss": 0.6233822107315063, + "eval_runtime": 15.973, + "eval_samples_per_second": 53.089, + "eval_steps_per_second": 13.272, + "num_input_tokens_seen": 9977520, + "step": 15264 + }, + { + "epoch": 8.000524109014675, + "grad_norm": 2.122880458831787, + "learning_rate": 5.848888922025553e-06, + "loss": 0.2368, + "num_input_tokens_seen": 9978032, + "step": 15265 + }, + { + "epoch": 8.00314465408805, + "grad_norm": 1.9967584609985352, + "learning_rate": 5.834197303103414e-06, + "loss": 0.3312, + "num_input_tokens_seen": 9981616, + "step": 15270 + }, + { + "epoch": 8.005765199161425, + "grad_norm": 1.4154040813446045, + "learning_rate": 5.819521721249524e-06, + "loss": 0.1737, + "num_input_tokens_seen": 9984624, + "step": 15275 + }, + { + "epoch": 8.0083857442348, + "grad_norm": 1.337449312210083, + "learning_rate": 5.804862188743726e-06, + "loss": 0.2198, + "num_input_tokens_seen": 9987568, + "step": 15280 + }, + { + "epoch": 8.011006289308176, + "grad_norm": 2.104844808578491, + "learning_rate": 5.79021871785245e-06, + "loss": 0.1982, + "num_input_tokens_seen": 9991280, + "step": 15285 + }, + { + "epoch": 8.01362683438155, + "grad_norm": 1.678449273109436, + "learning_rate": 5.775591320828683e-06, + "loss": 0.2094, + "num_input_tokens_seen": 9994288, + "step": 15290 + }, + { + "epoch": 8.016247379454926, + "grad_norm": 1.9129987955093384, + "learning_rate": 5.7609800099119565e-06, + "loss": 0.1872, + "num_input_tokens_seen": 9997200, + "step": 15295 + }, + { + "epoch": 8.018867924528301, + "grad_norm": 3.493428945541382, + "learning_rate": 5.746384797328361e-06, + "loss": 0.2719, + "num_input_tokens_seen": 10000176, + "step": 15300 + }, + { + "epoch": 8.021488469601676, + "grad_norm": 1.4801591634750366, + "learning_rate": 5.731805695290498e-06, + "loss": 0.2171, + "num_input_tokens_seen": 10003504, + "step": 15305 + }, + { + "epoch": 8.024109014675052, + "grad_norm": 1.8154691457748413, + "learning_rate": 5.7172427159974865e-06, + "loss": 0.2196, + "num_input_tokens_seen": 10008688, + "step": 15310 + }, + { + "epoch": 8.026729559748428, + "grad_norm": 2.0892887115478516, + "learning_rate": 5.702695871634975e-06, + "loss": 0.2851, + "num_input_tokens_seen": 10011536, + "step": 15315 + }, + { + "epoch": 8.029350104821804, + "grad_norm": 2.141425609588623, + "learning_rate": 5.688165174375093e-06, + "loss": 0.2049, + "num_input_tokens_seen": 10014096, + "step": 15320 + }, + { + "epoch": 8.031970649895179, + "grad_norm": 1.0506095886230469, + "learning_rate": 5.673650636376457e-06, + "loss": 0.2428, + "num_input_tokens_seen": 10017360, + "step": 15325 + }, + { + "epoch": 8.034591194968554, + "grad_norm": 1.2900941371917725, + "learning_rate": 5.659152269784188e-06, + "loss": 0.2748, + "num_input_tokens_seen": 10020560, + "step": 15330 + }, + { + "epoch": 8.03721174004193, + "grad_norm": 2.5428707599639893, + "learning_rate": 5.644670086729834e-06, + "loss": 0.2312, + "num_input_tokens_seen": 10024176, + "step": 15335 + }, + { + "epoch": 8.039832285115304, + "grad_norm": 1.131831407546997, + "learning_rate": 5.630204099331432e-06, + "loss": 0.0934, + "num_input_tokens_seen": 10028336, + "step": 15340 + }, + { + "epoch": 8.04245283018868, + "grad_norm": 2.1905667781829834, + "learning_rate": 5.615754319693481e-06, + "loss": 0.2283, + "num_input_tokens_seen": 10031376, + "step": 15345 + }, + { + "epoch": 8.045073375262055, + "grad_norm": 2.2882800102233887, + "learning_rate": 5.601320759906861e-06, + "loss": 0.1815, + "num_input_tokens_seen": 10033904, + "step": 15350 + }, + { + "epoch": 8.04769392033543, + "grad_norm": 1.2991883754730225, + "learning_rate": 5.586903432048943e-06, + "loss": 0.194, + "num_input_tokens_seen": 10036720, + "step": 15355 + }, + { + "epoch": 8.050314465408805, + "grad_norm": 2.32063889503479, + "learning_rate": 5.572502348183475e-06, + "loss": 0.2114, + "num_input_tokens_seen": 10040016, + "step": 15360 + }, + { + "epoch": 8.05293501048218, + "grad_norm": 1.9875839948654175, + "learning_rate": 5.558117520360623e-06, + "loss": 0.2716, + "num_input_tokens_seen": 10043088, + "step": 15365 + }, + { + "epoch": 8.055555555555555, + "grad_norm": 2.3257603645324707, + "learning_rate": 5.543748960616971e-06, + "loss": 0.2614, + "num_input_tokens_seen": 10046160, + "step": 15370 + }, + { + "epoch": 8.05817610062893, + "grad_norm": 2.1124746799468994, + "learning_rate": 5.529396680975457e-06, + "loss": 0.2037, + "num_input_tokens_seen": 10048592, + "step": 15375 + }, + { + "epoch": 8.060796645702306, + "grad_norm": 1.1249449253082275, + "learning_rate": 5.515060693445418e-06, + "loss": 0.1815, + "num_input_tokens_seen": 10051536, + "step": 15380 + }, + { + "epoch": 8.06341719077568, + "grad_norm": 1.3184138536453247, + "learning_rate": 5.500741010022564e-06, + "loss": 0.2819, + "num_input_tokens_seen": 10055632, + "step": 15385 + }, + { + "epoch": 8.066037735849056, + "grad_norm": 1.5923373699188232, + "learning_rate": 5.48643764268893e-06, + "loss": 0.2421, + "num_input_tokens_seen": 10060528, + "step": 15390 + }, + { + "epoch": 8.068658280922431, + "grad_norm": 2.41573429107666, + "learning_rate": 5.472150603412937e-06, + "loss": 0.1261, + "num_input_tokens_seen": 10063664, + "step": 15395 + }, + { + "epoch": 8.071278825995806, + "grad_norm": 3.042710781097412, + "learning_rate": 5.457879904149327e-06, + "loss": 0.2349, + "num_input_tokens_seen": 10066288, + "step": 15400 + }, + { + "epoch": 8.073899371069182, + "grad_norm": 2.9256694316864014, + "learning_rate": 5.4436255568391545e-06, + "loss": 0.2417, + "num_input_tokens_seen": 10070064, + "step": 15405 + }, + { + "epoch": 8.076519916142558, + "grad_norm": 2.1134703159332275, + "learning_rate": 5.429387573409825e-06, + "loss": 0.1679, + "num_input_tokens_seen": 10074128, + "step": 15410 + }, + { + "epoch": 8.079140461215934, + "grad_norm": 2.1702685356140137, + "learning_rate": 5.415165965775024e-06, + "loss": 0.1497, + "num_input_tokens_seen": 10077936, + "step": 15415 + }, + { + "epoch": 8.081761006289309, + "grad_norm": 1.6153109073638916, + "learning_rate": 5.400960745834735e-06, + "loss": 0.179, + "num_input_tokens_seen": 10080976, + "step": 15420 + }, + { + "epoch": 8.084381551362684, + "grad_norm": 1.890679121017456, + "learning_rate": 5.386771925475256e-06, + "loss": 0.2218, + "num_input_tokens_seen": 10083984, + "step": 15425 + }, + { + "epoch": 8.08700209643606, + "grad_norm": 1.8666423559188843, + "learning_rate": 5.3725995165691294e-06, + "loss": 0.2336, + "num_input_tokens_seen": 10086800, + "step": 15430 + }, + { + "epoch": 8.089622641509434, + "grad_norm": 1.4425777196884155, + "learning_rate": 5.358443530975188e-06, + "loss": 0.1726, + "num_input_tokens_seen": 10090544, + "step": 15435 + }, + { + "epoch": 8.09224318658281, + "grad_norm": 2.9380953311920166, + "learning_rate": 5.344303980538498e-06, + "loss": 0.1747, + "num_input_tokens_seen": 10093360, + "step": 15440 + }, + { + "epoch": 8.094863731656185, + "grad_norm": 2.2969534397125244, + "learning_rate": 5.33018087709041e-06, + "loss": 0.3009, + "num_input_tokens_seen": 10097520, + "step": 15445 + }, + { + "epoch": 8.09748427672956, + "grad_norm": 1.2050544023513794, + "learning_rate": 5.316074232448484e-06, + "loss": 0.1581, + "num_input_tokens_seen": 10100240, + "step": 15450 + }, + { + "epoch": 8.100104821802935, + "grad_norm": 1.8964056968688965, + "learning_rate": 5.301984058416506e-06, + "loss": 0.2171, + "num_input_tokens_seen": 10103280, + "step": 15455 + }, + { + "epoch": 8.10272536687631, + "grad_norm": 1.8219149112701416, + "learning_rate": 5.2879103667845045e-06, + "loss": 0.1785, + "num_input_tokens_seen": 10106288, + "step": 15460 + }, + { + "epoch": 8.105345911949685, + "grad_norm": 1.975645661354065, + "learning_rate": 5.2738531693286965e-06, + "loss": 0.1569, + "num_input_tokens_seen": 10110064, + "step": 15465 + }, + { + "epoch": 8.10796645702306, + "grad_norm": 1.1730492115020752, + "learning_rate": 5.2598124778115044e-06, + "loss": 0.1604, + "num_input_tokens_seen": 10113360, + "step": 15470 + }, + { + "epoch": 8.110587002096436, + "grad_norm": 2.097931385040283, + "learning_rate": 5.245788303981533e-06, + "loss": 0.2441, + "num_input_tokens_seen": 10119760, + "step": 15475 + }, + { + "epoch": 8.11320754716981, + "grad_norm": 1.3218662738800049, + "learning_rate": 5.231780659573565e-06, + "loss": 0.126, + "num_input_tokens_seen": 10126864, + "step": 15480 + }, + { + "epoch": 8.115828092243186, + "grad_norm": 2.043421745300293, + "learning_rate": 5.2177895563085725e-06, + "loss": 0.1486, + "num_input_tokens_seen": 10129776, + "step": 15485 + }, + { + "epoch": 8.118448637316561, + "grad_norm": 2.0959644317626953, + "learning_rate": 5.203815005893664e-06, + "loss": 0.1876, + "num_input_tokens_seen": 10132816, + "step": 15490 + }, + { + "epoch": 8.121069182389936, + "grad_norm": 2.0985076427459717, + "learning_rate": 5.189857020022099e-06, + "loss": 0.1442, + "num_input_tokens_seen": 10136112, + "step": 15495 + }, + { + "epoch": 8.123689727463312, + "grad_norm": 1.5257999897003174, + "learning_rate": 5.1759156103732946e-06, + "loss": 0.2106, + "num_input_tokens_seen": 10138896, + "step": 15500 + }, + { + "epoch": 8.126310272536688, + "grad_norm": 2.4403913021087646, + "learning_rate": 5.161990788612781e-06, + "loss": 0.2707, + "num_input_tokens_seen": 10141456, + "step": 15505 + }, + { + "epoch": 8.128930817610064, + "grad_norm": 1.9802451133728027, + "learning_rate": 5.148082566392204e-06, + "loss": 0.2718, + "num_input_tokens_seen": 10144944, + "step": 15510 + }, + { + "epoch": 8.131551362683439, + "grad_norm": 2.097147226333618, + "learning_rate": 5.13419095534935e-06, + "loss": 0.1891, + "num_input_tokens_seen": 10148016, + "step": 15515 + }, + { + "epoch": 8.134171907756814, + "grad_norm": 2.0128681659698486, + "learning_rate": 5.120315967108055e-06, + "loss": 0.2544, + "num_input_tokens_seen": 10152048, + "step": 15520 + }, + { + "epoch": 8.13679245283019, + "grad_norm": 1.4809749126434326, + "learning_rate": 5.106457613278298e-06, + "loss": 0.2136, + "num_input_tokens_seen": 10155600, + "step": 15525 + }, + { + "epoch": 8.139412997903564, + "grad_norm": 2.5468695163726807, + "learning_rate": 5.092615905456111e-06, + "loss": 0.2346, + "num_input_tokens_seen": 10158160, + "step": 15530 + }, + { + "epoch": 8.14203354297694, + "grad_norm": 2.163407564163208, + "learning_rate": 5.078790855223595e-06, + "loss": 0.2285, + "num_input_tokens_seen": 10161296, + "step": 15535 + }, + { + "epoch": 8.144654088050315, + "grad_norm": 1.2918097972869873, + "learning_rate": 5.0649824741489325e-06, + "loss": 0.2324, + "num_input_tokens_seen": 10164112, + "step": 15540 + }, + { + "epoch": 8.14727463312369, + "grad_norm": 2.8904552459716797, + "learning_rate": 5.051190773786341e-06, + "loss": 0.208, + "num_input_tokens_seen": 10167248, + "step": 15545 + }, + { + "epoch": 8.149895178197065, + "grad_norm": 1.5888020992279053, + "learning_rate": 5.0374157656760786e-06, + "loss": 0.1886, + "num_input_tokens_seen": 10171088, + "step": 15550 + }, + { + "epoch": 8.15251572327044, + "grad_norm": 1.4584145545959473, + "learning_rate": 5.023657461344456e-06, + "loss": 0.1757, + "num_input_tokens_seen": 10174192, + "step": 15555 + }, + { + "epoch": 8.155136268343815, + "grad_norm": 2.4944839477539062, + "learning_rate": 5.009915872303786e-06, + "loss": 0.2097, + "num_input_tokens_seen": 10177264, + "step": 15560 + }, + { + "epoch": 8.15775681341719, + "grad_norm": 2.346853256225586, + "learning_rate": 4.996191010052403e-06, + "loss": 0.2266, + "num_input_tokens_seen": 10179824, + "step": 15565 + }, + { + "epoch": 8.160377358490566, + "grad_norm": 2.486290693283081, + "learning_rate": 4.982482886074647e-06, + "loss": 0.208, + "num_input_tokens_seen": 10183024, + "step": 15570 + }, + { + "epoch": 8.16299790356394, + "grad_norm": 1.627577543258667, + "learning_rate": 4.968791511840842e-06, + "loss": 0.1969, + "num_input_tokens_seen": 10186832, + "step": 15575 + }, + { + "epoch": 8.165618448637316, + "grad_norm": 2.085735559463501, + "learning_rate": 4.955116898807316e-06, + "loss": 0.2618, + "num_input_tokens_seen": 10189360, + "step": 15580 + }, + { + "epoch": 8.168238993710691, + "grad_norm": 1.5127828121185303, + "learning_rate": 4.941459058416356e-06, + "loss": 0.2124, + "num_input_tokens_seen": 10193360, + "step": 15585 + }, + { + "epoch": 8.170859538784066, + "grad_norm": 3.0005764961242676, + "learning_rate": 4.927818002096213e-06, + "loss": 0.2254, + "num_input_tokens_seen": 10197136, + "step": 15590 + }, + { + "epoch": 8.173480083857442, + "grad_norm": 2.719977378845215, + "learning_rate": 4.9141937412611084e-06, + "loss": 0.2157, + "num_input_tokens_seen": 10200592, + "step": 15595 + }, + { + "epoch": 8.176100628930818, + "grad_norm": 1.837620735168457, + "learning_rate": 4.900586287311202e-06, + "loss": 0.1937, + "num_input_tokens_seen": 10203088, + "step": 15600 + }, + { + "epoch": 8.178721174004194, + "grad_norm": 1.9975608587265015, + "learning_rate": 4.886995651632584e-06, + "loss": 0.3605, + "num_input_tokens_seen": 10205840, + "step": 15605 + }, + { + "epoch": 8.181341719077569, + "grad_norm": 1.7735763788223267, + "learning_rate": 4.873421845597273e-06, + "loss": 0.2044, + "num_input_tokens_seen": 10209040, + "step": 15610 + }, + { + "epoch": 8.183962264150944, + "grad_norm": 1.7526071071624756, + "learning_rate": 4.859864880563222e-06, + "loss": 0.1649, + "num_input_tokens_seen": 10212048, + "step": 15615 + }, + { + "epoch": 8.18658280922432, + "grad_norm": 3.3389647006988525, + "learning_rate": 4.846324767874277e-06, + "loss": 0.181, + "num_input_tokens_seen": 10214896, + "step": 15620 + }, + { + "epoch": 8.189203354297694, + "grad_norm": 3.2080295085906982, + "learning_rate": 4.832801518860175e-06, + "loss": 0.238, + "num_input_tokens_seen": 10217872, + "step": 15625 + }, + { + "epoch": 8.19182389937107, + "grad_norm": 1.579833745956421, + "learning_rate": 4.819295144836566e-06, + "loss": 0.1788, + "num_input_tokens_seen": 10221488, + "step": 15630 + }, + { + "epoch": 8.194444444444445, + "grad_norm": 2.010639190673828, + "learning_rate": 4.805805657104965e-06, + "loss": 0.1231, + "num_input_tokens_seen": 10224048, + "step": 15635 + }, + { + "epoch": 8.19706498951782, + "grad_norm": 1.0442692041397095, + "learning_rate": 4.792333066952748e-06, + "loss": 0.1313, + "num_input_tokens_seen": 10228368, + "step": 15640 + }, + { + "epoch": 8.199685534591195, + "grad_norm": 1.0508406162261963, + "learning_rate": 4.778877385653186e-06, + "loss": 0.2114, + "num_input_tokens_seen": 10231728, + "step": 15645 + }, + { + "epoch": 8.20230607966457, + "grad_norm": 1.8090085983276367, + "learning_rate": 4.7654386244653485e-06, + "loss": 0.2442, + "num_input_tokens_seen": 10234960, + "step": 15650 + }, + { + "epoch": 8.204926624737945, + "grad_norm": 1.2101269960403442, + "learning_rate": 4.752016794634201e-06, + "loss": 0.2221, + "num_input_tokens_seen": 10238448, + "step": 15655 + }, + { + "epoch": 8.20754716981132, + "grad_norm": 2.2408721446990967, + "learning_rate": 4.738611907390508e-06, + "loss": 0.1466, + "num_input_tokens_seen": 10240912, + "step": 15660 + }, + { + "epoch": 8.210167714884696, + "grad_norm": 1.3749018907546997, + "learning_rate": 4.725223973950863e-06, + "loss": 0.2402, + "num_input_tokens_seen": 10244816, + "step": 15665 + }, + { + "epoch": 8.21278825995807, + "grad_norm": 1.5109889507293701, + "learning_rate": 4.711853005517686e-06, + "loss": 0.1891, + "num_input_tokens_seen": 10247728, + "step": 15670 + }, + { + "epoch": 8.215408805031446, + "grad_norm": 2.2067008018493652, + "learning_rate": 4.698499013279189e-06, + "loss": 0.193, + "num_input_tokens_seen": 10251248, + "step": 15675 + }, + { + "epoch": 8.218029350104821, + "grad_norm": 1.5973143577575684, + "learning_rate": 4.685162008409374e-06, + "loss": 0.236, + "num_input_tokens_seen": 10254160, + "step": 15680 + }, + { + "epoch": 8.220649895178196, + "grad_norm": 1.824124813079834, + "learning_rate": 4.671842002068061e-06, + "loss": 0.2034, + "num_input_tokens_seen": 10256944, + "step": 15685 + }, + { + "epoch": 8.223270440251572, + "grad_norm": 2.17039155960083, + "learning_rate": 4.658539005400794e-06, + "loss": 0.2364, + "num_input_tokens_seen": 10259888, + "step": 15690 + }, + { + "epoch": 8.225890985324948, + "grad_norm": 2.388314962387085, + "learning_rate": 4.645253029538926e-06, + "loss": 0.2657, + "num_input_tokens_seen": 10262640, + "step": 15695 + }, + { + "epoch": 8.228511530398324, + "grad_norm": 2.099378824234009, + "learning_rate": 4.631984085599569e-06, + "loss": 0.243, + "num_input_tokens_seen": 10265424, + "step": 15700 + }, + { + "epoch": 8.231132075471699, + "grad_norm": 1.5802191495895386, + "learning_rate": 4.618732184685542e-06, + "loss": 0.2189, + "num_input_tokens_seen": 10267888, + "step": 15705 + }, + { + "epoch": 8.233752620545074, + "grad_norm": 1.1556522846221924, + "learning_rate": 4.60549733788545e-06, + "loss": 0.1534, + "num_input_tokens_seen": 10270992, + "step": 15710 + }, + { + "epoch": 8.23637316561845, + "grad_norm": 1.425577163696289, + "learning_rate": 4.592279556273604e-06, + "loss": 0.2257, + "num_input_tokens_seen": 10274128, + "step": 15715 + }, + { + "epoch": 8.238993710691824, + "grad_norm": 1.8197227716445923, + "learning_rate": 4.579078850910032e-06, + "loss": 0.1978, + "num_input_tokens_seen": 10278128, + "step": 15720 + }, + { + "epoch": 8.2416142557652, + "grad_norm": 1.6118172407150269, + "learning_rate": 4.565895232840489e-06, + "loss": 0.1655, + "num_input_tokens_seen": 10280720, + "step": 15725 + }, + { + "epoch": 8.244234800838575, + "grad_norm": 1.569267749786377, + "learning_rate": 4.552728713096427e-06, + "loss": 0.1812, + "num_input_tokens_seen": 10283280, + "step": 15730 + }, + { + "epoch": 8.24685534591195, + "grad_norm": 1.9941498041152954, + "learning_rate": 4.539579302694977e-06, + "loss": 0.2063, + "num_input_tokens_seen": 10287600, + "step": 15735 + }, + { + "epoch": 8.249475890985325, + "grad_norm": 2.108248710632324, + "learning_rate": 4.5264470126389765e-06, + "loss": 0.1738, + "num_input_tokens_seen": 10289808, + "step": 15740 + }, + { + "epoch": 8.2520964360587, + "grad_norm": 6.1407856941223145, + "learning_rate": 4.5133318539169215e-06, + "loss": 0.2855, + "num_input_tokens_seen": 10292656, + "step": 15745 + }, + { + "epoch": 8.254716981132075, + "grad_norm": 1.9383361339569092, + "learning_rate": 4.500233837502979e-06, + "loss": 0.2021, + "num_input_tokens_seen": 10295664, + "step": 15750 + }, + { + "epoch": 8.25733752620545, + "grad_norm": 1.9189033508300781, + "learning_rate": 4.4871529743569675e-06, + "loss": 0.193, + "num_input_tokens_seen": 10299248, + "step": 15755 + }, + { + "epoch": 8.259958071278826, + "grad_norm": 2.5081663131713867, + "learning_rate": 4.474089275424351e-06, + "loss": 0.1764, + "num_input_tokens_seen": 10302832, + "step": 15760 + }, + { + "epoch": 8.2625786163522, + "grad_norm": 1.5773767232894897, + "learning_rate": 4.461042751636252e-06, + "loss": 0.2441, + "num_input_tokens_seen": 10307888, + "step": 15765 + }, + { + "epoch": 8.265199161425576, + "grad_norm": 3.3166451454162598, + "learning_rate": 4.448013413909394e-06, + "loss": 0.2119, + "num_input_tokens_seen": 10311248, + "step": 15770 + }, + { + "epoch": 8.267819706498951, + "grad_norm": 1.9590643644332886, + "learning_rate": 4.435001273146127e-06, + "loss": 0.228, + "num_input_tokens_seen": 10314320, + "step": 15775 + }, + { + "epoch": 8.270440251572326, + "grad_norm": 2.06846022605896, + "learning_rate": 4.422006340234433e-06, + "loss": 0.2061, + "num_input_tokens_seen": 10317936, + "step": 15780 + }, + { + "epoch": 8.273060796645701, + "grad_norm": 1.6364986896514893, + "learning_rate": 4.4090286260478674e-06, + "loss": 0.2377, + "num_input_tokens_seen": 10321136, + "step": 15785 + }, + { + "epoch": 8.275681341719078, + "grad_norm": 2.7836272716522217, + "learning_rate": 4.3960681414455864e-06, + "loss": 0.3191, + "num_input_tokens_seen": 10323760, + "step": 15790 + }, + { + "epoch": 8.278301886792454, + "grad_norm": 2.471673011779785, + "learning_rate": 4.383124897272331e-06, + "loss": 0.2276, + "num_input_tokens_seen": 10327088, + "step": 15795 + }, + { + "epoch": 8.280922431865829, + "grad_norm": 1.6638505458831787, + "learning_rate": 4.3701989043584274e-06, + "loss": 0.2771, + "num_input_tokens_seen": 10330448, + "step": 15800 + }, + { + "epoch": 8.283542976939204, + "grad_norm": 2.264122724533081, + "learning_rate": 4.357290173519746e-06, + "loss": 0.1917, + "num_input_tokens_seen": 10334672, + "step": 15805 + }, + { + "epoch": 8.286163522012579, + "grad_norm": 1.7570475339889526, + "learning_rate": 4.344398715557724e-06, + "loss": 0.2419, + "num_input_tokens_seen": 10337904, + "step": 15810 + }, + { + "epoch": 8.288784067085954, + "grad_norm": 2.622154951095581, + "learning_rate": 4.3315245412593496e-06, + "loss": 0.227, + "num_input_tokens_seen": 10341488, + "step": 15815 + }, + { + "epoch": 8.29140461215933, + "grad_norm": 2.8663125038146973, + "learning_rate": 4.318667661397141e-06, + "loss": 0.1977, + "num_input_tokens_seen": 10344944, + "step": 15820 + }, + { + "epoch": 8.294025157232705, + "grad_norm": 2.8940067291259766, + "learning_rate": 4.305828086729144e-06, + "loss": 0.2575, + "num_input_tokens_seen": 10347248, + "step": 15825 + }, + { + "epoch": 8.29664570230608, + "grad_norm": 3.3608319759368896, + "learning_rate": 4.293005827998942e-06, + "loss": 0.3027, + "num_input_tokens_seen": 10349712, + "step": 15830 + }, + { + "epoch": 8.299266247379455, + "grad_norm": 1.2380883693695068, + "learning_rate": 4.280200895935593e-06, + "loss": 0.2096, + "num_input_tokens_seen": 10353936, + "step": 15835 + }, + { + "epoch": 8.30188679245283, + "grad_norm": 3.11025071144104, + "learning_rate": 4.267413301253701e-06, + "loss": 0.1763, + "num_input_tokens_seen": 10356432, + "step": 15840 + }, + { + "epoch": 8.304507337526205, + "grad_norm": 2.740325450897217, + "learning_rate": 4.254643054653329e-06, + "loss": 0.3984, + "num_input_tokens_seen": 10360624, + "step": 15845 + }, + { + "epoch": 8.30712788259958, + "grad_norm": 1.8593332767486572, + "learning_rate": 4.241890166820034e-06, + "loss": 0.2328, + "num_input_tokens_seen": 10364432, + "step": 15850 + }, + { + "epoch": 8.309748427672956, + "grad_norm": 2.91762375831604, + "learning_rate": 4.22915464842486e-06, + "loss": 0.1954, + "num_input_tokens_seen": 10367184, + "step": 15855 + }, + { + "epoch": 8.31236897274633, + "grad_norm": 2.4265716075897217, + "learning_rate": 4.216436510124303e-06, + "loss": 0.1918, + "num_input_tokens_seen": 10370192, + "step": 15860 + }, + { + "epoch": 8.314989517819706, + "grad_norm": 1.737524151802063, + "learning_rate": 4.203735762560312e-06, + "loss": 0.3184, + "num_input_tokens_seen": 10373744, + "step": 15865 + }, + { + "epoch": 8.317610062893081, + "grad_norm": 2.068523645401001, + "learning_rate": 4.191052416360314e-06, + "loss": 0.248, + "num_input_tokens_seen": 10376400, + "step": 15870 + }, + { + "epoch": 8.320230607966456, + "grad_norm": 1.4508658647537231, + "learning_rate": 4.178386482137126e-06, + "loss": 0.1656, + "num_input_tokens_seen": 10379600, + "step": 15875 + }, + { + "epoch": 8.322851153039831, + "grad_norm": 1.5786696672439575, + "learning_rate": 4.165737970489036e-06, + "loss": 0.252, + "num_input_tokens_seen": 10383088, + "step": 15880 + }, + { + "epoch": 8.325471698113208, + "grad_norm": 2.989948034286499, + "learning_rate": 4.153106891999753e-06, + "loss": 0.2611, + "num_input_tokens_seen": 10386288, + "step": 15885 + }, + { + "epoch": 8.328092243186584, + "grad_norm": 2.1671946048736572, + "learning_rate": 4.140493257238362e-06, + "loss": 0.3197, + "num_input_tokens_seen": 10389744, + "step": 15890 + }, + { + "epoch": 8.330712788259959, + "grad_norm": 1.4811434745788574, + "learning_rate": 4.127897076759399e-06, + "loss": 0.1907, + "num_input_tokens_seen": 10392880, + "step": 15895 + }, + { + "epoch": 8.333333333333334, + "grad_norm": 1.9265919923782349, + "learning_rate": 4.11531836110276e-06, + "loss": 0.2767, + "num_input_tokens_seen": 10397168, + "step": 15900 + }, + { + "epoch": 8.335953878406709, + "grad_norm": 2.413651466369629, + "learning_rate": 4.1027571207937345e-06, + "loss": 0.2103, + "num_input_tokens_seen": 10400784, + "step": 15905 + }, + { + "epoch": 8.338574423480084, + "grad_norm": 1.4681862592697144, + "learning_rate": 4.09021336634301e-06, + "loss": 0.1122, + "num_input_tokens_seen": 10403568, + "step": 15910 + }, + { + "epoch": 8.34119496855346, + "grad_norm": 2.2278504371643066, + "learning_rate": 4.077687108246622e-06, + "loss": 0.1897, + "num_input_tokens_seen": 10406800, + "step": 15915 + }, + { + "epoch": 8.343815513626835, + "grad_norm": 1.3939968347549438, + "learning_rate": 4.06517835698596e-06, + "loss": 0.2242, + "num_input_tokens_seen": 10409744, + "step": 15920 + }, + { + "epoch": 8.34643605870021, + "grad_norm": 2.8614728450775146, + "learning_rate": 4.0526871230277905e-06, + "loss": 0.1838, + "num_input_tokens_seen": 10412880, + "step": 15925 + }, + { + "epoch": 8.349056603773585, + "grad_norm": 3.2292728424072266, + "learning_rate": 4.040213416824204e-06, + "loss": 0.2469, + "num_input_tokens_seen": 10416432, + "step": 15930 + }, + { + "epoch": 8.35167714884696, + "grad_norm": 3.20052170753479, + "learning_rate": 4.027757248812622e-06, + "loss": 0.2183, + "num_input_tokens_seen": 10419824, + "step": 15935 + }, + { + "epoch": 8.354297693920335, + "grad_norm": 1.7413853406906128, + "learning_rate": 4.015318629415804e-06, + "loss": 0.2178, + "num_input_tokens_seen": 10422800, + "step": 15940 + }, + { + "epoch": 8.35691823899371, + "grad_norm": 2.0911426544189453, + "learning_rate": 4.002897569041808e-06, + "loss": 0.1641, + "num_input_tokens_seen": 10425808, + "step": 15945 + }, + { + "epoch": 8.359538784067086, + "grad_norm": 1.7957043647766113, + "learning_rate": 3.990494078084022e-06, + "loss": 0.2126, + "num_input_tokens_seen": 10428944, + "step": 15950 + }, + { + "epoch": 8.36215932914046, + "grad_norm": 2.758812427520752, + "learning_rate": 3.9781081669211156e-06, + "loss": 0.2761, + "num_input_tokens_seen": 10433648, + "step": 15955 + }, + { + "epoch": 8.364779874213836, + "grad_norm": 1.2398111820220947, + "learning_rate": 3.965739845917049e-06, + "loss": 0.216, + "num_input_tokens_seen": 10436784, + "step": 15960 + }, + { + "epoch": 8.367400419287211, + "grad_norm": 3.003438949584961, + "learning_rate": 3.953389125421078e-06, + "loss": 0.1838, + "num_input_tokens_seen": 10439280, + "step": 15965 + }, + { + "epoch": 8.370020964360586, + "grad_norm": 1.6299625635147095, + "learning_rate": 3.941056015767713e-06, + "loss": 0.1811, + "num_input_tokens_seen": 10441744, + "step": 15970 + }, + { + "epoch": 8.372641509433961, + "grad_norm": 2.1858959197998047, + "learning_rate": 3.928740527276745e-06, + "loss": 0.2138, + "num_input_tokens_seen": 10445424, + "step": 15975 + }, + { + "epoch": 8.375262054507338, + "grad_norm": 2.5055432319641113, + "learning_rate": 3.916442670253198e-06, + "loss": 0.2982, + "num_input_tokens_seen": 10448688, + "step": 15980 + }, + { + "epoch": 8.377882599580714, + "grad_norm": 1.7496691942214966, + "learning_rate": 3.904162454987373e-06, + "loss": 0.2527, + "num_input_tokens_seen": 10453040, + "step": 15985 + }, + { + "epoch": 8.380503144654089, + "grad_norm": 2.0306153297424316, + "learning_rate": 3.891899891754788e-06, + "loss": 0.2588, + "num_input_tokens_seen": 10457072, + "step": 15990 + }, + { + "epoch": 8.383123689727464, + "grad_norm": 3.75079345703125, + "learning_rate": 3.8796549908161864e-06, + "loss": 0.1829, + "num_input_tokens_seen": 10459280, + "step": 15995 + }, + { + "epoch": 8.385744234800839, + "grad_norm": 3.5263781547546387, + "learning_rate": 3.867427762417555e-06, + "loss": 0.1955, + "num_input_tokens_seen": 10462192, + "step": 16000 + }, + { + "epoch": 8.388364779874214, + "grad_norm": 1.6664104461669922, + "learning_rate": 3.855218216790077e-06, + "loss": 0.2477, + "num_input_tokens_seen": 10466672, + "step": 16005 + }, + { + "epoch": 8.39098532494759, + "grad_norm": 1.7596068382263184, + "learning_rate": 3.843026364150132e-06, + "loss": 0.204, + "num_input_tokens_seen": 10469776, + "step": 16010 + }, + { + "epoch": 8.393605870020965, + "grad_norm": 3.637803077697754, + "learning_rate": 3.830852214699326e-06, + "loss": 0.1495, + "num_input_tokens_seen": 10472848, + "step": 16015 + }, + { + "epoch": 8.39622641509434, + "grad_norm": 6.257242202758789, + "learning_rate": 3.818695778624409e-06, + "loss": 0.215, + "num_input_tokens_seen": 10476368, + "step": 16020 + }, + { + "epoch": 8.398846960167715, + "grad_norm": 1.6031386852264404, + "learning_rate": 3.8065570660973436e-06, + "loss": 0.1903, + "num_input_tokens_seen": 10479472, + "step": 16025 + }, + { + "epoch": 8.40146750524109, + "grad_norm": 1.8291006088256836, + "learning_rate": 3.7944360872752495e-06, + "loss": 0.2708, + "num_input_tokens_seen": 10482000, + "step": 16030 + }, + { + "epoch": 8.404088050314465, + "grad_norm": 2.4917149543762207, + "learning_rate": 3.782332852300402e-06, + "loss": 0.2664, + "num_input_tokens_seen": 10484592, + "step": 16035 + }, + { + "epoch": 8.40670859538784, + "grad_norm": 4.219763278961182, + "learning_rate": 3.770247371300242e-06, + "loss": 0.1877, + "num_input_tokens_seen": 10487216, + "step": 16040 + }, + { + "epoch": 8.409329140461216, + "grad_norm": 1.6134847402572632, + "learning_rate": 3.7581796543873477e-06, + "loss": 0.2518, + "num_input_tokens_seen": 10490512, + "step": 16045 + }, + { + "epoch": 8.41194968553459, + "grad_norm": 2.872598171234131, + "learning_rate": 3.746129711659424e-06, + "loss": 0.1541, + "num_input_tokens_seen": 10493552, + "step": 16050 + }, + { + "epoch": 8.414570230607966, + "grad_norm": 3.876091480255127, + "learning_rate": 3.7340975531993313e-06, + "loss": 0.2525, + "num_input_tokens_seen": 10497264, + "step": 16055 + }, + { + "epoch": 8.417190775681341, + "grad_norm": 1.1175938844680786, + "learning_rate": 3.7220831890750067e-06, + "loss": 0.1578, + "num_input_tokens_seen": 10499984, + "step": 16060 + }, + { + "epoch": 8.419811320754716, + "grad_norm": 2.3669850826263428, + "learning_rate": 3.7100866293395403e-06, + "loss": 0.1838, + "num_input_tokens_seen": 10503248, + "step": 16065 + }, + { + "epoch": 8.422431865828091, + "grad_norm": 1.497626543045044, + "learning_rate": 3.698107884031099e-06, + "loss": 0.2161, + "num_input_tokens_seen": 10507088, + "step": 16070 + }, + { + "epoch": 8.425052410901468, + "grad_norm": 2.1538968086242676, + "learning_rate": 3.68614696317294e-06, + "loss": 0.2232, + "num_input_tokens_seen": 10510416, + "step": 16075 + }, + { + "epoch": 8.427672955974844, + "grad_norm": 2.3048434257507324, + "learning_rate": 3.6742038767734326e-06, + "loss": 0.2837, + "num_input_tokens_seen": 10514384, + "step": 16080 + }, + { + "epoch": 8.430293501048219, + "grad_norm": 1.4965038299560547, + "learning_rate": 3.6622786348259967e-06, + "loss": 0.2669, + "num_input_tokens_seen": 10517872, + "step": 16085 + }, + { + "epoch": 8.432914046121594, + "grad_norm": 2.040341377258301, + "learning_rate": 3.6503712473091257e-06, + "loss": 0.1735, + "num_input_tokens_seen": 10521200, + "step": 16090 + }, + { + "epoch": 8.435534591194969, + "grad_norm": 2.0178604125976562, + "learning_rate": 3.6384817241863877e-06, + "loss": 0.2257, + "num_input_tokens_seen": 10523856, + "step": 16095 + }, + { + "epoch": 8.438155136268344, + "grad_norm": 1.325342059135437, + "learning_rate": 3.626610075406389e-06, + "loss": 0.3003, + "num_input_tokens_seen": 10527728, + "step": 16100 + }, + { + "epoch": 8.44077568134172, + "grad_norm": 1.729012131690979, + "learning_rate": 3.614756310902781e-06, + "loss": 0.2306, + "num_input_tokens_seen": 10531536, + "step": 16105 + }, + { + "epoch": 8.443396226415095, + "grad_norm": 1.383952260017395, + "learning_rate": 3.6029204405942485e-06, + "loss": 0.195, + "num_input_tokens_seen": 10534352, + "step": 16110 + }, + { + "epoch": 8.44601677148847, + "grad_norm": 1.641570806503296, + "learning_rate": 3.5911024743845166e-06, + "loss": 0.2381, + "num_input_tokens_seen": 10537552, + "step": 16115 + }, + { + "epoch": 8.448637316561845, + "grad_norm": 1.6002353429794312, + "learning_rate": 3.5793024221623147e-06, + "loss": 0.1216, + "num_input_tokens_seen": 10539952, + "step": 16120 + }, + { + "epoch": 8.45125786163522, + "grad_norm": 1.7235292196273804, + "learning_rate": 3.567520293801388e-06, + "loss": 0.1971, + "num_input_tokens_seen": 10543408, + "step": 16125 + }, + { + "epoch": 8.453878406708595, + "grad_norm": 1.0593161582946777, + "learning_rate": 3.555756099160476e-06, + "loss": 0.1148, + "num_input_tokens_seen": 10546352, + "step": 16130 + }, + { + "epoch": 8.45649895178197, + "grad_norm": 1.5654257535934448, + "learning_rate": 3.544009848083335e-06, + "loss": 0.187, + "num_input_tokens_seen": 10550352, + "step": 16135 + }, + { + "epoch": 8.459119496855346, + "grad_norm": 1.7348153591156006, + "learning_rate": 3.5322815503986804e-06, + "loss": 0.3455, + "num_input_tokens_seen": 10554864, + "step": 16140 + }, + { + "epoch": 8.46174004192872, + "grad_norm": 1.8648916482925415, + "learning_rate": 3.520571215920218e-06, + "loss": 0.1343, + "num_input_tokens_seen": 10558960, + "step": 16145 + }, + { + "epoch": 8.464360587002096, + "grad_norm": 1.6053950786590576, + "learning_rate": 3.5088788544466177e-06, + "loss": 0.2198, + "num_input_tokens_seen": 10563600, + "step": 16150 + }, + { + "epoch": 8.466981132075471, + "grad_norm": 1.9093505144119263, + "learning_rate": 3.4972044757615203e-06, + "loss": 0.2256, + "num_input_tokens_seen": 10566480, + "step": 16155 + }, + { + "epoch": 8.469601677148846, + "grad_norm": 2.0788748264312744, + "learning_rate": 3.4855480896335084e-06, + "loss": 0.165, + "num_input_tokens_seen": 10569616, + "step": 16160 + }, + { + "epoch": 8.472222222222221, + "grad_norm": 2.2029871940612793, + "learning_rate": 3.4739097058161114e-06, + "loss": 0.2664, + "num_input_tokens_seen": 10572336, + "step": 16165 + }, + { + "epoch": 8.474842767295598, + "grad_norm": 4.737011432647705, + "learning_rate": 3.462289334047805e-06, + "loss": 0.262, + "num_input_tokens_seen": 10574576, + "step": 16170 + }, + { + "epoch": 8.477463312368974, + "grad_norm": 1.4215182065963745, + "learning_rate": 3.450686984051979e-06, + "loss": 0.1937, + "num_input_tokens_seen": 10578640, + "step": 16175 + }, + { + "epoch": 8.480083857442349, + "grad_norm": 1.1345049142837524, + "learning_rate": 3.4391026655369474e-06, + "loss": 0.1574, + "num_input_tokens_seen": 10581392, + "step": 16180 + }, + { + "epoch": 8.482704402515724, + "grad_norm": 2.3062729835510254, + "learning_rate": 3.427536388195954e-06, + "loss": 0.2502, + "num_input_tokens_seen": 10584144, + "step": 16185 + }, + { + "epoch": 8.485324947589099, + "grad_norm": 3.549147844314575, + "learning_rate": 3.415988161707109e-06, + "loss": 0.2471, + "num_input_tokens_seen": 10587600, + "step": 16190 + }, + { + "epoch": 8.487945492662474, + "grad_norm": 3.5212626457214355, + "learning_rate": 3.404457995733451e-06, + "loss": 0.2531, + "num_input_tokens_seen": 10590832, + "step": 16195 + }, + { + "epoch": 8.49056603773585, + "grad_norm": 2.643057346343994, + "learning_rate": 3.3929458999229113e-06, + "loss": 0.2259, + "num_input_tokens_seen": 10593712, + "step": 16200 + }, + { + "epoch": 8.493186582809225, + "grad_norm": 2.1015326976776123, + "learning_rate": 3.381451883908257e-06, + "loss": 0.2386, + "num_input_tokens_seen": 10596336, + "step": 16205 + }, + { + "epoch": 8.4958071278826, + "grad_norm": 2.2369236946105957, + "learning_rate": 3.369975957307178e-06, + "loss": 0.2459, + "num_input_tokens_seen": 10599536, + "step": 16210 + }, + { + "epoch": 8.498427672955975, + "grad_norm": 1.5426753759384155, + "learning_rate": 3.358518129722199e-06, + "loss": 0.1183, + "num_input_tokens_seen": 10602576, + "step": 16215 + }, + { + "epoch": 8.5, + "eval_loss": 0.665528416633606, + "eval_runtime": 15.9692, + "eval_samples_per_second": 53.102, + "eval_steps_per_second": 13.276, + "num_input_tokens_seen": 10604656, + "step": 16218 + }, + { + "epoch": 8.50104821802935, + "grad_norm": 2.1325361728668213, + "learning_rate": 3.3470784107406976e-06, + "loss": 0.1919, + "num_input_tokens_seen": 10605616, + "step": 16220 + }, + { + "epoch": 8.503668763102725, + "grad_norm": 2.4963784217834473, + "learning_rate": 3.3356568099349283e-06, + "loss": 0.3326, + "num_input_tokens_seen": 10609040, + "step": 16225 + }, + { + "epoch": 8.5062893081761, + "grad_norm": 2.291613817214966, + "learning_rate": 3.3242533368619435e-06, + "loss": 0.28, + "num_input_tokens_seen": 10612304, + "step": 16230 + }, + { + "epoch": 8.508909853249476, + "grad_norm": 1.3222531080245972, + "learning_rate": 3.312868001063654e-06, + "loss": 0.191, + "num_input_tokens_seen": 10614928, + "step": 16235 + }, + { + "epoch": 8.51153039832285, + "grad_norm": 2.1052582263946533, + "learning_rate": 3.3015008120668072e-06, + "loss": 0.2706, + "num_input_tokens_seen": 10618320, + "step": 16240 + }, + { + "epoch": 8.514150943396226, + "grad_norm": 3.5516388416290283, + "learning_rate": 3.290151779382922e-06, + "loss": 0.2456, + "num_input_tokens_seen": 10621296, + "step": 16245 + }, + { + "epoch": 8.516771488469601, + "grad_norm": 2.0192151069641113, + "learning_rate": 3.2788209125083654e-06, + "loss": 0.2513, + "num_input_tokens_seen": 10623984, + "step": 16250 + }, + { + "epoch": 8.519392033542976, + "grad_norm": 2.3489580154418945, + "learning_rate": 3.267508220924287e-06, + "loss": 0.1983, + "num_input_tokens_seen": 10628560, + "step": 16255 + }, + { + "epoch": 8.522012578616351, + "grad_norm": 1.702325463294983, + "learning_rate": 3.256213714096623e-06, + "loss": 0.2134, + "num_input_tokens_seen": 10631856, + "step": 16260 + }, + { + "epoch": 8.524633123689728, + "grad_norm": 1.8491883277893066, + "learning_rate": 3.2449374014761114e-06, + "loss": 0.2104, + "num_input_tokens_seen": 10635696, + "step": 16265 + }, + { + "epoch": 8.527253668763104, + "grad_norm": 1.841442584991455, + "learning_rate": 3.2336792924982514e-06, + "loss": 0.2086, + "num_input_tokens_seen": 10638384, + "step": 16270 + }, + { + "epoch": 8.529874213836479, + "grad_norm": 2.6251232624053955, + "learning_rate": 3.222439396583307e-06, + "loss": 0.256, + "num_input_tokens_seen": 10641456, + "step": 16275 + }, + { + "epoch": 8.532494758909854, + "grad_norm": 3.8331613540649414, + "learning_rate": 3.2112177231363226e-06, + "loss": 0.1851, + "num_input_tokens_seen": 10644336, + "step": 16280 + }, + { + "epoch": 8.535115303983229, + "grad_norm": 1.3445430994033813, + "learning_rate": 3.2000142815470756e-06, + "loss": 0.188, + "num_input_tokens_seen": 10647440, + "step": 16285 + }, + { + "epoch": 8.537735849056604, + "grad_norm": 2.6681430339813232, + "learning_rate": 3.188829081190095e-06, + "loss": 0.2078, + "num_input_tokens_seen": 10649872, + "step": 16290 + }, + { + "epoch": 8.54035639412998, + "grad_norm": 1.7257565259933472, + "learning_rate": 3.1776621314246384e-06, + "loss": 0.2115, + "num_input_tokens_seen": 10652976, + "step": 16295 + }, + { + "epoch": 8.542976939203355, + "grad_norm": 1.7582314014434814, + "learning_rate": 3.1665134415947125e-06, + "loss": 0.1562, + "num_input_tokens_seen": 10656016, + "step": 16300 + }, + { + "epoch": 8.54559748427673, + "grad_norm": 2.649343252182007, + "learning_rate": 3.1553830210290236e-06, + "loss": 0.3158, + "num_input_tokens_seen": 10659536, + "step": 16305 + }, + { + "epoch": 8.548218029350105, + "grad_norm": 1.6967990398406982, + "learning_rate": 3.1442708790410002e-06, + "loss": 0.1693, + "num_input_tokens_seen": 10662416, + "step": 16310 + }, + { + "epoch": 8.55083857442348, + "grad_norm": 1.9257503747940063, + "learning_rate": 3.133177024928771e-06, + "loss": 0.1778, + "num_input_tokens_seen": 10666320, + "step": 16315 + }, + { + "epoch": 8.553459119496855, + "grad_norm": 2.14182186126709, + "learning_rate": 3.1221014679751777e-06, + "loss": 0.2989, + "num_input_tokens_seen": 10669136, + "step": 16320 + }, + { + "epoch": 8.55607966457023, + "grad_norm": 1.2161049842834473, + "learning_rate": 3.111044217447731e-06, + "loss": 0.1971, + "num_input_tokens_seen": 10672240, + "step": 16325 + }, + { + "epoch": 8.558700209643606, + "grad_norm": 2.0378899574279785, + "learning_rate": 3.1000052825986366e-06, + "loss": 0.2158, + "num_input_tokens_seen": 10674992, + "step": 16330 + }, + { + "epoch": 8.56132075471698, + "grad_norm": 2.2701220512390137, + "learning_rate": 3.0889846726647657e-06, + "loss": 0.2185, + "num_input_tokens_seen": 10677616, + "step": 16335 + }, + { + "epoch": 8.563941299790356, + "grad_norm": 1.6805914640426636, + "learning_rate": 3.077982396867668e-06, + "loss": 0.1976, + "num_input_tokens_seen": 10681136, + "step": 16340 + }, + { + "epoch": 8.566561844863731, + "grad_norm": 2.262209415435791, + "learning_rate": 3.066998464413545e-06, + "loss": 0.2977, + "num_input_tokens_seen": 10684368, + "step": 16345 + }, + { + "epoch": 8.569182389937106, + "grad_norm": 1.5375224351882935, + "learning_rate": 3.056032884493243e-06, + "loss": 0.2747, + "num_input_tokens_seen": 10687632, + "step": 16350 + }, + { + "epoch": 8.571802935010481, + "grad_norm": 1.6073650121688843, + "learning_rate": 3.045085666282266e-06, + "loss": 0.2258, + "num_input_tokens_seen": 10692336, + "step": 16355 + }, + { + "epoch": 8.574423480083858, + "grad_norm": 1.1373623609542847, + "learning_rate": 3.034156818940745e-06, + "loss": 0.2219, + "num_input_tokens_seen": 10696048, + "step": 16360 + }, + { + "epoch": 8.577044025157234, + "grad_norm": 4.165998458862305, + "learning_rate": 3.0232463516134317e-06, + "loss": 0.1744, + "num_input_tokens_seen": 10700304, + "step": 16365 + }, + { + "epoch": 8.579664570230609, + "grad_norm": 4.66044807434082, + "learning_rate": 3.0123542734297267e-06, + "loss": 0.1773, + "num_input_tokens_seen": 10703120, + "step": 16370 + }, + { + "epoch": 8.582285115303984, + "grad_norm": 1.200256109237671, + "learning_rate": 3.0014805935035973e-06, + "loss": 0.2062, + "num_input_tokens_seen": 10707536, + "step": 16375 + }, + { + "epoch": 8.584905660377359, + "grad_norm": 1.4345089197158813, + "learning_rate": 2.99062532093366e-06, + "loss": 0.2787, + "num_input_tokens_seen": 10710384, + "step": 16380 + }, + { + "epoch": 8.587526205450734, + "grad_norm": 2.2634449005126953, + "learning_rate": 2.979788464803107e-06, + "loss": 0.1888, + "num_input_tokens_seen": 10713840, + "step": 16385 + }, + { + "epoch": 8.59014675052411, + "grad_norm": 1.9975359439849854, + "learning_rate": 2.968970034179719e-06, + "loss": 0.1922, + "num_input_tokens_seen": 10716720, + "step": 16390 + }, + { + "epoch": 8.592767295597485, + "grad_norm": 1.778852939605713, + "learning_rate": 2.9581700381158735e-06, + "loss": 0.23, + "num_input_tokens_seen": 10720720, + "step": 16395 + }, + { + "epoch": 8.59538784067086, + "grad_norm": 5.388294219970703, + "learning_rate": 2.9473884856485113e-06, + "loss": 0.1848, + "num_input_tokens_seen": 10723952, + "step": 16400 + }, + { + "epoch": 8.598008385744235, + "grad_norm": 3.1189491748809814, + "learning_rate": 2.936625385799133e-06, + "loss": 0.2939, + "num_input_tokens_seen": 10726096, + "step": 16405 + }, + { + "epoch": 8.60062893081761, + "grad_norm": 1.0633440017700195, + "learning_rate": 2.925880747573831e-06, + "loss": 0.1837, + "num_input_tokens_seen": 10729456, + "step": 16410 + }, + { + "epoch": 8.603249475890985, + "grad_norm": 1.7706722021102905, + "learning_rate": 2.9151545799632003e-06, + "loss": 0.186, + "num_input_tokens_seen": 10731824, + "step": 16415 + }, + { + "epoch": 8.60587002096436, + "grad_norm": 1.6758328676223755, + "learning_rate": 2.9044468919424305e-06, + "loss": 0.1916, + "num_input_tokens_seen": 10734864, + "step": 16420 + }, + { + "epoch": 8.608490566037736, + "grad_norm": 8.338798522949219, + "learning_rate": 2.8937576924712133e-06, + "loss": 0.1915, + "num_input_tokens_seen": 10737744, + "step": 16425 + }, + { + "epoch": 8.61111111111111, + "grad_norm": 2.3632593154907227, + "learning_rate": 2.883086990493783e-06, + "loss": 0.1458, + "num_input_tokens_seen": 10740656, + "step": 16430 + }, + { + "epoch": 8.613731656184486, + "grad_norm": 3.3665173053741455, + "learning_rate": 2.872434794938905e-06, + "loss": 0.3675, + "num_input_tokens_seen": 10744688, + "step": 16435 + }, + { + "epoch": 8.616352201257861, + "grad_norm": 2.323087692260742, + "learning_rate": 2.861801114719842e-06, + "loss": 0.2388, + "num_input_tokens_seen": 10747184, + "step": 16440 + }, + { + "epoch": 8.618972746331236, + "grad_norm": 2.726550817489624, + "learning_rate": 2.8511859587343704e-06, + "loss": 0.2486, + "num_input_tokens_seen": 10750320, + "step": 16445 + }, + { + "epoch": 8.621593291404611, + "grad_norm": 2.09293794631958, + "learning_rate": 2.840589335864774e-06, + "loss": 0.1614, + "num_input_tokens_seen": 10753328, + "step": 16450 + }, + { + "epoch": 8.624213836477988, + "grad_norm": 1.732020378112793, + "learning_rate": 2.830011254977821e-06, + "loss": 0.2083, + "num_input_tokens_seen": 10756208, + "step": 16455 + }, + { + "epoch": 8.626834381551364, + "grad_norm": 3.3110690116882324, + "learning_rate": 2.819451724924768e-06, + "loss": 0.203, + "num_input_tokens_seen": 10759888, + "step": 16460 + }, + { + "epoch": 8.629454926624739, + "grad_norm": 2.6450371742248535, + "learning_rate": 2.8089107545413355e-06, + "loss": 0.1741, + "num_input_tokens_seen": 10763152, + "step": 16465 + }, + { + "epoch": 8.632075471698114, + "grad_norm": 2.8529253005981445, + "learning_rate": 2.7983883526477433e-06, + "loss": 0.2372, + "num_input_tokens_seen": 10767024, + "step": 16470 + }, + { + "epoch": 8.634696016771489, + "grad_norm": 2.0056312084198, + "learning_rate": 2.7878845280486453e-06, + "loss": 0.1842, + "num_input_tokens_seen": 10770096, + "step": 16475 + }, + { + "epoch": 8.637316561844864, + "grad_norm": 2.587787389755249, + "learning_rate": 2.777399289533164e-06, + "loss": 0.2215, + "num_input_tokens_seen": 10774576, + "step": 16480 + }, + { + "epoch": 8.63993710691824, + "grad_norm": 1.9621399641036987, + "learning_rate": 2.766932645874873e-06, + "loss": 0.1895, + "num_input_tokens_seen": 10777296, + "step": 16485 + }, + { + "epoch": 8.642557651991615, + "grad_norm": 1.8217699527740479, + "learning_rate": 2.756484605831777e-06, + "loss": 0.2062, + "num_input_tokens_seen": 10779696, + "step": 16490 + }, + { + "epoch": 8.64517819706499, + "grad_norm": 4.418690204620361, + "learning_rate": 2.74605517814632e-06, + "loss": 0.2209, + "num_input_tokens_seen": 10783184, + "step": 16495 + }, + { + "epoch": 8.647798742138365, + "grad_norm": 2.4709742069244385, + "learning_rate": 2.7356443715453705e-06, + "loss": 0.3353, + "num_input_tokens_seen": 10786512, + "step": 16500 + }, + { + "epoch": 8.65041928721174, + "grad_norm": 2.2014882564544678, + "learning_rate": 2.725252194740213e-06, + "loss": 0.1798, + "num_input_tokens_seen": 10789872, + "step": 16505 + }, + { + "epoch": 8.653039832285115, + "grad_norm": 1.6410679817199707, + "learning_rate": 2.714878656426553e-06, + "loss": 0.1893, + "num_input_tokens_seen": 10792944, + "step": 16510 + }, + { + "epoch": 8.65566037735849, + "grad_norm": 2.937817096710205, + "learning_rate": 2.704523765284489e-06, + "loss": 0.2053, + "num_input_tokens_seen": 10796624, + "step": 16515 + }, + { + "epoch": 8.658280922431866, + "grad_norm": 1.672340989112854, + "learning_rate": 2.6941875299785174e-06, + "loss": 0.2192, + "num_input_tokens_seen": 10800144, + "step": 16520 + }, + { + "epoch": 8.66090146750524, + "grad_norm": 3.258357286453247, + "learning_rate": 2.683869959157534e-06, + "loss": 0.1803, + "num_input_tokens_seen": 10803152, + "step": 16525 + }, + { + "epoch": 8.663522012578616, + "grad_norm": 4.520423412322998, + "learning_rate": 2.673571061454813e-06, + "loss": 0.2963, + "num_input_tokens_seen": 10805552, + "step": 16530 + }, + { + "epoch": 8.666142557651991, + "grad_norm": 3.6295149326324463, + "learning_rate": 2.6632908454879898e-06, + "loss": 0.2044, + "num_input_tokens_seen": 10808880, + "step": 16535 + }, + { + "epoch": 8.668763102725366, + "grad_norm": 1.6710398197174072, + "learning_rate": 2.653029319859096e-06, + "loss": 0.2055, + "num_input_tokens_seen": 10812240, + "step": 16540 + }, + { + "epoch": 8.671383647798741, + "grad_norm": 5.310831069946289, + "learning_rate": 2.642786493154492e-06, + "loss": 0.1915, + "num_input_tokens_seen": 10815600, + "step": 16545 + }, + { + "epoch": 8.674004192872118, + "grad_norm": 2.145265817642212, + "learning_rate": 2.6325623739449108e-06, + "loss": 0.1996, + "num_input_tokens_seen": 10819120, + "step": 16550 + }, + { + "epoch": 8.676624737945493, + "grad_norm": 1.6125173568725586, + "learning_rate": 2.6223569707854444e-06, + "loss": 0.1676, + "num_input_tokens_seen": 10821744, + "step": 16555 + }, + { + "epoch": 8.679245283018869, + "grad_norm": 14.915236473083496, + "learning_rate": 2.612170292215482e-06, + "loss": 0.2162, + "num_input_tokens_seen": 10824336, + "step": 16560 + }, + { + "epoch": 8.681865828092244, + "grad_norm": 2.396352767944336, + "learning_rate": 2.6020023467587917e-06, + "loss": 0.2595, + "num_input_tokens_seen": 10826672, + "step": 16565 + }, + { + "epoch": 8.684486373165619, + "grad_norm": 1.5868748426437378, + "learning_rate": 2.5918531429234368e-06, + "loss": 0.18, + "num_input_tokens_seen": 10830192, + "step": 16570 + }, + { + "epoch": 8.687106918238994, + "grad_norm": 2.736950397491455, + "learning_rate": 2.5817226892018016e-06, + "loss": 0.3276, + "num_input_tokens_seen": 10834096, + "step": 16575 + }, + { + "epoch": 8.68972746331237, + "grad_norm": 2.123906135559082, + "learning_rate": 2.571610994070603e-06, + "loss": 0.2142, + "num_input_tokens_seen": 10837360, + "step": 16580 + }, + { + "epoch": 8.692348008385745, + "grad_norm": 1.6292669773101807, + "learning_rate": 2.561518065990834e-06, + "loss": 0.3037, + "num_input_tokens_seen": 10841168, + "step": 16585 + }, + { + "epoch": 8.69496855345912, + "grad_norm": 3.5257537364959717, + "learning_rate": 2.5514439134077945e-06, + "loss": 0.1603, + "num_input_tokens_seen": 10844784, + "step": 16590 + }, + { + "epoch": 8.697589098532495, + "grad_norm": 3.4697093963623047, + "learning_rate": 2.541388544751089e-06, + "loss": 0.2142, + "num_input_tokens_seen": 10847376, + "step": 16595 + }, + { + "epoch": 8.70020964360587, + "grad_norm": 3.1239218711853027, + "learning_rate": 2.53135196843457e-06, + "loss": 0.174, + "num_input_tokens_seen": 10850000, + "step": 16600 + }, + { + "epoch": 8.702830188679245, + "grad_norm": 2.180161476135254, + "learning_rate": 2.521334192856403e-06, + "loss": 0.1484, + "num_input_tokens_seen": 10852528, + "step": 16605 + }, + { + "epoch": 8.70545073375262, + "grad_norm": 2.544666051864624, + "learning_rate": 2.5113352263990005e-06, + "loss": 0.1675, + "num_input_tokens_seen": 10855312, + "step": 16610 + }, + { + "epoch": 8.708071278825996, + "grad_norm": 1.9725990295410156, + "learning_rate": 2.5013550774290322e-06, + "loss": 0.3575, + "num_input_tokens_seen": 10859376, + "step": 16615 + }, + { + "epoch": 8.71069182389937, + "grad_norm": 2.081634044647217, + "learning_rate": 2.491393754297444e-06, + "loss": 0.227, + "num_input_tokens_seen": 10862224, + "step": 16620 + }, + { + "epoch": 8.713312368972746, + "grad_norm": 2.2630910873413086, + "learning_rate": 2.48145126533941e-06, + "loss": 0.2434, + "num_input_tokens_seen": 10865104, + "step": 16625 + }, + { + "epoch": 8.715932914046121, + "grad_norm": 2.465106248855591, + "learning_rate": 2.4715276188743476e-06, + "loss": 0.1776, + "num_input_tokens_seen": 10867792, + "step": 16630 + }, + { + "epoch": 8.718553459119496, + "grad_norm": 2.4149041175842285, + "learning_rate": 2.461622823205917e-06, + "loss": 0.2068, + "num_input_tokens_seen": 10870768, + "step": 16635 + }, + { + "epoch": 8.721174004192871, + "grad_norm": 1.6442673206329346, + "learning_rate": 2.451736886621997e-06, + "loss": 0.133, + "num_input_tokens_seen": 10875056, + "step": 16640 + }, + { + "epoch": 8.723794549266248, + "grad_norm": 3.0464131832122803, + "learning_rate": 2.4418698173946872e-06, + "loss": 0.2429, + "num_input_tokens_seen": 10878288, + "step": 16645 + }, + { + "epoch": 8.726415094339622, + "grad_norm": 2.7190775871276855, + "learning_rate": 2.432021623780295e-06, + "loss": 0.2505, + "num_input_tokens_seen": 10880720, + "step": 16650 + }, + { + "epoch": 8.729035639412999, + "grad_norm": 2.1083199977874756, + "learning_rate": 2.4221923140193477e-06, + "loss": 0.1294, + "num_input_tokens_seen": 10884528, + "step": 16655 + }, + { + "epoch": 8.731656184486374, + "grad_norm": 2.0503146648406982, + "learning_rate": 2.41238189633656e-06, + "loss": 0.2665, + "num_input_tokens_seen": 10886960, + "step": 16660 + }, + { + "epoch": 8.734276729559749, + "grad_norm": 1.615939736366272, + "learning_rate": 2.402590378940836e-06, + "loss": 0.2221, + "num_input_tokens_seen": 10889904, + "step": 16665 + }, + { + "epoch": 8.736897274633124, + "grad_norm": 2.001241683959961, + "learning_rate": 2.3928177700252798e-06, + "loss": 0.2701, + "num_input_tokens_seen": 10893072, + "step": 16670 + }, + { + "epoch": 8.7395178197065, + "grad_norm": 2.590013027191162, + "learning_rate": 2.3830640777671583e-06, + "loss": 0.2631, + "num_input_tokens_seen": 10897392, + "step": 16675 + }, + { + "epoch": 8.742138364779874, + "grad_norm": 6.773374557495117, + "learning_rate": 2.3733293103279153e-06, + "loss": 0.2706, + "num_input_tokens_seen": 10901008, + "step": 16680 + }, + { + "epoch": 8.74475890985325, + "grad_norm": 1.996108055114746, + "learning_rate": 2.3636134758531604e-06, + "loss": 0.2411, + "num_input_tokens_seen": 10904848, + "step": 16685 + }, + { + "epoch": 8.747379454926625, + "grad_norm": 2.6384429931640625, + "learning_rate": 2.3539165824726565e-06, + "loss": 0.2201, + "num_input_tokens_seen": 10908528, + "step": 16690 + }, + { + "epoch": 8.75, + "grad_norm": 1.6181960105895996, + "learning_rate": 2.344238638300328e-06, + "loss": 0.2785, + "num_input_tokens_seen": 10912080, + "step": 16695 + }, + { + "epoch": 8.752620545073375, + "grad_norm": 2.1119303703308105, + "learning_rate": 2.334579651434235e-06, + "loss": 0.2837, + "num_input_tokens_seen": 10916528, + "step": 16700 + }, + { + "epoch": 8.75524109014675, + "grad_norm": 2.7142622470855713, + "learning_rate": 2.3249396299565683e-06, + "loss": 0.1919, + "num_input_tokens_seen": 10920688, + "step": 16705 + }, + { + "epoch": 8.757861635220126, + "grad_norm": 2.444998264312744, + "learning_rate": 2.3153185819336705e-06, + "loss": 0.2429, + "num_input_tokens_seen": 10923600, + "step": 16710 + }, + { + "epoch": 8.7604821802935, + "grad_norm": 1.968190312385559, + "learning_rate": 2.3057165154159873e-06, + "loss": 0.1835, + "num_input_tokens_seen": 10926544, + "step": 16715 + }, + { + "epoch": 8.763102725366876, + "grad_norm": 2.100193977355957, + "learning_rate": 2.296133438438086e-06, + "loss": 0.1913, + "num_input_tokens_seen": 10929392, + "step": 16720 + }, + { + "epoch": 8.765723270440251, + "grad_norm": 1.7482202053070068, + "learning_rate": 2.2865693590186616e-06, + "loss": 0.194, + "num_input_tokens_seen": 10933456, + "step": 16725 + }, + { + "epoch": 8.768343815513626, + "grad_norm": 2.333944797515869, + "learning_rate": 2.2770242851604813e-06, + "loss": 0.2955, + "num_input_tokens_seen": 10936336, + "step": 16730 + }, + { + "epoch": 8.770964360587001, + "grad_norm": 2.7593157291412354, + "learning_rate": 2.2674982248504395e-06, + "loss": 0.2308, + "num_input_tokens_seen": 10939696, + "step": 16735 + }, + { + "epoch": 8.773584905660378, + "grad_norm": 2.281930923461914, + "learning_rate": 2.257991186059502e-06, + "loss": 0.15, + "num_input_tokens_seen": 10942192, + "step": 16740 + }, + { + "epoch": 8.776205450733752, + "grad_norm": 2.2892673015594482, + "learning_rate": 2.248503176742725e-06, + "loss": 0.1341, + "num_input_tokens_seen": 10944624, + "step": 16745 + }, + { + "epoch": 8.778825995807129, + "grad_norm": 1.6543183326721191, + "learning_rate": 2.2390342048392467e-06, + "loss": 0.1738, + "num_input_tokens_seen": 10947952, + "step": 16750 + }, + { + "epoch": 8.781446540880504, + "grad_norm": 2.490699291229248, + "learning_rate": 2.229584278272265e-06, + "loss": 0.2022, + "num_input_tokens_seen": 10951440, + "step": 16755 + }, + { + "epoch": 8.784067085953879, + "grad_norm": 2.335862636566162, + "learning_rate": 2.2201534049490436e-06, + "loss": 0.1404, + "num_input_tokens_seen": 10954224, + "step": 16760 + }, + { + "epoch": 8.786687631027254, + "grad_norm": 2.854006767272949, + "learning_rate": 2.2107415927609176e-06, + "loss": 0.1948, + "num_input_tokens_seen": 10957392, + "step": 16765 + }, + { + "epoch": 8.78930817610063, + "grad_norm": 1.3944553136825562, + "learning_rate": 2.2013488495832542e-06, + "loss": 0.2789, + "num_input_tokens_seen": 10961136, + "step": 16770 + }, + { + "epoch": 8.791928721174004, + "grad_norm": 1.8178247213363647, + "learning_rate": 2.1919751832754714e-06, + "loss": 0.201, + "num_input_tokens_seen": 10964272, + "step": 16775 + }, + { + "epoch": 8.79454926624738, + "grad_norm": 1.9479976892471313, + "learning_rate": 2.182620601681029e-06, + "loss": 0.1305, + "num_input_tokens_seen": 10967344, + "step": 16780 + }, + { + "epoch": 8.797169811320755, + "grad_norm": 2.8159165382385254, + "learning_rate": 2.1732851126274047e-06, + "loss": 0.2082, + "num_input_tokens_seen": 10970800, + "step": 16785 + }, + { + "epoch": 8.79979035639413, + "grad_norm": 1.312361240386963, + "learning_rate": 2.1639687239261214e-06, + "loss": 0.1872, + "num_input_tokens_seen": 10974544, + "step": 16790 + }, + { + "epoch": 8.802410901467505, + "grad_norm": 50.94136047363281, + "learning_rate": 2.1546714433726993e-06, + "loss": 0.2327, + "num_input_tokens_seen": 10976784, + "step": 16795 + }, + { + "epoch": 8.80503144654088, + "grad_norm": 2.0000345706939697, + "learning_rate": 2.1453932787466767e-06, + "loss": 0.2663, + "num_input_tokens_seen": 10980400, + "step": 16800 + }, + { + "epoch": 8.807651991614255, + "grad_norm": 1.8258858919143677, + "learning_rate": 2.1361342378116072e-06, + "loss": 0.1614, + "num_input_tokens_seen": 10986160, + "step": 16805 + }, + { + "epoch": 8.81027253668763, + "grad_norm": 1.915705919265747, + "learning_rate": 2.1268943283150294e-06, + "loss": 0.2199, + "num_input_tokens_seen": 10989584, + "step": 16810 + }, + { + "epoch": 8.812893081761006, + "grad_norm": 3.0542373657226562, + "learning_rate": 2.1176735579884753e-06, + "loss": 0.2702, + "num_input_tokens_seen": 10992464, + "step": 16815 + }, + { + "epoch": 8.815513626834381, + "grad_norm": 2.021627187728882, + "learning_rate": 2.1084719345474597e-06, + "loss": 0.2637, + "num_input_tokens_seen": 10995472, + "step": 16820 + }, + { + "epoch": 8.818134171907756, + "grad_norm": 2.9926974773406982, + "learning_rate": 2.0992894656914895e-06, + "loss": 0.1911, + "num_input_tokens_seen": 10999088, + "step": 16825 + }, + { + "epoch": 8.820754716981131, + "grad_norm": 2.0948047637939453, + "learning_rate": 2.0901261591040333e-06, + "loss": 0.2172, + "num_input_tokens_seen": 11002000, + "step": 16830 + }, + { + "epoch": 8.823375262054507, + "grad_norm": 3.619046211242676, + "learning_rate": 2.0809820224525213e-06, + "loss": 0.2884, + "num_input_tokens_seen": 11005360, + "step": 16835 + }, + { + "epoch": 8.825995807127882, + "grad_norm": 1.3004310131072998, + "learning_rate": 2.0718570633883576e-06, + "loss": 0.1858, + "num_input_tokens_seen": 11008272, + "step": 16840 + }, + { + "epoch": 8.828616352201259, + "grad_norm": 2.174391746520996, + "learning_rate": 2.0627512895468883e-06, + "loss": 0.2105, + "num_input_tokens_seen": 11011760, + "step": 16845 + }, + { + "epoch": 8.831236897274634, + "grad_norm": 2.261776924133301, + "learning_rate": 2.0536647085474037e-06, + "loss": 0.2381, + "num_input_tokens_seen": 11014736, + "step": 16850 + }, + { + "epoch": 8.833857442348009, + "grad_norm": 1.4987136125564575, + "learning_rate": 2.044597327993153e-06, + "loss": 0.2126, + "num_input_tokens_seen": 11018128, + "step": 16855 + }, + { + "epoch": 8.836477987421384, + "grad_norm": 1.2934978008270264, + "learning_rate": 2.035549155471289e-06, + "loss": 0.1508, + "num_input_tokens_seen": 11022032, + "step": 16860 + }, + { + "epoch": 8.83909853249476, + "grad_norm": 3.0309250354766846, + "learning_rate": 2.0265201985529226e-06, + "loss": 0.1926, + "num_input_tokens_seen": 11024944, + "step": 16865 + }, + { + "epoch": 8.841719077568134, + "grad_norm": 2.088501453399658, + "learning_rate": 2.0175104647930655e-06, + "loss": 0.1836, + "num_input_tokens_seen": 11028272, + "step": 16870 + }, + { + "epoch": 8.84433962264151, + "grad_norm": 4.697885036468506, + "learning_rate": 2.008519961730651e-06, + "loss": 0.2175, + "num_input_tokens_seen": 11031088, + "step": 16875 + }, + { + "epoch": 8.846960167714885, + "grad_norm": 4.07885217666626, + "learning_rate": 1.9995486968885284e-06, + "loss": 0.2602, + "num_input_tokens_seen": 11033456, + "step": 16880 + }, + { + "epoch": 8.84958071278826, + "grad_norm": 1.8325204849243164, + "learning_rate": 1.990596677773435e-06, + "loss": 0.2631, + "num_input_tokens_seen": 11036976, + "step": 16885 + }, + { + "epoch": 8.852201257861635, + "grad_norm": 1.766755223274231, + "learning_rate": 1.981663911876014e-06, + "loss": 0.2348, + "num_input_tokens_seen": 11039472, + "step": 16890 + }, + { + "epoch": 8.85482180293501, + "grad_norm": 2.123033046722412, + "learning_rate": 1.972750406670801e-06, + "loss": 0.2363, + "num_input_tokens_seen": 11042064, + "step": 16895 + }, + { + "epoch": 8.857442348008385, + "grad_norm": 1.7174296379089355, + "learning_rate": 1.9638561696161962e-06, + "loss": 0.2432, + "num_input_tokens_seen": 11045232, + "step": 16900 + }, + { + "epoch": 8.86006289308176, + "grad_norm": 2.1640257835388184, + "learning_rate": 1.954981208154502e-06, + "loss": 0.2107, + "num_input_tokens_seen": 11048784, + "step": 16905 + }, + { + "epoch": 8.862683438155136, + "grad_norm": 1.4358725547790527, + "learning_rate": 1.9461255297118868e-06, + "loss": 0.2489, + "num_input_tokens_seen": 11052528, + "step": 16910 + }, + { + "epoch": 8.865303983228511, + "grad_norm": 3.641735315322876, + "learning_rate": 1.937289141698359e-06, + "loss": 0.3, + "num_input_tokens_seen": 11055824, + "step": 16915 + }, + { + "epoch": 8.867924528301886, + "grad_norm": 2.014132022857666, + "learning_rate": 1.928472051507821e-06, + "loss": 0.1533, + "num_input_tokens_seen": 11059504, + "step": 16920 + }, + { + "epoch": 8.870545073375261, + "grad_norm": 1.4326820373535156, + "learning_rate": 1.919674266518004e-06, + "loss": 0.1658, + "num_input_tokens_seen": 11062544, + "step": 16925 + }, + { + "epoch": 8.873165618448636, + "grad_norm": 1.196294903755188, + "learning_rate": 1.910895794090492e-06, + "loss": 0.2187, + "num_input_tokens_seen": 11065584, + "step": 16930 + }, + { + "epoch": 8.875786163522012, + "grad_norm": 1.83164381980896, + "learning_rate": 1.902136641570712e-06, + "loss": 0.1502, + "num_input_tokens_seen": 11068336, + "step": 16935 + }, + { + "epoch": 8.878406708595389, + "grad_norm": 2.3084049224853516, + "learning_rate": 1.8933968162879235e-06, + "loss": 0.1988, + "num_input_tokens_seen": 11071184, + "step": 16940 + }, + { + "epoch": 8.881027253668764, + "grad_norm": 2.0369110107421875, + "learning_rate": 1.8846763255552097e-06, + "loss": 0.1067, + "num_input_tokens_seen": 11074320, + "step": 16945 + }, + { + "epoch": 8.883647798742139, + "grad_norm": 1.9209212064743042, + "learning_rate": 1.8759751766694811e-06, + "loss": 0.1709, + "num_input_tokens_seen": 11077168, + "step": 16950 + }, + { + "epoch": 8.886268343815514, + "grad_norm": 3.2554659843444824, + "learning_rate": 1.8672933769114636e-06, + "loss": 0.2012, + "num_input_tokens_seen": 11080080, + "step": 16955 + }, + { + "epoch": 8.88888888888889, + "grad_norm": 1.8061765432357788, + "learning_rate": 1.8586309335456908e-06, + "loss": 0.1648, + "num_input_tokens_seen": 11083408, + "step": 16960 + }, + { + "epoch": 8.891509433962264, + "grad_norm": 2.3280205726623535, + "learning_rate": 1.8499878538204951e-06, + "loss": 0.256, + "num_input_tokens_seen": 11087184, + "step": 16965 + }, + { + "epoch": 8.89412997903564, + "grad_norm": 2.053572654724121, + "learning_rate": 1.8413641449680081e-06, + "loss": 0.1358, + "num_input_tokens_seen": 11090864, + "step": 16970 + }, + { + "epoch": 8.896750524109015, + "grad_norm": 1.5772241353988647, + "learning_rate": 1.8327598142041658e-06, + "loss": 0.151, + "num_input_tokens_seen": 11094288, + "step": 16975 + }, + { + "epoch": 8.89937106918239, + "grad_norm": 1.5883121490478516, + "learning_rate": 1.824174868728673e-06, + "loss": 0.1296, + "num_input_tokens_seen": 11096656, + "step": 16980 + }, + { + "epoch": 8.901991614255765, + "grad_norm": 1.4781430959701538, + "learning_rate": 1.815609315725017e-06, + "loss": 0.2047, + "num_input_tokens_seen": 11100144, + "step": 16985 + }, + { + "epoch": 8.90461215932914, + "grad_norm": 1.2889639139175415, + "learning_rate": 1.80706316236047e-06, + "loss": 0.2016, + "num_input_tokens_seen": 11103440, + "step": 16990 + }, + { + "epoch": 8.907232704402515, + "grad_norm": 2.6630165576934814, + "learning_rate": 1.7985364157860562e-06, + "loss": 0.182, + "num_input_tokens_seen": 11107056, + "step": 16995 + }, + { + "epoch": 8.90985324947589, + "grad_norm": 1.9577651023864746, + "learning_rate": 1.7900290831365713e-06, + "loss": 0.1837, + "num_input_tokens_seen": 11110640, + "step": 17000 + }, + { + "epoch": 8.912473794549266, + "grad_norm": 0.3872951567173004, + "learning_rate": 1.781541171530554e-06, + "loss": 0.1504, + "num_input_tokens_seen": 11117008, + "step": 17005 + }, + { + "epoch": 8.915094339622641, + "grad_norm": 2.0534048080444336, + "learning_rate": 1.7730726880703125e-06, + "loss": 0.1582, + "num_input_tokens_seen": 11120176, + "step": 17010 + }, + { + "epoch": 8.917714884696016, + "grad_norm": 2.4833438396453857, + "learning_rate": 1.7646236398418835e-06, + "loss": 0.2069, + "num_input_tokens_seen": 11123504, + "step": 17015 + }, + { + "epoch": 8.920335429769391, + "grad_norm": 2.7762062549591064, + "learning_rate": 1.7561940339150373e-06, + "loss": 0.2937, + "num_input_tokens_seen": 11126000, + "step": 17020 + }, + { + "epoch": 8.922955974842766, + "grad_norm": 1.1343390941619873, + "learning_rate": 1.7477838773432926e-06, + "loss": 0.1786, + "num_input_tokens_seen": 11129584, + "step": 17025 + }, + { + "epoch": 8.925576519916142, + "grad_norm": 2.5990982055664062, + "learning_rate": 1.7393931771638839e-06, + "loss": 0.2408, + "num_input_tokens_seen": 11132240, + "step": 17030 + }, + { + "epoch": 8.928197064989519, + "grad_norm": 2.31701397895813, + "learning_rate": 1.7310219403977563e-06, + "loss": 0.2077, + "num_input_tokens_seen": 11135120, + "step": 17035 + }, + { + "epoch": 8.930817610062894, + "grad_norm": 1.9480128288269043, + "learning_rate": 1.7226701740495926e-06, + "loss": 0.1225, + "num_input_tokens_seen": 11137968, + "step": 17040 + }, + { + "epoch": 8.933438155136269, + "grad_norm": 1.6801413297653198, + "learning_rate": 1.714337885107753e-06, + "loss": 0.1799, + "num_input_tokens_seen": 11140368, + "step": 17045 + }, + { + "epoch": 8.936058700209644, + "grad_norm": 2.736799955368042, + "learning_rate": 1.7060250805443296e-06, + "loss": 0.255, + "num_input_tokens_seen": 11145168, + "step": 17050 + }, + { + "epoch": 8.93867924528302, + "grad_norm": 2.165738105773926, + "learning_rate": 1.6977317673150916e-06, + "loss": 0.2406, + "num_input_tokens_seen": 11149008, + "step": 17055 + }, + { + "epoch": 8.941299790356394, + "grad_norm": 2.5493388175964355, + "learning_rate": 1.6894579523595022e-06, + "loss": 0.1988, + "num_input_tokens_seen": 11156336, + "step": 17060 + }, + { + "epoch": 8.94392033542977, + "grad_norm": 2.6547956466674805, + "learning_rate": 1.6812036426007176e-06, + "loss": 0.2747, + "num_input_tokens_seen": 11159152, + "step": 17065 + }, + { + "epoch": 8.946540880503145, + "grad_norm": 1.8093276023864746, + "learning_rate": 1.6729688449455689e-06, + "loss": 0.2063, + "num_input_tokens_seen": 11162096, + "step": 17070 + }, + { + "epoch": 8.94916142557652, + "grad_norm": 4.253817081451416, + "learning_rate": 1.6647535662845466e-06, + "loss": 0.1796, + "num_input_tokens_seen": 11165200, + "step": 17075 + }, + { + "epoch": 8.951781970649895, + "grad_norm": 3.3027186393737793, + "learning_rate": 1.656557813491838e-06, + "loss": 0.2749, + "num_input_tokens_seen": 11168368, + "step": 17080 + }, + { + "epoch": 8.95440251572327, + "grad_norm": 1.6466386318206787, + "learning_rate": 1.6483815934252578e-06, + "loss": 0.2703, + "num_input_tokens_seen": 11172048, + "step": 17085 + }, + { + "epoch": 8.957023060796645, + "grad_norm": 1.6553006172180176, + "learning_rate": 1.6402249129263025e-06, + "loss": 0.2129, + "num_input_tokens_seen": 11175600, + "step": 17090 + }, + { + "epoch": 8.95964360587002, + "grad_norm": 2.661252498626709, + "learning_rate": 1.6320877788201127e-06, + "loss": 0.193, + "num_input_tokens_seen": 11178576, + "step": 17095 + }, + { + "epoch": 8.962264150943396, + "grad_norm": 1.813553810119629, + "learning_rate": 1.6239701979154614e-06, + "loss": 0.2514, + "num_input_tokens_seen": 11181168, + "step": 17100 + }, + { + "epoch": 8.964884696016771, + "grad_norm": 2.1401994228363037, + "learning_rate": 1.6158721770047762e-06, + "loss": 0.16, + "num_input_tokens_seen": 11184400, + "step": 17105 + }, + { + "epoch": 8.967505241090146, + "grad_norm": 1.8230998516082764, + "learning_rate": 1.6077937228641093e-06, + "loss": 0.272, + "num_input_tokens_seen": 11187536, + "step": 17110 + }, + { + "epoch": 8.970125786163521, + "grad_norm": 2.2341408729553223, + "learning_rate": 1.5997348422531395e-06, + "loss": 0.223, + "num_input_tokens_seen": 11191568, + "step": 17115 + }, + { + "epoch": 8.972746331236896, + "grad_norm": 2.5668587684631348, + "learning_rate": 1.5916955419151725e-06, + "loss": 0.1713, + "num_input_tokens_seen": 11195024, + "step": 17120 + }, + { + "epoch": 8.975366876310272, + "grad_norm": 3.131383180618286, + "learning_rate": 1.5836758285771303e-06, + "loss": 0.2585, + "num_input_tokens_seen": 11198480, + "step": 17125 + }, + { + "epoch": 8.977987421383649, + "grad_norm": 1.5005176067352295, + "learning_rate": 1.5756757089495366e-06, + "loss": 0.2272, + "num_input_tokens_seen": 11201712, + "step": 17130 + }, + { + "epoch": 8.980607966457024, + "grad_norm": 1.3393701314926147, + "learning_rate": 1.5676951897265313e-06, + "loss": 0.164, + "num_input_tokens_seen": 11204720, + "step": 17135 + }, + { + "epoch": 8.983228511530399, + "grad_norm": 2.5625317096710205, + "learning_rate": 1.5597342775858476e-06, + "loss": 0.241, + "num_input_tokens_seen": 11207440, + "step": 17140 + }, + { + "epoch": 8.985849056603774, + "grad_norm": 1.8057959079742432, + "learning_rate": 1.5517929791888125e-06, + "loss": 0.2196, + "num_input_tokens_seen": 11210384, + "step": 17145 + }, + { + "epoch": 8.98846960167715, + "grad_norm": 3.613022804260254, + "learning_rate": 1.5438713011803385e-06, + "loss": 0.2042, + "num_input_tokens_seen": 11213392, + "step": 17150 + }, + { + "epoch": 8.991090146750524, + "grad_norm": 2.2250709533691406, + "learning_rate": 1.535969250188926e-06, + "loss": 0.2406, + "num_input_tokens_seen": 11216016, + "step": 17155 + }, + { + "epoch": 8.9937106918239, + "grad_norm": 1.7852816581726074, + "learning_rate": 1.5280868328266528e-06, + "loss": 0.2404, + "num_input_tokens_seen": 11218160, + "step": 17160 + }, + { + "epoch": 8.996331236897275, + "grad_norm": 1.8422508239746094, + "learning_rate": 1.520224055689165e-06, + "loss": 0.3821, + "num_input_tokens_seen": 11222000, + "step": 17165 + }, + { + "epoch": 8.99895178197065, + "grad_norm": 1.7200772762298584, + "learning_rate": 1.5123809253556692e-06, + "loss": 0.2188, + "num_input_tokens_seen": 11224816, + "step": 17170 + }, + { + "epoch": 9.0, + "eval_loss": 0.6604991555213928, + "eval_runtime": 15.9293, + "eval_samples_per_second": 53.235, + "eval_steps_per_second": 13.309, + "num_input_tokens_seen": 11225416, + "step": 17172 + }, + { + "epoch": 9.001572327044025, + "grad_norm": 2.399247884750366, + "learning_rate": 1.5045574483889463e-06, + "loss": 0.247, + "num_input_tokens_seen": 11227112, + "step": 17175 + }, + { + "epoch": 9.0041928721174, + "grad_norm": 1.392547607421875, + "learning_rate": 1.4967536313353237e-06, + "loss": 0.1861, + "num_input_tokens_seen": 11229544, + "step": 17180 + }, + { + "epoch": 9.006813417190775, + "grad_norm": 1.3482989072799683, + "learning_rate": 1.4889694807246779e-06, + "loss": 0.2323, + "num_input_tokens_seen": 11233640, + "step": 17185 + }, + { + "epoch": 9.00943396226415, + "grad_norm": 2.3905816078186035, + "learning_rate": 1.481205003070424e-06, + "loss": 0.1362, + "num_input_tokens_seen": 11236552, + "step": 17190 + }, + { + "epoch": 9.012054507337526, + "grad_norm": 1.9459887742996216, + "learning_rate": 1.4734602048695312e-06, + "loss": 0.1822, + "num_input_tokens_seen": 11240008, + "step": 17195 + }, + { + "epoch": 9.014675052410901, + "grad_norm": 1.823833703994751, + "learning_rate": 1.465735092602491e-06, + "loss": 0.2346, + "num_input_tokens_seen": 11243176, + "step": 17200 + }, + { + "epoch": 9.017295597484276, + "grad_norm": 1.8316423892974854, + "learning_rate": 1.4580296727333187e-06, + "loss": 0.2101, + "num_input_tokens_seen": 11246568, + "step": 17205 + }, + { + "epoch": 9.019916142557651, + "grad_norm": 0.77811598777771, + "learning_rate": 1.450343951709568e-06, + "loss": 0.1574, + "num_input_tokens_seen": 11251080, + "step": 17210 + }, + { + "epoch": 9.022536687631026, + "grad_norm": 3.0723485946655273, + "learning_rate": 1.4426779359622916e-06, + "loss": 0.2614, + "num_input_tokens_seen": 11253256, + "step": 17215 + }, + { + "epoch": 9.025157232704403, + "grad_norm": 1.5802383422851562, + "learning_rate": 1.4350316319060585e-06, + "loss": 0.203, + "num_input_tokens_seen": 11256936, + "step": 17220 + }, + { + "epoch": 9.027777777777779, + "grad_norm": 1.7289087772369385, + "learning_rate": 1.4274050459389594e-06, + "loss": 0.1817, + "num_input_tokens_seen": 11259784, + "step": 17225 + }, + { + "epoch": 9.030398322851154, + "grad_norm": 1.8655149936676025, + "learning_rate": 1.4197981844425583e-06, + "loss": 0.2137, + "num_input_tokens_seen": 11262728, + "step": 17230 + }, + { + "epoch": 9.033018867924529, + "grad_norm": 1.7967422008514404, + "learning_rate": 1.4122110537819365e-06, + "loss": 0.1827, + "num_input_tokens_seen": 11265640, + "step": 17235 + }, + { + "epoch": 9.035639412997904, + "grad_norm": 1.1767266988754272, + "learning_rate": 1.4046436603056601e-06, + "loss": 0.1757, + "num_input_tokens_seen": 11270344, + "step": 17240 + }, + { + "epoch": 9.03825995807128, + "grad_norm": 4.176662445068359, + "learning_rate": 1.397096010345772e-06, + "loss": 0.2245, + "num_input_tokens_seen": 11272584, + "step": 17245 + }, + { + "epoch": 9.040880503144654, + "grad_norm": 2.021397352218628, + "learning_rate": 1.3895681102178094e-06, + "loss": 0.1936, + "num_input_tokens_seen": 11276872, + "step": 17250 + }, + { + "epoch": 9.04350104821803, + "grad_norm": 2.011284589767456, + "learning_rate": 1.3820599662207695e-06, + "loss": 0.2209, + "num_input_tokens_seen": 11279688, + "step": 17255 + }, + { + "epoch": 9.046121593291405, + "grad_norm": 1.9247568845748901, + "learning_rate": 1.3745715846371244e-06, + "loss": 0.1744, + "num_input_tokens_seen": 11282888, + "step": 17260 + }, + { + "epoch": 9.04874213836478, + "grad_norm": 2.1463828086853027, + "learning_rate": 1.3671029717328142e-06, + "loss": 0.3085, + "num_input_tokens_seen": 11285928, + "step": 17265 + }, + { + "epoch": 9.051362683438155, + "grad_norm": 2.423048973083496, + "learning_rate": 1.3596541337572265e-06, + "loss": 0.1979, + "num_input_tokens_seen": 11289288, + "step": 17270 + }, + { + "epoch": 9.05398322851153, + "grad_norm": 1.8313210010528564, + "learning_rate": 1.3522250769432115e-06, + "loss": 0.2229, + "num_input_tokens_seen": 11295976, + "step": 17275 + }, + { + "epoch": 9.056603773584905, + "grad_norm": 1.4615654945373535, + "learning_rate": 1.3448158075070687e-06, + "loss": 0.2252, + "num_input_tokens_seen": 11299816, + "step": 17280 + }, + { + "epoch": 9.05922431865828, + "grad_norm": 1.753225326538086, + "learning_rate": 1.337426331648528e-06, + "loss": 0.2149, + "num_input_tokens_seen": 11303176, + "step": 17285 + }, + { + "epoch": 9.061844863731656, + "grad_norm": 1.7017226219177246, + "learning_rate": 1.3300566555507709e-06, + "loss": 0.1377, + "num_input_tokens_seen": 11306376, + "step": 17290 + }, + { + "epoch": 9.064465408805031, + "grad_norm": 1.3255431652069092, + "learning_rate": 1.3227067853804065e-06, + "loss": 0.1683, + "num_input_tokens_seen": 11310696, + "step": 17295 + }, + { + "epoch": 9.067085953878406, + "grad_norm": 3.1225833892822266, + "learning_rate": 1.315376727287465e-06, + "loss": 0.1812, + "num_input_tokens_seen": 11314888, + "step": 17300 + }, + { + "epoch": 9.069706498951781, + "grad_norm": 2.565791130065918, + "learning_rate": 1.3080664874054127e-06, + "loss": 0.1949, + "num_input_tokens_seen": 11317576, + "step": 17305 + }, + { + "epoch": 9.072327044025156, + "grad_norm": 2.390211582183838, + "learning_rate": 1.3007760718511176e-06, + "loss": 0.2285, + "num_input_tokens_seen": 11320840, + "step": 17310 + }, + { + "epoch": 9.074947589098532, + "grad_norm": 1.8497686386108398, + "learning_rate": 1.2935054867248692e-06, + "loss": 0.1425, + "num_input_tokens_seen": 11324040, + "step": 17315 + }, + { + "epoch": 9.077568134171909, + "grad_norm": 2.541390895843506, + "learning_rate": 1.2862547381103567e-06, + "loss": 0.2331, + "num_input_tokens_seen": 11327784, + "step": 17320 + }, + { + "epoch": 9.080188679245284, + "grad_norm": 1.230920672416687, + "learning_rate": 1.2790238320746827e-06, + "loss": 0.175, + "num_input_tokens_seen": 11331304, + "step": 17325 + }, + { + "epoch": 9.082809224318659, + "grad_norm": 1.7825660705566406, + "learning_rate": 1.271812774668335e-06, + "loss": 0.265, + "num_input_tokens_seen": 11334312, + "step": 17330 + }, + { + "epoch": 9.085429769392034, + "grad_norm": 1.139296531677246, + "learning_rate": 1.2646215719251952e-06, + "loss": 0.2298, + "num_input_tokens_seen": 11339144, + "step": 17335 + }, + { + "epoch": 9.08805031446541, + "grad_norm": 1.3340473175048828, + "learning_rate": 1.2574502298625334e-06, + "loss": 0.1504, + "num_input_tokens_seen": 11342664, + "step": 17340 + }, + { + "epoch": 9.090670859538784, + "grad_norm": 2.77545428276062, + "learning_rate": 1.250298754481008e-06, + "loss": 0.1871, + "num_input_tokens_seen": 11345768, + "step": 17345 + }, + { + "epoch": 9.09329140461216, + "grad_norm": 1.594847559928894, + "learning_rate": 1.2431671517646403e-06, + "loss": 0.2855, + "num_input_tokens_seen": 11349448, + "step": 17350 + }, + { + "epoch": 9.095911949685535, + "grad_norm": 1.9039560556411743, + "learning_rate": 1.2360554276808295e-06, + "loss": 0.1549, + "num_input_tokens_seen": 11352744, + "step": 17355 + }, + { + "epoch": 9.09853249475891, + "grad_norm": 5.995811462402344, + "learning_rate": 1.228963588180343e-06, + "loss": 0.1963, + "num_input_tokens_seen": 11354920, + "step": 17360 + }, + { + "epoch": 9.101153039832285, + "grad_norm": 2.124880313873291, + "learning_rate": 1.2218916391973118e-06, + "loss": 0.1791, + "num_input_tokens_seen": 11359240, + "step": 17365 + }, + { + "epoch": 9.10377358490566, + "grad_norm": 2.038684606552124, + "learning_rate": 1.2148395866492135e-06, + "loss": 0.2646, + "num_input_tokens_seen": 11362056, + "step": 17370 + }, + { + "epoch": 9.106394129979035, + "grad_norm": 1.9594770669937134, + "learning_rate": 1.2078074364368862e-06, + "loss": 0.1106, + "num_input_tokens_seen": 11364648, + "step": 17375 + }, + { + "epoch": 9.10901467505241, + "grad_norm": 1.6617701053619385, + "learning_rate": 1.2007951944445122e-06, + "loss": 0.1846, + "num_input_tokens_seen": 11368168, + "step": 17380 + }, + { + "epoch": 9.111635220125786, + "grad_norm": 1.411332368850708, + "learning_rate": 1.1938028665396173e-06, + "loss": 0.196, + "num_input_tokens_seen": 11370856, + "step": 17385 + }, + { + "epoch": 9.114255765199161, + "grad_norm": 2.3460745811462402, + "learning_rate": 1.1868304585730571e-06, + "loss": 0.2205, + "num_input_tokens_seen": 11373768, + "step": 17390 + }, + { + "epoch": 9.116876310272536, + "grad_norm": 35.72917556762695, + "learning_rate": 1.1798779763790346e-06, + "loss": 0.2296, + "num_input_tokens_seen": 11377192, + "step": 17395 + }, + { + "epoch": 9.119496855345911, + "grad_norm": 1.5463839769363403, + "learning_rate": 1.1729454257750544e-06, + "loss": 0.1757, + "num_input_tokens_seen": 11379912, + "step": 17400 + }, + { + "epoch": 9.122117400419286, + "grad_norm": 1.937469720840454, + "learning_rate": 1.1660328125619652e-06, + "loss": 0.2089, + "num_input_tokens_seen": 11383496, + "step": 17405 + }, + { + "epoch": 9.124737945492662, + "grad_norm": 1.9038697481155396, + "learning_rate": 1.1591401425239318e-06, + "loss": 0.2251, + "num_input_tokens_seen": 11386504, + "step": 17410 + }, + { + "epoch": 9.127358490566039, + "grad_norm": 4.568230628967285, + "learning_rate": 1.1522674214284158e-06, + "loss": 0.1412, + "num_input_tokens_seen": 11388776, + "step": 17415 + }, + { + "epoch": 9.129979035639414, + "grad_norm": 2.0969624519348145, + "learning_rate": 1.145414655026203e-06, + "loss": 0.2454, + "num_input_tokens_seen": 11391976, + "step": 17420 + }, + { + "epoch": 9.132599580712789, + "grad_norm": 1.825900673866272, + "learning_rate": 1.1385818490513733e-06, + "loss": 0.2052, + "num_input_tokens_seen": 11394856, + "step": 17425 + }, + { + "epoch": 9.135220125786164, + "grad_norm": 2.9404866695404053, + "learning_rate": 1.1317690092213007e-06, + "loss": 0.2314, + "num_input_tokens_seen": 11397896, + "step": 17430 + }, + { + "epoch": 9.13784067085954, + "grad_norm": 2.128685235977173, + "learning_rate": 1.124976141236675e-06, + "loss": 0.2037, + "num_input_tokens_seen": 11401096, + "step": 17435 + }, + { + "epoch": 9.140461215932914, + "grad_norm": 1.9686188697814941, + "learning_rate": 1.1182032507814354e-06, + "loss": 0.2057, + "num_input_tokens_seen": 11404616, + "step": 17440 + }, + { + "epoch": 9.14308176100629, + "grad_norm": 1.7895787954330444, + "learning_rate": 1.1114503435228434e-06, + "loss": 0.1735, + "num_input_tokens_seen": 11407848, + "step": 17445 + }, + { + "epoch": 9.145702306079665, + "grad_norm": 2.168607711791992, + "learning_rate": 1.1047174251114234e-06, + "loss": 0.1843, + "num_input_tokens_seen": 11410760, + "step": 17450 + }, + { + "epoch": 9.14832285115304, + "grad_norm": 2.3520054817199707, + "learning_rate": 1.0980045011809604e-06, + "loss": 0.2563, + "num_input_tokens_seen": 11413864, + "step": 17455 + }, + { + "epoch": 9.150943396226415, + "grad_norm": 1.72475266456604, + "learning_rate": 1.0913115773485388e-06, + "loss": 0.1969, + "num_input_tokens_seen": 11416808, + "step": 17460 + }, + { + "epoch": 9.15356394129979, + "grad_norm": 1.9016457796096802, + "learning_rate": 1.084638659214482e-06, + "loss": 0.2888, + "num_input_tokens_seen": 11419944, + "step": 17465 + }, + { + "epoch": 9.156184486373165, + "grad_norm": 2.1487889289855957, + "learning_rate": 1.0779857523623815e-06, + "loss": 0.1907, + "num_input_tokens_seen": 11422728, + "step": 17470 + }, + { + "epoch": 9.15880503144654, + "grad_norm": 1.6789876222610474, + "learning_rate": 1.071352862359093e-06, + "loss": 0.2466, + "num_input_tokens_seen": 11430984, + "step": 17475 + }, + { + "epoch": 9.161425576519916, + "grad_norm": 2.209933280944824, + "learning_rate": 1.0647399947547127e-06, + "loss": 0.1729, + "num_input_tokens_seen": 11433672, + "step": 17480 + }, + { + "epoch": 9.164046121593291, + "grad_norm": 2.0762133598327637, + "learning_rate": 1.0581471550825812e-06, + "loss": 0.2152, + "num_input_tokens_seen": 11436168, + "step": 17485 + }, + { + "epoch": 9.166666666666666, + "grad_norm": 1.1532233953475952, + "learning_rate": 1.0515743488592939e-06, + "loss": 0.16, + "num_input_tokens_seen": 11439528, + "step": 17490 + }, + { + "epoch": 9.169287211740041, + "grad_norm": 3.02337908744812, + "learning_rate": 1.0450215815846736e-06, + "loss": 0.2509, + "num_input_tokens_seen": 11442312, + "step": 17495 + }, + { + "epoch": 9.171907756813416, + "grad_norm": 2.1352269649505615, + "learning_rate": 1.0384888587417736e-06, + "loss": 0.187, + "num_input_tokens_seen": 11445416, + "step": 17500 + }, + { + "epoch": 9.174528301886792, + "grad_norm": 2.9661142826080322, + "learning_rate": 1.0319761857968735e-06, + "loss": 0.272, + "num_input_tokens_seen": 11448712, + "step": 17505 + }, + { + "epoch": 9.177148846960169, + "grad_norm": 2.028432846069336, + "learning_rate": 1.0254835681994895e-06, + "loss": 0.1971, + "num_input_tokens_seen": 11452104, + "step": 17510 + }, + { + "epoch": 9.179769392033544, + "grad_norm": 1.7359328269958496, + "learning_rate": 1.0190110113823426e-06, + "loss": 0.1401, + "num_input_tokens_seen": 11456136, + "step": 17515 + }, + { + "epoch": 9.182389937106919, + "grad_norm": 2.1144840717315674, + "learning_rate": 1.0125585207613752e-06, + "loss": 0.2005, + "num_input_tokens_seen": 11459272, + "step": 17520 + }, + { + "epoch": 9.185010482180294, + "grad_norm": 2.202828884124756, + "learning_rate": 1.0061261017357327e-06, + "loss": 0.1893, + "num_input_tokens_seen": 11463208, + "step": 17525 + }, + { + "epoch": 9.18763102725367, + "grad_norm": 3.1505775451660156, + "learning_rate": 9.997137596877732e-07, + "loss": 0.1934, + "num_input_tokens_seen": 11466472, + "step": 17530 + }, + { + "epoch": 9.190251572327044, + "grad_norm": 3.10463809967041, + "learning_rate": 9.93321499983052e-07, + "loss": 0.232, + "num_input_tokens_seen": 11469192, + "step": 17535 + }, + { + "epoch": 9.19287211740042, + "grad_norm": 2.4797191619873047, + "learning_rate": 9.869493279703158e-07, + "loss": 0.1902, + "num_input_tokens_seen": 11472232, + "step": 17540 + }, + { + "epoch": 9.195492662473795, + "grad_norm": 2.9527039527893066, + "learning_rate": 9.805972489815102e-07, + "loss": 0.249, + "num_input_tokens_seen": 11476040, + "step": 17545 + }, + { + "epoch": 9.19811320754717, + "grad_norm": 3.194350242614746, + "learning_rate": 9.742652683317643e-07, + "loss": 0.186, + "num_input_tokens_seen": 11478728, + "step": 17550 + }, + { + "epoch": 9.200733752620545, + "grad_norm": 2.296360731124878, + "learning_rate": 9.679533913193927e-07, + "loss": 0.2088, + "num_input_tokens_seen": 11481448, + "step": 17555 + }, + { + "epoch": 9.20335429769392, + "grad_norm": 2.2180166244506836, + "learning_rate": 9.61661623225879e-07, + "loss": 0.1713, + "num_input_tokens_seen": 11484424, + "step": 17560 + }, + { + "epoch": 9.205974842767295, + "grad_norm": 1.477434515953064, + "learning_rate": 9.553899693158951e-07, + "loss": 0.2067, + "num_input_tokens_seen": 11487560, + "step": 17565 + }, + { + "epoch": 9.20859538784067, + "grad_norm": 3.252474784851074, + "learning_rate": 9.491384348372684e-07, + "loss": 0.1901, + "num_input_tokens_seen": 11490632, + "step": 17570 + }, + { + "epoch": 9.211215932914046, + "grad_norm": 2.7938733100891113, + "learning_rate": 9.429070250210004e-07, + "loss": 0.1972, + "num_input_tokens_seen": 11493032, + "step": 17575 + }, + { + "epoch": 9.213836477987421, + "grad_norm": 2.5741519927978516, + "learning_rate": 9.366957450812535e-07, + "loss": 0.1555, + "num_input_tokens_seen": 11496200, + "step": 17580 + }, + { + "epoch": 9.216457023060796, + "grad_norm": 2.295999050140381, + "learning_rate": 9.305046002153345e-07, + "loss": 0.2612, + "num_input_tokens_seen": 11498696, + "step": 17585 + }, + { + "epoch": 9.219077568134171, + "grad_norm": 2.371086359024048, + "learning_rate": 9.243335956037186e-07, + "loss": 0.1617, + "num_input_tokens_seen": 11502312, + "step": 17590 + }, + { + "epoch": 9.221698113207546, + "grad_norm": 1.547320008277893, + "learning_rate": 9.181827364100171e-07, + "loss": 0.193, + "num_input_tokens_seen": 11505160, + "step": 17595 + }, + { + "epoch": 9.224318658280922, + "grad_norm": 1.5848252773284912, + "learning_rate": 9.120520277809852e-07, + "loss": 0.1312, + "num_input_tokens_seen": 11508456, + "step": 17600 + }, + { + "epoch": 9.226939203354299, + "grad_norm": 2.273747205734253, + "learning_rate": 9.059414748465278e-07, + "loss": 0.2016, + "num_input_tokens_seen": 11511816, + "step": 17605 + }, + { + "epoch": 9.229559748427674, + "grad_norm": 1.704892873764038, + "learning_rate": 8.998510827196715e-07, + "loss": 0.1919, + "num_input_tokens_seen": 11514088, + "step": 17610 + }, + { + "epoch": 9.232180293501049, + "grad_norm": 1.6342811584472656, + "learning_rate": 8.937808564965733e-07, + "loss": 0.1706, + "num_input_tokens_seen": 11517096, + "step": 17615 + }, + { + "epoch": 9.234800838574424, + "grad_norm": 1.8304009437561035, + "learning_rate": 8.877308012565339e-07, + "loss": 0.1835, + "num_input_tokens_seen": 11520168, + "step": 17620 + }, + { + "epoch": 9.2374213836478, + "grad_norm": 2.2229514122009277, + "learning_rate": 8.817009220619482e-07, + "loss": 0.3027, + "num_input_tokens_seen": 11523944, + "step": 17625 + }, + { + "epoch": 9.240041928721174, + "grad_norm": 1.1686148643493652, + "learning_rate": 8.756912239583554e-07, + "loss": 0.2684, + "num_input_tokens_seen": 11527720, + "step": 17630 + }, + { + "epoch": 9.24266247379455, + "grad_norm": 1.4338371753692627, + "learning_rate": 8.697017119743911e-07, + "loss": 0.2851, + "num_input_tokens_seen": 11531304, + "step": 17635 + }, + { + "epoch": 9.245283018867925, + "grad_norm": 1.8823795318603516, + "learning_rate": 8.637323911218048e-07, + "loss": 0.1969, + "num_input_tokens_seen": 11535176, + "step": 17640 + }, + { + "epoch": 9.2479035639413, + "grad_norm": 1.1685763597488403, + "learning_rate": 8.577832663954538e-07, + "loss": 0.2768, + "num_input_tokens_seen": 11539016, + "step": 17645 + }, + { + "epoch": 9.250524109014675, + "grad_norm": 1.5326272249221802, + "learning_rate": 8.51854342773295e-07, + "loss": 0.1551, + "num_input_tokens_seen": 11543752, + "step": 17650 + }, + { + "epoch": 9.25314465408805, + "grad_norm": 2.4748454093933105, + "learning_rate": 8.459456252163739e-07, + "loss": 0.23, + "num_input_tokens_seen": 11546664, + "step": 17655 + }, + { + "epoch": 9.255765199161425, + "grad_norm": 3.1630046367645264, + "learning_rate": 8.400571186688466e-07, + "loss": 0.1992, + "num_input_tokens_seen": 11549032, + "step": 17660 + }, + { + "epoch": 9.2583857442348, + "grad_norm": 2.4584858417510986, + "learning_rate": 8.341888280579386e-07, + "loss": 0.3095, + "num_input_tokens_seen": 11552328, + "step": 17665 + }, + { + "epoch": 9.261006289308176, + "grad_norm": 1.944806456565857, + "learning_rate": 8.283407582939689e-07, + "loss": 0.284, + "num_input_tokens_seen": 11555464, + "step": 17670 + }, + { + "epoch": 9.26362683438155, + "grad_norm": 2.624269723892212, + "learning_rate": 8.22512914270332e-07, + "loss": 0.1909, + "num_input_tokens_seen": 11558088, + "step": 17675 + }, + { + "epoch": 9.266247379454926, + "grad_norm": 2.3500468730926514, + "learning_rate": 8.167053008635101e-07, + "loss": 0.2348, + "num_input_tokens_seen": 11560872, + "step": 17680 + }, + { + "epoch": 9.268867924528301, + "grad_norm": 3.2007787227630615, + "learning_rate": 8.109179229330438e-07, + "loss": 0.1645, + "num_input_tokens_seen": 11564264, + "step": 17685 + }, + { + "epoch": 9.271488469601676, + "grad_norm": 1.7680706977844238, + "learning_rate": 8.051507853215401e-07, + "loss": 0.1806, + "num_input_tokens_seen": 11567656, + "step": 17690 + }, + { + "epoch": 9.274109014675052, + "grad_norm": 2.236210346221924, + "learning_rate": 7.994038928546887e-07, + "loss": 0.1994, + "num_input_tokens_seen": 11571176, + "step": 17695 + }, + { + "epoch": 9.276729559748428, + "grad_norm": 1.1644686460494995, + "learning_rate": 7.93677250341221e-07, + "loss": 0.262, + "num_input_tokens_seen": 11574216, + "step": 17700 + }, + { + "epoch": 9.279350104821804, + "grad_norm": 2.2718820571899414, + "learning_rate": 7.879708625729287e-07, + "loss": 0.168, + "num_input_tokens_seen": 11577608, + "step": 17705 + }, + { + "epoch": 9.281970649895179, + "grad_norm": 1.9329078197479248, + "learning_rate": 7.822847343246564e-07, + "loss": 0.2086, + "num_input_tokens_seen": 11581000, + "step": 17710 + }, + { + "epoch": 9.284591194968554, + "grad_norm": 1.6744704246520996, + "learning_rate": 7.766188703542954e-07, + "loss": 0.2397, + "num_input_tokens_seen": 11584840, + "step": 17715 + }, + { + "epoch": 9.28721174004193, + "grad_norm": 2.065079689025879, + "learning_rate": 7.709732754027866e-07, + "loss": 0.2179, + "num_input_tokens_seen": 11587912, + "step": 17720 + }, + { + "epoch": 9.289832285115304, + "grad_norm": 2.246366262435913, + "learning_rate": 7.653479541941038e-07, + "loss": 0.1985, + "num_input_tokens_seen": 11591752, + "step": 17725 + }, + { + "epoch": 9.29245283018868, + "grad_norm": 2.7670583724975586, + "learning_rate": 7.597429114352572e-07, + "loss": 0.2611, + "num_input_tokens_seen": 11594248, + "step": 17730 + }, + { + "epoch": 9.295073375262055, + "grad_norm": 2.0991156101226807, + "learning_rate": 7.541581518162922e-07, + "loss": 0.1854, + "num_input_tokens_seen": 11597448, + "step": 17735 + }, + { + "epoch": 9.29769392033543, + "grad_norm": 1.3728463649749756, + "learning_rate": 7.485936800102788e-07, + "loss": 0.3018, + "num_input_tokens_seen": 11600360, + "step": 17740 + }, + { + "epoch": 9.300314465408805, + "grad_norm": 2.913738489151001, + "learning_rate": 7.430495006733152e-07, + "loss": 0.2788, + "num_input_tokens_seen": 11603528, + "step": 17745 + }, + { + "epoch": 9.30293501048218, + "grad_norm": 2.093806028366089, + "learning_rate": 7.375256184445178e-07, + "loss": 0.2482, + "num_input_tokens_seen": 11606600, + "step": 17750 + }, + { + "epoch": 9.305555555555555, + "grad_norm": 1.798256516456604, + "learning_rate": 7.320220379460146e-07, + "loss": 0.1853, + "num_input_tokens_seen": 11609064, + "step": 17755 + }, + { + "epoch": 9.30817610062893, + "grad_norm": 1.3189986944198608, + "learning_rate": 7.265387637829524e-07, + "loss": 0.2002, + "num_input_tokens_seen": 11612328, + "step": 17760 + }, + { + "epoch": 9.310796645702306, + "grad_norm": 2.2489774227142334, + "learning_rate": 7.210758005434887e-07, + "loss": 0.2481, + "num_input_tokens_seen": 11615912, + "step": 17765 + }, + { + "epoch": 9.31341719077568, + "grad_norm": 2.621434211730957, + "learning_rate": 7.156331527987753e-07, + "loss": 0.2542, + "num_input_tokens_seen": 11618888, + "step": 17770 + }, + { + "epoch": 9.316037735849056, + "grad_norm": 2.4049570560455322, + "learning_rate": 7.102108251029777e-07, + "loss": 0.3932, + "num_input_tokens_seen": 11621544, + "step": 17775 + }, + { + "epoch": 9.318658280922431, + "grad_norm": 2.2394423484802246, + "learning_rate": 7.04808821993247e-07, + "loss": 0.1779, + "num_input_tokens_seen": 11624040, + "step": 17780 + }, + { + "epoch": 9.321278825995806, + "grad_norm": 2.740093231201172, + "learning_rate": 6.994271479897314e-07, + "loss": 0.1761, + "num_input_tokens_seen": 11626728, + "step": 17785 + }, + { + "epoch": 9.323899371069182, + "grad_norm": 3.1075949668884277, + "learning_rate": 6.940658075955759e-07, + "loss": 0.1322, + "num_input_tokens_seen": 11629832, + "step": 17790 + }, + { + "epoch": 9.326519916142558, + "grad_norm": 1.6362838745117188, + "learning_rate": 6.887248052969003e-07, + "loss": 0.1471, + "num_input_tokens_seen": 11633320, + "step": 17795 + }, + { + "epoch": 9.329140461215934, + "grad_norm": 1.940351963043213, + "learning_rate": 6.834041455628104e-07, + "loss": 0.2028, + "num_input_tokens_seen": 11636104, + "step": 17800 + }, + { + "epoch": 9.331761006289309, + "grad_norm": 1.8335752487182617, + "learning_rate": 6.781038328454003e-07, + "loss": 0.2262, + "num_input_tokens_seen": 11639752, + "step": 17805 + }, + { + "epoch": 9.334381551362684, + "grad_norm": 1.6309316158294678, + "learning_rate": 6.728238715797169e-07, + "loss": 0.2166, + "num_input_tokens_seen": 11642664, + "step": 17810 + }, + { + "epoch": 9.33700209643606, + "grad_norm": 2.420827865600586, + "learning_rate": 6.675642661838011e-07, + "loss": 0.2155, + "num_input_tokens_seen": 11646024, + "step": 17815 + }, + { + "epoch": 9.339622641509434, + "grad_norm": 1.5077571868896484, + "learning_rate": 6.623250210586463e-07, + "loss": 0.1963, + "num_input_tokens_seen": 11649192, + "step": 17820 + }, + { + "epoch": 9.34224318658281, + "grad_norm": 1.6106961965560913, + "learning_rate": 6.571061405882095e-07, + "loss": 0.2791, + "num_input_tokens_seen": 11653192, + "step": 17825 + }, + { + "epoch": 9.344863731656185, + "grad_norm": 5.386600017547607, + "learning_rate": 6.519076291394172e-07, + "loss": 0.187, + "num_input_tokens_seen": 11657224, + "step": 17830 + }, + { + "epoch": 9.34748427672956, + "grad_norm": 2.4729816913604736, + "learning_rate": 6.467294910621452e-07, + "loss": 0.3241, + "num_input_tokens_seen": 11660328, + "step": 17835 + }, + { + "epoch": 9.350104821802935, + "grad_norm": 1.929837703704834, + "learning_rate": 6.415717306892193e-07, + "loss": 0.2311, + "num_input_tokens_seen": 11663400, + "step": 17840 + }, + { + "epoch": 9.35272536687631, + "grad_norm": 2.0596976280212402, + "learning_rate": 6.364343523364263e-07, + "loss": 0.2316, + "num_input_tokens_seen": 11665992, + "step": 17845 + }, + { + "epoch": 9.355345911949685, + "grad_norm": 2.1611275672912598, + "learning_rate": 6.313173603024802e-07, + "loss": 0.1911, + "num_input_tokens_seen": 11669000, + "step": 17850 + }, + { + "epoch": 9.35796645702306, + "grad_norm": 1.4549570083618164, + "learning_rate": 6.262207588690533e-07, + "loss": 0.1792, + "num_input_tokens_seen": 11671976, + "step": 17855 + }, + { + "epoch": 9.360587002096436, + "grad_norm": 1.9269940853118896, + "learning_rate": 6.211445523007398e-07, + "loss": 0.1825, + "num_input_tokens_seen": 11674984, + "step": 17860 + }, + { + "epoch": 9.36320754716981, + "grad_norm": 2.0137410163879395, + "learning_rate": 6.160887448450892e-07, + "loss": 0.1849, + "num_input_tokens_seen": 11677864, + "step": 17865 + }, + { + "epoch": 9.365828092243186, + "grad_norm": 1.4424363374710083, + "learning_rate": 6.11053340732562e-07, + "loss": 0.2051, + "num_input_tokens_seen": 11681128, + "step": 17870 + }, + { + "epoch": 9.368448637316561, + "grad_norm": 2.51884126663208, + "learning_rate": 6.060383441765544e-07, + "loss": 0.1878, + "num_input_tokens_seen": 11683880, + "step": 17875 + }, + { + "epoch": 9.371069182389936, + "grad_norm": 1.7698864936828613, + "learning_rate": 6.01043759373393e-07, + "loss": 0.1804, + "num_input_tokens_seen": 11687496, + "step": 17880 + }, + { + "epoch": 9.373689727463312, + "grad_norm": 2.3470804691314697, + "learning_rate": 5.960695905023128e-07, + "loss": 0.2486, + "num_input_tokens_seen": 11690696, + "step": 17885 + }, + { + "epoch": 9.376310272536688, + "grad_norm": 1.8400574922561646, + "learning_rate": 5.91115841725473e-07, + "loss": 0.1964, + "num_input_tokens_seen": 11693544, + "step": 17890 + }, + { + "epoch": 9.378930817610064, + "grad_norm": 3.357168436050415, + "learning_rate": 5.861825171879415e-07, + "loss": 0.2301, + "num_input_tokens_seen": 11696840, + "step": 17895 + }, + { + "epoch": 9.381551362683439, + "grad_norm": 1.8564375638961792, + "learning_rate": 5.812696210177021e-07, + "loss": 0.2105, + "num_input_tokens_seen": 11700264, + "step": 17900 + }, + { + "epoch": 9.384171907756814, + "grad_norm": 4.085783958435059, + "learning_rate": 5.763771573256415e-07, + "loss": 0.1389, + "num_input_tokens_seen": 11703496, + "step": 17905 + }, + { + "epoch": 9.38679245283019, + "grad_norm": 2.1031100749969482, + "learning_rate": 5.715051302055491e-07, + "loss": 0.2668, + "num_input_tokens_seen": 11706664, + "step": 17910 + }, + { + "epoch": 9.389412997903564, + "grad_norm": 3.2411510944366455, + "learning_rate": 5.666535437341108e-07, + "loss": 0.2157, + "num_input_tokens_seen": 11709480, + "step": 17915 + }, + { + "epoch": 9.39203354297694, + "grad_norm": 1.8859463930130005, + "learning_rate": 5.618224019709212e-07, + "loss": 0.1569, + "num_input_tokens_seen": 11712712, + "step": 17920 + }, + { + "epoch": 9.394654088050315, + "grad_norm": 2.3078670501708984, + "learning_rate": 5.570117089584548e-07, + "loss": 0.2357, + "num_input_tokens_seen": 11715464, + "step": 17925 + }, + { + "epoch": 9.39727463312369, + "grad_norm": 2.716326951980591, + "learning_rate": 5.522214687220751e-07, + "loss": 0.1958, + "num_input_tokens_seen": 11718120, + "step": 17930 + }, + { + "epoch": 9.399895178197065, + "grad_norm": 1.5990817546844482, + "learning_rate": 5.474516852700451e-07, + "loss": 0.2373, + "num_input_tokens_seen": 11721512, + "step": 17935 + }, + { + "epoch": 9.40251572327044, + "grad_norm": 2.033259630203247, + "learning_rate": 5.427023625934946e-07, + "loss": 0.1557, + "num_input_tokens_seen": 11724008, + "step": 17940 + }, + { + "epoch": 9.405136268343815, + "grad_norm": 1.6428958177566528, + "learning_rate": 5.379735046664419e-07, + "loss": 0.1922, + "num_input_tokens_seen": 11726696, + "step": 17945 + }, + { + "epoch": 9.40775681341719, + "grad_norm": 1.3341647386550903, + "learning_rate": 5.33265115445783e-07, + "loss": 0.1791, + "num_input_tokens_seen": 11730408, + "step": 17950 + }, + { + "epoch": 9.410377358490566, + "grad_norm": 2.3052706718444824, + "learning_rate": 5.285771988712746e-07, + "loss": 0.2329, + "num_input_tokens_seen": 11733320, + "step": 17955 + }, + { + "epoch": 9.41299790356394, + "grad_norm": 2.433615207672119, + "learning_rate": 5.239097588655595e-07, + "loss": 0.2134, + "num_input_tokens_seen": 11738472, + "step": 17960 + }, + { + "epoch": 9.415618448637316, + "grad_norm": 2.1821181774139404, + "learning_rate": 5.192627993341359e-07, + "loss": 0.2347, + "num_input_tokens_seen": 11741032, + "step": 17965 + }, + { + "epoch": 9.418238993710691, + "grad_norm": 3.049649953842163, + "learning_rate": 5.146363241653657e-07, + "loss": 0.1797, + "num_input_tokens_seen": 11744328, + "step": 17970 + }, + { + "epoch": 9.420859538784066, + "grad_norm": 1.9595361948013306, + "learning_rate": 5.100303372304716e-07, + "loss": 0.1522, + "num_input_tokens_seen": 11747976, + "step": 17975 + }, + { + "epoch": 9.423480083857442, + "grad_norm": 2.4393022060394287, + "learning_rate": 5.054448423835373e-07, + "loss": 0.3175, + "num_input_tokens_seen": 11751144, + "step": 17980 + }, + { + "epoch": 9.426100628930818, + "grad_norm": 2.139101266860962, + "learning_rate": 5.008798434614908e-07, + "loss": 0.2523, + "num_input_tokens_seen": 11754312, + "step": 17985 + }, + { + "epoch": 9.428721174004194, + "grad_norm": 1.5991681814193726, + "learning_rate": 4.963353442841156e-07, + "loss": 0.1624, + "num_input_tokens_seen": 11757896, + "step": 17990 + }, + { + "epoch": 9.431341719077569, + "grad_norm": 1.552502989768982, + "learning_rate": 4.918113486540393e-07, + "loss": 0.1476, + "num_input_tokens_seen": 11760840, + "step": 17995 + }, + { + "epoch": 9.433962264150944, + "grad_norm": 3.0222675800323486, + "learning_rate": 4.873078603567421e-07, + "loss": 0.1717, + "num_input_tokens_seen": 11764136, + "step": 18000 + }, + { + "epoch": 9.43658280922432, + "grad_norm": 2.8564655780792236, + "learning_rate": 4.828248831605292e-07, + "loss": 0.2438, + "num_input_tokens_seen": 11766696, + "step": 18005 + }, + { + "epoch": 9.439203354297694, + "grad_norm": 3.5283045768737793, + "learning_rate": 4.783624208165554e-07, + "loss": 0.2162, + "num_input_tokens_seen": 11769160, + "step": 18010 + }, + { + "epoch": 9.44182389937107, + "grad_norm": 1.6555898189544678, + "learning_rate": 4.739204770588035e-07, + "loss": 0.2251, + "num_input_tokens_seen": 11772360, + "step": 18015 + }, + { + "epoch": 9.444444444444445, + "grad_norm": 3.660688877105713, + "learning_rate": 4.694990556040918e-07, + "loss": 0.2422, + "num_input_tokens_seen": 11774984, + "step": 18020 + }, + { + "epoch": 9.44706498951782, + "grad_norm": 1.7806142568588257, + "learning_rate": 4.65098160152061e-07, + "loss": 0.1911, + "num_input_tokens_seen": 11777928, + "step": 18025 + }, + { + "epoch": 9.449685534591195, + "grad_norm": 2.145226240158081, + "learning_rate": 4.6071779438517924e-07, + "loss": 0.1798, + "num_input_tokens_seen": 11781096, + "step": 18030 + }, + { + "epoch": 9.45230607966457, + "grad_norm": 1.1849910020828247, + "learning_rate": 4.563579619687369e-07, + "loss": 0.1613, + "num_input_tokens_seen": 11784104, + "step": 18035 + }, + { + "epoch": 9.454926624737945, + "grad_norm": 1.9778201580047607, + "learning_rate": 4.5201866655084636e-07, + "loss": 0.2106, + "num_input_tokens_seen": 11787304, + "step": 18040 + }, + { + "epoch": 9.45754716981132, + "grad_norm": 2.436704635620117, + "learning_rate": 4.4769991176242533e-07, + "loss": 0.1358, + "num_input_tokens_seen": 11790024, + "step": 18045 + }, + { + "epoch": 9.460167714884696, + "grad_norm": 2.029385805130005, + "learning_rate": 4.4340170121721645e-07, + "loss": 0.144, + "num_input_tokens_seen": 11792840, + "step": 18050 + }, + { + "epoch": 9.46278825995807, + "grad_norm": 3.3645710945129395, + "learning_rate": 4.3912403851176234e-07, + "loss": 0.1601, + "num_input_tokens_seen": 11795720, + "step": 18055 + }, + { + "epoch": 9.465408805031446, + "grad_norm": 2.478837728500366, + "learning_rate": 4.348669272254163e-07, + "loss": 0.176, + "num_input_tokens_seen": 11798472, + "step": 18060 + }, + { + "epoch": 9.468029350104821, + "grad_norm": 2.385051965713501, + "learning_rate": 4.306303709203374e-07, + "loss": 0.1684, + "num_input_tokens_seen": 11801800, + "step": 18065 + }, + { + "epoch": 9.470649895178196, + "grad_norm": 3.419631242752075, + "learning_rate": 4.264143731414788e-07, + "loss": 0.2118, + "num_input_tokens_seen": 11805160, + "step": 18070 + }, + { + "epoch": 9.473270440251572, + "grad_norm": 2.070904016494751, + "learning_rate": 4.2221893741659636e-07, + "loss": 0.2005, + "num_input_tokens_seen": 11811816, + "step": 18075 + }, + { + "epoch": 9.475890985324948, + "grad_norm": 1.2780120372772217, + "learning_rate": 4.180440672562402e-07, + "loss": 0.2876, + "num_input_tokens_seen": 11816040, + "step": 18080 + }, + { + "epoch": 9.478511530398324, + "grad_norm": 1.7168736457824707, + "learning_rate": 4.1388976615374665e-07, + "loss": 0.2507, + "num_input_tokens_seen": 11818920, + "step": 18085 + }, + { + "epoch": 9.481132075471699, + "grad_norm": 2.3513870239257812, + "learning_rate": 4.097560375852516e-07, + "loss": 0.2246, + "num_input_tokens_seen": 11821736, + "step": 18090 + }, + { + "epoch": 9.483752620545074, + "grad_norm": 3.2546145915985107, + "learning_rate": 4.056428850096661e-07, + "loss": 0.2317, + "num_input_tokens_seen": 11825256, + "step": 18095 + }, + { + "epoch": 9.48637316561845, + "grad_norm": 1.6579781770706177, + "learning_rate": 4.01550311868687e-07, + "loss": 0.2561, + "num_input_tokens_seen": 11829416, + "step": 18100 + }, + { + "epoch": 9.488993710691824, + "grad_norm": 1.6533126831054688, + "learning_rate": 3.974783215867972e-07, + "loss": 0.267, + "num_input_tokens_seen": 11832200, + "step": 18105 + }, + { + "epoch": 9.4916142557652, + "grad_norm": 2.7443768978118896, + "learning_rate": 3.9342691757124626e-07, + "loss": 0.2254, + "num_input_tokens_seen": 11835592, + "step": 18110 + }, + { + "epoch": 9.494234800838575, + "grad_norm": 2.68881893157959, + "learning_rate": 3.8939610321206966e-07, + "loss": 0.2257, + "num_input_tokens_seen": 11839400, + "step": 18115 + }, + { + "epoch": 9.49685534591195, + "grad_norm": 2.4995949268341064, + "learning_rate": 3.853858818820694e-07, + "loss": 0.2322, + "num_input_tokens_seen": 11842216, + "step": 18120 + }, + { + "epoch": 9.499475890985325, + "grad_norm": 1.268731951713562, + "learning_rate": 3.8139625693680847e-07, + "loss": 0.152, + "num_input_tokens_seen": 11844936, + "step": 18125 + }, + { + "epoch": 9.5, + "eval_loss": 0.6764101982116699, + "eval_runtime": 15.9722, + "eval_samples_per_second": 53.092, + "eval_steps_per_second": 13.273, + "num_input_tokens_seen": 11845704, + "step": 18126 + }, + { + "epoch": 9.5020964360587, + "grad_norm": 2.9328277111053467, + "learning_rate": 3.774272317146277e-07, + "loss": 0.2177, + "num_input_tokens_seen": 11847912, + "step": 18130 + }, + { + "epoch": 9.504716981132075, + "grad_norm": 1.2567301988601685, + "learning_rate": 3.7347880953662597e-07, + "loss": 0.1504, + "num_input_tokens_seen": 11853192, + "step": 18135 + }, + { + "epoch": 9.50733752620545, + "grad_norm": 2.3333401679992676, + "learning_rate": 3.6955099370666045e-07, + "loss": 0.2218, + "num_input_tokens_seen": 11856808, + "step": 18140 + }, + { + "epoch": 9.509958071278826, + "grad_norm": 3.286998748779297, + "learning_rate": 3.656437875113522e-07, + "loss": 0.395, + "num_input_tokens_seen": 11860360, + "step": 18145 + }, + { + "epoch": 9.5125786163522, + "grad_norm": 2.0358588695526123, + "learning_rate": 3.617571942200693e-07, + "loss": 0.1542, + "num_input_tokens_seen": 11863048, + "step": 18150 + }, + { + "epoch": 9.515199161425576, + "grad_norm": 2.25559663772583, + "learning_rate": 3.5789121708493523e-07, + "loss": 0.2159, + "num_input_tokens_seen": 11867016, + "step": 18155 + }, + { + "epoch": 9.517819706498951, + "grad_norm": 2.7882609367370605, + "learning_rate": 3.5404585934082635e-07, + "loss": 0.1665, + "num_input_tokens_seen": 11870728, + "step": 18160 + }, + { + "epoch": 9.520440251572326, + "grad_norm": 1.8731576204299927, + "learning_rate": 3.502211242053577e-07, + "loss": 0.1925, + "num_input_tokens_seen": 11873928, + "step": 18165 + }, + { + "epoch": 9.523060796645701, + "grad_norm": 2.4618144035339355, + "learning_rate": 3.4641701487889697e-07, + "loss": 0.1691, + "num_input_tokens_seen": 11876968, + "step": 18170 + }, + { + "epoch": 9.525681341719078, + "grad_norm": 1.4696531295776367, + "learning_rate": 3.4263353454454806e-07, + "loss": 0.2048, + "num_input_tokens_seen": 11880648, + "step": 18175 + }, + { + "epoch": 9.528301886792454, + "grad_norm": 2.308501958847046, + "learning_rate": 3.3887068636815346e-07, + "loss": 0.2346, + "num_input_tokens_seen": 11883560, + "step": 18180 + }, + { + "epoch": 9.530922431865829, + "grad_norm": 2.369633197784424, + "learning_rate": 3.351284734982918e-07, + "loss": 0.2058, + "num_input_tokens_seen": 11888296, + "step": 18185 + }, + { + "epoch": 9.533542976939204, + "grad_norm": 2.034019947052002, + "learning_rate": 3.3140689906628054e-07, + "loss": 0.211, + "num_input_tokens_seen": 11892872, + "step": 18190 + }, + { + "epoch": 9.536163522012579, + "grad_norm": 2.647352695465088, + "learning_rate": 3.2770596618615645e-07, + "loss": 0.1548, + "num_input_tokens_seen": 11895944, + "step": 18195 + }, + { + "epoch": 9.538784067085954, + "grad_norm": 2.4771955013275146, + "learning_rate": 3.240256779546952e-07, + "loss": 0.1328, + "num_input_tokens_seen": 11898696, + "step": 18200 + }, + { + "epoch": 9.54140461215933, + "grad_norm": 0.533099353313446, + "learning_rate": 3.2036603745139447e-07, + "loss": 0.1451, + "num_input_tokens_seen": 11904744, + "step": 18205 + }, + { + "epoch": 9.544025157232705, + "grad_norm": 1.4481667280197144, + "learning_rate": 3.167270477384743e-07, + "loss": 0.1608, + "num_input_tokens_seen": 11907656, + "step": 18210 + }, + { + "epoch": 9.54664570230608, + "grad_norm": 2.917510986328125, + "learning_rate": 3.1310871186086834e-07, + "loss": 0.2666, + "num_input_tokens_seen": 11910504, + "step": 18215 + }, + { + "epoch": 9.549266247379455, + "grad_norm": 3.6054742336273193, + "learning_rate": 3.095110328462464e-07, + "loss": 0.1847, + "num_input_tokens_seen": 11913192, + "step": 18220 + }, + { + "epoch": 9.55188679245283, + "grad_norm": 2.1486268043518066, + "learning_rate": 3.0593401370497264e-07, + "loss": 0.3092, + "num_input_tokens_seen": 11916680, + "step": 18225 + }, + { + "epoch": 9.554507337526205, + "grad_norm": 3.7538371086120605, + "learning_rate": 3.0237765743013626e-07, + "loss": 0.1675, + "num_input_tokens_seen": 11919656, + "step": 18230 + }, + { + "epoch": 9.55712788259958, + "grad_norm": 1.8426475524902344, + "learning_rate": 2.9884196699753453e-07, + "loss": 0.2035, + "num_input_tokens_seen": 11922632, + "step": 18235 + }, + { + "epoch": 9.559748427672956, + "grad_norm": 1.8955600261688232, + "learning_rate": 2.953269453656704e-07, + "loss": 0.194, + "num_input_tokens_seen": 11926280, + "step": 18240 + }, + { + "epoch": 9.56236897274633, + "grad_norm": 0.6097882390022278, + "learning_rate": 2.9183259547575504e-07, + "loss": 0.1185, + "num_input_tokens_seen": 11929768, + "step": 18245 + }, + { + "epoch": 9.564989517819706, + "grad_norm": 1.4693992137908936, + "learning_rate": 2.883589202517023e-07, + "loss": 0.2141, + "num_input_tokens_seen": 11933480, + "step": 18250 + }, + { + "epoch": 9.567610062893081, + "grad_norm": 2.580925464630127, + "learning_rate": 2.849059226001177e-07, + "loss": 0.1422, + "num_input_tokens_seen": 11936200, + "step": 18255 + }, + { + "epoch": 9.570230607966456, + "grad_norm": 2.904174327850342, + "learning_rate": 2.8147360541032065e-07, + "loss": 0.2477, + "num_input_tokens_seen": 11938472, + "step": 18260 + }, + { + "epoch": 9.572851153039831, + "grad_norm": 2.0403616428375244, + "learning_rate": 2.780619715543109e-07, + "loss": 0.2637, + "num_input_tokens_seen": 11942280, + "step": 18265 + }, + { + "epoch": 9.575471698113208, + "grad_norm": 2.2074332237243652, + "learning_rate": 2.746710238867911e-07, + "loss": 0.166, + "num_input_tokens_seen": 11945800, + "step": 18270 + }, + { + "epoch": 9.578092243186584, + "grad_norm": 1.1595823764801025, + "learning_rate": 2.713007652451499e-07, + "loss": 0.1826, + "num_input_tokens_seen": 11948200, + "step": 18275 + }, + { + "epoch": 9.580712788259959, + "grad_norm": 2.601531505584717, + "learning_rate": 2.6795119844946757e-07, + "loss": 0.2232, + "num_input_tokens_seen": 11951656, + "step": 18280 + }, + { + "epoch": 9.583333333333334, + "grad_norm": 1.0634233951568604, + "learning_rate": 2.646223263025077e-07, + "loss": 0.1707, + "num_input_tokens_seen": 11955208, + "step": 18285 + }, + { + "epoch": 9.585953878406709, + "grad_norm": 1.62983238697052, + "learning_rate": 2.6131415158971993e-07, + "loss": 0.1431, + "num_input_tokens_seen": 11957768, + "step": 18290 + }, + { + "epoch": 9.588574423480084, + "grad_norm": 2.6630430221557617, + "learning_rate": 2.5802667707922887e-07, + "loss": 0.204, + "num_input_tokens_seen": 11960552, + "step": 18295 + }, + { + "epoch": 9.59119496855346, + "grad_norm": 2.3422563076019287, + "learning_rate": 2.54759905521848e-07, + "loss": 0.2429, + "num_input_tokens_seen": 11964552, + "step": 18300 + }, + { + "epoch": 9.593815513626835, + "grad_norm": 2.7632555961608887, + "learning_rate": 2.51513839651063e-07, + "loss": 0.1847, + "num_input_tokens_seen": 11966824, + "step": 18305 + }, + { + "epoch": 9.59643605870021, + "grad_norm": 2.8852787017822266, + "learning_rate": 2.4828848218302615e-07, + "loss": 0.1888, + "num_input_tokens_seen": 11969832, + "step": 18310 + }, + { + "epoch": 9.599056603773585, + "grad_norm": 2.2954115867614746, + "learning_rate": 2.450838358165786e-07, + "loss": 0.149, + "num_input_tokens_seen": 11972424, + "step": 18315 + }, + { + "epoch": 9.60167714884696, + "grad_norm": 2.5609493255615234, + "learning_rate": 2.41899903233217e-07, + "loss": 0.2274, + "num_input_tokens_seen": 11976552, + "step": 18320 + }, + { + "epoch": 9.604297693920335, + "grad_norm": 1.4645768404006958, + "learning_rate": 2.387366870971103e-07, + "loss": 0.2065, + "num_input_tokens_seen": 11979720, + "step": 18325 + }, + { + "epoch": 9.60691823899371, + "grad_norm": 1.824829339981079, + "learning_rate": 2.3559419005509675e-07, + "loss": 0.2473, + "num_input_tokens_seen": 11982536, + "step": 18330 + }, + { + "epoch": 9.609538784067086, + "grad_norm": 2.075368881225586, + "learning_rate": 2.3247241473667026e-07, + "loss": 0.136, + "num_input_tokens_seen": 11985384, + "step": 18335 + }, + { + "epoch": 9.61215932914046, + "grad_norm": 1.674567699432373, + "learning_rate": 2.2937136375399126e-07, + "loss": 0.1659, + "num_input_tokens_seen": 11988712, + "step": 18340 + }, + { + "epoch": 9.614779874213836, + "grad_norm": 1.3033084869384766, + "learning_rate": 2.2629103970188137e-07, + "loss": 0.1984, + "num_input_tokens_seen": 11993352, + "step": 18345 + }, + { + "epoch": 9.617400419287211, + "grad_norm": 1.7133715152740479, + "learning_rate": 2.2323144515780935e-07, + "loss": 0.2087, + "num_input_tokens_seen": 11995848, + "step": 18350 + }, + { + "epoch": 9.620020964360586, + "grad_norm": 1.654833436012268, + "learning_rate": 2.201925826819079e-07, + "loss": 0.2814, + "num_input_tokens_seen": 11999336, + "step": 18355 + }, + { + "epoch": 9.622641509433961, + "grad_norm": 1.4239252805709839, + "learning_rate": 2.1717445481695408e-07, + "loss": 0.2234, + "num_input_tokens_seen": 12003080, + "step": 18360 + }, + { + "epoch": 9.625262054507338, + "grad_norm": 0.7865161895751953, + "learning_rate": 2.1417706408838333e-07, + "loss": 0.1069, + "num_input_tokens_seen": 12007240, + "step": 18365 + }, + { + "epoch": 9.627882599580714, + "grad_norm": 1.472185492515564, + "learning_rate": 2.112004130042755e-07, + "loss": 0.2357, + "num_input_tokens_seen": 12009928, + "step": 18370 + }, + { + "epoch": 9.630503144654089, + "grad_norm": 2.1096017360687256, + "learning_rate": 2.082445040553549e-07, + "loss": 0.1974, + "num_input_tokens_seen": 12013384, + "step": 18375 + }, + { + "epoch": 9.633123689727464, + "grad_norm": 2.610865354537964, + "learning_rate": 2.053093397149902e-07, + "loss": 0.1731, + "num_input_tokens_seen": 12016520, + "step": 18380 + }, + { + "epoch": 9.635744234800839, + "grad_norm": 2.189488410949707, + "learning_rate": 2.0239492243919467e-07, + "loss": 0.1849, + "num_input_tokens_seen": 12020040, + "step": 18385 + }, + { + "epoch": 9.638364779874214, + "grad_norm": 1.5688245296478271, + "learning_rate": 1.9950125466662028e-07, + "loss": 0.2053, + "num_input_tokens_seen": 12023464, + "step": 18390 + }, + { + "epoch": 9.64098532494759, + "grad_norm": 1.3572442531585693, + "learning_rate": 1.9662833881855248e-07, + "loss": 0.1342, + "num_input_tokens_seen": 12026664, + "step": 18395 + }, + { + "epoch": 9.643605870020965, + "grad_norm": 1.3705387115478516, + "learning_rate": 1.9377617729891828e-07, + "loss": 0.1883, + "num_input_tokens_seen": 12030440, + "step": 18400 + }, + { + "epoch": 9.64622641509434, + "grad_norm": 1.5784013271331787, + "learning_rate": 1.9094477249427534e-07, + "loss": 0.2082, + "num_input_tokens_seen": 12032872, + "step": 18405 + }, + { + "epoch": 9.648846960167715, + "grad_norm": 1.2564842700958252, + "learning_rate": 1.8813412677381737e-07, + "loss": 0.17, + "num_input_tokens_seen": 12037864, + "step": 18410 + }, + { + "epoch": 9.65146750524109, + "grad_norm": 2.1727113723754883, + "learning_rate": 1.8534424248935756e-07, + "loss": 0.1484, + "num_input_tokens_seen": 12040904, + "step": 18415 + }, + { + "epoch": 9.654088050314465, + "grad_norm": 1.1407520771026611, + "learning_rate": 1.8257512197535076e-07, + "loss": 0.1732, + "num_input_tokens_seen": 12044296, + "step": 18420 + }, + { + "epoch": 9.65670859538784, + "grad_norm": 1.8990728855133057, + "learning_rate": 1.7982676754886574e-07, + "loss": 0.1678, + "num_input_tokens_seen": 12047208, + "step": 18425 + }, + { + "epoch": 9.659329140461216, + "grad_norm": 1.7458282709121704, + "learning_rate": 1.7709918150959904e-07, + "loss": 0.1896, + "num_input_tokens_seen": 12050696, + "step": 18430 + }, + { + "epoch": 9.66194968553459, + "grad_norm": 2.018489360809326, + "learning_rate": 1.7439236613987775e-07, + "loss": 0.2246, + "num_input_tokens_seen": 12054536, + "step": 18435 + }, + { + "epoch": 9.664570230607966, + "grad_norm": 3.867915153503418, + "learning_rate": 1.717063237046318e-07, + "loss": 0.1987, + "num_input_tokens_seen": 12056776, + "step": 18440 + }, + { + "epoch": 9.667190775681341, + "grad_norm": 1.9100679159164429, + "learning_rate": 1.6904105645142444e-07, + "loss": 0.2037, + "num_input_tokens_seen": 12059720, + "step": 18445 + }, + { + "epoch": 9.669811320754716, + "grad_norm": 2.3963563442230225, + "learning_rate": 1.6639656661043e-07, + "loss": 0.2111, + "num_input_tokens_seen": 12063304, + "step": 18450 + }, + { + "epoch": 9.672431865828091, + "grad_norm": 1.5803028345108032, + "learning_rate": 1.6377285639443407e-07, + "loss": 0.2242, + "num_input_tokens_seen": 12067592, + "step": 18455 + }, + { + "epoch": 9.675052410901468, + "grad_norm": 1.4102493524551392, + "learning_rate": 1.61169927998836e-07, + "loss": 0.2144, + "num_input_tokens_seen": 12070856, + "step": 18460 + }, + { + "epoch": 9.677672955974844, + "grad_norm": 1.1199527978897095, + "learning_rate": 1.5858778360165195e-07, + "loss": 0.2126, + "num_input_tokens_seen": 12074280, + "step": 18465 + }, + { + "epoch": 9.680293501048219, + "grad_norm": 1.7735329866409302, + "learning_rate": 1.5602642536350075e-07, + "loss": 0.1615, + "num_input_tokens_seen": 12077288, + "step": 18470 + }, + { + "epoch": 9.682914046121594, + "grad_norm": 1.7243341207504272, + "learning_rate": 1.5348585542760974e-07, + "loss": 0.1603, + "num_input_tokens_seen": 12082056, + "step": 18475 + }, + { + "epoch": 9.685534591194969, + "grad_norm": 2.0604002475738525, + "learning_rate": 1.5096607591980894e-07, + "loss": 0.2342, + "num_input_tokens_seen": 12085128, + "step": 18480 + }, + { + "epoch": 9.688155136268344, + "grad_norm": 1.58133065700531, + "learning_rate": 1.4846708894853955e-07, + "loss": 0.2181, + "num_input_tokens_seen": 12089032, + "step": 18485 + }, + { + "epoch": 9.69077568134172, + "grad_norm": 2.0616261959075928, + "learning_rate": 1.459888966048373e-07, + "loss": 0.1645, + "num_input_tokens_seen": 12091496, + "step": 18490 + }, + { + "epoch": 9.693396226415095, + "grad_norm": 2.8473870754241943, + "learning_rate": 1.4353150096234058e-07, + "loss": 0.2559, + "num_input_tokens_seen": 12095208, + "step": 18495 + }, + { + "epoch": 9.69601677148847, + "grad_norm": 1.7503589391708374, + "learning_rate": 1.410949040772852e-07, + "loss": 0.2692, + "num_input_tokens_seen": 12099176, + "step": 18500 + }, + { + "epoch": 9.698637316561845, + "grad_norm": 2.744154453277588, + "learning_rate": 1.3867910798850692e-07, + "loss": 0.2361, + "num_input_tokens_seen": 12102408, + "step": 18505 + }, + { + "epoch": 9.70125786163522, + "grad_norm": 1.6623018980026245, + "learning_rate": 1.3628411471742764e-07, + "loss": 0.2525, + "num_input_tokens_seen": 12105704, + "step": 18510 + }, + { + "epoch": 9.703878406708595, + "grad_norm": 1.3067378997802734, + "learning_rate": 1.3390992626807485e-07, + "loss": 0.2855, + "num_input_tokens_seen": 12108520, + "step": 18515 + }, + { + "epoch": 9.70649895178197, + "grad_norm": 2.503415107727051, + "learning_rate": 1.315565446270567e-07, + "loss": 0.1961, + "num_input_tokens_seen": 12111912, + "step": 18520 + }, + { + "epoch": 9.709119496855346, + "grad_norm": 2.518883228302002, + "learning_rate": 1.292239717635785e-07, + "loss": 0.1868, + "num_input_tokens_seen": 12116040, + "step": 18525 + }, + { + "epoch": 9.71174004192872, + "grad_norm": 1.8636502027511597, + "learning_rate": 1.269122096294262e-07, + "loss": 0.1718, + "num_input_tokens_seen": 12118632, + "step": 18530 + }, + { + "epoch": 9.714360587002096, + "grad_norm": 1.6444909572601318, + "learning_rate": 1.24621260158983e-07, + "loss": 0.1279, + "num_input_tokens_seen": 12121192, + "step": 18535 + }, + { + "epoch": 9.716981132075471, + "grad_norm": 2.08396315574646, + "learning_rate": 1.2235112526920723e-07, + "loss": 0.1686, + "num_input_tokens_seen": 12123976, + "step": 18540 + }, + { + "epoch": 9.719601677148846, + "grad_norm": 1.5353634357452393, + "learning_rate": 1.2010180685964324e-07, + "loss": 0.2811, + "num_input_tokens_seen": 12127816, + "step": 18545 + }, + { + "epoch": 9.722222222222221, + "grad_norm": 1.7465927600860596, + "learning_rate": 1.1787330681241881e-07, + "loss": 0.1819, + "num_input_tokens_seen": 12131848, + "step": 18550 + }, + { + "epoch": 9.724842767295598, + "grad_norm": 2.0792176723480225, + "learning_rate": 1.156656269922396e-07, + "loss": 0.207, + "num_input_tokens_seen": 12135432, + "step": 18555 + }, + { + "epoch": 9.727463312368974, + "grad_norm": 1.8543446063995361, + "learning_rate": 1.1347876924639455e-07, + "loss": 0.2295, + "num_input_tokens_seen": 12138376, + "step": 18560 + }, + { + "epoch": 9.730083857442349, + "grad_norm": 2.0432817935943604, + "learning_rate": 1.1131273540474496e-07, + "loss": 0.2558, + "num_input_tokens_seen": 12141480, + "step": 18565 + }, + { + "epoch": 9.732704402515724, + "grad_norm": 2.3458032608032227, + "learning_rate": 1.091675272797299e-07, + "loss": 0.1591, + "num_input_tokens_seen": 12144168, + "step": 18570 + }, + { + "epoch": 9.735324947589099, + "grad_norm": 1.6456936597824097, + "learning_rate": 1.0704314666635795e-07, + "loss": 0.2554, + "num_input_tokens_seen": 12148168, + "step": 18575 + }, + { + "epoch": 9.737945492662474, + "grad_norm": 1.7487496137619019, + "learning_rate": 1.0493959534221832e-07, + "loss": 0.144, + "num_input_tokens_seen": 12150696, + "step": 18580 + }, + { + "epoch": 9.74056603773585, + "grad_norm": 2.139617919921875, + "learning_rate": 1.0285687506746133e-07, + "loss": 0.1574, + "num_input_tokens_seen": 12153928, + "step": 18585 + }, + { + "epoch": 9.743186582809225, + "grad_norm": 2.2615816593170166, + "learning_rate": 1.0079498758481798e-07, + "loss": 0.1859, + "num_input_tokens_seen": 12157384, + "step": 18590 + }, + { + "epoch": 9.7458071278826, + "grad_norm": 1.6813504695892334, + "learning_rate": 9.87539346195776e-08, + "loss": 0.1942, + "num_input_tokens_seen": 12160200, + "step": 18595 + }, + { + "epoch": 9.748427672955975, + "grad_norm": 2.0348854064941406, + "learning_rate": 9.673371787960183e-08, + "loss": 0.256, + "num_input_tokens_seen": 12162504, + "step": 18600 + }, + { + "epoch": 9.75104821802935, + "grad_norm": 2.16823410987854, + "learning_rate": 9.473433905531626e-08, + "loss": 0.2052, + "num_input_tokens_seen": 12165288, + "step": 18605 + }, + { + "epoch": 9.753668763102725, + "grad_norm": 2.92033052444458, + "learning_rate": 9.275579981970483e-08, + "loss": 0.1821, + "num_input_tokens_seen": 12167912, + "step": 18610 + }, + { + "epoch": 9.7562893081761, + "grad_norm": 15.674690246582031, + "learning_rate": 9.07981018283266e-08, + "loss": 0.2621, + "num_input_tokens_seen": 12171624, + "step": 18615 + }, + { + "epoch": 9.758909853249476, + "grad_norm": 2.256648063659668, + "learning_rate": 8.886124671928786e-08, + "loss": 0.1644, + "num_input_tokens_seen": 12174632, + "step": 18620 + }, + { + "epoch": 9.76153039832285, + "grad_norm": 4.707893371582031, + "learning_rate": 8.694523611326444e-08, + "loss": 0.3071, + "num_input_tokens_seen": 12177896, + "step": 18625 + }, + { + "epoch": 9.764150943396226, + "grad_norm": 2.429788112640381, + "learning_rate": 8.505007161348222e-08, + "loss": 0.1994, + "num_input_tokens_seen": 12181256, + "step": 18630 + }, + { + "epoch": 9.766771488469601, + "grad_norm": 2.5867080688476562, + "learning_rate": 8.31757548057338e-08, + "loss": 0.2394, + "num_input_tokens_seen": 12183848, + "step": 18635 + }, + { + "epoch": 9.769392033542976, + "grad_norm": 2.024838924407959, + "learning_rate": 8.132228725835634e-08, + "loss": 0.2171, + "num_input_tokens_seen": 12187208, + "step": 18640 + }, + { + "epoch": 9.772012578616351, + "grad_norm": 1.873658299446106, + "learning_rate": 7.948967052225087e-08, + "loss": 0.2163, + "num_input_tokens_seen": 12190568, + "step": 18645 + }, + { + "epoch": 9.774633123689728, + "grad_norm": 1.8301262855529785, + "learning_rate": 7.767790613086301e-08, + "loss": 0.1506, + "num_input_tokens_seen": 12193224, + "step": 18650 + }, + { + "epoch": 9.777253668763104, + "grad_norm": 1.593110203742981, + "learning_rate": 7.588699560019952e-08, + "loss": 0.1625, + "num_input_tokens_seen": 12195816, + "step": 18655 + }, + { + "epoch": 9.779874213836479, + "grad_norm": 3.6583030223846436, + "learning_rate": 7.411694042881168e-08, + "loss": 0.219, + "num_input_tokens_seen": 12198856, + "step": 18660 + }, + { + "epoch": 9.782494758909854, + "grad_norm": 2.038177013397217, + "learning_rate": 7.23677420977953e-08, + "loss": 0.2299, + "num_input_tokens_seen": 12201992, + "step": 18665 + }, + { + "epoch": 9.785115303983229, + "grad_norm": 2.0245089530944824, + "learning_rate": 7.063940207080733e-08, + "loss": 0.2013, + "num_input_tokens_seen": 12205608, + "step": 18670 + }, + { + "epoch": 9.787735849056604, + "grad_norm": 2.1398632526397705, + "learning_rate": 6.893192179403817e-08, + "loss": 0.2355, + "num_input_tokens_seen": 12209352, + "step": 18675 + }, + { + "epoch": 9.79035639412998, + "grad_norm": 2.262556791305542, + "learning_rate": 6.724530269623108e-08, + "loss": 0.1831, + "num_input_tokens_seen": 12213768, + "step": 18680 + }, + { + "epoch": 9.792976939203355, + "grad_norm": 2.429652690887451, + "learning_rate": 6.557954618867102e-08, + "loss": 0.1776, + "num_input_tokens_seen": 12216776, + "step": 18685 + }, + { + "epoch": 9.79559748427673, + "grad_norm": 1.1257370710372925, + "learning_rate": 6.393465366519024e-08, + "loss": 0.176, + "num_input_tokens_seen": 12221000, + "step": 18690 + }, + { + "epoch": 9.798218029350105, + "grad_norm": 3.8319168090820312, + "learning_rate": 6.231062650215724e-08, + "loss": 0.222, + "num_input_tokens_seen": 12224200, + "step": 18695 + }, + { + "epoch": 9.80083857442348, + "grad_norm": 1.852075219154358, + "learning_rate": 6.070746605848221e-08, + "loss": 0.1646, + "num_input_tokens_seen": 12226920, + "step": 18700 + }, + { + "epoch": 9.803459119496855, + "grad_norm": 2.4144811630249023, + "learning_rate": 5.912517367561987e-08, + "loss": 0.2519, + "num_input_tokens_seen": 12229960, + "step": 18705 + }, + { + "epoch": 9.80607966457023, + "grad_norm": 2.0747292041778564, + "learning_rate": 5.756375067755837e-08, + "loss": 0.1821, + "num_input_tokens_seen": 12232040, + "step": 18710 + }, + { + "epoch": 9.808700209643606, + "grad_norm": 1.3122594356536865, + "learning_rate": 5.602319837082481e-08, + "loss": 0.1456, + "num_input_tokens_seen": 12234472, + "step": 18715 + }, + { + "epoch": 9.81132075471698, + "grad_norm": 3.1182022094726562, + "learning_rate": 5.450351804448528e-08, + "loss": 0.1499, + "num_input_tokens_seen": 12237448, + "step": 18720 + }, + { + "epoch": 9.813941299790356, + "grad_norm": 1.8886851072311401, + "learning_rate": 5.3004710970133705e-08, + "loss": 0.2284, + "num_input_tokens_seen": 12240264, + "step": 18725 + }, + { + "epoch": 9.816561844863731, + "grad_norm": 1.4782249927520752, + "learning_rate": 5.1526778401911334e-08, + "loss": 0.1809, + "num_input_tokens_seen": 12243176, + "step": 18730 + }, + { + "epoch": 9.819182389937106, + "grad_norm": 3.54781174659729, + "learning_rate": 5.0069721576476156e-08, + "loss": 0.1922, + "num_input_tokens_seen": 12245864, + "step": 18735 + }, + { + "epoch": 9.821802935010481, + "grad_norm": 1.7837921380996704, + "learning_rate": 4.863354171303347e-08, + "loss": 0.2115, + "num_input_tokens_seen": 12248712, + "step": 18740 + }, + { + "epoch": 9.824423480083858, + "grad_norm": 2.088392734527588, + "learning_rate": 4.72182400133081e-08, + "loss": 0.1998, + "num_input_tokens_seen": 12251144, + "step": 18745 + }, + { + "epoch": 9.827044025157234, + "grad_norm": 2.0164222717285156, + "learning_rate": 4.582381766156385e-08, + "loss": 0.2153, + "num_input_tokens_seen": 12255336, + "step": 18750 + }, + { + "epoch": 9.829664570230609, + "grad_norm": 3.166515350341797, + "learning_rate": 4.445027582458683e-08, + "loss": 0.1861, + "num_input_tokens_seen": 12257672, + "step": 18755 + }, + { + "epoch": 9.832285115303984, + "grad_norm": 1.9609159231185913, + "learning_rate": 4.309761565169379e-08, + "loss": 0.1773, + "num_input_tokens_seen": 12261032, + "step": 18760 + }, + { + "epoch": 9.834905660377359, + "grad_norm": 1.9990642070770264, + "learning_rate": 4.1765838274732125e-08, + "loss": 0.1952, + "num_input_tokens_seen": 12264488, + "step": 18765 + }, + { + "epoch": 9.837526205450734, + "grad_norm": 2.4211738109588623, + "learning_rate": 4.045494480807155e-08, + "loss": 0.1964, + "num_input_tokens_seen": 12267432, + "step": 18770 + }, + { + "epoch": 9.84014675052411, + "grad_norm": 2.6195132732391357, + "learning_rate": 3.916493634860407e-08, + "loss": 0.1542, + "num_input_tokens_seen": 12270888, + "step": 18775 + }, + { + "epoch": 9.842767295597485, + "grad_norm": 2.223850727081299, + "learning_rate": 3.789581397575515e-08, + "loss": 0.1844, + "num_input_tokens_seen": 12273896, + "step": 18780 + }, + { + "epoch": 9.84538784067086, + "grad_norm": 2.084920644760132, + "learning_rate": 3.664757875146418e-08, + "loss": 0.2003, + "num_input_tokens_seen": 12276328, + "step": 18785 + }, + { + "epoch": 9.848008385744235, + "grad_norm": 1.6858866214752197, + "learning_rate": 3.5420231720198485e-08, + "loss": 0.1712, + "num_input_tokens_seen": 12279016, + "step": 18790 + }, + { + "epoch": 9.85062893081761, + "grad_norm": 1.7856684923171997, + "learning_rate": 3.421377390894764e-08, + "loss": 0.1335, + "num_input_tokens_seen": 12281512, + "step": 18795 + }, + { + "epoch": 9.853249475890985, + "grad_norm": 1.7614686489105225, + "learning_rate": 3.3028206327218035e-08, + "loss": 0.2554, + "num_input_tokens_seen": 12285160, + "step": 18800 + }, + { + "epoch": 9.85587002096436, + "grad_norm": 1.4408208131790161, + "learning_rate": 3.1863529967041117e-08, + "loss": 0.2177, + "num_input_tokens_seen": 12288616, + "step": 18805 + }, + { + "epoch": 9.858490566037736, + "grad_norm": 2.0140910148620605, + "learning_rate": 3.071974580296233e-08, + "loss": 0.3209, + "num_input_tokens_seen": 12292680, + "step": 18810 + }, + { + "epoch": 9.86111111111111, + "grad_norm": 1.1100860834121704, + "learning_rate": 2.9596854792052207e-08, + "loss": 0.1889, + "num_input_tokens_seen": 12297160, + "step": 18815 + }, + { + "epoch": 9.863731656184486, + "grad_norm": 2.4930942058563232, + "learning_rate": 2.8494857873889724e-08, + "loss": 0.2506, + "num_input_tokens_seen": 12300520, + "step": 18820 + }, + { + "epoch": 9.866352201257861, + "grad_norm": 1.289282202720642, + "learning_rate": 2.741375597057616e-08, + "loss": 0.2419, + "num_input_tokens_seen": 12303496, + "step": 18825 + }, + { + "epoch": 9.868972746331236, + "grad_norm": 2.258014678955078, + "learning_rate": 2.6353549986729566e-08, + "loss": 0.2213, + "num_input_tokens_seen": 12306568, + "step": 18830 + }, + { + "epoch": 9.871593291404611, + "grad_norm": 1.887428641319275, + "learning_rate": 2.531424080948197e-08, + "loss": 0.2368, + "num_input_tokens_seen": 12309160, + "step": 18835 + }, + { + "epoch": 9.874213836477988, + "grad_norm": 1.5274162292480469, + "learning_rate": 2.4295829308482176e-08, + "loss": 0.2018, + "num_input_tokens_seen": 12312776, + "step": 18840 + }, + { + "epoch": 9.876834381551364, + "grad_norm": 2.5760204792022705, + "learning_rate": 2.329831633588464e-08, + "loss": 0.153, + "num_input_tokens_seen": 12316104, + "step": 18845 + }, + { + "epoch": 9.879454926624739, + "grad_norm": 2.795894145965576, + "learning_rate": 2.232170272636891e-08, + "loss": 0.2396, + "num_input_tokens_seen": 12318568, + "step": 18850 + }, + { + "epoch": 9.882075471698114, + "grad_norm": 1.9848110675811768, + "learning_rate": 2.136598929711464e-08, + "loss": 0.2184, + "num_input_tokens_seen": 12321736, + "step": 18855 + }, + { + "epoch": 9.884696016771489, + "grad_norm": 1.5976333618164062, + "learning_rate": 2.0431176847823807e-08, + "loss": 0.2428, + "num_input_tokens_seen": 12325032, + "step": 18860 + }, + { + "epoch": 9.887316561844864, + "grad_norm": 1.5707284212112427, + "learning_rate": 1.9517266160704038e-08, + "loss": 0.2541, + "num_input_tokens_seen": 12329160, + "step": 18865 + }, + { + "epoch": 9.88993710691824, + "grad_norm": 1.6852447986602783, + "learning_rate": 1.8624258000471405e-08, + "loss": 0.3971, + "num_input_tokens_seen": 12332392, + "step": 18870 + }, + { + "epoch": 9.892557651991615, + "grad_norm": 1.887164831161499, + "learning_rate": 1.7752153114358737e-08, + "loss": 0.2291, + "num_input_tokens_seen": 12336456, + "step": 18875 + }, + { + "epoch": 9.89517819706499, + "grad_norm": 1.9743479490280151, + "learning_rate": 1.6900952232098977e-08, + "loss": 0.1979, + "num_input_tokens_seen": 12339080, + "step": 18880 + }, + { + "epoch": 9.897798742138365, + "grad_norm": 1.8529651165008545, + "learning_rate": 1.6070656065939048e-08, + "loss": 0.1701, + "num_input_tokens_seen": 12341768, + "step": 18885 + }, + { + "epoch": 9.90041928721174, + "grad_norm": 1.958155632019043, + "learning_rate": 1.526126531063432e-08, + "loss": 0.1987, + "num_input_tokens_seen": 12344936, + "step": 18890 + }, + { + "epoch": 9.903039832285115, + "grad_norm": 1.9846580028533936, + "learning_rate": 1.4472780643445817e-08, + "loss": 0.1666, + "num_input_tokens_seen": 12349416, + "step": 18895 + }, + { + "epoch": 9.90566037735849, + "grad_norm": 2.3476858139038086, + "learning_rate": 1.3705202724142996e-08, + "loss": 0.2051, + "num_input_tokens_seen": 12352360, + "step": 18900 + }, + { + "epoch": 9.908280922431866, + "grad_norm": 1.9905366897583008, + "learning_rate": 1.2958532194995432e-08, + "loss": 0.2463, + "num_input_tokens_seen": 12355688, + "step": 18905 + }, + { + "epoch": 9.91090146750524, + "grad_norm": 1.1543928384780884, + "learning_rate": 1.2232769680789457e-08, + "loss": 0.2172, + "num_input_tokens_seen": 12359560, + "step": 18910 + }, + { + "epoch": 9.913522012578616, + "grad_norm": 1.8548483848571777, + "learning_rate": 1.152791578880319e-08, + "loss": 0.2712, + "num_input_tokens_seen": 12363656, + "step": 18915 + }, + { + "epoch": 9.916142557651991, + "grad_norm": 1.4029093980789185, + "learning_rate": 1.0843971108828732e-08, + "loss": 0.173, + "num_input_tokens_seen": 12367688, + "step": 18920 + }, + { + "epoch": 9.918763102725366, + "grad_norm": 2.5014874935150146, + "learning_rate": 1.018093621316385e-08, + "loss": 0.207, + "num_input_tokens_seen": 12371400, + "step": 18925 + }, + { + "epoch": 9.921383647798741, + "grad_norm": 2.102825880050659, + "learning_rate": 9.53881165659809e-09, + "loss": 0.1976, + "num_input_tokens_seen": 12374152, + "step": 18930 + }, + { + "epoch": 9.924004192872118, + "grad_norm": 1.7338072061538696, + "learning_rate": 8.91759797644054e-09, + "loss": 0.1312, + "num_input_tokens_seen": 12378472, + "step": 18935 + }, + { + "epoch": 9.926624737945493, + "grad_norm": 2.432953357696533, + "learning_rate": 8.317295692486516e-09, + "loss": 0.2356, + "num_input_tokens_seen": 12381480, + "step": 18940 + }, + { + "epoch": 9.929245283018869, + "grad_norm": 1.0787451267242432, + "learning_rate": 7.737905307045323e-09, + "loss": 0.2243, + "num_input_tokens_seen": 12384648, + "step": 18945 + }, + { + "epoch": 9.931865828092244, + "grad_norm": 1.4920077323913574, + "learning_rate": 7.179427304926378e-09, + "loss": 0.167, + "num_input_tokens_seen": 12387432, + "step": 18950 + }, + { + "epoch": 9.934486373165619, + "grad_norm": 2.5539700984954834, + "learning_rate": 6.641862153433653e-09, + "loss": 0.2268, + "num_input_tokens_seen": 12390984, + "step": 18955 + }, + { + "epoch": 9.937106918238994, + "grad_norm": 1.8679301738739014, + "learning_rate": 6.125210302382333e-09, + "loss": 0.216, + "num_input_tokens_seen": 12394760, + "step": 18960 + }, + { + "epoch": 9.93972746331237, + "grad_norm": 3.165524482727051, + "learning_rate": 5.629472184079387e-09, + "loss": 0.2906, + "num_input_tokens_seen": 12397768, + "step": 18965 + }, + { + "epoch": 9.942348008385745, + "grad_norm": 1.925972819328308, + "learning_rate": 5.154648213334668e-09, + "loss": 0.1896, + "num_input_tokens_seen": 12400968, + "step": 18970 + }, + { + "epoch": 9.94496855345912, + "grad_norm": 2.299572706222534, + "learning_rate": 4.700738787466463e-09, + "loss": 0.2504, + "num_input_tokens_seen": 12406664, + "step": 18975 + }, + { + "epoch": 9.947589098532495, + "grad_norm": 2.0092318058013916, + "learning_rate": 4.26774428627652e-09, + "loss": 0.166, + "num_input_tokens_seen": 12409448, + "step": 18980 + }, + { + "epoch": 9.95020964360587, + "grad_norm": 1.2498681545257568, + "learning_rate": 3.855665072080572e-09, + "loss": 0.1027, + "num_input_tokens_seen": 12412744, + "step": 18985 + }, + { + "epoch": 9.952830188679245, + "grad_norm": 1.9077186584472656, + "learning_rate": 3.464501489683358e-09, + "loss": 0.3511, + "num_input_tokens_seen": 12415656, + "step": 18990 + }, + { + "epoch": 9.95545073375262, + "grad_norm": 1.5354210138320923, + "learning_rate": 3.094253866398056e-09, + "loss": 0.2944, + "num_input_tokens_seen": 12418472, + "step": 18995 + }, + { + "epoch": 9.958071278825996, + "grad_norm": 2.591525077819824, + "learning_rate": 2.7449225120268484e-09, + "loss": 0.1688, + "num_input_tokens_seen": 12421768, + "step": 19000 + }, + { + "epoch": 9.96069182389937, + "grad_norm": 1.6509802341461182, + "learning_rate": 2.416507718877581e-09, + "loss": 0.2327, + "num_input_tokens_seen": 12425800, + "step": 19005 + }, + { + "epoch": 9.963312368972746, + "grad_norm": 1.9795682430267334, + "learning_rate": 2.109009761747105e-09, + "loss": 0.2447, + "num_input_tokens_seen": 12429288, + "step": 19010 + }, + { + "epoch": 9.965932914046121, + "grad_norm": 1.9158377647399902, + "learning_rate": 1.8224288979434844e-09, + "loss": 0.1465, + "num_input_tokens_seen": 12433160, + "step": 19015 + }, + { + "epoch": 9.968553459119496, + "grad_norm": 2.6064488887786865, + "learning_rate": 1.5567653672554638e-09, + "loss": 0.1619, + "num_input_tokens_seen": 12435944, + "step": 19020 + }, + { + "epoch": 9.971174004192871, + "grad_norm": 1.5369117259979248, + "learning_rate": 1.3120193919857748e-09, + "loss": 0.1746, + "num_input_tokens_seen": 12438216, + "step": 19025 + }, + { + "epoch": 9.973794549266248, + "grad_norm": 1.9255480766296387, + "learning_rate": 1.0881911769261565e-09, + "loss": 0.1743, + "num_input_tokens_seen": 12440904, + "step": 19030 + }, + { + "epoch": 9.976415094339622, + "grad_norm": 3.0324313640594482, + "learning_rate": 8.852809093601311e-10, + "loss": 0.219, + "num_input_tokens_seen": 12443752, + "step": 19035 + }, + { + "epoch": 9.979035639412999, + "grad_norm": 4.0194010734558105, + "learning_rate": 7.03288759076881e-10, + "loss": 0.2648, + "num_input_tokens_seen": 12446152, + "step": 19040 + }, + { + "epoch": 9.981656184486374, + "grad_norm": 1.9542360305786133, + "learning_rate": 5.422148783629233e-10, + "loss": 0.1536, + "num_input_tokens_seen": 12449160, + "step": 19045 + }, + { + "epoch": 9.984276729559749, + "grad_norm": 1.6942179203033447, + "learning_rate": 4.0205940199100623e-10, + "loss": 0.2432, + "num_input_tokens_seen": 12453064, + "step": 19050 + }, + { + "epoch": 9.986897274633124, + "grad_norm": 1.8916586637496948, + "learning_rate": 2.828224472395391e-10, + "loss": 0.164, + "num_input_tokens_seen": 12455944, + "step": 19055 + }, + { + "epoch": 9.9895178197065, + "grad_norm": 3.6136910915374756, + "learning_rate": 1.8450411388426515e-10, + "loss": 0.2456, + "num_input_tokens_seen": 12459528, + "step": 19060 + }, + { + "epoch": 9.992138364779874, + "grad_norm": 1.5156410932540894, + "learning_rate": 1.0710448418715935e-10, + "loss": 0.2544, + "num_input_tokens_seen": 12463048, + "step": 19065 + }, + { + "epoch": 9.99475890985325, + "grad_norm": 1.6779978275299072, + "learning_rate": 5.062362291585743e-11, + "loss": 0.1928, + "num_input_tokens_seen": 12467240, + "step": 19070 + }, + { + "epoch": 9.997379454926625, + "grad_norm": 2.0392026901245117, + "learning_rate": 1.5061577329777976e-11, + "loss": 0.3794, + "num_input_tokens_seen": 12470216, + "step": 19075 + }, + { + "epoch": 10.0, + "grad_norm": 3.00492000579834, + "learning_rate": 4.183771884491705e-13, + "loss": 0.277, + "num_input_tokens_seen": 12472912, + "step": 19080 + }, + { + "epoch": 10.0, + "eval_loss": 0.6786032915115356, + "eval_runtime": 15.9293, + "eval_samples_per_second": 53.235, + "eval_steps_per_second": 13.309, + "num_input_tokens_seen": 12472912, + "step": 19080 + }, + { + "epoch": 10.0, + "num_input_tokens_seen": 12472912, + "step": 19080, + "total_flos": 5.627482088814674e+17, + "train_loss": 0.3423160022456566, + "train_runtime": 5308.2579, + "train_samples_per_second": 14.372, + "train_steps_per_second": 3.594 + } + ], + "logging_steps": 5, + "max_steps": 19080, + "num_input_tokens_seen": 12472912, + "num_train_epochs": 10, + "save_steps": 954, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.627482088814674e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}