{ "best_global_step": 758667, "best_metric": 0.06412914395332336, "best_model_checkpoint": "/media/user/Expansion1/multilingual-e5-small-aligned-v2-text-quality-v3/checkpoint-758667", "epoch": 10.0, "eval_steps": 500, "global_step": 1083810, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004613354739299324, "grad_norm": 0.9464718699455261, "learning_rate": 4.99769793598509e-05, "loss": 0.2108, "num_input_tokens_seen": 512000, "step": 500 }, { "epoch": 0.009226709478598648, "grad_norm": 1.6402217149734497, "learning_rate": 4.99539125861544e-05, "loss": 0.1148, "num_input_tokens_seen": 1024000, "step": 1000 }, { "epoch": 0.013840064217897971, "grad_norm": 2.3964197635650635, "learning_rate": 4.9930845812457905e-05, "loss": 0.115, "num_input_tokens_seen": 1536000, "step": 1500 }, { "epoch": 0.018453418957197296, "grad_norm": 1.5508780479431152, "learning_rate": 4.990777903876141e-05, "loss": 0.0986, "num_input_tokens_seen": 2048000, "step": 2000 }, { "epoch": 0.02306677369649662, "grad_norm": 1.0917489528656006, "learning_rate": 4.9884712265064913e-05, "loss": 0.1006, "num_input_tokens_seen": 2560000, "step": 2500 }, { "epoch": 0.027680128435795943, "grad_norm": 3.2608118057250977, "learning_rate": 4.9861645491368414e-05, "loss": 0.0966, "num_input_tokens_seen": 3072000, "step": 3000 }, { "epoch": 0.03229348317509526, "grad_norm": 0.6695080995559692, "learning_rate": 4.983857871767192e-05, "loss": 0.0966, "num_input_tokens_seen": 3584000, "step": 3500 }, { "epoch": 0.03690683791439459, "grad_norm": 0.9232053756713867, "learning_rate": 4.981551194397542e-05, "loss": 0.0937, "num_input_tokens_seen": 4096000, "step": 4000 }, { "epoch": 0.041520192653693916, "grad_norm": 1.5442851781845093, "learning_rate": 4.979244517027893e-05, "loss": 0.0966, "num_input_tokens_seen": 4608000, "step": 4500 }, { "epoch": 0.04613354739299324, "grad_norm": 1.1777746677398682, "learning_rate": 4.976937839658243e-05, "loss": 0.0928, "num_input_tokens_seen": 5120000, "step": 5000 }, { "epoch": 0.05074690213229256, "grad_norm": 1.1882743835449219, "learning_rate": 4.974631162288593e-05, "loss": 0.0982, "num_input_tokens_seen": 5632000, "step": 5500 }, { "epoch": 0.055360256871591886, "grad_norm": 1.9017492532730103, "learning_rate": 4.972324484918944e-05, "loss": 0.0968, "num_input_tokens_seen": 6144000, "step": 6000 }, { "epoch": 0.05997361161089121, "grad_norm": 0.9373461008071899, "learning_rate": 4.970017807549294e-05, "loss": 0.0942, "num_input_tokens_seen": 6656000, "step": 6500 }, { "epoch": 0.06458696635019053, "grad_norm": 1.4917376041412354, "learning_rate": 4.967711130179644e-05, "loss": 0.0935, "num_input_tokens_seen": 7168000, "step": 7000 }, { "epoch": 0.06920032108948986, "grad_norm": 0.534630298614502, "learning_rate": 4.9654044528099946e-05, "loss": 0.0879, "num_input_tokens_seen": 7680000, "step": 7500 }, { "epoch": 0.07381367582878919, "grad_norm": 1.9700461626052856, "learning_rate": 4.9630977754403454e-05, "loss": 0.0913, "num_input_tokens_seen": 8192000, "step": 8000 }, { "epoch": 0.0784270305680885, "grad_norm": 2.11916446685791, "learning_rate": 4.960791098070695e-05, "loss": 0.0929, "num_input_tokens_seen": 8704000, "step": 8500 }, { "epoch": 0.08304038530738783, "grad_norm": 1.961242437362671, "learning_rate": 4.9584844207010455e-05, "loss": 0.0883, "num_input_tokens_seen": 9216000, "step": 9000 }, { "epoch": 0.08765374004668715, "grad_norm": 1.5819107294082642, "learning_rate": 4.956177743331396e-05, "loss": 0.0849, "num_input_tokens_seen": 9728000, "step": 9500 }, { "epoch": 0.09226709478598648, "grad_norm": 0.8099465370178223, "learning_rate": 4.953871065961746e-05, "loss": 0.0925, "num_input_tokens_seen": 10240000, "step": 10000 }, { "epoch": 0.0968804495252858, "grad_norm": 0.8762685656547546, "learning_rate": 4.9515643885920963e-05, "loss": 0.0867, "num_input_tokens_seen": 10752000, "step": 10500 }, { "epoch": 0.10149380426458512, "grad_norm": 2.166046142578125, "learning_rate": 4.949257711222447e-05, "loss": 0.0906, "num_input_tokens_seen": 11264000, "step": 11000 }, { "epoch": 0.10610715900388444, "grad_norm": 0.9908414483070374, "learning_rate": 4.946951033852797e-05, "loss": 0.0989, "num_input_tokens_seen": 11776000, "step": 11500 }, { "epoch": 0.11072051374318377, "grad_norm": 0.9543777704238892, "learning_rate": 4.944644356483147e-05, "loss": 0.0917, "num_input_tokens_seen": 12288000, "step": 12000 }, { "epoch": 0.11533386848248309, "grad_norm": 2.302893877029419, "learning_rate": 4.942337679113498e-05, "loss": 0.0906, "num_input_tokens_seen": 12800000, "step": 12500 }, { "epoch": 0.11994722322178242, "grad_norm": 1.214758038520813, "learning_rate": 4.940031001743849e-05, "loss": 0.0831, "num_input_tokens_seen": 13312000, "step": 13000 }, { "epoch": 0.12456057796108173, "grad_norm": 1.4494785070419312, "learning_rate": 4.937724324374199e-05, "loss": 0.0949, "num_input_tokens_seen": 13824000, "step": 13500 }, { "epoch": 0.12917393270038105, "grad_norm": 1.3759499788284302, "learning_rate": 4.935417647004549e-05, "loss": 0.0775, "num_input_tokens_seen": 14336000, "step": 14000 }, { "epoch": 0.13378728743968038, "grad_norm": 1.4409326314926147, "learning_rate": 4.9331109696348995e-05, "loss": 0.0874, "num_input_tokens_seen": 14848000, "step": 14500 }, { "epoch": 0.1384006421789797, "grad_norm": 0.6916935443878174, "learning_rate": 4.9308042922652496e-05, "loss": 0.0888, "num_input_tokens_seen": 15360000, "step": 15000 }, { "epoch": 0.14301399691827904, "grad_norm": 2.6819546222686768, "learning_rate": 4.9284976148956e-05, "loss": 0.0866, "num_input_tokens_seen": 15872000, "step": 15500 }, { "epoch": 0.14762735165757837, "grad_norm": 2.243403434753418, "learning_rate": 4.9261909375259504e-05, "loss": 0.0858, "num_input_tokens_seen": 16384000, "step": 16000 }, { "epoch": 0.15224070639687767, "grad_norm": 0.6077441573143005, "learning_rate": 4.9238842601563004e-05, "loss": 0.0829, "num_input_tokens_seen": 16896000, "step": 16500 }, { "epoch": 0.156854061136177, "grad_norm": 0.7938207387924194, "learning_rate": 4.921577582786651e-05, "loss": 0.0807, "num_input_tokens_seen": 17408000, "step": 17000 }, { "epoch": 0.16146741587547633, "grad_norm": 1.7776683568954468, "learning_rate": 4.919270905417002e-05, "loss": 0.0801, "num_input_tokens_seen": 17920000, "step": 17500 }, { "epoch": 0.16608077061477566, "grad_norm": 0.9043099880218506, "learning_rate": 4.916964228047351e-05, "loss": 0.0806, "num_input_tokens_seen": 18432000, "step": 18000 }, { "epoch": 0.17069412535407497, "grad_norm": 3.0099849700927734, "learning_rate": 4.914657550677702e-05, "loss": 0.0801, "num_input_tokens_seen": 18944000, "step": 18500 }, { "epoch": 0.1753074800933743, "grad_norm": 1.3632686138153076, "learning_rate": 4.912350873308053e-05, "loss": 0.0858, "num_input_tokens_seen": 19456000, "step": 19000 }, { "epoch": 0.17992083483267363, "grad_norm": 1.3890104293823242, "learning_rate": 4.910044195938403e-05, "loss": 0.0807, "num_input_tokens_seen": 19968000, "step": 19500 }, { "epoch": 0.18453418957197296, "grad_norm": 1.393978476524353, "learning_rate": 4.907737518568753e-05, "loss": 0.09, "num_input_tokens_seen": 20480000, "step": 20000 }, { "epoch": 0.18914754431127226, "grad_norm": 0.9538819193840027, "learning_rate": 4.9054308411991036e-05, "loss": 0.0862, "num_input_tokens_seen": 20992000, "step": 20500 }, { "epoch": 0.1937608990505716, "grad_norm": 1.6974983215332031, "learning_rate": 4.903124163829454e-05, "loss": 0.0778, "num_input_tokens_seen": 21504000, "step": 21000 }, { "epoch": 0.19837425378987092, "grad_norm": 0.43043065071105957, "learning_rate": 4.900817486459804e-05, "loss": 0.0927, "num_input_tokens_seen": 22016000, "step": 21500 }, { "epoch": 0.20298760852917025, "grad_norm": 0.9475088119506836, "learning_rate": 4.8985108090901545e-05, "loss": 0.0813, "num_input_tokens_seen": 22528000, "step": 22000 }, { "epoch": 0.20760096326846958, "grad_norm": 3.547081470489502, "learning_rate": 4.8962041317205045e-05, "loss": 0.0849, "num_input_tokens_seen": 23040000, "step": 22500 }, { "epoch": 0.21221431800776888, "grad_norm": 1.2342774868011475, "learning_rate": 4.893897454350855e-05, "loss": 0.0831, "num_input_tokens_seen": 23552000, "step": 23000 }, { "epoch": 0.2168276727470682, "grad_norm": 2.133857488632202, "learning_rate": 4.891590776981205e-05, "loss": 0.0774, "num_input_tokens_seen": 24064000, "step": 23500 }, { "epoch": 0.22144102748636754, "grad_norm": 2.0566883087158203, "learning_rate": 4.889284099611556e-05, "loss": 0.0778, "num_input_tokens_seen": 24576000, "step": 24000 }, { "epoch": 0.22605438222566687, "grad_norm": 0.5913178324699402, "learning_rate": 4.886977422241906e-05, "loss": 0.0811, "num_input_tokens_seen": 25088000, "step": 24500 }, { "epoch": 0.23066773696496617, "grad_norm": 1.9674791097640991, "learning_rate": 4.884670744872256e-05, "loss": 0.0743, "num_input_tokens_seen": 25600000, "step": 25000 }, { "epoch": 0.2352810917042655, "grad_norm": 0.5584122538566589, "learning_rate": 4.882364067502607e-05, "loss": 0.0852, "num_input_tokens_seen": 26112000, "step": 25500 }, { "epoch": 0.23989444644356483, "grad_norm": 1.9229296445846558, "learning_rate": 4.880057390132957e-05, "loss": 0.0828, "num_input_tokens_seen": 26624000, "step": 26000 }, { "epoch": 0.24450780118286417, "grad_norm": 1.968058466911316, "learning_rate": 4.877750712763308e-05, "loss": 0.0822, "num_input_tokens_seen": 27136000, "step": 26500 }, { "epoch": 0.24912115592216347, "grad_norm": 1.6034080982208252, "learning_rate": 4.875444035393658e-05, "loss": 0.0822, "num_input_tokens_seen": 27648000, "step": 27000 }, { "epoch": 0.2537345106614628, "grad_norm": 1.7301759719848633, "learning_rate": 4.873137358024008e-05, "loss": 0.0833, "num_input_tokens_seen": 28160000, "step": 27500 }, { "epoch": 0.2583478654007621, "grad_norm": 2.2902233600616455, "learning_rate": 4.8708306806543585e-05, "loss": 0.0904, "num_input_tokens_seen": 28672000, "step": 28000 }, { "epoch": 0.26296122014006146, "grad_norm": 2.805758476257324, "learning_rate": 4.868524003284709e-05, "loss": 0.0854, "num_input_tokens_seen": 29184000, "step": 28500 }, { "epoch": 0.26757457487936076, "grad_norm": 1.0350342988967896, "learning_rate": 4.8662173259150587e-05, "loss": 0.0806, "num_input_tokens_seen": 29696000, "step": 29000 }, { "epoch": 0.2721879296186601, "grad_norm": 0.6509085893630981, "learning_rate": 4.8639106485454094e-05, "loss": 0.0846, "num_input_tokens_seen": 30208000, "step": 29500 }, { "epoch": 0.2768012843579594, "grad_norm": 1.2850301265716553, "learning_rate": 4.86160397117576e-05, "loss": 0.0857, "num_input_tokens_seen": 30720000, "step": 30000 }, { "epoch": 0.2814146390972587, "grad_norm": 1.7259219884872437, "learning_rate": 4.85929729380611e-05, "loss": 0.0839, "num_input_tokens_seen": 31232000, "step": 30500 }, { "epoch": 0.2860279938365581, "grad_norm": 1.7700318098068237, "learning_rate": 4.85699061643646e-05, "loss": 0.0768, "num_input_tokens_seen": 31744000, "step": 31000 }, { "epoch": 0.2906413485758574, "grad_norm": 1.1451270580291748, "learning_rate": 4.854683939066811e-05, "loss": 0.0824, "num_input_tokens_seen": 32256000, "step": 31500 }, { "epoch": 0.29525470331515674, "grad_norm": 1.772096872329712, "learning_rate": 4.852377261697161e-05, "loss": 0.0847, "num_input_tokens_seen": 32768000, "step": 32000 }, { "epoch": 0.29986805805445604, "grad_norm": 1.671513557434082, "learning_rate": 4.850070584327511e-05, "loss": 0.0838, "num_input_tokens_seen": 33280000, "step": 32500 }, { "epoch": 0.30448141279375535, "grad_norm": 0.9703548550605774, "learning_rate": 4.847763906957862e-05, "loss": 0.08, "num_input_tokens_seen": 33792000, "step": 33000 }, { "epoch": 0.3090947675330547, "grad_norm": 0.7928164601325989, "learning_rate": 4.8454572295882126e-05, "loss": 0.08, "num_input_tokens_seen": 34304000, "step": 33500 }, { "epoch": 0.313708122272354, "grad_norm": 1.1138111352920532, "learning_rate": 4.8431505522185626e-05, "loss": 0.0733, "num_input_tokens_seen": 34816000, "step": 34000 }, { "epoch": 0.3183214770116533, "grad_norm": 0.89890056848526, "learning_rate": 4.840843874848913e-05, "loss": 0.0828, "num_input_tokens_seen": 35328000, "step": 34500 }, { "epoch": 0.32293483175095267, "grad_norm": 2.127382516860962, "learning_rate": 4.8385371974792634e-05, "loss": 0.0818, "num_input_tokens_seen": 35840000, "step": 35000 }, { "epoch": 0.32754818649025197, "grad_norm": 1.0730081796646118, "learning_rate": 4.8362305201096135e-05, "loss": 0.0776, "num_input_tokens_seen": 36352000, "step": 35500 }, { "epoch": 0.3321615412295513, "grad_norm": 0.5055031180381775, "learning_rate": 4.833923842739964e-05, "loss": 0.085, "num_input_tokens_seen": 36864000, "step": 36000 }, { "epoch": 0.33677489596885063, "grad_norm": 2.764418601989746, "learning_rate": 4.831617165370314e-05, "loss": 0.0795, "num_input_tokens_seen": 37376000, "step": 36500 }, { "epoch": 0.34138825070814993, "grad_norm": 2.272135019302368, "learning_rate": 4.829310488000664e-05, "loss": 0.0757, "num_input_tokens_seen": 37888000, "step": 37000 }, { "epoch": 0.3460016054474493, "grad_norm": 2.2221481800079346, "learning_rate": 4.827003810631015e-05, "loss": 0.0881, "num_input_tokens_seen": 38400000, "step": 37500 }, { "epoch": 0.3506149601867486, "grad_norm": 1.7147547006607056, "learning_rate": 4.824697133261365e-05, "loss": 0.0805, "num_input_tokens_seen": 38912000, "step": 38000 }, { "epoch": 0.35522831492604795, "grad_norm": 2.031804084777832, "learning_rate": 4.822390455891715e-05, "loss": 0.0762, "num_input_tokens_seen": 39424000, "step": 38500 }, { "epoch": 0.35984166966534725, "grad_norm": 0.8008927702903748, "learning_rate": 4.820083778522066e-05, "loss": 0.0794, "num_input_tokens_seen": 39936000, "step": 39000 }, { "epoch": 0.36445502440464655, "grad_norm": 1.5696818828582764, "learning_rate": 4.8177771011524167e-05, "loss": 0.0821, "num_input_tokens_seen": 40448000, "step": 39500 }, { "epoch": 0.3690683791439459, "grad_norm": 0.7710667252540588, "learning_rate": 4.815470423782766e-05, "loss": 0.0776, "num_input_tokens_seen": 40960000, "step": 40000 }, { "epoch": 0.3736817338832452, "grad_norm": 1.0794172286987305, "learning_rate": 4.813163746413117e-05, "loss": 0.0781, "num_input_tokens_seen": 41472000, "step": 40500 }, { "epoch": 0.3782950886225445, "grad_norm": 2.43756365776062, "learning_rate": 4.8108570690434675e-05, "loss": 0.0787, "num_input_tokens_seen": 41984000, "step": 41000 }, { "epoch": 0.3829084433618439, "grad_norm": 0.6750785112380981, "learning_rate": 4.8085503916738176e-05, "loss": 0.081, "num_input_tokens_seen": 42496000, "step": 41500 }, { "epoch": 0.3875217981011432, "grad_norm": 0.7780609726905823, "learning_rate": 4.8062437143041676e-05, "loss": 0.0791, "num_input_tokens_seen": 43008000, "step": 42000 }, { "epoch": 0.39213515284044254, "grad_norm": 1.1585677862167358, "learning_rate": 4.8039370369345184e-05, "loss": 0.0811, "num_input_tokens_seen": 43520000, "step": 42500 }, { "epoch": 0.39674850757974184, "grad_norm": 2.7044448852539062, "learning_rate": 4.8016303595648684e-05, "loss": 0.0775, "num_input_tokens_seen": 44032000, "step": 43000 }, { "epoch": 0.40136186231904114, "grad_norm": 2.9311044216156006, "learning_rate": 4.799323682195219e-05, "loss": 0.0739, "num_input_tokens_seen": 44544000, "step": 43500 }, { "epoch": 0.4059752170583405, "grad_norm": 2.255924940109253, "learning_rate": 4.797017004825569e-05, "loss": 0.0814, "num_input_tokens_seen": 45056000, "step": 44000 }, { "epoch": 0.4105885717976398, "grad_norm": 3.5307369232177734, "learning_rate": 4.79471032745592e-05, "loss": 0.0773, "num_input_tokens_seen": 45568000, "step": 44500 }, { "epoch": 0.41520192653693916, "grad_norm": 0.7721351385116577, "learning_rate": 4.79240365008627e-05, "loss": 0.074, "num_input_tokens_seen": 46080000, "step": 45000 }, { "epoch": 0.41981528127623846, "grad_norm": 1.668393611907959, "learning_rate": 4.79009697271662e-05, "loss": 0.0763, "num_input_tokens_seen": 46592000, "step": 45500 }, { "epoch": 0.42442863601553776, "grad_norm": 2.3824353218078613, "learning_rate": 4.787790295346971e-05, "loss": 0.0772, "num_input_tokens_seen": 47104000, "step": 46000 }, { "epoch": 0.4290419907548371, "grad_norm": 2.127598762512207, "learning_rate": 4.785483617977321e-05, "loss": 0.0803, "num_input_tokens_seen": 47616000, "step": 46500 }, { "epoch": 0.4336553454941364, "grad_norm": 2.958203077316284, "learning_rate": 4.7831769406076716e-05, "loss": 0.0781, "num_input_tokens_seen": 48128000, "step": 47000 }, { "epoch": 0.4382687002334357, "grad_norm": 0.7533183693885803, "learning_rate": 4.7808702632380217e-05, "loss": 0.0793, "num_input_tokens_seen": 48640000, "step": 47500 }, { "epoch": 0.4428820549727351, "grad_norm": 1.3638031482696533, "learning_rate": 4.778563585868372e-05, "loss": 0.081, "num_input_tokens_seen": 49152000, "step": 48000 }, { "epoch": 0.4474954097120344, "grad_norm": 1.3746527433395386, "learning_rate": 4.7762569084987225e-05, "loss": 0.0863, "num_input_tokens_seen": 49664000, "step": 48500 }, { "epoch": 0.45210876445133374, "grad_norm": 1.5628637075424194, "learning_rate": 4.773950231129073e-05, "loss": 0.0799, "num_input_tokens_seen": 50176000, "step": 49000 }, { "epoch": 0.45672211919063305, "grad_norm": 1.8787376880645752, "learning_rate": 4.7716435537594226e-05, "loss": 0.0782, "num_input_tokens_seen": 50688000, "step": 49500 }, { "epoch": 0.46133547392993235, "grad_norm": 1.3804419040679932, "learning_rate": 4.769336876389773e-05, "loss": 0.0833, "num_input_tokens_seen": 51200000, "step": 50000 }, { "epoch": 0.4659488286692317, "grad_norm": 1.6135491132736206, "learning_rate": 4.767030199020124e-05, "loss": 0.0762, "num_input_tokens_seen": 51712000, "step": 50500 }, { "epoch": 0.470562183408531, "grad_norm": 2.186791181564331, "learning_rate": 4.7647235216504734e-05, "loss": 0.0797, "num_input_tokens_seen": 52224000, "step": 51000 }, { "epoch": 0.4751755381478303, "grad_norm": 1.6921688318252563, "learning_rate": 4.762416844280824e-05, "loss": 0.0812, "num_input_tokens_seen": 52736000, "step": 51500 }, { "epoch": 0.47978889288712967, "grad_norm": 0.95241379737854, "learning_rate": 4.760110166911175e-05, "loss": 0.0788, "num_input_tokens_seen": 53248000, "step": 52000 }, { "epoch": 0.484402247626429, "grad_norm": 3.2142257690429688, "learning_rate": 4.757803489541525e-05, "loss": 0.0776, "num_input_tokens_seen": 53760000, "step": 52500 }, { "epoch": 0.48901560236572833, "grad_norm": 3.2678260803222656, "learning_rate": 4.755496812171875e-05, "loss": 0.0753, "num_input_tokens_seen": 54272000, "step": 53000 }, { "epoch": 0.49362895710502763, "grad_norm": 2.8343145847320557, "learning_rate": 4.753190134802226e-05, "loss": 0.0784, "num_input_tokens_seen": 54784000, "step": 53500 }, { "epoch": 0.49824231184432693, "grad_norm": 1.4818017482757568, "learning_rate": 4.750883457432576e-05, "loss": 0.0752, "num_input_tokens_seen": 55296000, "step": 54000 }, { "epoch": 0.5028556665836262, "grad_norm": 1.2139348983764648, "learning_rate": 4.7485767800629265e-05, "loss": 0.0734, "num_input_tokens_seen": 55808000, "step": 54500 }, { "epoch": 0.5074690213229256, "grad_norm": 1.3937476873397827, "learning_rate": 4.7462701026932766e-05, "loss": 0.0759, "num_input_tokens_seen": 56320000, "step": 55000 }, { "epoch": 0.512082376062225, "grad_norm": 1.7801790237426758, "learning_rate": 4.743963425323627e-05, "loss": 0.0799, "num_input_tokens_seen": 56832000, "step": 55500 }, { "epoch": 0.5166957308015242, "grad_norm": 0.9710603952407837, "learning_rate": 4.7416567479539774e-05, "loss": 0.0705, "num_input_tokens_seen": 57344000, "step": 56000 }, { "epoch": 0.5213090855408236, "grad_norm": 1.3923077583312988, "learning_rate": 4.739350070584328e-05, "loss": 0.0778, "num_input_tokens_seen": 57856000, "step": 56500 }, { "epoch": 0.5259224402801229, "grad_norm": 0.5901740193367004, "learning_rate": 4.737043393214678e-05, "loss": 0.0729, "num_input_tokens_seen": 58368000, "step": 57000 }, { "epoch": 0.5305357950194223, "grad_norm": 1.3465195894241333, "learning_rate": 4.734736715845028e-05, "loss": 0.0797, "num_input_tokens_seen": 58880000, "step": 57500 }, { "epoch": 0.5351491497587215, "grad_norm": 0.48033392429351807, "learning_rate": 4.732430038475379e-05, "loss": 0.0736, "num_input_tokens_seen": 59392000, "step": 58000 }, { "epoch": 0.5397625044980209, "grad_norm": 1.3446660041809082, "learning_rate": 4.730123361105729e-05, "loss": 0.0778, "num_input_tokens_seen": 59904000, "step": 58500 }, { "epoch": 0.5443758592373202, "grad_norm": 0.895521342754364, "learning_rate": 4.727816683736079e-05, "loss": 0.0754, "num_input_tokens_seen": 60416000, "step": 59000 }, { "epoch": 0.5489892139766195, "grad_norm": 1.3843989372253418, "learning_rate": 4.72551000636643e-05, "loss": 0.0817, "num_input_tokens_seen": 60928000, "step": 59500 }, { "epoch": 0.5536025687159188, "grad_norm": 1.5670028924942017, "learning_rate": 4.7232033289967806e-05, "loss": 0.0742, "num_input_tokens_seen": 61440000, "step": 60000 }, { "epoch": 0.5582159234552182, "grad_norm": 1.4761849641799927, "learning_rate": 4.72089665162713e-05, "loss": 0.0688, "num_input_tokens_seen": 61952000, "step": 60500 }, { "epoch": 0.5628292781945174, "grad_norm": 6.005481719970703, "learning_rate": 4.718589974257481e-05, "loss": 0.0836, "num_input_tokens_seen": 62464000, "step": 61000 }, { "epoch": 0.5674426329338168, "grad_norm": 1.2835499048233032, "learning_rate": 4.7162832968878314e-05, "loss": 0.0731, "num_input_tokens_seen": 62976000, "step": 61500 }, { "epoch": 0.5720559876731162, "grad_norm": 1.769403338432312, "learning_rate": 4.7139766195181815e-05, "loss": 0.079, "num_input_tokens_seen": 63488000, "step": 62000 }, { "epoch": 0.5766693424124154, "grad_norm": 1.8391185998916626, "learning_rate": 4.7116699421485315e-05, "loss": 0.082, "num_input_tokens_seen": 64000000, "step": 62500 }, { "epoch": 0.5812826971517148, "grad_norm": 1.3075145483016968, "learning_rate": 4.709363264778882e-05, "loss": 0.0753, "num_input_tokens_seen": 64512000, "step": 63000 }, { "epoch": 0.5858960518910141, "grad_norm": 2.2406928539276123, "learning_rate": 4.707056587409232e-05, "loss": 0.0737, "num_input_tokens_seen": 65024000, "step": 63500 }, { "epoch": 0.5905094066303135, "grad_norm": 2.2750511169433594, "learning_rate": 4.7047499100395824e-05, "loss": 0.077, "num_input_tokens_seen": 65536000, "step": 64000 }, { "epoch": 0.5951227613696127, "grad_norm": 1.7060987949371338, "learning_rate": 4.702443232669933e-05, "loss": 0.0764, "num_input_tokens_seen": 66048000, "step": 64500 }, { "epoch": 0.5997361161089121, "grad_norm": 1.3420023918151855, "learning_rate": 4.700136555300283e-05, "loss": 0.0803, "num_input_tokens_seen": 66560000, "step": 65000 }, { "epoch": 0.6043494708482114, "grad_norm": 0.8915556073188782, "learning_rate": 4.697829877930634e-05, "loss": 0.0765, "num_input_tokens_seen": 67072000, "step": 65500 }, { "epoch": 0.6089628255875107, "grad_norm": 2.3567070960998535, "learning_rate": 4.695523200560984e-05, "loss": 0.0739, "num_input_tokens_seen": 67584000, "step": 66000 }, { "epoch": 0.61357618032681, "grad_norm": 1.8976528644561768, "learning_rate": 4.693216523191335e-05, "loss": 0.0738, "num_input_tokens_seen": 68096000, "step": 66500 }, { "epoch": 0.6181895350661094, "grad_norm": 2.0413930416107178, "learning_rate": 4.690909845821685e-05, "loss": 0.0826, "num_input_tokens_seen": 68608000, "step": 67000 }, { "epoch": 0.6228028898054087, "grad_norm": 4.672994613647461, "learning_rate": 4.6886031684520355e-05, "loss": 0.0773, "num_input_tokens_seen": 69120000, "step": 67500 }, { "epoch": 0.627416244544708, "grad_norm": 1.1743087768554688, "learning_rate": 4.6862964910823856e-05, "loss": 0.0745, "num_input_tokens_seen": 69632000, "step": 68000 }, { "epoch": 0.6320295992840074, "grad_norm": 0.7749766707420349, "learning_rate": 4.6839898137127356e-05, "loss": 0.0738, "num_input_tokens_seen": 70144000, "step": 68500 }, { "epoch": 0.6366429540233066, "grad_norm": 0.5075979232788086, "learning_rate": 4.6816831363430864e-05, "loss": 0.0747, "num_input_tokens_seen": 70656000, "step": 69000 }, { "epoch": 0.641256308762606, "grad_norm": 2.802272081375122, "learning_rate": 4.679376458973437e-05, "loss": 0.0825, "num_input_tokens_seen": 71168000, "step": 69500 }, { "epoch": 0.6458696635019053, "grad_norm": 1.798438549041748, "learning_rate": 4.6770697816037865e-05, "loss": 0.0766, "num_input_tokens_seen": 71680000, "step": 70000 }, { "epoch": 0.6504830182412047, "grad_norm": 1.7648403644561768, "learning_rate": 4.674763104234137e-05, "loss": 0.077, "num_input_tokens_seen": 72192000, "step": 70500 }, { "epoch": 0.6550963729805039, "grad_norm": 2.0195560455322266, "learning_rate": 4.672456426864488e-05, "loss": 0.0767, "num_input_tokens_seen": 72704000, "step": 71000 }, { "epoch": 0.6597097277198033, "grad_norm": 3.9862349033355713, "learning_rate": 4.670149749494837e-05, "loss": 0.0745, "num_input_tokens_seen": 73216000, "step": 71500 }, { "epoch": 0.6643230824591027, "grad_norm": 2.7226781845092773, "learning_rate": 4.667843072125188e-05, "loss": 0.0703, "num_input_tokens_seen": 73728000, "step": 72000 }, { "epoch": 0.6689364371984019, "grad_norm": 2.0484044551849365, "learning_rate": 4.665536394755539e-05, "loss": 0.0765, "num_input_tokens_seen": 74240000, "step": 72500 }, { "epoch": 0.6735497919377013, "grad_norm": 0.4825538694858551, "learning_rate": 4.663229717385889e-05, "loss": 0.0823, "num_input_tokens_seen": 74752000, "step": 73000 }, { "epoch": 0.6781631466770006, "grad_norm": 1.2127926349639893, "learning_rate": 4.660923040016239e-05, "loss": 0.0754, "num_input_tokens_seen": 75264000, "step": 73500 }, { "epoch": 0.6827765014162999, "grad_norm": 3.139049768447876, "learning_rate": 4.6586163626465897e-05, "loss": 0.0749, "num_input_tokens_seen": 75776000, "step": 74000 }, { "epoch": 0.6873898561555992, "grad_norm": 2.038872480392456, "learning_rate": 4.65630968527694e-05, "loss": 0.0753, "num_input_tokens_seen": 76288000, "step": 74500 }, { "epoch": 0.6920032108948986, "grad_norm": 4.1413469314575195, "learning_rate": 4.6540030079072904e-05, "loss": 0.0761, "num_input_tokens_seen": 76800000, "step": 75000 }, { "epoch": 0.6966165656341978, "grad_norm": 1.3078006505966187, "learning_rate": 4.6516963305376405e-05, "loss": 0.0766, "num_input_tokens_seen": 77312000, "step": 75500 }, { "epoch": 0.7012299203734972, "grad_norm": 1.2052334547042847, "learning_rate": 4.649389653167991e-05, "loss": 0.0749, "num_input_tokens_seen": 77824000, "step": 76000 }, { "epoch": 0.7058432751127965, "grad_norm": 1.5266985893249512, "learning_rate": 4.647082975798341e-05, "loss": 0.0768, "num_input_tokens_seen": 78336000, "step": 76500 }, { "epoch": 0.7104566298520959, "grad_norm": 13.878520011901855, "learning_rate": 4.6447762984286914e-05, "loss": 0.0813, "num_input_tokens_seen": 78848000, "step": 77000 }, { "epoch": 0.7150699845913951, "grad_norm": 0.8548376560211182, "learning_rate": 4.642469621059042e-05, "loss": 0.0693, "num_input_tokens_seen": 79360000, "step": 77500 }, { "epoch": 0.7196833393306945, "grad_norm": 1.8979346752166748, "learning_rate": 4.640162943689392e-05, "loss": 0.0795, "num_input_tokens_seen": 79872000, "step": 78000 }, { "epoch": 0.7242966940699939, "grad_norm": 0.6193153262138367, "learning_rate": 4.637856266319743e-05, "loss": 0.0776, "num_input_tokens_seen": 80384000, "step": 78500 }, { "epoch": 0.7289100488092931, "grad_norm": 1.736380934715271, "learning_rate": 4.635549588950093e-05, "loss": 0.079, "num_input_tokens_seen": 80896000, "step": 79000 }, { "epoch": 0.7335234035485925, "grad_norm": 3.559295415878296, "learning_rate": 4.633242911580443e-05, "loss": 0.0792, "num_input_tokens_seen": 81408000, "step": 79500 }, { "epoch": 0.7381367582878918, "grad_norm": 1.017986536026001, "learning_rate": 4.630936234210794e-05, "loss": 0.0782, "num_input_tokens_seen": 81920000, "step": 80000 }, { "epoch": 0.7427501130271911, "grad_norm": 1.2457808256149292, "learning_rate": 4.6286295568411445e-05, "loss": 0.0766, "num_input_tokens_seen": 82432000, "step": 80500 }, { "epoch": 0.7473634677664904, "grad_norm": 0.6746057271957397, "learning_rate": 4.626322879471494e-05, "loss": 0.0728, "num_input_tokens_seen": 82944000, "step": 81000 }, { "epoch": 0.7519768225057898, "grad_norm": 1.1048623323440552, "learning_rate": 4.6240162021018446e-05, "loss": 0.0763, "num_input_tokens_seen": 83456000, "step": 81500 }, { "epoch": 0.756590177245089, "grad_norm": 2.0804615020751953, "learning_rate": 4.621709524732195e-05, "loss": 0.0736, "num_input_tokens_seen": 83968000, "step": 82000 }, { "epoch": 0.7612035319843884, "grad_norm": 0.7726876735687256, "learning_rate": 4.6194028473625454e-05, "loss": 0.0756, "num_input_tokens_seen": 84480000, "step": 82500 }, { "epoch": 0.7658168867236878, "grad_norm": 1.618414044380188, "learning_rate": 4.6170961699928954e-05, "loss": 0.0736, "num_input_tokens_seen": 84992000, "step": 83000 }, { "epoch": 0.7704302414629871, "grad_norm": 0.2806508243083954, "learning_rate": 4.614789492623246e-05, "loss": 0.0757, "num_input_tokens_seen": 85504000, "step": 83500 }, { "epoch": 0.7750435962022864, "grad_norm": 1.093205451965332, "learning_rate": 4.612482815253596e-05, "loss": 0.0746, "num_input_tokens_seen": 86016000, "step": 84000 }, { "epoch": 0.7796569509415857, "grad_norm": 0.8395510911941528, "learning_rate": 4.610176137883946e-05, "loss": 0.0728, "num_input_tokens_seen": 86528000, "step": 84500 }, { "epoch": 0.7842703056808851, "grad_norm": 5.429121017456055, "learning_rate": 4.607869460514297e-05, "loss": 0.0752, "num_input_tokens_seen": 87040000, "step": 85000 }, { "epoch": 0.7888836604201843, "grad_norm": 1.0684977769851685, "learning_rate": 4.605562783144647e-05, "loss": 0.0734, "num_input_tokens_seen": 87552000, "step": 85500 }, { "epoch": 0.7934970151594837, "grad_norm": 4.412910461425781, "learning_rate": 4.603256105774998e-05, "loss": 0.0724, "num_input_tokens_seen": 88064000, "step": 86000 }, { "epoch": 0.798110369898783, "grad_norm": 1.352186918258667, "learning_rate": 4.600949428405348e-05, "loss": 0.0752, "num_input_tokens_seen": 88576000, "step": 86500 }, { "epoch": 0.8027237246380823, "grad_norm": 3.716979742050171, "learning_rate": 4.5986427510356986e-05, "loss": 0.0712, "num_input_tokens_seen": 89088000, "step": 87000 }, { "epoch": 0.8073370793773816, "grad_norm": 1.6584104299545288, "learning_rate": 4.596336073666049e-05, "loss": 0.0733, "num_input_tokens_seen": 89600000, "step": 87500 }, { "epoch": 0.811950434116681, "grad_norm": 2.3811452388763428, "learning_rate": 4.5940293962963994e-05, "loss": 0.0763, "num_input_tokens_seen": 90112000, "step": 88000 }, { "epoch": 0.8165637888559802, "grad_norm": 1.4352256059646606, "learning_rate": 4.5917227189267495e-05, "loss": 0.0696, "num_input_tokens_seen": 90624000, "step": 88500 }, { "epoch": 0.8211771435952796, "grad_norm": 2.95996356010437, "learning_rate": 4.5894160415570995e-05, "loss": 0.0675, "num_input_tokens_seen": 91136000, "step": 89000 }, { "epoch": 0.825790498334579, "grad_norm": 1.790480375289917, "learning_rate": 4.58710936418745e-05, "loss": 0.0737, "num_input_tokens_seen": 91648000, "step": 89500 }, { "epoch": 0.8304038530738783, "grad_norm": 2.4636244773864746, "learning_rate": 4.5848026868178e-05, "loss": 0.0725, "num_input_tokens_seen": 92160000, "step": 90000 }, { "epoch": 0.8350172078131776, "grad_norm": 1.4085214138031006, "learning_rate": 4.5824960094481504e-05, "loss": 0.0801, "num_input_tokens_seen": 92672000, "step": 90500 }, { "epoch": 0.8396305625524769, "grad_norm": 1.5080194473266602, "learning_rate": 4.580189332078501e-05, "loss": 0.0707, "num_input_tokens_seen": 93184000, "step": 91000 }, { "epoch": 0.8442439172917763, "grad_norm": 0.8035141229629517, "learning_rate": 4.577882654708852e-05, "loss": 0.0775, "num_input_tokens_seen": 93696000, "step": 91500 }, { "epoch": 0.8488572720310755, "grad_norm": 1.832581639289856, "learning_rate": 4.575575977339201e-05, "loss": 0.076, "num_input_tokens_seen": 94208000, "step": 92000 }, { "epoch": 0.8534706267703749, "grad_norm": 0.5887289047241211, "learning_rate": 4.573269299969552e-05, "loss": 0.0752, "num_input_tokens_seen": 94720000, "step": 92500 }, { "epoch": 0.8580839815096742, "grad_norm": 0.7849867939949036, "learning_rate": 4.570962622599903e-05, "loss": 0.0815, "num_input_tokens_seen": 95232000, "step": 93000 }, { "epoch": 0.8626973362489735, "grad_norm": 2.76053524017334, "learning_rate": 4.568655945230253e-05, "loss": 0.0696, "num_input_tokens_seen": 95744000, "step": 93500 }, { "epoch": 0.8673106909882728, "grad_norm": 0.608044445514679, "learning_rate": 4.566349267860603e-05, "loss": 0.0764, "num_input_tokens_seen": 96256000, "step": 94000 }, { "epoch": 0.8719240457275722, "grad_norm": 2.4751555919647217, "learning_rate": 4.5640425904909536e-05, "loss": 0.0706, "num_input_tokens_seen": 96768000, "step": 94500 }, { "epoch": 0.8765374004668715, "grad_norm": 0.5605325698852539, "learning_rate": 4.5617359131213036e-05, "loss": 0.074, "num_input_tokens_seen": 97280000, "step": 95000 }, { "epoch": 0.8811507552061708, "grad_norm": 2.0805656909942627, "learning_rate": 4.5594292357516544e-05, "loss": 0.0723, "num_input_tokens_seen": 97792000, "step": 95500 }, { "epoch": 0.8857641099454702, "grad_norm": 0.8538010120391846, "learning_rate": 4.5571225583820044e-05, "loss": 0.0755, "num_input_tokens_seen": 98304000, "step": 96000 }, { "epoch": 0.8903774646847694, "grad_norm": 0.7344834804534912, "learning_rate": 4.5548158810123545e-05, "loss": 0.0722, "num_input_tokens_seen": 98816000, "step": 96500 }, { "epoch": 0.8949908194240688, "grad_norm": 0.9666327238082886, "learning_rate": 4.552509203642705e-05, "loss": 0.0777, "num_input_tokens_seen": 99328000, "step": 97000 }, { "epoch": 0.8996041741633681, "grad_norm": 1.5512099266052246, "learning_rate": 4.550202526273055e-05, "loss": 0.0751, "num_input_tokens_seen": 99840000, "step": 97500 }, { "epoch": 0.9042175289026675, "grad_norm": 0.9923927187919617, "learning_rate": 4.547895848903406e-05, "loss": 0.073, "num_input_tokens_seen": 100352000, "step": 98000 }, { "epoch": 0.9088308836419667, "grad_norm": 1.5789976119995117, "learning_rate": 4.545589171533756e-05, "loss": 0.068, "num_input_tokens_seen": 100864000, "step": 98500 }, { "epoch": 0.9134442383812661, "grad_norm": 0.3622562885284424, "learning_rate": 4.543282494164107e-05, "loss": 0.0711, "num_input_tokens_seen": 101376000, "step": 99000 }, { "epoch": 0.9180575931205655, "grad_norm": 1.9762753248214722, "learning_rate": 4.540975816794457e-05, "loss": 0.0678, "num_input_tokens_seen": 101888000, "step": 99500 }, { "epoch": 0.9226709478598647, "grad_norm": 2.144947052001953, "learning_rate": 4.538669139424807e-05, "loss": 0.0705, "num_input_tokens_seen": 102400000, "step": 100000 }, { "epoch": 0.9272843025991641, "grad_norm": 0.5793939232826233, "learning_rate": 4.5363624620551576e-05, "loss": 0.0798, "num_input_tokens_seen": 102912000, "step": 100500 }, { "epoch": 0.9318976573384634, "grad_norm": 1.8652976751327515, "learning_rate": 4.5340557846855084e-05, "loss": 0.0723, "num_input_tokens_seen": 103424000, "step": 101000 }, { "epoch": 0.9365110120777627, "grad_norm": 1.8371716737747192, "learning_rate": 4.531749107315858e-05, "loss": 0.0752, "num_input_tokens_seen": 103936000, "step": 101500 }, { "epoch": 0.941124366817062, "grad_norm": 1.0695359706878662, "learning_rate": 4.5294424299462085e-05, "loss": 0.0786, "num_input_tokens_seen": 104448000, "step": 102000 }, { "epoch": 0.9457377215563614, "grad_norm": 1.6259958744049072, "learning_rate": 4.527135752576559e-05, "loss": 0.0726, "num_input_tokens_seen": 104960000, "step": 102500 }, { "epoch": 0.9503510762956606, "grad_norm": 2.0838193893432617, "learning_rate": 4.5248290752069086e-05, "loss": 0.0729, "num_input_tokens_seen": 105472000, "step": 103000 }, { "epoch": 0.95496443103496, "grad_norm": 1.8072469234466553, "learning_rate": 4.5225223978372593e-05, "loss": 0.0725, "num_input_tokens_seen": 105984000, "step": 103500 }, { "epoch": 0.9595777857742593, "grad_norm": 1.4469674825668335, "learning_rate": 4.52021572046761e-05, "loss": 0.0762, "num_input_tokens_seen": 106496000, "step": 104000 }, { "epoch": 0.9641911405135587, "grad_norm": 0.8151160478591919, "learning_rate": 4.51790904309796e-05, "loss": 0.0713, "num_input_tokens_seen": 107008000, "step": 104500 }, { "epoch": 0.968804495252858, "grad_norm": 2.5363306999206543, "learning_rate": 4.51560236572831e-05, "loss": 0.0717, "num_input_tokens_seen": 107520000, "step": 105000 }, { "epoch": 0.9734178499921573, "grad_norm": 2.3089513778686523, "learning_rate": 4.513295688358661e-05, "loss": 0.075, "num_input_tokens_seen": 108032000, "step": 105500 }, { "epoch": 0.9780312047314567, "grad_norm": 1.2738145589828491, "learning_rate": 4.510989010989011e-05, "loss": 0.0739, "num_input_tokens_seen": 108544000, "step": 106000 }, { "epoch": 0.9826445594707559, "grad_norm": 0.9310311675071716, "learning_rate": 4.508682333619362e-05, "loss": 0.0715, "num_input_tokens_seen": 109056000, "step": 106500 }, { "epoch": 0.9872579142100553, "grad_norm": 1.332413911819458, "learning_rate": 4.506375656249712e-05, "loss": 0.0762, "num_input_tokens_seen": 109568000, "step": 107000 }, { "epoch": 0.9918712689493546, "grad_norm": 1.171770691871643, "learning_rate": 4.504068978880062e-05, "loss": 0.0682, "num_input_tokens_seen": 110080000, "step": 107500 }, { "epoch": 0.9964846236886539, "grad_norm": 1.318642497062683, "learning_rate": 4.5017623015104126e-05, "loss": 0.0725, "num_input_tokens_seen": 110592000, "step": 108000 }, { "epoch": 1.0, "eval_combined_score": 0.07267016709729579, "eval_loss": 0.07267016172409058, "eval_mse": 0.07267016501992041, "eval_runtime": 46.4186, "eval_samples_per_second": 2075.42, "eval_steps_per_second": 259.444, "num_input_tokens_seen": 110981376, "step": 108381 }, { "epoch": 1.0010979784279532, "grad_norm": 2.0301551818847656, "learning_rate": 4.499455624140763e-05, "loss": 0.0723, "num_input_tokens_seen": 111103232, "step": 108500 }, { "epoch": 1.0057113331672525, "grad_norm": 0.46064960956573486, "learning_rate": 4.4971489467711134e-05, "loss": 0.066, "num_input_tokens_seen": 111615232, "step": 109000 }, { "epoch": 1.010324687906552, "grad_norm": 2.481804132461548, "learning_rate": 4.4948422694014634e-05, "loss": 0.0567, "num_input_tokens_seen": 112127232, "step": 109500 }, { "epoch": 1.0149380426458512, "grad_norm": 1.0883979797363281, "learning_rate": 4.492535592031814e-05, "loss": 0.0591, "num_input_tokens_seen": 112639232, "step": 110000 }, { "epoch": 1.0195513973851504, "grad_norm": 1.5821534395217896, "learning_rate": 4.490228914662164e-05, "loss": 0.0575, "num_input_tokens_seen": 113151232, "step": 110500 }, { "epoch": 1.02416475212445, "grad_norm": 1.1834355592727661, "learning_rate": 4.487922237292514e-05, "loss": 0.0643, "num_input_tokens_seen": 113663232, "step": 111000 }, { "epoch": 1.0287781068637492, "grad_norm": 0.5016165375709534, "learning_rate": 4.485615559922865e-05, "loss": 0.0598, "num_input_tokens_seen": 114175232, "step": 111500 }, { "epoch": 1.0333914616030484, "grad_norm": 2.372044086456299, "learning_rate": 4.483308882553216e-05, "loss": 0.0608, "num_input_tokens_seen": 114687232, "step": 112000 }, { "epoch": 1.0380048163423479, "grad_norm": 1.4434441328048706, "learning_rate": 4.481002205183565e-05, "loss": 0.059, "num_input_tokens_seen": 115199232, "step": 112500 }, { "epoch": 1.0426181710816471, "grad_norm": 1.329825520515442, "learning_rate": 4.478695527813916e-05, "loss": 0.061, "num_input_tokens_seen": 115711232, "step": 113000 }, { "epoch": 1.0472315258209464, "grad_norm": 0.6627879738807678, "learning_rate": 4.4763888504442666e-05, "loss": 0.0562, "num_input_tokens_seen": 116223232, "step": 113500 }, { "epoch": 1.0518448805602458, "grad_norm": 1.4965338706970215, "learning_rate": 4.474082173074617e-05, "loss": 0.0614, "num_input_tokens_seen": 116735232, "step": 114000 }, { "epoch": 1.056458235299545, "grad_norm": 4.595455646514893, "learning_rate": 4.471775495704967e-05, "loss": 0.0569, "num_input_tokens_seen": 117247232, "step": 114500 }, { "epoch": 1.0610715900388445, "grad_norm": 1.5899192094802856, "learning_rate": 4.4694688183353175e-05, "loss": 0.058, "num_input_tokens_seen": 117759232, "step": 115000 }, { "epoch": 1.0656849447781438, "grad_norm": 1.812812328338623, "learning_rate": 4.4671621409656675e-05, "loss": 0.0564, "num_input_tokens_seen": 118271232, "step": 115500 }, { "epoch": 1.070298299517443, "grad_norm": 1.8089003562927246, "learning_rate": 4.4648554635960176e-05, "loss": 0.0664, "num_input_tokens_seen": 118783232, "step": 116000 }, { "epoch": 1.0749116542567425, "grad_norm": 2.216608762741089, "learning_rate": 4.462548786226368e-05, "loss": 0.0599, "num_input_tokens_seen": 119295232, "step": 116500 }, { "epoch": 1.0795250089960418, "grad_norm": 2.6362509727478027, "learning_rate": 4.4602421088567184e-05, "loss": 0.0585, "num_input_tokens_seen": 119807232, "step": 117000 }, { "epoch": 1.084138363735341, "grad_norm": 0.8326151371002197, "learning_rate": 4.457935431487069e-05, "loss": 0.0593, "num_input_tokens_seen": 120319232, "step": 117500 }, { "epoch": 1.0887517184746405, "grad_norm": 1.3363105058670044, "learning_rate": 4.455628754117419e-05, "loss": 0.056, "num_input_tokens_seen": 120831232, "step": 118000 }, { "epoch": 1.0933650732139397, "grad_norm": 2.2342283725738525, "learning_rate": 4.45332207674777e-05, "loss": 0.0607, "num_input_tokens_seen": 121343232, "step": 118500 }, { "epoch": 1.097978427953239, "grad_norm": 1.9718506336212158, "learning_rate": 4.45101539937812e-05, "loss": 0.0625, "num_input_tokens_seen": 121855232, "step": 119000 }, { "epoch": 1.1025917826925384, "grad_norm": 0.7142735123634338, "learning_rate": 4.448708722008471e-05, "loss": 0.0565, "num_input_tokens_seen": 122367232, "step": 119500 }, { "epoch": 1.1072051374318377, "grad_norm": 1.1628931760787964, "learning_rate": 4.446402044638821e-05, "loss": 0.0583, "num_input_tokens_seen": 122879232, "step": 120000 }, { "epoch": 1.111818492171137, "grad_norm": 1.8776410818099976, "learning_rate": 4.444095367269171e-05, "loss": 0.0642, "num_input_tokens_seen": 123391232, "step": 120500 }, { "epoch": 1.1164318469104364, "grad_norm": 1.5755925178527832, "learning_rate": 4.4417886898995216e-05, "loss": 0.0631, "num_input_tokens_seen": 123903232, "step": 121000 }, { "epoch": 1.1210452016497356, "grad_norm": 1.7925944328308105, "learning_rate": 4.4394820125298716e-05, "loss": 0.0603, "num_input_tokens_seen": 124415232, "step": 121500 }, { "epoch": 1.125658556389035, "grad_norm": 2.4041876792907715, "learning_rate": 4.437175335160222e-05, "loss": 0.0552, "num_input_tokens_seen": 124927232, "step": 122000 }, { "epoch": 1.1302719111283344, "grad_norm": 2.1456570625305176, "learning_rate": 4.4348686577905724e-05, "loss": 0.065, "num_input_tokens_seen": 125439232, "step": 122500 }, { "epoch": 1.1348852658676336, "grad_norm": 1.278905987739563, "learning_rate": 4.432561980420923e-05, "loss": 0.0648, "num_input_tokens_seen": 125951232, "step": 123000 }, { "epoch": 1.1394986206069329, "grad_norm": 1.4145876169204712, "learning_rate": 4.4302553030512725e-05, "loss": 0.0603, "num_input_tokens_seen": 126463232, "step": 123500 }, { "epoch": 1.1441119753462323, "grad_norm": 1.247292160987854, "learning_rate": 4.427948625681623e-05, "loss": 0.0616, "num_input_tokens_seen": 126975232, "step": 124000 }, { "epoch": 1.1487253300855316, "grad_norm": 1.0648530721664429, "learning_rate": 4.425641948311974e-05, "loss": 0.0577, "num_input_tokens_seen": 127487232, "step": 124500 }, { "epoch": 1.1533386848248308, "grad_norm": 2.285616874694824, "learning_rate": 4.423335270942324e-05, "loss": 0.0574, "num_input_tokens_seen": 127999232, "step": 125000 }, { "epoch": 1.1579520395641303, "grad_norm": 1.124847173690796, "learning_rate": 4.421028593572674e-05, "loss": 0.0599, "num_input_tokens_seen": 128511232, "step": 125500 }, { "epoch": 1.1625653943034295, "grad_norm": 2.4443585872650146, "learning_rate": 4.418721916203025e-05, "loss": 0.0568, "num_input_tokens_seen": 129023232, "step": 126000 }, { "epoch": 1.167178749042729, "grad_norm": 0.8579834699630737, "learning_rate": 4.416415238833375e-05, "loss": 0.0628, "num_input_tokens_seen": 129535232, "step": 126500 }, { "epoch": 1.1717921037820282, "grad_norm": 3.7771518230438232, "learning_rate": 4.4141085614637256e-05, "loss": 0.0618, "num_input_tokens_seen": 130047232, "step": 127000 }, { "epoch": 1.1764054585213275, "grad_norm": 1.2302302122116089, "learning_rate": 4.411801884094076e-05, "loss": 0.0569, "num_input_tokens_seen": 130559232, "step": 127500 }, { "epoch": 1.1810188132606267, "grad_norm": 5.366886615753174, "learning_rate": 4.409495206724426e-05, "loss": 0.0581, "num_input_tokens_seen": 131071232, "step": 128000 }, { "epoch": 1.1856321679999262, "grad_norm": 1.6237967014312744, "learning_rate": 4.4071885293547765e-05, "loss": 0.0564, "num_input_tokens_seen": 131583232, "step": 128500 }, { "epoch": 1.1902455227392255, "grad_norm": 1.025489091873169, "learning_rate": 4.4048818519851265e-05, "loss": 0.062, "num_input_tokens_seen": 132095232, "step": 129000 }, { "epoch": 1.194858877478525, "grad_norm": 3.0035746097564697, "learning_rate": 4.402575174615477e-05, "loss": 0.0567, "num_input_tokens_seen": 132607232, "step": 129500 }, { "epoch": 1.1994722322178242, "grad_norm": 0.4716099202632904, "learning_rate": 4.4002684972458273e-05, "loss": 0.0594, "num_input_tokens_seen": 133119232, "step": 130000 }, { "epoch": 1.2040855869571234, "grad_norm": 1.073433756828308, "learning_rate": 4.397961819876178e-05, "loss": 0.0638, "num_input_tokens_seen": 133631232, "step": 130500 }, { "epoch": 1.208698941696423, "grad_norm": 1.676879644393921, "learning_rate": 4.395655142506528e-05, "loss": 0.0665, "num_input_tokens_seen": 134143232, "step": 131000 }, { "epoch": 1.2133122964357221, "grad_norm": 1.4313554763793945, "learning_rate": 4.393348465136878e-05, "loss": 0.062, "num_input_tokens_seen": 134655232, "step": 131500 }, { "epoch": 1.2179256511750214, "grad_norm": 1.8880019187927246, "learning_rate": 4.391041787767229e-05, "loss": 0.0568, "num_input_tokens_seen": 135167232, "step": 132000 }, { "epoch": 1.2225390059143209, "grad_norm": 1.572786569595337, "learning_rate": 4.38873511039758e-05, "loss": 0.0581, "num_input_tokens_seen": 135679232, "step": 132500 }, { "epoch": 1.22715236065362, "grad_norm": 1.1069833040237427, "learning_rate": 4.386428433027929e-05, "loss": 0.0567, "num_input_tokens_seen": 136191232, "step": 133000 }, { "epoch": 1.2317657153929193, "grad_norm": 1.1832222938537598, "learning_rate": 4.38412175565828e-05, "loss": 0.0589, "num_input_tokens_seen": 136703232, "step": 133500 }, { "epoch": 1.2363790701322188, "grad_norm": 0.8395095467567444, "learning_rate": 4.3818150782886305e-05, "loss": 0.0607, "num_input_tokens_seen": 137215232, "step": 134000 }, { "epoch": 1.240992424871518, "grad_norm": 1.2240726947784424, "learning_rate": 4.3795084009189806e-05, "loss": 0.0639, "num_input_tokens_seen": 137727232, "step": 134500 }, { "epoch": 1.2456057796108173, "grad_norm": 0.596113383769989, "learning_rate": 4.3772017235493306e-05, "loss": 0.0622, "num_input_tokens_seen": 138239232, "step": 135000 }, { "epoch": 1.2502191343501168, "grad_norm": 1.9236828088760376, "learning_rate": 4.3748950461796814e-05, "loss": 0.0607, "num_input_tokens_seen": 138751232, "step": 135500 }, { "epoch": 1.254832489089416, "grad_norm": 0.9456164836883545, "learning_rate": 4.3725883688100314e-05, "loss": 0.0583, "num_input_tokens_seen": 139263232, "step": 136000 }, { "epoch": 1.2594458438287153, "grad_norm": 3.4136688709259033, "learning_rate": 4.3702816914403815e-05, "loss": 0.0638, "num_input_tokens_seen": 139775232, "step": 136500 }, { "epoch": 1.2640591985680147, "grad_norm": 1.01094388961792, "learning_rate": 4.367975014070732e-05, "loss": 0.0598, "num_input_tokens_seen": 140287232, "step": 137000 }, { "epoch": 1.268672553307314, "grad_norm": 1.1260863542556763, "learning_rate": 4.365668336701082e-05, "loss": 0.0586, "num_input_tokens_seen": 140799232, "step": 137500 }, { "epoch": 1.2732859080466135, "grad_norm": 3.8169174194335938, "learning_rate": 4.363361659331433e-05, "loss": 0.0616, "num_input_tokens_seen": 141311232, "step": 138000 }, { "epoch": 1.2778992627859127, "grad_norm": 0.5968789458274841, "learning_rate": 4.361054981961783e-05, "loss": 0.0586, "num_input_tokens_seen": 141823232, "step": 138500 }, { "epoch": 1.282512617525212, "grad_norm": 1.5847851037979126, "learning_rate": 4.358748304592133e-05, "loss": 0.0531, "num_input_tokens_seen": 142335232, "step": 139000 }, { "epoch": 1.2871259722645112, "grad_norm": 1.6152338981628418, "learning_rate": 4.356441627222484e-05, "loss": 0.0621, "num_input_tokens_seen": 142847232, "step": 139500 }, { "epoch": 1.2917393270038107, "grad_norm": 1.3131306171417236, "learning_rate": 4.3541349498528346e-05, "loss": 0.0596, "num_input_tokens_seen": 143359232, "step": 140000 }, { "epoch": 1.29635268174311, "grad_norm": 1.424111247062683, "learning_rate": 4.351828272483185e-05, "loss": 0.0606, "num_input_tokens_seen": 143871232, "step": 140500 }, { "epoch": 1.3009660364824094, "grad_norm": 0.8023368716239929, "learning_rate": 4.349521595113535e-05, "loss": 0.0644, "num_input_tokens_seen": 144383232, "step": 141000 }, { "epoch": 1.3055793912217086, "grad_norm": 1.9093987941741943, "learning_rate": 4.3472149177438855e-05, "loss": 0.063, "num_input_tokens_seen": 144895232, "step": 141500 }, { "epoch": 1.3101927459610079, "grad_norm": 2.1738569736480713, "learning_rate": 4.3449082403742355e-05, "loss": 0.0627, "num_input_tokens_seen": 145407232, "step": 142000 }, { "epoch": 1.3148061007003071, "grad_norm": 2.2907350063323975, "learning_rate": 4.3426015630045856e-05, "loss": 0.0628, "num_input_tokens_seen": 145919232, "step": 142500 }, { "epoch": 1.3194194554396066, "grad_norm": 1.2344714403152466, "learning_rate": 4.340294885634936e-05, "loss": 0.0589, "num_input_tokens_seen": 146431232, "step": 143000 }, { "epoch": 1.3240328101789058, "grad_norm": 2.3011679649353027, "learning_rate": 4.337988208265287e-05, "loss": 0.0639, "num_input_tokens_seen": 146943232, "step": 143500 }, { "epoch": 1.3286461649182053, "grad_norm": 1.3081352710723877, "learning_rate": 4.3356815308956364e-05, "loss": 0.0607, "num_input_tokens_seen": 147455232, "step": 144000 }, { "epoch": 1.3332595196575046, "grad_norm": 1.5605255365371704, "learning_rate": 4.333374853525987e-05, "loss": 0.0619, "num_input_tokens_seen": 147967232, "step": 144500 }, { "epoch": 1.3378728743968038, "grad_norm": 1.3698718547821045, "learning_rate": 4.331068176156338e-05, "loss": 0.0592, "num_input_tokens_seen": 148479232, "step": 145000 }, { "epoch": 1.3424862291361033, "grad_norm": 0.7845633029937744, "learning_rate": 4.328761498786688e-05, "loss": 0.0649, "num_input_tokens_seen": 148991232, "step": 145500 }, { "epoch": 1.3470995838754025, "grad_norm": 2.0420374870300293, "learning_rate": 4.326454821417038e-05, "loss": 0.0598, "num_input_tokens_seen": 149503232, "step": 146000 }, { "epoch": 1.3517129386147018, "grad_norm": 2.2831552028656006, "learning_rate": 4.324148144047389e-05, "loss": 0.0614, "num_input_tokens_seen": 150015232, "step": 146500 }, { "epoch": 1.3563262933540012, "grad_norm": 0.9809445738792419, "learning_rate": 4.321841466677739e-05, "loss": 0.0588, "num_input_tokens_seen": 150527232, "step": 147000 }, { "epoch": 1.3609396480933005, "grad_norm": 1.6517871618270874, "learning_rate": 4.3195347893080895e-05, "loss": 0.061, "num_input_tokens_seen": 151039232, "step": 147500 }, { "epoch": 1.3655530028325997, "grad_norm": 0.8756200075149536, "learning_rate": 4.3172281119384396e-05, "loss": 0.0601, "num_input_tokens_seen": 151551232, "step": 148000 }, { "epoch": 1.3701663575718992, "grad_norm": 4.2246317863464355, "learning_rate": 4.31492143456879e-05, "loss": 0.0559, "num_input_tokens_seen": 152063232, "step": 148500 }, { "epoch": 1.3747797123111984, "grad_norm": 3.220839738845825, "learning_rate": 4.3126147571991404e-05, "loss": 0.0572, "num_input_tokens_seen": 152575232, "step": 149000 }, { "epoch": 1.379393067050498, "grad_norm": 1.6114301681518555, "learning_rate": 4.3103080798294905e-05, "loss": 0.0593, "num_input_tokens_seen": 153087232, "step": 149500 }, { "epoch": 1.3840064217897972, "grad_norm": 0.6551116108894348, "learning_rate": 4.3080014024598405e-05, "loss": 0.0626, "num_input_tokens_seen": 153599232, "step": 150000 }, { "epoch": 1.3886197765290964, "grad_norm": 2.2895658016204834, "learning_rate": 4.305694725090191e-05, "loss": 0.064, "num_input_tokens_seen": 154111232, "step": 150500 }, { "epoch": 1.3932331312683957, "grad_norm": 2.927482843399048, "learning_rate": 4.303388047720542e-05, "loss": 0.0625, "num_input_tokens_seen": 154623232, "step": 151000 }, { "epoch": 1.3978464860076951, "grad_norm": 1.2749851942062378, "learning_rate": 4.301081370350892e-05, "loss": 0.0579, "num_input_tokens_seen": 155135232, "step": 151500 }, { "epoch": 1.4024598407469944, "grad_norm": 1.7866413593292236, "learning_rate": 4.298774692981242e-05, "loss": 0.0574, "num_input_tokens_seen": 155647232, "step": 152000 }, { "epoch": 1.4070731954862938, "grad_norm": 2.288804292678833, "learning_rate": 4.296468015611593e-05, "loss": 0.0631, "num_input_tokens_seen": 156159232, "step": 152500 }, { "epoch": 1.411686550225593, "grad_norm": 1.509840965270996, "learning_rate": 4.294161338241943e-05, "loss": 0.0585, "num_input_tokens_seen": 156671232, "step": 153000 }, { "epoch": 1.4162999049648923, "grad_norm": 0.8478446006774902, "learning_rate": 4.291854660872293e-05, "loss": 0.0593, "num_input_tokens_seen": 157183232, "step": 153500 }, { "epoch": 1.4209132597041916, "grad_norm": 1.4515230655670166, "learning_rate": 4.289547983502644e-05, "loss": 0.0599, "num_input_tokens_seen": 157695232, "step": 154000 }, { "epoch": 1.425526614443491, "grad_norm": 0.7513217926025391, "learning_rate": 4.2872413061329944e-05, "loss": 0.0602, "num_input_tokens_seen": 158207232, "step": 154500 }, { "epoch": 1.4301399691827903, "grad_norm": 2.4477181434631348, "learning_rate": 4.284934628763344e-05, "loss": 0.0583, "num_input_tokens_seen": 158719232, "step": 155000 }, { "epoch": 1.4347533239220898, "grad_norm": 1.2855825424194336, "learning_rate": 4.2826279513936945e-05, "loss": 0.0653, "num_input_tokens_seen": 159231232, "step": 155500 }, { "epoch": 1.439366678661389, "grad_norm": 0.5422343611717224, "learning_rate": 4.280321274024045e-05, "loss": 0.0601, "num_input_tokens_seen": 159743232, "step": 156000 }, { "epoch": 1.4439800334006883, "grad_norm": 1.519142746925354, "learning_rate": 4.278014596654395e-05, "loss": 0.0558, "num_input_tokens_seen": 160255232, "step": 156500 }, { "epoch": 1.4485933881399875, "grad_norm": 1.936989426612854, "learning_rate": 4.2757079192847454e-05, "loss": 0.0572, "num_input_tokens_seen": 160767232, "step": 157000 }, { "epoch": 1.453206742879287, "grad_norm": 2.0965301990509033, "learning_rate": 4.273401241915096e-05, "loss": 0.0655, "num_input_tokens_seen": 161279232, "step": 157500 }, { "epoch": 1.4578200976185862, "grad_norm": 1.300350308418274, "learning_rate": 4.271094564545446e-05, "loss": 0.0606, "num_input_tokens_seen": 161791232, "step": 158000 }, { "epoch": 1.4624334523578857, "grad_norm": 2.8612143993377686, "learning_rate": 4.268787887175797e-05, "loss": 0.0587, "num_input_tokens_seen": 162303232, "step": 158500 }, { "epoch": 1.467046807097185, "grad_norm": 1.869927167892456, "learning_rate": 4.266481209806147e-05, "loss": 0.0626, "num_input_tokens_seen": 162815232, "step": 159000 }, { "epoch": 1.4716601618364842, "grad_norm": 0.6784268617630005, "learning_rate": 4.264174532436497e-05, "loss": 0.0587, "num_input_tokens_seen": 163327232, "step": 159500 }, { "epoch": 1.4762735165757837, "grad_norm": 1.315468192100525, "learning_rate": 4.261867855066848e-05, "loss": 0.0558, "num_input_tokens_seen": 163839232, "step": 160000 }, { "epoch": 1.480886871315083, "grad_norm": 0.5266712307929993, "learning_rate": 4.2595611776971985e-05, "loss": 0.0601, "num_input_tokens_seen": 164351232, "step": 160500 }, { "epoch": 1.4855002260543821, "grad_norm": 0.976466178894043, "learning_rate": 4.2572545003275486e-05, "loss": 0.059, "num_input_tokens_seen": 164863232, "step": 161000 }, { "epoch": 1.4901135807936816, "grad_norm": 2.195340633392334, "learning_rate": 4.2549478229578986e-05, "loss": 0.0618, "num_input_tokens_seen": 165375232, "step": 161500 }, { "epoch": 1.4947269355329809, "grad_norm": 0.6188003420829773, "learning_rate": 4.2526411455882494e-05, "loss": 0.062, "num_input_tokens_seen": 165887232, "step": 162000 }, { "epoch": 1.49934029027228, "grad_norm": 1.496407389640808, "learning_rate": 4.2503344682185994e-05, "loss": 0.0591, "num_input_tokens_seen": 166399232, "step": 162500 }, { "epoch": 1.5039536450115794, "grad_norm": 0.94919753074646, "learning_rate": 4.2480277908489495e-05, "loss": 0.06, "num_input_tokens_seen": 166911232, "step": 163000 }, { "epoch": 1.5085669997508788, "grad_norm": 1.6207939386367798, "learning_rate": 4.2457211134793e-05, "loss": 0.0599, "num_input_tokens_seen": 167423232, "step": 163500 }, { "epoch": 1.5131803544901783, "grad_norm": 1.1205254793167114, "learning_rate": 4.24341443610965e-05, "loss": 0.0617, "num_input_tokens_seen": 167935232, "step": 164000 }, { "epoch": 1.5177937092294775, "grad_norm": 1.0323721170425415, "learning_rate": 4.24110775874e-05, "loss": 0.0601, "num_input_tokens_seen": 168447232, "step": 164500 }, { "epoch": 1.5224070639687768, "grad_norm": 0.6799350380897522, "learning_rate": 4.238801081370351e-05, "loss": 0.0631, "num_input_tokens_seen": 168959232, "step": 165000 }, { "epoch": 1.527020418708076, "grad_norm": 1.2749136686325073, "learning_rate": 4.236494404000702e-05, "loss": 0.058, "num_input_tokens_seen": 169471232, "step": 165500 }, { "epoch": 1.5316337734473755, "grad_norm": 2.35078763961792, "learning_rate": 4.234187726631052e-05, "loss": 0.066, "num_input_tokens_seen": 169983232, "step": 166000 }, { "epoch": 1.5362471281866747, "grad_norm": 1.8924311399459839, "learning_rate": 4.231881049261402e-05, "loss": 0.0591, "num_input_tokens_seen": 170495232, "step": 166500 }, { "epoch": 1.5408604829259742, "grad_norm": 2.8488757610321045, "learning_rate": 4.2295743718917527e-05, "loss": 0.0584, "num_input_tokens_seen": 171007232, "step": 167000 }, { "epoch": 1.5454738376652735, "grad_norm": 1.7758262157440186, "learning_rate": 4.227267694522103e-05, "loss": 0.0661, "num_input_tokens_seen": 171519232, "step": 167500 }, { "epoch": 1.5500871924045727, "grad_norm": 0.7893622517585754, "learning_rate": 4.224961017152453e-05, "loss": 0.0594, "num_input_tokens_seen": 172031232, "step": 168000 }, { "epoch": 1.554700547143872, "grad_norm": 1.069485068321228, "learning_rate": 4.2226543397828035e-05, "loss": 0.0656, "num_input_tokens_seen": 172543232, "step": 168500 }, { "epoch": 1.5593139018831714, "grad_norm": 2.2371785640716553, "learning_rate": 4.2203476624131536e-05, "loss": 0.058, "num_input_tokens_seen": 173055232, "step": 169000 }, { "epoch": 1.5639272566224707, "grad_norm": 1.76310396194458, "learning_rate": 4.218040985043504e-05, "loss": 0.0623, "num_input_tokens_seen": 173567232, "step": 169500 }, { "epoch": 1.5685406113617701, "grad_norm": 2.7890520095825195, "learning_rate": 4.2157343076738544e-05, "loss": 0.0582, "num_input_tokens_seen": 174079232, "step": 170000 }, { "epoch": 1.5731539661010694, "grad_norm": 2.2342007160186768, "learning_rate": 4.2134276303042044e-05, "loss": 0.0645, "num_input_tokens_seen": 174591232, "step": 170500 }, { "epoch": 1.5777673208403686, "grad_norm": 1.6538183689117432, "learning_rate": 4.211120952934555e-05, "loss": 0.0578, "num_input_tokens_seen": 175103232, "step": 171000 }, { "epoch": 1.5823806755796679, "grad_norm": 6.509249687194824, "learning_rate": 4.208814275564906e-05, "loss": 0.0638, "num_input_tokens_seen": 175615232, "step": 171500 }, { "epoch": 1.5869940303189674, "grad_norm": 2.7748773097991943, "learning_rate": 4.206507598195256e-05, "loss": 0.0646, "num_input_tokens_seen": 176127232, "step": 172000 }, { "epoch": 1.5916073850582668, "grad_norm": 4.16091251373291, "learning_rate": 4.204200920825606e-05, "loss": 0.0653, "num_input_tokens_seen": 176639232, "step": 172500 }, { "epoch": 1.596220739797566, "grad_norm": 1.4821609258651733, "learning_rate": 4.201894243455957e-05, "loss": 0.0642, "num_input_tokens_seen": 177151232, "step": 173000 }, { "epoch": 1.6008340945368653, "grad_norm": 0.9436431527137756, "learning_rate": 4.199587566086307e-05, "loss": 0.0603, "num_input_tokens_seen": 177663232, "step": 173500 }, { "epoch": 1.6054474492761646, "grad_norm": 1.735992193222046, "learning_rate": 4.197280888716657e-05, "loss": 0.0596, "num_input_tokens_seen": 178175232, "step": 174000 }, { "epoch": 1.6100608040154638, "grad_norm": 1.1625646352767944, "learning_rate": 4.1949742113470076e-05, "loss": 0.0601, "num_input_tokens_seen": 178687232, "step": 174500 }, { "epoch": 1.6146741587547633, "grad_norm": 1.0174745321273804, "learning_rate": 4.192667533977358e-05, "loss": 0.058, "num_input_tokens_seen": 179199232, "step": 175000 }, { "epoch": 1.6192875134940627, "grad_norm": 1.141682744026184, "learning_rate": 4.190360856607708e-05, "loss": 0.0622, "num_input_tokens_seen": 179711232, "step": 175500 }, { "epoch": 1.623900868233362, "grad_norm": 1.165004014968872, "learning_rate": 4.1880541792380585e-05, "loss": 0.0627, "num_input_tokens_seen": 180223232, "step": 176000 }, { "epoch": 1.6285142229726612, "grad_norm": 2.1781582832336426, "learning_rate": 4.185747501868409e-05, "loss": 0.0631, "num_input_tokens_seen": 180735232, "step": 176500 }, { "epoch": 1.6331275777119605, "grad_norm": 1.5659372806549072, "learning_rate": 4.183440824498759e-05, "loss": 0.0607, "num_input_tokens_seen": 181247232, "step": 177000 }, { "epoch": 1.63774093245126, "grad_norm": 1.9345473051071167, "learning_rate": 4.181134147129109e-05, "loss": 0.0567, "num_input_tokens_seen": 181759232, "step": 177500 }, { "epoch": 1.6423542871905592, "grad_norm": 0.8415033221244812, "learning_rate": 4.17882746975946e-05, "loss": 0.06, "num_input_tokens_seen": 182271232, "step": 178000 }, { "epoch": 1.6469676419298587, "grad_norm": 0.4496413767337799, "learning_rate": 4.17652079238981e-05, "loss": 0.0583, "num_input_tokens_seen": 182783232, "step": 178500 }, { "epoch": 1.651580996669158, "grad_norm": 1.1432942152023315, "learning_rate": 4.174214115020161e-05, "loss": 0.062, "num_input_tokens_seen": 183295232, "step": 179000 }, { "epoch": 1.6561943514084572, "grad_norm": 0.4867847263813019, "learning_rate": 4.171907437650511e-05, "loss": 0.0653, "num_input_tokens_seen": 183807232, "step": 179500 }, { "epoch": 1.6608077061477564, "grad_norm": 3.039292335510254, "learning_rate": 4.169600760280861e-05, "loss": 0.0578, "num_input_tokens_seen": 184319232, "step": 180000 }, { "epoch": 1.6654210608870559, "grad_norm": 2.18542218208313, "learning_rate": 4.167294082911212e-05, "loss": 0.064, "num_input_tokens_seen": 184831232, "step": 180500 }, { "epoch": 1.6700344156263551, "grad_norm": 0.9734911918640137, "learning_rate": 4.164987405541562e-05, "loss": 0.0578, "num_input_tokens_seen": 185343232, "step": 181000 }, { "epoch": 1.6746477703656546, "grad_norm": 0.8751457929611206, "learning_rate": 4.162680728171912e-05, "loss": 0.0593, "num_input_tokens_seen": 185855232, "step": 181500 }, { "epoch": 1.6792611251049538, "grad_norm": 1.0533229112625122, "learning_rate": 4.1603740508022625e-05, "loss": 0.0601, "num_input_tokens_seen": 186367232, "step": 182000 }, { "epoch": 1.683874479844253, "grad_norm": 0.742938220500946, "learning_rate": 4.158067373432613e-05, "loss": 0.0589, "num_input_tokens_seen": 186879232, "step": 182500 }, { "epoch": 1.6884878345835523, "grad_norm": 1.432569146156311, "learning_rate": 4.155760696062963e-05, "loss": 0.061, "num_input_tokens_seen": 187391232, "step": 183000 }, { "epoch": 1.6931011893228518, "grad_norm": 2.900394916534424, "learning_rate": 4.1534540186933134e-05, "loss": 0.058, "num_input_tokens_seen": 187903232, "step": 183500 }, { "epoch": 1.6977145440621513, "grad_norm": 1.1864616870880127, "learning_rate": 4.151147341323664e-05, "loss": 0.0594, "num_input_tokens_seen": 188415232, "step": 184000 }, { "epoch": 1.7023278988014505, "grad_norm": 2.3834102153778076, "learning_rate": 4.148840663954014e-05, "loss": 0.0623, "num_input_tokens_seen": 188927232, "step": 184500 }, { "epoch": 1.7069412535407498, "grad_norm": 2.183478355407715, "learning_rate": 4.146533986584364e-05, "loss": 0.0621, "num_input_tokens_seen": 189439232, "step": 185000 }, { "epoch": 1.711554608280049, "grad_norm": 1.4946995973587036, "learning_rate": 4.144227309214715e-05, "loss": 0.0585, "num_input_tokens_seen": 189951232, "step": 185500 }, { "epoch": 1.7161679630193483, "grad_norm": 2.6389856338500977, "learning_rate": 4.141920631845066e-05, "loss": 0.0641, "num_input_tokens_seen": 190463232, "step": 186000 }, { "epoch": 1.7207813177586477, "grad_norm": 1.5870720148086548, "learning_rate": 4.139613954475416e-05, "loss": 0.0622, "num_input_tokens_seen": 190975232, "step": 186500 }, { "epoch": 1.7253946724979472, "grad_norm": 1.0115468502044678, "learning_rate": 4.137307277105766e-05, "loss": 0.0602, "num_input_tokens_seen": 191487232, "step": 187000 }, { "epoch": 1.7300080272372464, "grad_norm": 2.0021095275878906, "learning_rate": 4.1350005997361166e-05, "loss": 0.0585, "num_input_tokens_seen": 191999232, "step": 187500 }, { "epoch": 1.7346213819765457, "grad_norm": 1.7288790941238403, "learning_rate": 4.1326939223664666e-05, "loss": 0.064, "num_input_tokens_seen": 192511232, "step": 188000 }, { "epoch": 1.739234736715845, "grad_norm": 2.1877362728118896, "learning_rate": 4.130387244996817e-05, "loss": 0.061, "num_input_tokens_seen": 193023232, "step": 188500 }, { "epoch": 1.7438480914551442, "grad_norm": 2.1723220348358154, "learning_rate": 4.1280805676271674e-05, "loss": 0.0611, "num_input_tokens_seen": 193535232, "step": 189000 }, { "epoch": 1.7484614461944437, "grad_norm": 1.1203595399856567, "learning_rate": 4.1257738902575175e-05, "loss": 0.0587, "num_input_tokens_seen": 194047232, "step": 189500 }, { "epoch": 1.7530748009337431, "grad_norm": 1.7950832843780518, "learning_rate": 4.123467212887868e-05, "loss": 0.0619, "num_input_tokens_seen": 194559232, "step": 190000 }, { "epoch": 1.7576881556730424, "grad_norm": 0.8511695265769958, "learning_rate": 4.121160535518218e-05, "loss": 0.0587, "num_input_tokens_seen": 195071232, "step": 190500 }, { "epoch": 1.7623015104123416, "grad_norm": 0.49872857332229614, "learning_rate": 4.118853858148568e-05, "loss": 0.0586, "num_input_tokens_seen": 195583232, "step": 191000 }, { "epoch": 1.7669148651516409, "grad_norm": 1.272387981414795, "learning_rate": 4.116547180778919e-05, "loss": 0.062, "num_input_tokens_seen": 196095232, "step": 191500 }, { "epoch": 1.7715282198909403, "grad_norm": 3.0328872203826904, "learning_rate": 4.11424050340927e-05, "loss": 0.0561, "num_input_tokens_seen": 196607232, "step": 192000 }, { "epoch": 1.7761415746302396, "grad_norm": 1.1026365756988525, "learning_rate": 4.111933826039619e-05, "loss": 0.061, "num_input_tokens_seen": 197119232, "step": 192500 }, { "epoch": 1.780754929369539, "grad_norm": 1.523284673690796, "learning_rate": 4.10962714866997e-05, "loss": 0.0647, "num_input_tokens_seen": 197631232, "step": 193000 }, { "epoch": 1.7853682841088383, "grad_norm": 2.571349859237671, "learning_rate": 4.1073204713003207e-05, "loss": 0.0572, "num_input_tokens_seen": 198143232, "step": 193500 }, { "epoch": 1.7899816388481375, "grad_norm": 1.1206070184707642, "learning_rate": 4.105013793930671e-05, "loss": 0.065, "num_input_tokens_seen": 198655232, "step": 194000 }, { "epoch": 1.7945949935874368, "grad_norm": 1.2172856330871582, "learning_rate": 4.102707116561021e-05, "loss": 0.0624, "num_input_tokens_seen": 199167232, "step": 194500 }, { "epoch": 1.7992083483267363, "grad_norm": 1.3785135746002197, "learning_rate": 4.1004004391913715e-05, "loss": 0.0619, "num_input_tokens_seen": 199679232, "step": 195000 }, { "epoch": 1.8038217030660355, "grad_norm": 1.8791236877441406, "learning_rate": 4.0980937618217216e-05, "loss": 0.0594, "num_input_tokens_seen": 200191232, "step": 195500 }, { "epoch": 1.808435057805335, "grad_norm": 1.4721789360046387, "learning_rate": 4.0957870844520716e-05, "loss": 0.0584, "num_input_tokens_seen": 200703232, "step": 196000 }, { "epoch": 1.8130484125446342, "grad_norm": 2.4450087547302246, "learning_rate": 4.0934804070824224e-05, "loss": 0.0622, "num_input_tokens_seen": 201215232, "step": 196500 }, { "epoch": 1.8176617672839335, "grad_norm": 2.5776455402374268, "learning_rate": 4.091173729712773e-05, "loss": 0.062, "num_input_tokens_seen": 201727232, "step": 197000 }, { "epoch": 1.8222751220232327, "grad_norm": 0.703079104423523, "learning_rate": 4.088867052343123e-05, "loss": 0.063, "num_input_tokens_seen": 202239232, "step": 197500 }, { "epoch": 1.8268884767625322, "grad_norm": 3.7383570671081543, "learning_rate": 4.086560374973473e-05, "loss": 0.0621, "num_input_tokens_seen": 202751232, "step": 198000 }, { "epoch": 1.8315018315018317, "grad_norm": 1.2119007110595703, "learning_rate": 4.084253697603824e-05, "loss": 0.0638, "num_input_tokens_seen": 203263232, "step": 198500 }, { "epoch": 1.836115186241131, "grad_norm": 1.6069977283477783, "learning_rate": 4.081947020234174e-05, "loss": 0.0594, "num_input_tokens_seen": 203775232, "step": 199000 }, { "epoch": 1.8407285409804302, "grad_norm": 0.5176113843917847, "learning_rate": 4.079640342864525e-05, "loss": 0.0565, "num_input_tokens_seen": 204287232, "step": 199500 }, { "epoch": 1.8453418957197294, "grad_norm": 1.78886878490448, "learning_rate": 4.077333665494875e-05, "loss": 0.0599, "num_input_tokens_seen": 204799232, "step": 200000 }, { "epoch": 1.8499552504590286, "grad_norm": 0.8037757277488708, "learning_rate": 4.075026988125225e-05, "loss": 0.0584, "num_input_tokens_seen": 205311232, "step": 200500 }, { "epoch": 1.8545686051983281, "grad_norm": 0.8422955274581909, "learning_rate": 4.0727203107555756e-05, "loss": 0.0626, "num_input_tokens_seen": 205823232, "step": 201000 }, { "epoch": 1.8591819599376276, "grad_norm": 3.384787082672119, "learning_rate": 4.0704136333859257e-05, "loss": 0.0603, "num_input_tokens_seen": 206335232, "step": 201500 }, { "epoch": 1.8637953146769268, "grad_norm": 1.103167176246643, "learning_rate": 4.068106956016276e-05, "loss": 0.0608, "num_input_tokens_seen": 206847232, "step": 202000 }, { "epoch": 1.868408669416226, "grad_norm": 0.9550286531448364, "learning_rate": 4.0658002786466264e-05, "loss": 0.0583, "num_input_tokens_seen": 207359232, "step": 202500 }, { "epoch": 1.8730220241555253, "grad_norm": 1.2629748582839966, "learning_rate": 4.063493601276977e-05, "loss": 0.0599, "num_input_tokens_seen": 207871232, "step": 203000 }, { "epoch": 1.8776353788948248, "grad_norm": 1.8319883346557617, "learning_rate": 4.061186923907327e-05, "loss": 0.0557, "num_input_tokens_seen": 208383232, "step": 203500 }, { "epoch": 1.882248733634124, "grad_norm": 0.8122320175170898, "learning_rate": 4.058880246537677e-05, "loss": 0.0631, "num_input_tokens_seen": 208895232, "step": 204000 }, { "epoch": 1.8868620883734235, "grad_norm": 1.0240248441696167, "learning_rate": 4.056573569168028e-05, "loss": 0.0571, "num_input_tokens_seen": 209407232, "step": 204500 }, { "epoch": 1.8914754431127228, "grad_norm": 1.0079154968261719, "learning_rate": 4.054266891798378e-05, "loss": 0.0591, "num_input_tokens_seen": 209919232, "step": 205000 }, { "epoch": 1.896088797852022, "grad_norm": 0.7955754399299622, "learning_rate": 4.051960214428728e-05, "loss": 0.0579, "num_input_tokens_seen": 210431232, "step": 205500 }, { "epoch": 1.9007021525913212, "grad_norm": 2.3598215579986572, "learning_rate": 4.049653537059079e-05, "loss": 0.0578, "num_input_tokens_seen": 210943232, "step": 206000 }, { "epoch": 1.9053155073306207, "grad_norm": 2.217241048812866, "learning_rate": 4.047346859689429e-05, "loss": 0.0615, "num_input_tokens_seen": 211455232, "step": 206500 }, { "epoch": 1.90992886206992, "grad_norm": 0.9427639245986938, "learning_rate": 4.045040182319779e-05, "loss": 0.0654, "num_input_tokens_seen": 211967232, "step": 207000 }, { "epoch": 1.9145422168092194, "grad_norm": 2.3182663917541504, "learning_rate": 4.04273350495013e-05, "loss": 0.0605, "num_input_tokens_seen": 212479232, "step": 207500 }, { "epoch": 1.9191555715485187, "grad_norm": 2.283663272857666, "learning_rate": 4.0404268275804805e-05, "loss": 0.059, "num_input_tokens_seen": 212991232, "step": 208000 }, { "epoch": 1.923768926287818, "grad_norm": 0.8118070960044861, "learning_rate": 4.0381201502108305e-05, "loss": 0.0606, "num_input_tokens_seen": 213503232, "step": 208500 }, { "epoch": 1.9283822810271172, "grad_norm": 1.4257065057754517, "learning_rate": 4.0358134728411806e-05, "loss": 0.0619, "num_input_tokens_seen": 214015232, "step": 209000 }, { "epoch": 1.9329956357664166, "grad_norm": 1.2044384479522705, "learning_rate": 4.033506795471531e-05, "loss": 0.0554, "num_input_tokens_seen": 214527232, "step": 209500 }, { "epoch": 1.9376089905057161, "grad_norm": 1.2655075788497925, "learning_rate": 4.0312001181018814e-05, "loss": 0.0569, "num_input_tokens_seen": 215039232, "step": 210000 }, { "epoch": 1.9422223452450154, "grad_norm": 1.7089818716049194, "learning_rate": 4.028893440732232e-05, "loss": 0.062, "num_input_tokens_seen": 215551232, "step": 210500 }, { "epoch": 1.9468356999843146, "grad_norm": 1.0826196670532227, "learning_rate": 4.026586763362582e-05, "loss": 0.0611, "num_input_tokens_seen": 216063232, "step": 211000 }, { "epoch": 1.9514490547236139, "grad_norm": 0.5117043852806091, "learning_rate": 4.024280085992932e-05, "loss": 0.0618, "num_input_tokens_seen": 216575232, "step": 211500 }, { "epoch": 1.956062409462913, "grad_norm": 0.4635091722011566, "learning_rate": 4.021973408623283e-05, "loss": 0.0617, "num_input_tokens_seen": 217087232, "step": 212000 }, { "epoch": 1.9606757642022126, "grad_norm": 2.1524128913879395, "learning_rate": 4.019666731253634e-05, "loss": 0.0614, "num_input_tokens_seen": 217599232, "step": 212500 }, { "epoch": 1.965289118941512, "grad_norm": 1.02557373046875, "learning_rate": 4.017360053883983e-05, "loss": 0.0552, "num_input_tokens_seen": 218111232, "step": 213000 }, { "epoch": 1.9699024736808113, "grad_norm": 2.18851375579834, "learning_rate": 4.015053376514334e-05, "loss": 0.0597, "num_input_tokens_seen": 218623232, "step": 213500 }, { "epoch": 1.9745158284201105, "grad_norm": 2.4914391040802, "learning_rate": 4.0127466991446846e-05, "loss": 0.0616, "num_input_tokens_seen": 219135232, "step": 214000 }, { "epoch": 1.9791291831594098, "grad_norm": 1.8353182077407837, "learning_rate": 4.0104400217750346e-05, "loss": 0.0675, "num_input_tokens_seen": 219647232, "step": 214500 }, { "epoch": 1.983742537898709, "grad_norm": 5.431290149688721, "learning_rate": 4.008133344405385e-05, "loss": 0.0568, "num_input_tokens_seen": 220159232, "step": 215000 }, { "epoch": 1.9883558926380085, "grad_norm": 0.523113489151001, "learning_rate": 4.0058266670357354e-05, "loss": 0.0596, "num_input_tokens_seen": 220671232, "step": 215500 }, { "epoch": 1.992969247377308, "grad_norm": 0.5525696277618408, "learning_rate": 4.0035199896660855e-05, "loss": 0.0589, "num_input_tokens_seen": 221183232, "step": 216000 }, { "epoch": 1.9975826021166072, "grad_norm": 2.0920755863189697, "learning_rate": 4.0012133122964355e-05, "loss": 0.0603, "num_input_tokens_seen": 221695232, "step": 216500 }, { "epoch": 2.0, "eval_combined_score": 0.06747195769945506, "eval_loss": 0.0674719586968422, "eval_mse": 0.06747195670206793, "eval_runtime": 46.4608, "eval_samples_per_second": 2073.535, "eval_steps_per_second": 259.208, "num_input_tokens_seen": 221962752, "step": 216762 }, { "epoch": 2.0021959568559065, "grad_norm": 2.938506841659546, "learning_rate": 3.998906634926786e-05, "loss": 0.0546, "num_input_tokens_seen": 222206464, "step": 217000 }, { "epoch": 2.0068093115952057, "grad_norm": 1.5632978677749634, "learning_rate": 3.996599957557137e-05, "loss": 0.0497, "num_input_tokens_seen": 222718464, "step": 217500 }, { "epoch": 2.011422666334505, "grad_norm": 2.7584619522094727, "learning_rate": 3.994293280187487e-05, "loss": 0.0504, "num_input_tokens_seen": 223230464, "step": 218000 }, { "epoch": 2.0160360210738046, "grad_norm": 0.7712005972862244, "learning_rate": 3.991986602817837e-05, "loss": 0.0498, "num_input_tokens_seen": 223742464, "step": 218500 }, { "epoch": 2.020649375813104, "grad_norm": 2.087860584259033, "learning_rate": 3.989679925448188e-05, "loss": 0.0514, "num_input_tokens_seen": 224254464, "step": 219000 }, { "epoch": 2.025262730552403, "grad_norm": 1.5292513370513916, "learning_rate": 3.987373248078538e-05, "loss": 0.046, "num_input_tokens_seen": 224766464, "step": 219500 }, { "epoch": 2.0298760852917024, "grad_norm": 2.2876648902893066, "learning_rate": 3.985066570708888e-05, "loss": 0.0514, "num_input_tokens_seen": 225278464, "step": 220000 }, { "epoch": 2.0344894400310016, "grad_norm": 1.1318377256393433, "learning_rate": 3.982759893339239e-05, "loss": 0.0466, "num_input_tokens_seen": 225790464, "step": 220500 }, { "epoch": 2.039102794770301, "grad_norm": 0.5960507988929749, "learning_rate": 3.980453215969589e-05, "loss": 0.0484, "num_input_tokens_seen": 226302464, "step": 221000 }, { "epoch": 2.0437161495096006, "grad_norm": 1.8446494340896606, "learning_rate": 3.9781465385999395e-05, "loss": 0.0458, "num_input_tokens_seen": 226814464, "step": 221500 }, { "epoch": 2.0483295042489, "grad_norm": 1.8140873908996582, "learning_rate": 3.9758398612302896e-05, "loss": 0.0496, "num_input_tokens_seen": 227326464, "step": 222000 }, { "epoch": 2.052942858988199, "grad_norm": 0.29578447341918945, "learning_rate": 3.9735331838606396e-05, "loss": 0.0447, "num_input_tokens_seen": 227838464, "step": 222500 }, { "epoch": 2.0575562137274983, "grad_norm": 1.8332575559616089, "learning_rate": 3.9712265064909904e-05, "loss": 0.042, "num_input_tokens_seen": 228350464, "step": 223000 }, { "epoch": 2.0621695684667976, "grad_norm": 1.091813325881958, "learning_rate": 3.968919829121341e-05, "loss": 0.0526, "num_input_tokens_seen": 228862464, "step": 223500 }, { "epoch": 2.066782923206097, "grad_norm": 0.7884387373924255, "learning_rate": 3.9666131517516905e-05, "loss": 0.0455, "num_input_tokens_seen": 229374464, "step": 224000 }, { "epoch": 2.0713962779453965, "grad_norm": 2.7083017826080322, "learning_rate": 3.964306474382041e-05, "loss": 0.0457, "num_input_tokens_seen": 229886464, "step": 224500 }, { "epoch": 2.0760096326846957, "grad_norm": 3.8200302124023438, "learning_rate": 3.961999797012392e-05, "loss": 0.0459, "num_input_tokens_seen": 230398464, "step": 225000 }, { "epoch": 2.080622987423995, "grad_norm": 1.0111039876937866, "learning_rate": 3.959693119642742e-05, "loss": 0.0464, "num_input_tokens_seen": 230910464, "step": 225500 }, { "epoch": 2.0852363421632942, "grad_norm": 0.7892510890960693, "learning_rate": 3.957386442273092e-05, "loss": 0.0527, "num_input_tokens_seen": 231422464, "step": 226000 }, { "epoch": 2.0898496969025935, "grad_norm": 0.9745638370513916, "learning_rate": 3.955079764903443e-05, "loss": 0.0446, "num_input_tokens_seen": 231934464, "step": 226500 }, { "epoch": 2.0944630516418927, "grad_norm": 1.1187430620193481, "learning_rate": 3.952773087533793e-05, "loss": 0.0505, "num_input_tokens_seen": 232446464, "step": 227000 }, { "epoch": 2.0990764063811924, "grad_norm": 1.3649568557739258, "learning_rate": 3.950466410164143e-05, "loss": 0.0494, "num_input_tokens_seen": 232958464, "step": 227500 }, { "epoch": 2.1036897611204917, "grad_norm": 1.2664381265640259, "learning_rate": 3.9481597327944936e-05, "loss": 0.0425, "num_input_tokens_seen": 233470464, "step": 228000 }, { "epoch": 2.108303115859791, "grad_norm": 2.6382997035980225, "learning_rate": 3.9458530554248444e-05, "loss": 0.0469, "num_input_tokens_seen": 233982464, "step": 228500 }, { "epoch": 2.11291647059909, "grad_norm": 1.4181214570999146, "learning_rate": 3.9435463780551944e-05, "loss": 0.0465, "num_input_tokens_seen": 234494464, "step": 229000 }, { "epoch": 2.1175298253383894, "grad_norm": 1.2546645402908325, "learning_rate": 3.9412397006855445e-05, "loss": 0.0502, "num_input_tokens_seen": 235006464, "step": 229500 }, { "epoch": 2.122143180077689, "grad_norm": 3.3777077198028564, "learning_rate": 3.938933023315895e-05, "loss": 0.0513, "num_input_tokens_seen": 235518464, "step": 230000 }, { "epoch": 2.1267565348169883, "grad_norm": 1.0438088178634644, "learning_rate": 3.936626345946245e-05, "loss": 0.0452, "num_input_tokens_seen": 236030464, "step": 230500 }, { "epoch": 2.1313698895562876, "grad_norm": 3.252018928527832, "learning_rate": 3.934319668576596e-05, "loss": 0.0463, "num_input_tokens_seen": 236542464, "step": 231000 }, { "epoch": 2.135983244295587, "grad_norm": 0.6309357285499573, "learning_rate": 3.932012991206946e-05, "loss": 0.0456, "num_input_tokens_seen": 237054464, "step": 231500 }, { "epoch": 2.140596599034886, "grad_norm": 0.6404411196708679, "learning_rate": 3.929706313837296e-05, "loss": 0.0469, "num_input_tokens_seen": 237566464, "step": 232000 }, { "epoch": 2.1452099537741853, "grad_norm": 2.673940896987915, "learning_rate": 3.927399636467647e-05, "loss": 0.0495, "num_input_tokens_seen": 238078464, "step": 232500 }, { "epoch": 2.149823308513485, "grad_norm": 0.5295352935791016, "learning_rate": 3.9250929590979976e-05, "loss": 0.0488, "num_input_tokens_seen": 238590464, "step": 233000 }, { "epoch": 2.1544366632527843, "grad_norm": 2.1107120513916016, "learning_rate": 3.922786281728347e-05, "loss": 0.0471, "num_input_tokens_seen": 239102464, "step": 233500 }, { "epoch": 2.1590500179920835, "grad_norm": 0.7328481674194336, "learning_rate": 3.920479604358698e-05, "loss": 0.0482, "num_input_tokens_seen": 239614464, "step": 234000 }, { "epoch": 2.1636633727313828, "grad_norm": 0.5566291213035583, "learning_rate": 3.9181729269890485e-05, "loss": 0.0449, "num_input_tokens_seen": 240126464, "step": 234500 }, { "epoch": 2.168276727470682, "grad_norm": 2.311140537261963, "learning_rate": 3.915866249619398e-05, "loss": 0.0453, "num_input_tokens_seen": 240638464, "step": 235000 }, { "epoch": 2.1728900822099813, "grad_norm": 0.43719959259033203, "learning_rate": 3.9135595722497486e-05, "loss": 0.0484, "num_input_tokens_seen": 241150464, "step": 235500 }, { "epoch": 2.177503436949281, "grad_norm": 1.3434603214263916, "learning_rate": 3.911252894880099e-05, "loss": 0.0471, "num_input_tokens_seen": 241662464, "step": 236000 }, { "epoch": 2.18211679168858, "grad_norm": 1.4311593770980835, "learning_rate": 3.9089462175104494e-05, "loss": 0.0466, "num_input_tokens_seen": 242174464, "step": 236500 }, { "epoch": 2.1867301464278794, "grad_norm": 1.6135164499282837, "learning_rate": 3.9066395401407994e-05, "loss": 0.0459, "num_input_tokens_seen": 242686464, "step": 237000 }, { "epoch": 2.1913435011671787, "grad_norm": 0.8135620951652527, "learning_rate": 3.90433286277115e-05, "loss": 0.0484, "num_input_tokens_seen": 243198464, "step": 237500 }, { "epoch": 2.195956855906478, "grad_norm": 2.1880440711975098, "learning_rate": 3.9020261854015e-05, "loss": 0.0493, "num_input_tokens_seen": 243710464, "step": 238000 }, { "epoch": 2.200570210645777, "grad_norm": 1.676583170890808, "learning_rate": 3.899719508031851e-05, "loss": 0.0505, "num_input_tokens_seen": 244222464, "step": 238500 }, { "epoch": 2.205183565385077, "grad_norm": 2.2629077434539795, "learning_rate": 3.897412830662201e-05, "loss": 0.0501, "num_input_tokens_seen": 244734464, "step": 239000 }, { "epoch": 2.209796920124376, "grad_norm": 2.8751511573791504, "learning_rate": 3.895106153292552e-05, "loss": 0.0446, "num_input_tokens_seen": 245246464, "step": 239500 }, { "epoch": 2.2144102748636754, "grad_norm": 2.8819162845611572, "learning_rate": 3.892799475922902e-05, "loss": 0.05, "num_input_tokens_seen": 245758464, "step": 240000 }, { "epoch": 2.2190236296029746, "grad_norm": 2.6944236755371094, "learning_rate": 3.890492798553252e-05, "loss": 0.0491, "num_input_tokens_seen": 246270464, "step": 240500 }, { "epoch": 2.223636984342274, "grad_norm": 1.2675094604492188, "learning_rate": 3.8881861211836026e-05, "loss": 0.054, "num_input_tokens_seen": 246782464, "step": 241000 }, { "epoch": 2.2282503390815736, "grad_norm": 3.3482534885406494, "learning_rate": 3.885879443813953e-05, "loss": 0.0436, "num_input_tokens_seen": 247294464, "step": 241500 }, { "epoch": 2.232863693820873, "grad_norm": 4.079286575317383, "learning_rate": 3.8835727664443034e-05, "loss": 0.0451, "num_input_tokens_seen": 247806464, "step": 242000 }, { "epoch": 2.237477048560172, "grad_norm": 1.210747480392456, "learning_rate": 3.8812660890746535e-05, "loss": 0.0474, "num_input_tokens_seen": 248318464, "step": 242500 }, { "epoch": 2.2420904032994713, "grad_norm": 0.7511959671974182, "learning_rate": 3.8789594117050035e-05, "loss": 0.0548, "num_input_tokens_seen": 248830464, "step": 243000 }, { "epoch": 2.2467037580387705, "grad_norm": 2.5810165405273438, "learning_rate": 3.876652734335354e-05, "loss": 0.0501, "num_input_tokens_seen": 249342464, "step": 243500 }, { "epoch": 2.25131711277807, "grad_norm": 1.060328722000122, "learning_rate": 3.874346056965705e-05, "loss": 0.0473, "num_input_tokens_seen": 249854464, "step": 244000 }, { "epoch": 2.255930467517369, "grad_norm": 0.6183954477310181, "learning_rate": 3.8720393795960544e-05, "loss": 0.0486, "num_input_tokens_seen": 250366464, "step": 244500 }, { "epoch": 2.2605438222566687, "grad_norm": 1.4669181108474731, "learning_rate": 3.869732702226405e-05, "loss": 0.046, "num_input_tokens_seen": 250878464, "step": 245000 }, { "epoch": 2.265157176995968, "grad_norm": 0.44876328110694885, "learning_rate": 3.867426024856756e-05, "loss": 0.0494, "num_input_tokens_seen": 251390464, "step": 245500 }, { "epoch": 2.269770531735267, "grad_norm": 1.458533763885498, "learning_rate": 3.865119347487106e-05, "loss": 0.0514, "num_input_tokens_seen": 251902464, "step": 246000 }, { "epoch": 2.2743838864745665, "grad_norm": 1.5308929681777954, "learning_rate": 3.862812670117456e-05, "loss": 0.0482, "num_input_tokens_seen": 252414464, "step": 246500 }, { "epoch": 2.2789972412138657, "grad_norm": 2.227228879928589, "learning_rate": 3.860505992747807e-05, "loss": 0.0489, "num_input_tokens_seen": 252926464, "step": 247000 }, { "epoch": 2.2836105959531654, "grad_norm": 0.44453561305999756, "learning_rate": 3.858199315378157e-05, "loss": 0.0494, "num_input_tokens_seen": 253438464, "step": 247500 }, { "epoch": 2.2882239506924646, "grad_norm": 1.6029125452041626, "learning_rate": 3.855892638008507e-05, "loss": 0.0512, "num_input_tokens_seen": 253950464, "step": 248000 }, { "epoch": 2.292837305431764, "grad_norm": 0.9729604125022888, "learning_rate": 3.8535859606388576e-05, "loss": 0.0479, "num_input_tokens_seen": 254462464, "step": 248500 }, { "epoch": 2.297450660171063, "grad_norm": 2.042520046234131, "learning_rate": 3.8512792832692076e-05, "loss": 0.0505, "num_input_tokens_seen": 254974464, "step": 249000 }, { "epoch": 2.3020640149103624, "grad_norm": 0.6108492016792297, "learning_rate": 3.8489726058995583e-05, "loss": 0.0486, "num_input_tokens_seen": 255486464, "step": 249500 }, { "epoch": 2.3066773696496616, "grad_norm": 3.030125379562378, "learning_rate": 3.8466659285299084e-05, "loss": 0.0489, "num_input_tokens_seen": 255998464, "step": 250000 }, { "epoch": 2.3112907243889613, "grad_norm": 1.440781831741333, "learning_rate": 3.844359251160259e-05, "loss": 0.0486, "num_input_tokens_seen": 256510464, "step": 250500 }, { "epoch": 2.3159040791282606, "grad_norm": 2.0030038356781006, "learning_rate": 3.842052573790609e-05, "loss": 0.051, "num_input_tokens_seen": 257022464, "step": 251000 }, { "epoch": 2.32051743386756, "grad_norm": 0.7390642166137695, "learning_rate": 3.83974589642096e-05, "loss": 0.0524, "num_input_tokens_seen": 257534464, "step": 251500 }, { "epoch": 2.325130788606859, "grad_norm": 1.2793288230895996, "learning_rate": 3.83743921905131e-05, "loss": 0.0511, "num_input_tokens_seen": 258046464, "step": 252000 }, { "epoch": 2.3297441433461583, "grad_norm": 0.9258439540863037, "learning_rate": 3.83513254168166e-05, "loss": 0.0452, "num_input_tokens_seen": 258558464, "step": 252500 }, { "epoch": 2.334357498085458, "grad_norm": 1.6350897550582886, "learning_rate": 3.832825864312011e-05, "loss": 0.0512, "num_input_tokens_seen": 259070464, "step": 253000 }, { "epoch": 2.3389708528247573, "grad_norm": 0.529399037361145, "learning_rate": 3.830519186942361e-05, "loss": 0.0508, "num_input_tokens_seen": 259582464, "step": 253500 }, { "epoch": 2.3435842075640565, "grad_norm": 1.1488155126571655, "learning_rate": 3.828212509572711e-05, "loss": 0.0507, "num_input_tokens_seen": 260094464, "step": 254000 }, { "epoch": 2.3481975623033557, "grad_norm": 1.7055829763412476, "learning_rate": 3.8259058322030616e-05, "loss": 0.0512, "num_input_tokens_seen": 260606464, "step": 254500 }, { "epoch": 2.352810917042655, "grad_norm": 1.6156001091003418, "learning_rate": 3.8235991548334124e-05, "loss": 0.0475, "num_input_tokens_seen": 261118464, "step": 255000 }, { "epoch": 2.3574242717819542, "grad_norm": 1.6147477626800537, "learning_rate": 3.821292477463762e-05, "loss": 0.0486, "num_input_tokens_seen": 261630464, "step": 255500 }, { "epoch": 2.3620376265212535, "grad_norm": 2.267575979232788, "learning_rate": 3.8189858000941125e-05, "loss": 0.0531, "num_input_tokens_seen": 262142464, "step": 256000 }, { "epoch": 2.366650981260553, "grad_norm": 4.673060417175293, "learning_rate": 3.816679122724463e-05, "loss": 0.0482, "num_input_tokens_seen": 262654464, "step": 256500 }, { "epoch": 2.3712643359998524, "grad_norm": 0.9855422377586365, "learning_rate": 3.814372445354813e-05, "loss": 0.0513, "num_input_tokens_seen": 263166464, "step": 257000 }, { "epoch": 2.3758776907391517, "grad_norm": 2.0277483463287354, "learning_rate": 3.8120657679851633e-05, "loss": 0.0486, "num_input_tokens_seen": 263678464, "step": 257500 }, { "epoch": 2.380491045478451, "grad_norm": 2.461817979812622, "learning_rate": 3.809759090615514e-05, "loss": 0.0467, "num_input_tokens_seen": 264190464, "step": 258000 }, { "epoch": 2.38510440021775, "grad_norm": 1.2786630392074585, "learning_rate": 3.807452413245864e-05, "loss": 0.0449, "num_input_tokens_seen": 264702464, "step": 258500 }, { "epoch": 2.38971775495705, "grad_norm": 0.7494092583656311, "learning_rate": 3.805145735876215e-05, "loss": 0.0444, "num_input_tokens_seen": 265214464, "step": 259000 }, { "epoch": 2.394331109696349, "grad_norm": 0.7989722490310669, "learning_rate": 3.802839058506565e-05, "loss": 0.0474, "num_input_tokens_seen": 265726464, "step": 259500 }, { "epoch": 2.3989444644356483, "grad_norm": 1.17472505569458, "learning_rate": 3.800532381136916e-05, "loss": 0.0508, "num_input_tokens_seen": 266238464, "step": 260000 }, { "epoch": 2.4035578191749476, "grad_norm": 4.456437587738037, "learning_rate": 3.798225703767266e-05, "loss": 0.0536, "num_input_tokens_seen": 266750464, "step": 260500 }, { "epoch": 2.408171173914247, "grad_norm": 1.390002727508545, "learning_rate": 3.795919026397616e-05, "loss": 0.0489, "num_input_tokens_seen": 267262464, "step": 261000 }, { "epoch": 2.412784528653546, "grad_norm": 3.4362330436706543, "learning_rate": 3.7936123490279665e-05, "loss": 0.0455, "num_input_tokens_seen": 267774464, "step": 261500 }, { "epoch": 2.417397883392846, "grad_norm": 3.1407535076141357, "learning_rate": 3.7913056716583166e-05, "loss": 0.0488, "num_input_tokens_seen": 268286464, "step": 262000 }, { "epoch": 2.422011238132145, "grad_norm": 5.290740966796875, "learning_rate": 3.788998994288667e-05, "loss": 0.0529, "num_input_tokens_seen": 268798464, "step": 262500 }, { "epoch": 2.4266245928714443, "grad_norm": 0.8178442716598511, "learning_rate": 3.7866923169190174e-05, "loss": 0.0488, "num_input_tokens_seen": 269310464, "step": 263000 }, { "epoch": 2.4312379476107435, "grad_norm": 1.9484672546386719, "learning_rate": 3.7843856395493674e-05, "loss": 0.047, "num_input_tokens_seen": 269822464, "step": 263500 }, { "epoch": 2.4358513023500428, "grad_norm": 3.035595178604126, "learning_rate": 3.782078962179718e-05, "loss": 0.0465, "num_input_tokens_seen": 270334464, "step": 264000 }, { "epoch": 2.4404646570893425, "grad_norm": 1.731019377708435, "learning_rate": 3.779772284810069e-05, "loss": 0.0458, "num_input_tokens_seen": 270846464, "step": 264500 }, { "epoch": 2.4450780118286417, "grad_norm": 1.4459056854248047, "learning_rate": 3.777465607440418e-05, "loss": 0.0469, "num_input_tokens_seen": 271358464, "step": 265000 }, { "epoch": 2.449691366567941, "grad_norm": 1.475520372390747, "learning_rate": 3.775158930070769e-05, "loss": 0.0489, "num_input_tokens_seen": 271870464, "step": 265500 }, { "epoch": 2.45430472130724, "grad_norm": 1.0083856582641602, "learning_rate": 3.77285225270112e-05, "loss": 0.0474, "num_input_tokens_seen": 272382464, "step": 266000 }, { "epoch": 2.4589180760465394, "grad_norm": 1.0660340785980225, "learning_rate": 3.770545575331469e-05, "loss": 0.0531, "num_input_tokens_seen": 272894464, "step": 266500 }, { "epoch": 2.4635314307858387, "grad_norm": 2.4508252143859863, "learning_rate": 3.76823889796182e-05, "loss": 0.0484, "num_input_tokens_seen": 273406464, "step": 267000 }, { "epoch": 2.468144785525138, "grad_norm": 1.2447962760925293, "learning_rate": 3.7659322205921706e-05, "loss": 0.0543, "num_input_tokens_seen": 273918464, "step": 267500 }, { "epoch": 2.4727581402644376, "grad_norm": 0.9269862174987793, "learning_rate": 3.763625543222521e-05, "loss": 0.047, "num_input_tokens_seen": 274430464, "step": 268000 }, { "epoch": 2.477371495003737, "grad_norm": 1.8680906295776367, "learning_rate": 3.761318865852871e-05, "loss": 0.0488, "num_input_tokens_seen": 274942464, "step": 268500 }, { "epoch": 2.481984849743036, "grad_norm": 2.0206573009490967, "learning_rate": 3.7590121884832215e-05, "loss": 0.0481, "num_input_tokens_seen": 275454464, "step": 269000 }, { "epoch": 2.4865982044823354, "grad_norm": 1.7884100675582886, "learning_rate": 3.7567055111135715e-05, "loss": 0.0491, "num_input_tokens_seen": 275966464, "step": 269500 }, { "epoch": 2.4912115592216346, "grad_norm": 0.8701728582382202, "learning_rate": 3.754398833743922e-05, "loss": 0.0482, "num_input_tokens_seen": 276478464, "step": 270000 }, { "epoch": 2.4958249139609343, "grad_norm": 1.0109634399414062, "learning_rate": 3.752092156374272e-05, "loss": 0.0501, "num_input_tokens_seen": 276990464, "step": 270500 }, { "epoch": 2.5004382687002336, "grad_norm": 2.7722220420837402, "learning_rate": 3.749785479004623e-05, "loss": 0.0521, "num_input_tokens_seen": 277502464, "step": 271000 }, { "epoch": 2.505051623439533, "grad_norm": 0.6980007886886597, "learning_rate": 3.747478801634973e-05, "loss": 0.0489, "num_input_tokens_seen": 278014464, "step": 271500 }, { "epoch": 2.509664978178832, "grad_norm": 1.2792749404907227, "learning_rate": 3.745172124265324e-05, "loss": 0.0489, "num_input_tokens_seen": 278526464, "step": 272000 }, { "epoch": 2.5142783329181313, "grad_norm": 2.294569969177246, "learning_rate": 3.742865446895674e-05, "loss": 0.0499, "num_input_tokens_seen": 279038464, "step": 272500 }, { "epoch": 2.5188916876574305, "grad_norm": 0.667633593082428, "learning_rate": 3.740558769526024e-05, "loss": 0.0493, "num_input_tokens_seen": 279550464, "step": 273000 }, { "epoch": 2.52350504239673, "grad_norm": 1.3469390869140625, "learning_rate": 3.738252092156375e-05, "loss": 0.0495, "num_input_tokens_seen": 280062464, "step": 273500 }, { "epoch": 2.5281183971360295, "grad_norm": 1.247475266456604, "learning_rate": 3.735945414786725e-05, "loss": 0.0511, "num_input_tokens_seen": 280574464, "step": 274000 }, { "epoch": 2.5327317518753287, "grad_norm": 0.4033117890357971, "learning_rate": 3.733638737417075e-05, "loss": 0.0535, "num_input_tokens_seen": 281086464, "step": 274500 }, { "epoch": 2.537345106614628, "grad_norm": 1.1649394035339355, "learning_rate": 3.7313320600474255e-05, "loss": 0.0532, "num_input_tokens_seen": 281598464, "step": 275000 }, { "epoch": 2.5419584613539272, "grad_norm": 2.126436710357666, "learning_rate": 3.729025382677776e-05, "loss": 0.0485, "num_input_tokens_seen": 282110464, "step": 275500 }, { "epoch": 2.546571816093227, "grad_norm": 0.8005649447441101, "learning_rate": 3.726718705308126e-05, "loss": 0.0506, "num_input_tokens_seen": 282622464, "step": 276000 }, { "epoch": 2.551185170832526, "grad_norm": 2.3989765644073486, "learning_rate": 3.7244120279384764e-05, "loss": 0.0513, "num_input_tokens_seen": 283134464, "step": 276500 }, { "epoch": 2.5557985255718254, "grad_norm": 0.7040809988975525, "learning_rate": 3.722105350568827e-05, "loss": 0.0496, "num_input_tokens_seen": 283646464, "step": 277000 }, { "epoch": 2.5604118803111247, "grad_norm": 1.1335313320159912, "learning_rate": 3.719798673199177e-05, "loss": 0.0546, "num_input_tokens_seen": 284158464, "step": 277500 }, { "epoch": 2.565025235050424, "grad_norm": 0.9312555193901062, "learning_rate": 3.717491995829527e-05, "loss": 0.0516, "num_input_tokens_seen": 284670464, "step": 278000 }, { "epoch": 2.569638589789723, "grad_norm": 0.7695990800857544, "learning_rate": 3.715185318459878e-05, "loss": 0.0521, "num_input_tokens_seen": 285182464, "step": 278500 }, { "epoch": 2.5742519445290224, "grad_norm": 1.258518934249878, "learning_rate": 3.712878641090228e-05, "loss": 0.0499, "num_input_tokens_seen": 285694464, "step": 279000 }, { "epoch": 2.578865299268322, "grad_norm": 2.346951961517334, "learning_rate": 3.710571963720578e-05, "loss": 0.0472, "num_input_tokens_seen": 286206464, "step": 279500 }, { "epoch": 2.5834786540076213, "grad_norm": 0.8598672747612, "learning_rate": 3.708265286350929e-05, "loss": 0.0526, "num_input_tokens_seen": 286718464, "step": 280000 }, { "epoch": 2.5880920087469206, "grad_norm": 1.0490000247955322, "learning_rate": 3.705958608981279e-05, "loss": 0.0441, "num_input_tokens_seen": 287230464, "step": 280500 }, { "epoch": 2.59270536348622, "grad_norm": 0.49518364667892456, "learning_rate": 3.7036519316116296e-05, "loss": 0.0474, "num_input_tokens_seen": 287742464, "step": 281000 }, { "epoch": 2.597318718225519, "grad_norm": 1.5736312866210938, "learning_rate": 3.70134525424198e-05, "loss": 0.0509, "num_input_tokens_seen": 288254464, "step": 281500 }, { "epoch": 2.6019320729648188, "grad_norm": 2.511143445968628, "learning_rate": 3.6990385768723304e-05, "loss": 0.047, "num_input_tokens_seen": 288766464, "step": 282000 }, { "epoch": 2.606545427704118, "grad_norm": 0.9060021638870239, "learning_rate": 3.6967318995026805e-05, "loss": 0.053, "num_input_tokens_seen": 289278464, "step": 282500 }, { "epoch": 2.6111587824434173, "grad_norm": 1.4283766746520996, "learning_rate": 3.694425222133031e-05, "loss": 0.0476, "num_input_tokens_seen": 289790464, "step": 283000 }, { "epoch": 2.6157721371827165, "grad_norm": 1.5333555936813354, "learning_rate": 3.692118544763381e-05, "loss": 0.0538, "num_input_tokens_seen": 290302464, "step": 283500 }, { "epoch": 2.6203854919220158, "grad_norm": 1.615579605102539, "learning_rate": 3.689811867393731e-05, "loss": 0.0475, "num_input_tokens_seen": 290814464, "step": 284000 }, { "epoch": 2.624998846661315, "grad_norm": 1.5331679582595825, "learning_rate": 3.687505190024082e-05, "loss": 0.0471, "num_input_tokens_seen": 291326464, "step": 284500 }, { "epoch": 2.6296122014006142, "grad_norm": 2.3747360706329346, "learning_rate": 3.685198512654433e-05, "loss": 0.0477, "num_input_tokens_seen": 291838464, "step": 285000 }, { "epoch": 2.634225556139914, "grad_norm": 2.0471205711364746, "learning_rate": 3.682891835284782e-05, "loss": 0.0493, "num_input_tokens_seen": 292350464, "step": 285500 }, { "epoch": 2.638838910879213, "grad_norm": 1.0454156398773193, "learning_rate": 3.680585157915133e-05, "loss": 0.0467, "num_input_tokens_seen": 292862464, "step": 286000 }, { "epoch": 2.6434522656185124, "grad_norm": 2.0174975395202637, "learning_rate": 3.678278480545484e-05, "loss": 0.0526, "num_input_tokens_seen": 293374464, "step": 286500 }, { "epoch": 2.6480656203578117, "grad_norm": 1.8630324602127075, "learning_rate": 3.675971803175833e-05, "loss": 0.0489, "num_input_tokens_seen": 293886464, "step": 287000 }, { "epoch": 2.6526789750971114, "grad_norm": 2.270232915878296, "learning_rate": 3.673665125806184e-05, "loss": 0.0509, "num_input_tokens_seen": 294398464, "step": 287500 }, { "epoch": 2.6572923298364106, "grad_norm": 1.7369494438171387, "learning_rate": 3.6713584484365345e-05, "loss": 0.0504, "num_input_tokens_seen": 294910464, "step": 288000 }, { "epoch": 2.66190568457571, "grad_norm": 0.9229201078414917, "learning_rate": 3.6690517710668846e-05, "loss": 0.0467, "num_input_tokens_seen": 295422464, "step": 288500 }, { "epoch": 2.666519039315009, "grad_norm": 1.377439260482788, "learning_rate": 3.6667450936972346e-05, "loss": 0.0515, "num_input_tokens_seen": 295934464, "step": 289000 }, { "epoch": 2.6711323940543084, "grad_norm": 1.9601995944976807, "learning_rate": 3.6644384163275854e-05, "loss": 0.0527, "num_input_tokens_seen": 296446464, "step": 289500 }, { "epoch": 2.6757457487936076, "grad_norm": 1.4592013359069824, "learning_rate": 3.6621317389579354e-05, "loss": 0.0491, "num_input_tokens_seen": 296958464, "step": 290000 }, { "epoch": 2.680359103532907, "grad_norm": 0.35405218601226807, "learning_rate": 3.659825061588286e-05, "loss": 0.0472, "num_input_tokens_seen": 297470464, "step": 290500 }, { "epoch": 2.6849724582722065, "grad_norm": 1.9252680540084839, "learning_rate": 3.657518384218636e-05, "loss": 0.0469, "num_input_tokens_seen": 297982464, "step": 291000 }, { "epoch": 2.689585813011506, "grad_norm": 1.1235663890838623, "learning_rate": 3.655211706848987e-05, "loss": 0.0485, "num_input_tokens_seen": 298494464, "step": 291500 }, { "epoch": 2.694199167750805, "grad_norm": 0.9481515884399414, "learning_rate": 3.652905029479337e-05, "loss": 0.049, "num_input_tokens_seen": 299006464, "step": 292000 }, { "epoch": 2.6988125224901043, "grad_norm": 0.37934771180152893, "learning_rate": 3.650598352109687e-05, "loss": 0.052, "num_input_tokens_seen": 299518464, "step": 292500 }, { "epoch": 2.7034258772294035, "grad_norm": 1.1855201721191406, "learning_rate": 3.648291674740038e-05, "loss": 0.0492, "num_input_tokens_seen": 300030464, "step": 293000 }, { "epoch": 2.708039231968703, "grad_norm": 1.4538213014602661, "learning_rate": 3.645984997370388e-05, "loss": 0.0503, "num_input_tokens_seen": 300542464, "step": 293500 }, { "epoch": 2.7126525867080025, "grad_norm": 2.1017704010009766, "learning_rate": 3.6436783200007386e-05, "loss": 0.0458, "num_input_tokens_seen": 301054464, "step": 294000 }, { "epoch": 2.7172659414473017, "grad_norm": 0.6946723461151123, "learning_rate": 3.6413716426310887e-05, "loss": 0.0524, "num_input_tokens_seen": 301566464, "step": 294500 }, { "epoch": 2.721879296186601, "grad_norm": 3.0771243572235107, "learning_rate": 3.639064965261439e-05, "loss": 0.0518, "num_input_tokens_seen": 302078464, "step": 295000 }, { "epoch": 2.7264926509259, "grad_norm": 1.259162425994873, "learning_rate": 3.6367582878917895e-05, "loss": 0.0532, "num_input_tokens_seen": 302590464, "step": 295500 }, { "epoch": 2.7311060056651995, "grad_norm": 1.8771902322769165, "learning_rate": 3.63445161052214e-05, "loss": 0.0487, "num_input_tokens_seen": 303102464, "step": 296000 }, { "epoch": 2.7357193604044987, "grad_norm": 1.765956997871399, "learning_rate": 3.6321449331524896e-05, "loss": 0.0437, "num_input_tokens_seen": 303614464, "step": 296500 }, { "epoch": 2.7403327151437984, "grad_norm": 1.2610450983047485, "learning_rate": 3.62983825578284e-05, "loss": 0.044, "num_input_tokens_seen": 304126464, "step": 297000 }, { "epoch": 2.7449460698830976, "grad_norm": 4.452374458312988, "learning_rate": 3.627531578413191e-05, "loss": 0.0507, "num_input_tokens_seen": 304638464, "step": 297500 }, { "epoch": 2.749559424622397, "grad_norm": 1.082930088043213, "learning_rate": 3.625224901043541e-05, "loss": 0.0518, "num_input_tokens_seen": 305150464, "step": 298000 }, { "epoch": 2.754172779361696, "grad_norm": 0.708118200302124, "learning_rate": 3.622918223673891e-05, "loss": 0.0483, "num_input_tokens_seen": 305662464, "step": 298500 }, { "epoch": 2.758786134100996, "grad_norm": 1.1710622310638428, "learning_rate": 3.620611546304242e-05, "loss": 0.051, "num_input_tokens_seen": 306174464, "step": 299000 }, { "epoch": 2.763399488840295, "grad_norm": 2.388134002685547, "learning_rate": 3.618304868934592e-05, "loss": 0.0506, "num_input_tokens_seen": 306686464, "step": 299500 }, { "epoch": 2.7680128435795943, "grad_norm": 2.3141307830810547, "learning_rate": 3.615998191564942e-05, "loss": 0.0464, "num_input_tokens_seen": 307198464, "step": 300000 }, { "epoch": 2.7726261983188936, "grad_norm": 1.966213345527649, "learning_rate": 3.613691514195293e-05, "loss": 0.0501, "num_input_tokens_seen": 307710464, "step": 300500 }, { "epoch": 2.777239553058193, "grad_norm": 3.948702573776245, "learning_rate": 3.611384836825643e-05, "loss": 0.0495, "num_input_tokens_seen": 308222464, "step": 301000 }, { "epoch": 2.781852907797492, "grad_norm": 1.3868130445480347, "learning_rate": 3.6090781594559935e-05, "loss": 0.0471, "num_input_tokens_seen": 308734464, "step": 301500 }, { "epoch": 2.7864662625367913, "grad_norm": 1.42705500125885, "learning_rate": 3.6067714820863436e-05, "loss": 0.0474, "num_input_tokens_seen": 309246464, "step": 302000 }, { "epoch": 2.7910796172760906, "grad_norm": 1.4073491096496582, "learning_rate": 3.604464804716694e-05, "loss": 0.0459, "num_input_tokens_seen": 309758464, "step": 302500 }, { "epoch": 2.7956929720153902, "grad_norm": 1.990958333015442, "learning_rate": 3.6021581273470444e-05, "loss": 0.0461, "num_input_tokens_seen": 310270464, "step": 303000 }, { "epoch": 2.8003063267546895, "grad_norm": 2.2346065044403076, "learning_rate": 3.599851449977395e-05, "loss": 0.0534, "num_input_tokens_seen": 310782464, "step": 303500 }, { "epoch": 2.8049196814939887, "grad_norm": 1.1180897951126099, "learning_rate": 3.597544772607745e-05, "loss": 0.0459, "num_input_tokens_seen": 311294464, "step": 304000 }, { "epoch": 2.809533036233288, "grad_norm": 1.765995979309082, "learning_rate": 3.595238095238095e-05, "loss": 0.0443, "num_input_tokens_seen": 311806464, "step": 304500 }, { "epoch": 2.8141463909725877, "grad_norm": 0.6811426877975464, "learning_rate": 3.592931417868446e-05, "loss": 0.0488, "num_input_tokens_seen": 312318464, "step": 305000 }, { "epoch": 2.818759745711887, "grad_norm": 2.811584234237671, "learning_rate": 3.590624740498796e-05, "loss": 0.0517, "num_input_tokens_seen": 312830464, "step": 305500 }, { "epoch": 2.823373100451186, "grad_norm": 2.9501793384552, "learning_rate": 3.588318063129146e-05, "loss": 0.0537, "num_input_tokens_seen": 313342464, "step": 306000 }, { "epoch": 2.8279864551904854, "grad_norm": 0.9767802357673645, "learning_rate": 3.586011385759497e-05, "loss": 0.0473, "num_input_tokens_seen": 313854464, "step": 306500 }, { "epoch": 2.8325998099297847, "grad_norm": 1.463254451751709, "learning_rate": 3.5837047083898476e-05, "loss": 0.0498, "num_input_tokens_seen": 314366464, "step": 307000 }, { "epoch": 2.837213164669084, "grad_norm": 1.6375666856765747, "learning_rate": 3.581398031020197e-05, "loss": 0.0494, "num_input_tokens_seen": 314878464, "step": 307500 }, { "epoch": 2.841826519408383, "grad_norm": 6.093188285827637, "learning_rate": 3.579091353650548e-05, "loss": 0.0505, "num_input_tokens_seen": 315390464, "step": 308000 }, { "epoch": 2.846439874147683, "grad_norm": 1.2764623165130615, "learning_rate": 3.5767846762808984e-05, "loss": 0.0529, "num_input_tokens_seen": 315902464, "step": 308500 }, { "epoch": 2.851053228886982, "grad_norm": 0.9110862612724304, "learning_rate": 3.5744779989112485e-05, "loss": 0.0486, "num_input_tokens_seen": 316414464, "step": 309000 }, { "epoch": 2.8556665836262813, "grad_norm": 1.6029491424560547, "learning_rate": 3.5721713215415985e-05, "loss": 0.0524, "num_input_tokens_seen": 316926464, "step": 309500 }, { "epoch": 2.8602799383655806, "grad_norm": 1.162832498550415, "learning_rate": 3.569864644171949e-05, "loss": 0.0497, "num_input_tokens_seen": 317438464, "step": 310000 }, { "epoch": 2.8648932931048803, "grad_norm": 0.8766358494758606, "learning_rate": 3.567557966802299e-05, "loss": 0.0529, "num_input_tokens_seen": 317950464, "step": 310500 }, { "epoch": 2.8695066478441795, "grad_norm": 1.384810209274292, "learning_rate": 3.56525128943265e-05, "loss": 0.0495, "num_input_tokens_seen": 318462464, "step": 311000 }, { "epoch": 2.8741200025834788, "grad_norm": 3.1389269828796387, "learning_rate": 3.562944612063e-05, "loss": 0.0495, "num_input_tokens_seen": 318974464, "step": 311500 }, { "epoch": 2.878733357322778, "grad_norm": 2.004563570022583, "learning_rate": 3.56063793469335e-05, "loss": 0.0498, "num_input_tokens_seen": 319486464, "step": 312000 }, { "epoch": 2.8833467120620773, "grad_norm": 2.8419971466064453, "learning_rate": 3.558331257323701e-05, "loss": 0.0497, "num_input_tokens_seen": 319998464, "step": 312500 }, { "epoch": 2.8879600668013765, "grad_norm": 1.0195252895355225, "learning_rate": 3.556024579954051e-05, "loss": 0.0496, "num_input_tokens_seen": 320510464, "step": 313000 }, { "epoch": 2.8925734215406758, "grad_norm": 1.6460163593292236, "learning_rate": 3.553717902584402e-05, "loss": 0.0465, "num_input_tokens_seen": 321022464, "step": 313500 }, { "epoch": 2.897186776279975, "grad_norm": 0.9986339211463928, "learning_rate": 3.551411225214752e-05, "loss": 0.0494, "num_input_tokens_seen": 321534464, "step": 314000 }, { "epoch": 2.9018001310192747, "grad_norm": 0.7910524606704712, "learning_rate": 3.5491045478451025e-05, "loss": 0.0488, "num_input_tokens_seen": 322046464, "step": 314500 }, { "epoch": 2.906413485758574, "grad_norm": 0.8609081506729126, "learning_rate": 3.5467978704754526e-05, "loss": 0.0522, "num_input_tokens_seen": 322558464, "step": 315000 }, { "epoch": 2.911026840497873, "grad_norm": 0.49892082810401917, "learning_rate": 3.5444911931058026e-05, "loss": 0.0471, "num_input_tokens_seen": 323070464, "step": 315500 }, { "epoch": 2.9156401952371724, "grad_norm": 1.161789894104004, "learning_rate": 3.5421845157361534e-05, "loss": 0.0519, "num_input_tokens_seen": 323582464, "step": 316000 }, { "epoch": 2.920253549976472, "grad_norm": 2.9082627296447754, "learning_rate": 3.539877838366504e-05, "loss": 0.0517, "num_input_tokens_seen": 324094464, "step": 316500 }, { "epoch": 2.9248669047157714, "grad_norm": 2.1669368743896484, "learning_rate": 3.5375711609968535e-05, "loss": 0.0506, "num_input_tokens_seen": 324606464, "step": 317000 }, { "epoch": 2.9294802594550706, "grad_norm": 0.955956220626831, "learning_rate": 3.535264483627204e-05, "loss": 0.0508, "num_input_tokens_seen": 325118464, "step": 317500 }, { "epoch": 2.93409361419437, "grad_norm": 1.6256439685821533, "learning_rate": 3.532957806257555e-05, "loss": 0.0468, "num_input_tokens_seen": 325630464, "step": 318000 }, { "epoch": 2.938706968933669, "grad_norm": 1.479632019996643, "learning_rate": 3.530651128887904e-05, "loss": 0.0468, "num_input_tokens_seen": 326142464, "step": 318500 }, { "epoch": 2.9433203236729684, "grad_norm": 0.8990212082862854, "learning_rate": 3.528344451518255e-05, "loss": 0.0515, "num_input_tokens_seen": 326654464, "step": 319000 }, { "epoch": 2.9479336784122676, "grad_norm": 0.5225000381469727, "learning_rate": 3.526037774148606e-05, "loss": 0.0474, "num_input_tokens_seen": 327166464, "step": 319500 }, { "epoch": 2.9525470331515673, "grad_norm": 0.6462964415550232, "learning_rate": 3.523731096778956e-05, "loss": 0.0523, "num_input_tokens_seen": 327678464, "step": 320000 }, { "epoch": 2.9571603878908665, "grad_norm": 1.1759368181228638, "learning_rate": 3.521424419409306e-05, "loss": 0.0485, "num_input_tokens_seen": 328190464, "step": 320500 }, { "epoch": 2.961773742630166, "grad_norm": 0.6114454865455627, "learning_rate": 3.5191177420396567e-05, "loss": 0.0522, "num_input_tokens_seen": 328702464, "step": 321000 }, { "epoch": 2.966387097369465, "grad_norm": 0.8368657231330872, "learning_rate": 3.516811064670007e-05, "loss": 0.0468, "num_input_tokens_seen": 329214464, "step": 321500 }, { "epoch": 2.9710004521087643, "grad_norm": 0.39750799536705017, "learning_rate": 3.5145043873003574e-05, "loss": 0.0491, "num_input_tokens_seen": 329726464, "step": 322000 }, { "epoch": 2.975613806848064, "grad_norm": 1.4396777153015137, "learning_rate": 3.5121977099307075e-05, "loss": 0.0486, "num_input_tokens_seen": 330238464, "step": 322500 }, { "epoch": 2.9802271615873632, "grad_norm": 6.470019817352295, "learning_rate": 3.5098910325610576e-05, "loss": 0.0466, "num_input_tokens_seen": 330750464, "step": 323000 }, { "epoch": 2.9848405163266625, "grad_norm": 0.8978260159492493, "learning_rate": 3.507584355191408e-05, "loss": 0.051, "num_input_tokens_seen": 331262464, "step": 323500 }, { "epoch": 2.9894538710659617, "grad_norm": 1.2832305431365967, "learning_rate": 3.505277677821759e-05, "loss": 0.05, "num_input_tokens_seen": 331774464, "step": 324000 }, { "epoch": 2.994067225805261, "grad_norm": 1.4465861320495605, "learning_rate": 3.502971000452109e-05, "loss": 0.0491, "num_input_tokens_seen": 332286464, "step": 324500 }, { "epoch": 2.99868058054456, "grad_norm": 0.7884268164634705, "learning_rate": 3.500664323082459e-05, "loss": 0.0559, "num_input_tokens_seen": 332798464, "step": 325000 }, { "epoch": 3.0, "eval_combined_score": 0.07028037235137267, "eval_loss": 0.07028037309646606, "eval_mse": 0.07028037160627928, "eval_runtime": 46.6351, "eval_samples_per_second": 2065.784, "eval_steps_per_second": 258.239, "num_input_tokens_seen": 332944128, "step": 325143 }, { "epoch": 3.00329393528386, "grad_norm": 1.5264211893081665, "learning_rate": 3.49835764571281e-05, "loss": 0.0458, "num_input_tokens_seen": 333309696, "step": 325500 }, { "epoch": 3.007907290023159, "grad_norm": 0.4709686040878296, "learning_rate": 3.49605096834316e-05, "loss": 0.0373, "num_input_tokens_seen": 333821696, "step": 326000 }, { "epoch": 3.0125206447624584, "grad_norm": 1.1726654767990112, "learning_rate": 3.49374429097351e-05, "loss": 0.0367, "num_input_tokens_seen": 334333696, "step": 326500 }, { "epoch": 3.0171339995017576, "grad_norm": 0.5303038358688354, "learning_rate": 3.491437613603861e-05, "loss": 0.0398, "num_input_tokens_seen": 334845696, "step": 327000 }, { "epoch": 3.021747354241057, "grad_norm": 1.8502370119094849, "learning_rate": 3.4891309362342115e-05, "loss": 0.0344, "num_input_tokens_seen": 335357696, "step": 327500 }, { "epoch": 3.026360708980356, "grad_norm": 0.6410061120986938, "learning_rate": 3.486824258864561e-05, "loss": 0.0387, "num_input_tokens_seen": 335869696, "step": 328000 }, { "epoch": 3.030974063719656, "grad_norm": 2.9425787925720215, "learning_rate": 3.4845175814949116e-05, "loss": 0.0408, "num_input_tokens_seen": 336381696, "step": 328500 }, { "epoch": 3.035587418458955, "grad_norm": 3.2158591747283936, "learning_rate": 3.482210904125262e-05, "loss": 0.039, "num_input_tokens_seen": 336893696, "step": 329000 }, { "epoch": 3.0402007731982543, "grad_norm": 1.0993469953536987, "learning_rate": 3.4799042267556124e-05, "loss": 0.0427, "num_input_tokens_seen": 337405696, "step": 329500 }, { "epoch": 3.0448141279375536, "grad_norm": 0.733238697052002, "learning_rate": 3.4775975493859624e-05, "loss": 0.0364, "num_input_tokens_seen": 337917696, "step": 330000 }, { "epoch": 3.049427482676853, "grad_norm": 1.7866772413253784, "learning_rate": 3.475290872016313e-05, "loss": 0.0367, "num_input_tokens_seen": 338429696, "step": 330500 }, { "epoch": 3.054040837416152, "grad_norm": 2.1485824584960938, "learning_rate": 3.472984194646663e-05, "loss": 0.0375, "num_input_tokens_seen": 338941696, "step": 331000 }, { "epoch": 3.0586541921554518, "grad_norm": 0.9480071663856506, "learning_rate": 3.470677517277013e-05, "loss": 0.0361, "num_input_tokens_seen": 339453696, "step": 331500 }, { "epoch": 3.063267546894751, "grad_norm": 1.3875316381454468, "learning_rate": 3.468370839907364e-05, "loss": 0.04, "num_input_tokens_seen": 339965696, "step": 332000 }, { "epoch": 3.0678809016340503, "grad_norm": 1.2781360149383545, "learning_rate": 3.466064162537714e-05, "loss": 0.0407, "num_input_tokens_seen": 340477696, "step": 332500 }, { "epoch": 3.0724942563733495, "grad_norm": 1.129167079925537, "learning_rate": 3.463757485168065e-05, "loss": 0.0386, "num_input_tokens_seen": 340989696, "step": 333000 }, { "epoch": 3.0771076111126487, "grad_norm": 1.3005669116973877, "learning_rate": 3.461450807798415e-05, "loss": 0.0389, "num_input_tokens_seen": 341501696, "step": 333500 }, { "epoch": 3.0817209658519484, "grad_norm": 1.7916690111160278, "learning_rate": 3.4591441304287656e-05, "loss": 0.0357, "num_input_tokens_seen": 342013696, "step": 334000 }, { "epoch": 3.0863343205912477, "grad_norm": 0.6907594799995422, "learning_rate": 3.456837453059116e-05, "loss": 0.0408, "num_input_tokens_seen": 342525696, "step": 334500 }, { "epoch": 3.090947675330547, "grad_norm": 1.9678852558135986, "learning_rate": 3.4545307756894664e-05, "loss": 0.0394, "num_input_tokens_seen": 343037696, "step": 335000 }, { "epoch": 3.095561030069846, "grad_norm": 2.437412977218628, "learning_rate": 3.4522240983198165e-05, "loss": 0.0374, "num_input_tokens_seen": 343549696, "step": 335500 }, { "epoch": 3.1001743848091454, "grad_norm": 0.7736024260520935, "learning_rate": 3.4499174209501665e-05, "loss": 0.0398, "num_input_tokens_seen": 344061696, "step": 336000 }, { "epoch": 3.1047877395484447, "grad_norm": 1.619535207748413, "learning_rate": 3.447610743580517e-05, "loss": 0.0407, "num_input_tokens_seen": 344573696, "step": 336500 }, { "epoch": 3.1094010942877444, "grad_norm": 0.7229686975479126, "learning_rate": 3.445304066210867e-05, "loss": 0.035, "num_input_tokens_seen": 345085696, "step": 337000 }, { "epoch": 3.1140144490270436, "grad_norm": 0.757798433303833, "learning_rate": 3.4429973888412174e-05, "loss": 0.0356, "num_input_tokens_seen": 345597696, "step": 337500 }, { "epoch": 3.118627803766343, "grad_norm": 1.478723168373108, "learning_rate": 3.440690711471568e-05, "loss": 0.0375, "num_input_tokens_seen": 346109696, "step": 338000 }, { "epoch": 3.123241158505642, "grad_norm": 1.482269525527954, "learning_rate": 3.438384034101919e-05, "loss": 0.0382, "num_input_tokens_seen": 346621696, "step": 338500 }, { "epoch": 3.1278545132449413, "grad_norm": 1.0418490171432495, "learning_rate": 3.436077356732268e-05, "loss": 0.0364, "num_input_tokens_seen": 347133696, "step": 339000 }, { "epoch": 3.1324678679842406, "grad_norm": 0.8459765911102295, "learning_rate": 3.433770679362619e-05, "loss": 0.0355, "num_input_tokens_seen": 347645696, "step": 339500 }, { "epoch": 3.1370812227235403, "grad_norm": 0.91368168592453, "learning_rate": 3.43146400199297e-05, "loss": 0.0384, "num_input_tokens_seen": 348157696, "step": 340000 }, { "epoch": 3.1416945774628395, "grad_norm": 1.1992415189743042, "learning_rate": 3.42915732462332e-05, "loss": 0.0402, "num_input_tokens_seen": 348669696, "step": 340500 }, { "epoch": 3.146307932202139, "grad_norm": 1.1619198322296143, "learning_rate": 3.42685064725367e-05, "loss": 0.0401, "num_input_tokens_seen": 349181696, "step": 341000 }, { "epoch": 3.150921286941438, "grad_norm": 0.8243937492370605, "learning_rate": 3.4245439698840206e-05, "loss": 0.039, "num_input_tokens_seen": 349693696, "step": 341500 }, { "epoch": 3.1555346416807373, "grad_norm": 1.217475175857544, "learning_rate": 3.4222372925143706e-05, "loss": 0.0392, "num_input_tokens_seen": 350205696, "step": 342000 }, { "epoch": 3.1601479964200365, "grad_norm": 1.7150335311889648, "learning_rate": 3.4199306151447214e-05, "loss": 0.0352, "num_input_tokens_seen": 350717696, "step": 342500 }, { "epoch": 3.164761351159336, "grad_norm": 0.892362117767334, "learning_rate": 3.4176239377750714e-05, "loss": 0.0403, "num_input_tokens_seen": 351229696, "step": 343000 }, { "epoch": 3.1693747058986355, "grad_norm": 0.5353464484214783, "learning_rate": 3.4153172604054215e-05, "loss": 0.0378, "num_input_tokens_seen": 351741696, "step": 343500 }, { "epoch": 3.1739880606379347, "grad_norm": 1.603272557258606, "learning_rate": 3.413010583035772e-05, "loss": 0.0401, "num_input_tokens_seen": 352253696, "step": 344000 }, { "epoch": 3.178601415377234, "grad_norm": 1.0198638439178467, "learning_rate": 3.410703905666122e-05, "loss": 0.0364, "num_input_tokens_seen": 352765696, "step": 344500 }, { "epoch": 3.183214770116533, "grad_norm": 0.7820620536804199, "learning_rate": 3.408397228296473e-05, "loss": 0.038, "num_input_tokens_seen": 353277696, "step": 345000 }, { "epoch": 3.187828124855833, "grad_norm": 1.567887306213379, "learning_rate": 3.406090550926823e-05, "loss": 0.0368, "num_input_tokens_seen": 353789696, "step": 345500 }, { "epoch": 3.192441479595132, "grad_norm": 1.5703437328338623, "learning_rate": 3.403783873557174e-05, "loss": 0.0385, "num_input_tokens_seen": 354301696, "step": 346000 }, { "epoch": 3.1970548343344314, "grad_norm": 0.5745303630828857, "learning_rate": 3.401477196187524e-05, "loss": 0.0368, "num_input_tokens_seen": 354813696, "step": 346500 }, { "epoch": 3.2016681890737306, "grad_norm": 0.9760965704917908, "learning_rate": 3.399170518817874e-05, "loss": 0.0414, "num_input_tokens_seen": 355325696, "step": 347000 }, { "epoch": 3.20628154381303, "grad_norm": 1.1067168712615967, "learning_rate": 3.3968638414482246e-05, "loss": 0.0379, "num_input_tokens_seen": 355837696, "step": 347500 }, { "epoch": 3.210894898552329, "grad_norm": 1.1161097288131714, "learning_rate": 3.3945571640785754e-05, "loss": 0.0384, "num_input_tokens_seen": 356349696, "step": 348000 }, { "epoch": 3.2155082532916284, "grad_norm": 2.1467411518096924, "learning_rate": 3.392250486708925e-05, "loss": 0.0387, "num_input_tokens_seen": 356861696, "step": 348500 }, { "epoch": 3.220121608030928, "grad_norm": 1.2950456142425537, "learning_rate": 3.3899438093392755e-05, "loss": 0.0362, "num_input_tokens_seen": 357373696, "step": 349000 }, { "epoch": 3.2247349627702273, "grad_norm": 1.0559481382369995, "learning_rate": 3.387637131969626e-05, "loss": 0.0395, "num_input_tokens_seen": 357885696, "step": 349500 }, { "epoch": 3.2293483175095266, "grad_norm": 1.2557491064071655, "learning_rate": 3.385330454599976e-05, "loss": 0.0409, "num_input_tokens_seen": 358397696, "step": 350000 }, { "epoch": 3.233961672248826, "grad_norm": 0.9372035264968872, "learning_rate": 3.3830237772303264e-05, "loss": 0.0404, "num_input_tokens_seen": 358909696, "step": 350500 }, { "epoch": 3.238575026988125, "grad_norm": 0.6541593670845032, "learning_rate": 3.380717099860677e-05, "loss": 0.0376, "num_input_tokens_seen": 359421696, "step": 351000 }, { "epoch": 3.2431883817274247, "grad_norm": 0.9174505472183228, "learning_rate": 3.378410422491027e-05, "loss": 0.0403, "num_input_tokens_seen": 359933696, "step": 351500 }, { "epoch": 3.247801736466724, "grad_norm": 0.9051727056503296, "learning_rate": 3.376103745121377e-05, "loss": 0.0375, "num_input_tokens_seen": 360445696, "step": 352000 }, { "epoch": 3.2524150912060232, "grad_norm": 1.1875522136688232, "learning_rate": 3.373797067751728e-05, "loss": 0.0431, "num_input_tokens_seen": 360957696, "step": 352500 }, { "epoch": 3.2570284459453225, "grad_norm": 0.1862681657075882, "learning_rate": 3.371490390382078e-05, "loss": 0.0385, "num_input_tokens_seen": 361469696, "step": 353000 }, { "epoch": 3.2616418006846217, "grad_norm": 1.5912601947784424, "learning_rate": 3.369183713012429e-05, "loss": 0.0371, "num_input_tokens_seen": 361981696, "step": 353500 }, { "epoch": 3.266255155423921, "grad_norm": 1.4725751876831055, "learning_rate": 3.366877035642779e-05, "loss": 0.0417, "num_input_tokens_seen": 362493696, "step": 354000 }, { "epoch": 3.2708685101632207, "grad_norm": 0.7821846604347229, "learning_rate": 3.364570358273129e-05, "loss": 0.0371, "num_input_tokens_seen": 363005696, "step": 354500 }, { "epoch": 3.27548186490252, "grad_norm": 1.3403239250183105, "learning_rate": 3.3622636809034796e-05, "loss": 0.0437, "num_input_tokens_seen": 363517696, "step": 355000 }, { "epoch": 3.280095219641819, "grad_norm": 1.3142443895339966, "learning_rate": 3.35995700353383e-05, "loss": 0.0424, "num_input_tokens_seen": 364029696, "step": 355500 }, { "epoch": 3.2847085743811184, "grad_norm": 0.7003629207611084, "learning_rate": 3.3576503261641804e-05, "loss": 0.038, "num_input_tokens_seen": 364541696, "step": 356000 }, { "epoch": 3.2893219291204177, "grad_norm": 2.1016480922698975, "learning_rate": 3.3553436487945304e-05, "loss": 0.0389, "num_input_tokens_seen": 365053696, "step": 356500 }, { "epoch": 3.2939352838597173, "grad_norm": 0.9255128502845764, "learning_rate": 3.353036971424881e-05, "loss": 0.0414, "num_input_tokens_seen": 365565696, "step": 357000 }, { "epoch": 3.2985486385990166, "grad_norm": 2.0615665912628174, "learning_rate": 3.350730294055231e-05, "loss": 0.0376, "num_input_tokens_seen": 366077696, "step": 357500 }, { "epoch": 3.303161993338316, "grad_norm": 0.5057035088539124, "learning_rate": 3.348423616685581e-05, "loss": 0.0441, "num_input_tokens_seen": 366589696, "step": 358000 }, { "epoch": 3.307775348077615, "grad_norm": 2.8129680156707764, "learning_rate": 3.346116939315932e-05, "loss": 0.0368, "num_input_tokens_seen": 367101696, "step": 358500 }, { "epoch": 3.3123887028169143, "grad_norm": 2.223184823989868, "learning_rate": 3.343810261946283e-05, "loss": 0.0423, "num_input_tokens_seen": 367613696, "step": 359000 }, { "epoch": 3.3170020575562136, "grad_norm": 1.127394199371338, "learning_rate": 3.341503584576632e-05, "loss": 0.0397, "num_input_tokens_seen": 368125696, "step": 359500 }, { "epoch": 3.321615412295513, "grad_norm": 2.887812376022339, "learning_rate": 3.339196907206983e-05, "loss": 0.0379, "num_input_tokens_seen": 368637696, "step": 360000 }, { "epoch": 3.3262287670348125, "grad_norm": 1.08502197265625, "learning_rate": 3.3368902298373336e-05, "loss": 0.0421, "num_input_tokens_seen": 369149696, "step": 360500 }, { "epoch": 3.3308421217741118, "grad_norm": 1.0474424362182617, "learning_rate": 3.334583552467684e-05, "loss": 0.04, "num_input_tokens_seen": 369661696, "step": 361000 }, { "epoch": 3.335455476513411, "grad_norm": 0.7261756658554077, "learning_rate": 3.332276875098034e-05, "loss": 0.0409, "num_input_tokens_seen": 370173696, "step": 361500 }, { "epoch": 3.3400688312527103, "grad_norm": 0.6790010929107666, "learning_rate": 3.3299701977283845e-05, "loss": 0.0403, "num_input_tokens_seen": 370685696, "step": 362000 }, { "epoch": 3.3446821859920095, "grad_norm": 1.7215800285339355, "learning_rate": 3.3276635203587345e-05, "loss": 0.0411, "num_input_tokens_seen": 371197696, "step": 362500 }, { "epoch": 3.349295540731309, "grad_norm": 1.112464189529419, "learning_rate": 3.325356842989085e-05, "loss": 0.0421, "num_input_tokens_seen": 371709696, "step": 363000 }, { "epoch": 3.3539088954706084, "grad_norm": 1.0138994455337524, "learning_rate": 3.323050165619435e-05, "loss": 0.0369, "num_input_tokens_seen": 372221696, "step": 363500 }, { "epoch": 3.3585222502099077, "grad_norm": 0.584247887134552, "learning_rate": 3.3207434882497854e-05, "loss": 0.0402, "num_input_tokens_seen": 372733696, "step": 364000 }, { "epoch": 3.363135604949207, "grad_norm": 1.9375905990600586, "learning_rate": 3.318436810880136e-05, "loss": 0.0359, "num_input_tokens_seen": 373245696, "step": 364500 }, { "epoch": 3.367748959688506, "grad_norm": 1.225064992904663, "learning_rate": 3.316130133510486e-05, "loss": 0.0378, "num_input_tokens_seen": 373757696, "step": 365000 }, { "epoch": 3.3723623144278054, "grad_norm": 1.0532304048538208, "learning_rate": 3.313823456140836e-05, "loss": 0.0422, "num_input_tokens_seen": 374269696, "step": 365500 }, { "epoch": 3.376975669167105, "grad_norm": 0.950737714767456, "learning_rate": 3.311516778771187e-05, "loss": 0.0385, "num_input_tokens_seen": 374781696, "step": 366000 }, { "epoch": 3.3815890239064044, "grad_norm": 0.340679794549942, "learning_rate": 3.309210101401538e-05, "loss": 0.0364, "num_input_tokens_seen": 375293696, "step": 366500 }, { "epoch": 3.3862023786457036, "grad_norm": 4.747739791870117, "learning_rate": 3.306903424031888e-05, "loss": 0.0354, "num_input_tokens_seen": 375805696, "step": 367000 }, { "epoch": 3.390815733385003, "grad_norm": 1.7227208614349365, "learning_rate": 3.304596746662238e-05, "loss": 0.0413, "num_input_tokens_seen": 376317696, "step": 367500 }, { "epoch": 3.395429088124302, "grad_norm": 1.4410547018051147, "learning_rate": 3.3022900692925886e-05, "loss": 0.0359, "num_input_tokens_seen": 376829696, "step": 368000 }, { "epoch": 3.400042442863602, "grad_norm": 0.847284197807312, "learning_rate": 3.2999833919229386e-05, "loss": 0.0437, "num_input_tokens_seen": 377341696, "step": 368500 }, { "epoch": 3.404655797602901, "grad_norm": 1.7439848184585571, "learning_rate": 3.297676714553289e-05, "loss": 0.0362, "num_input_tokens_seen": 377853696, "step": 369000 }, { "epoch": 3.4092691523422003, "grad_norm": 0.6023704409599304, "learning_rate": 3.2953700371836394e-05, "loss": 0.0418, "num_input_tokens_seen": 378365696, "step": 369500 }, { "epoch": 3.4138825070814995, "grad_norm": 0.3590753972530365, "learning_rate": 3.29306335981399e-05, "loss": 0.0402, "num_input_tokens_seen": 378877696, "step": 370000 }, { "epoch": 3.418495861820799, "grad_norm": 1.0211530923843384, "learning_rate": 3.2907566824443395e-05, "loss": 0.0374, "num_input_tokens_seen": 379389696, "step": 370500 }, { "epoch": 3.423109216560098, "grad_norm": 0.9513002038002014, "learning_rate": 3.28845000507469e-05, "loss": 0.0401, "num_input_tokens_seen": 379901696, "step": 371000 }, { "epoch": 3.4277225712993973, "grad_norm": 1.0161465406417847, "learning_rate": 3.286143327705041e-05, "loss": 0.0403, "num_input_tokens_seen": 380413696, "step": 371500 }, { "epoch": 3.432335926038697, "grad_norm": 1.2249014377593994, "learning_rate": 3.283836650335391e-05, "loss": 0.0401, "num_input_tokens_seen": 380925696, "step": 372000 }, { "epoch": 3.436949280777996, "grad_norm": 1.3249224424362183, "learning_rate": 3.281529972965741e-05, "loss": 0.0414, "num_input_tokens_seen": 381437696, "step": 372500 }, { "epoch": 3.4415626355172955, "grad_norm": 3.6392204761505127, "learning_rate": 3.279223295596092e-05, "loss": 0.0367, "num_input_tokens_seen": 381949696, "step": 373000 }, { "epoch": 3.4461759902565947, "grad_norm": 0.9922639727592468, "learning_rate": 3.276916618226442e-05, "loss": 0.0418, "num_input_tokens_seen": 382461696, "step": 373500 }, { "epoch": 3.450789344995894, "grad_norm": 2.1645193099975586, "learning_rate": 3.2746099408567926e-05, "loss": 0.0382, "num_input_tokens_seen": 382973696, "step": 374000 }, { "epoch": 3.4554026997351937, "grad_norm": 2.5222291946411133, "learning_rate": 3.272303263487143e-05, "loss": 0.0399, "num_input_tokens_seen": 383485696, "step": 374500 }, { "epoch": 3.460016054474493, "grad_norm": 2.2609009742736816, "learning_rate": 3.269996586117493e-05, "loss": 0.0395, "num_input_tokens_seen": 383997696, "step": 375000 }, { "epoch": 3.464629409213792, "grad_norm": 3.2856132984161377, "learning_rate": 3.2676899087478435e-05, "loss": 0.0391, "num_input_tokens_seen": 384509696, "step": 375500 }, { "epoch": 3.4692427639530914, "grad_norm": 0.6138939261436462, "learning_rate": 3.265383231378194e-05, "loss": 0.0398, "num_input_tokens_seen": 385021696, "step": 376000 }, { "epoch": 3.4738561186923906, "grad_norm": 1.3824810981750488, "learning_rate": 3.263076554008544e-05, "loss": 0.0374, "num_input_tokens_seen": 385533696, "step": 376500 }, { "epoch": 3.47846947343169, "grad_norm": 1.539600133895874, "learning_rate": 3.2607698766388943e-05, "loss": 0.0397, "num_input_tokens_seen": 386045696, "step": 377000 }, { "epoch": 3.483082828170989, "grad_norm": 0.7915021181106567, "learning_rate": 3.258463199269245e-05, "loss": 0.0408, "num_input_tokens_seen": 386557696, "step": 377500 }, { "epoch": 3.487696182910289, "grad_norm": 1.5975933074951172, "learning_rate": 3.256156521899595e-05, "loss": 0.0382, "num_input_tokens_seen": 387069696, "step": 378000 }, { "epoch": 3.492309537649588, "grad_norm": 1.8749665021896362, "learning_rate": 3.253849844529945e-05, "loss": 0.0407, "num_input_tokens_seen": 387581696, "step": 378500 }, { "epoch": 3.4969228923888873, "grad_norm": 1.7674627304077148, "learning_rate": 3.251543167160296e-05, "loss": 0.04, "num_input_tokens_seen": 388093696, "step": 379000 }, { "epoch": 3.5015362471281866, "grad_norm": 0.8147306442260742, "learning_rate": 3.249236489790646e-05, "loss": 0.04, "num_input_tokens_seen": 388605696, "step": 379500 }, { "epoch": 3.5061496018674863, "grad_norm": 0.7411497235298157, "learning_rate": 3.246929812420996e-05, "loss": 0.0394, "num_input_tokens_seen": 389117696, "step": 380000 }, { "epoch": 3.5107629566067855, "grad_norm": 1.145559549331665, "learning_rate": 3.244623135051347e-05, "loss": 0.0432, "num_input_tokens_seen": 389629696, "step": 380500 }, { "epoch": 3.5153763113460847, "grad_norm": 1.1018445491790771, "learning_rate": 3.2423164576816975e-05, "loss": 0.0426, "num_input_tokens_seen": 390141696, "step": 381000 }, { "epoch": 3.519989666085384, "grad_norm": 5.711886882781982, "learning_rate": 3.2400097803120476e-05, "loss": 0.0362, "num_input_tokens_seen": 390653696, "step": 381500 }, { "epoch": 3.5246030208246832, "grad_norm": 5.521966934204102, "learning_rate": 3.2377031029423976e-05, "loss": 0.0445, "num_input_tokens_seen": 391165696, "step": 382000 }, { "epoch": 3.5292163755639825, "grad_norm": 1.7097331285476685, "learning_rate": 3.2353964255727484e-05, "loss": 0.0394, "num_input_tokens_seen": 391677696, "step": 382500 }, { "epoch": 3.5338297303032817, "grad_norm": 2.794013023376465, "learning_rate": 3.2330897482030984e-05, "loss": 0.0418, "num_input_tokens_seen": 392189696, "step": 383000 }, { "epoch": 3.5384430850425814, "grad_norm": 0.8009048700332642, "learning_rate": 3.2307830708334485e-05, "loss": 0.0402, "num_input_tokens_seen": 392701696, "step": 383500 }, { "epoch": 3.5430564397818807, "grad_norm": 1.5974643230438232, "learning_rate": 3.228476393463799e-05, "loss": 0.0403, "num_input_tokens_seen": 393213696, "step": 384000 }, { "epoch": 3.54766979452118, "grad_norm": 2.538250207901001, "learning_rate": 3.226169716094149e-05, "loss": 0.0401, "num_input_tokens_seen": 393725696, "step": 384500 }, { "epoch": 3.552283149260479, "grad_norm": 1.2976337671279907, "learning_rate": 3.2238630387245e-05, "loss": 0.0379, "num_input_tokens_seen": 394237696, "step": 385000 }, { "epoch": 3.5568965039997784, "grad_norm": 1.1865109205245972, "learning_rate": 3.22155636135485e-05, "loss": 0.04, "num_input_tokens_seen": 394749696, "step": 385500 }, { "epoch": 3.561509858739078, "grad_norm": 0.36470434069633484, "learning_rate": 3.2192496839852e-05, "loss": 0.0399, "num_input_tokens_seen": 395261696, "step": 386000 }, { "epoch": 3.5661232134783774, "grad_norm": 2.1635212898254395, "learning_rate": 3.216943006615551e-05, "loss": 0.0403, "num_input_tokens_seen": 395773696, "step": 386500 }, { "epoch": 3.5707365682176766, "grad_norm": 1.7805256843566895, "learning_rate": 3.2146363292459016e-05, "loss": 0.0391, "num_input_tokens_seen": 396285696, "step": 387000 }, { "epoch": 3.575349922956976, "grad_norm": 1.5320919752120972, "learning_rate": 3.212329651876252e-05, "loss": 0.0417, "num_input_tokens_seen": 396797696, "step": 387500 }, { "epoch": 3.579963277696275, "grad_norm": 3.523890733718872, "learning_rate": 3.210022974506602e-05, "loss": 0.0394, "num_input_tokens_seen": 397309696, "step": 388000 }, { "epoch": 3.5845766324355743, "grad_norm": 1.2910226583480835, "learning_rate": 3.2077162971369525e-05, "loss": 0.0397, "num_input_tokens_seen": 397821696, "step": 388500 }, { "epoch": 3.5891899871748736, "grad_norm": 1.5501660108566284, "learning_rate": 3.2054096197673025e-05, "loss": 0.0396, "num_input_tokens_seen": 398333696, "step": 389000 }, { "epoch": 3.5938033419141733, "grad_norm": 1.1182091236114502, "learning_rate": 3.2031029423976526e-05, "loss": 0.0421, "num_input_tokens_seen": 398845696, "step": 389500 }, { "epoch": 3.5984166966534725, "grad_norm": 1.5010899305343628, "learning_rate": 3.200796265028003e-05, "loss": 0.038, "num_input_tokens_seen": 399357696, "step": 390000 }, { "epoch": 3.6030300513927718, "grad_norm": 0.4965997040271759, "learning_rate": 3.198489587658354e-05, "loss": 0.0392, "num_input_tokens_seen": 399869696, "step": 390500 }, { "epoch": 3.607643406132071, "grad_norm": 0.735758364200592, "learning_rate": 3.1961829102887034e-05, "loss": 0.0375, "num_input_tokens_seen": 400381696, "step": 391000 }, { "epoch": 3.6122567608713707, "grad_norm": 0.9119324684143066, "learning_rate": 3.193876232919054e-05, "loss": 0.0397, "num_input_tokens_seen": 400893696, "step": 391500 }, { "epoch": 3.61687011561067, "grad_norm": 1.0355151891708374, "learning_rate": 3.191569555549405e-05, "loss": 0.0379, "num_input_tokens_seen": 401405696, "step": 392000 }, { "epoch": 3.621483470349969, "grad_norm": 1.574038028717041, "learning_rate": 3.189262878179755e-05, "loss": 0.0398, "num_input_tokens_seen": 401917696, "step": 392500 }, { "epoch": 3.6260968250892684, "grad_norm": 1.9339407682418823, "learning_rate": 3.186956200810105e-05, "loss": 0.0366, "num_input_tokens_seen": 402429696, "step": 393000 }, { "epoch": 3.6307101798285677, "grad_norm": 1.808971643447876, "learning_rate": 3.184649523440456e-05, "loss": 0.0433, "num_input_tokens_seen": 402941696, "step": 393500 }, { "epoch": 3.635323534567867, "grad_norm": 0.8877146244049072, "learning_rate": 3.182342846070806e-05, "loss": 0.0402, "num_input_tokens_seen": 403453696, "step": 394000 }, { "epoch": 3.639936889307166, "grad_norm": 1.4622044563293457, "learning_rate": 3.1800361687011566e-05, "loss": 0.0429, "num_input_tokens_seen": 403965696, "step": 394500 }, { "epoch": 3.6445502440464654, "grad_norm": 1.1509592533111572, "learning_rate": 3.1777294913315066e-05, "loss": 0.0378, "num_input_tokens_seen": 404477696, "step": 395000 }, { "epoch": 3.649163598785765, "grad_norm": 1.6934188604354858, "learning_rate": 3.175422813961857e-05, "loss": 0.0395, "num_input_tokens_seen": 404989696, "step": 395500 }, { "epoch": 3.6537769535250644, "grad_norm": 2.861666202545166, "learning_rate": 3.1731161365922074e-05, "loss": 0.0382, "num_input_tokens_seen": 405501696, "step": 396000 }, { "epoch": 3.6583903082643636, "grad_norm": 1.3087468147277832, "learning_rate": 3.1708094592225575e-05, "loss": 0.0387, "num_input_tokens_seen": 406013696, "step": 396500 }, { "epoch": 3.663003663003663, "grad_norm": 0.8184057474136353, "learning_rate": 3.1685027818529075e-05, "loss": 0.0436, "num_input_tokens_seen": 406525696, "step": 397000 }, { "epoch": 3.6676170177429626, "grad_norm": 1.3447506427764893, "learning_rate": 3.166196104483258e-05, "loss": 0.0387, "num_input_tokens_seen": 407037696, "step": 397500 }, { "epoch": 3.672230372482262, "grad_norm": 1.8640304803848267, "learning_rate": 3.163889427113609e-05, "loss": 0.0427, "num_input_tokens_seen": 407549696, "step": 398000 }, { "epoch": 3.676843727221561, "grad_norm": 6.683871746063232, "learning_rate": 3.161582749743959e-05, "loss": 0.0413, "num_input_tokens_seen": 408061696, "step": 398500 }, { "epoch": 3.6814570819608603, "grad_norm": 0.6029996275901794, "learning_rate": 3.159276072374309e-05, "loss": 0.0428, "num_input_tokens_seen": 408573696, "step": 399000 }, { "epoch": 3.6860704367001595, "grad_norm": 0.6650155782699585, "learning_rate": 3.15696939500466e-05, "loss": 0.0376, "num_input_tokens_seen": 409085696, "step": 399500 }, { "epoch": 3.690683791439459, "grad_norm": 0.6915871500968933, "learning_rate": 3.15466271763501e-05, "loss": 0.04, "num_input_tokens_seen": 409597696, "step": 400000 }, { "epoch": 3.695297146178758, "grad_norm": 0.9651739597320557, "learning_rate": 3.15235604026536e-05, "loss": 0.0388, "num_input_tokens_seen": 410109696, "step": 400500 }, { "epoch": 3.6999105009180577, "grad_norm": 1.2852321863174438, "learning_rate": 3.150049362895711e-05, "loss": 0.0436, "num_input_tokens_seen": 410621696, "step": 401000 }, { "epoch": 3.704523855657357, "grad_norm": 1.250339150428772, "learning_rate": 3.1477426855260614e-05, "loss": 0.0371, "num_input_tokens_seen": 411133696, "step": 401500 }, { "epoch": 3.7091372103966562, "grad_norm": 0.9992502927780151, "learning_rate": 3.1454360081564115e-05, "loss": 0.0413, "num_input_tokens_seen": 411645696, "step": 402000 }, { "epoch": 3.7137505651359555, "grad_norm": 3.6451685428619385, "learning_rate": 3.1431293307867615e-05, "loss": 0.0425, "num_input_tokens_seen": 412157696, "step": 402500 }, { "epoch": 3.718363919875255, "grad_norm": 0.49393585324287415, "learning_rate": 3.140822653417112e-05, "loss": 0.0414, "num_input_tokens_seen": 412669696, "step": 403000 }, { "epoch": 3.7229772746145544, "grad_norm": 1.5764920711517334, "learning_rate": 3.1385159760474623e-05, "loss": 0.0373, "num_input_tokens_seen": 413181696, "step": 403500 }, { "epoch": 3.7275906293538537, "grad_norm": 2.7465178966522217, "learning_rate": 3.1362092986778124e-05, "loss": 0.0418, "num_input_tokens_seen": 413693696, "step": 404000 }, { "epoch": 3.732203984093153, "grad_norm": 2.4784648418426514, "learning_rate": 3.133902621308163e-05, "loss": 0.0373, "num_input_tokens_seen": 414205696, "step": 404500 }, { "epoch": 3.736817338832452, "grad_norm": 1.1435418128967285, "learning_rate": 3.131595943938513e-05, "loss": 0.0393, "num_input_tokens_seen": 414717696, "step": 405000 }, { "epoch": 3.7414306935717514, "grad_norm": 3.1641488075256348, "learning_rate": 3.129289266568864e-05, "loss": 0.0378, "num_input_tokens_seen": 415229696, "step": 405500 }, { "epoch": 3.7460440483110506, "grad_norm": 1.299619436264038, "learning_rate": 3.126982589199214e-05, "loss": 0.0376, "num_input_tokens_seen": 415741696, "step": 406000 }, { "epoch": 3.75065740305035, "grad_norm": 1.7014168500900269, "learning_rate": 3.124675911829564e-05, "loss": 0.0448, "num_input_tokens_seen": 416253696, "step": 406500 }, { "epoch": 3.7552707577896496, "grad_norm": 1.5592892169952393, "learning_rate": 3.122369234459915e-05, "loss": 0.038, "num_input_tokens_seen": 416765696, "step": 407000 }, { "epoch": 3.759884112528949, "grad_norm": 0.6049352884292603, "learning_rate": 3.1200625570902655e-05, "loss": 0.039, "num_input_tokens_seen": 417277696, "step": 407500 }, { "epoch": 3.764497467268248, "grad_norm": 0.6392286419868469, "learning_rate": 3.117755879720615e-05, "loss": 0.04, "num_input_tokens_seen": 417789696, "step": 408000 }, { "epoch": 3.7691108220075473, "grad_norm": 3.689347505569458, "learning_rate": 3.1154492023509656e-05, "loss": 0.0385, "num_input_tokens_seen": 418301696, "step": 408500 }, { "epoch": 3.773724176746847, "grad_norm": 0.8414890766143799, "learning_rate": 3.1131425249813164e-05, "loss": 0.0366, "num_input_tokens_seen": 418813696, "step": 409000 }, { "epoch": 3.7783375314861463, "grad_norm": 5.263124465942383, "learning_rate": 3.1108358476116664e-05, "loss": 0.0406, "num_input_tokens_seen": 419325696, "step": 409500 }, { "epoch": 3.7829508862254455, "grad_norm": 1.395107626914978, "learning_rate": 3.1085291702420165e-05, "loss": 0.0375, "num_input_tokens_seen": 419837696, "step": 410000 }, { "epoch": 3.7875642409647448, "grad_norm": 1.189859390258789, "learning_rate": 3.106222492872367e-05, "loss": 0.0373, "num_input_tokens_seen": 420349696, "step": 410500 }, { "epoch": 3.792177595704044, "grad_norm": 0.5523993372917175, "learning_rate": 3.103915815502717e-05, "loss": 0.0386, "num_input_tokens_seen": 420861696, "step": 411000 }, { "epoch": 3.7967909504433432, "grad_norm": 0.6239033341407776, "learning_rate": 3.1016091381330673e-05, "loss": 0.0369, "num_input_tokens_seen": 421373696, "step": 411500 }, { "epoch": 3.8014043051826425, "grad_norm": 2.072326421737671, "learning_rate": 3.099302460763418e-05, "loss": 0.0435, "num_input_tokens_seen": 421885696, "step": 412000 }, { "epoch": 3.806017659921942, "grad_norm": 2.074704647064209, "learning_rate": 3.096995783393769e-05, "loss": 0.04, "num_input_tokens_seen": 422397696, "step": 412500 }, { "epoch": 3.8106310146612414, "grad_norm": 1.9311884641647339, "learning_rate": 3.094689106024119e-05, "loss": 0.0428, "num_input_tokens_seen": 422909696, "step": 413000 }, { "epoch": 3.8152443694005407, "grad_norm": 1.3210355043411255, "learning_rate": 3.092382428654469e-05, "loss": 0.0429, "num_input_tokens_seen": 423421696, "step": 413500 }, { "epoch": 3.81985772413984, "grad_norm": 3.048222064971924, "learning_rate": 3.09007575128482e-05, "loss": 0.0418, "num_input_tokens_seen": 423933696, "step": 414000 }, { "epoch": 3.824471078879139, "grad_norm": 0.8300300240516663, "learning_rate": 3.08776907391517e-05, "loss": 0.0408, "num_input_tokens_seen": 424445696, "step": 414500 }, { "epoch": 3.829084433618439, "grad_norm": 0.6099697947502136, "learning_rate": 3.0854623965455205e-05, "loss": 0.0453, "num_input_tokens_seen": 424957696, "step": 415000 }, { "epoch": 3.833697788357738, "grad_norm": 1.205819845199585, "learning_rate": 3.0831557191758705e-05, "loss": 0.0379, "num_input_tokens_seen": 425469696, "step": 415500 }, { "epoch": 3.8383111430970374, "grad_norm": 2.9948160648345947, "learning_rate": 3.0808490418062206e-05, "loss": 0.0406, "num_input_tokens_seen": 425981696, "step": 416000 }, { "epoch": 3.8429244978363366, "grad_norm": 1.0202473402023315, "learning_rate": 3.078542364436571e-05, "loss": 0.0446, "num_input_tokens_seen": 426493696, "step": 416500 }, { "epoch": 3.847537852575636, "grad_norm": 1.2540485858917236, "learning_rate": 3.0762356870669214e-05, "loss": 0.0431, "num_input_tokens_seen": 427005696, "step": 417000 }, { "epoch": 3.852151207314935, "grad_norm": 1.10784113407135, "learning_rate": 3.0739290096972714e-05, "loss": 0.0403, "num_input_tokens_seen": 427517696, "step": 417500 }, { "epoch": 3.8567645620542343, "grad_norm": 1.326798439025879, "learning_rate": 3.071622332327622e-05, "loss": 0.0392, "num_input_tokens_seen": 428029696, "step": 418000 }, { "epoch": 3.861377916793534, "grad_norm": 0.7203147411346436, "learning_rate": 3.069315654957973e-05, "loss": 0.0412, "num_input_tokens_seen": 428541696, "step": 418500 }, { "epoch": 3.8659912715328333, "grad_norm": 2.017019510269165, "learning_rate": 3.067008977588323e-05, "loss": 0.0397, "num_input_tokens_seen": 429053696, "step": 419000 }, { "epoch": 3.8706046262721325, "grad_norm": 1.9709299802780151, "learning_rate": 3.064702300218673e-05, "loss": 0.0382, "num_input_tokens_seen": 429565696, "step": 419500 }, { "epoch": 3.875217981011432, "grad_norm": 3.0947420597076416, "learning_rate": 3.062395622849024e-05, "loss": 0.037, "num_input_tokens_seen": 430077696, "step": 420000 }, { "epoch": 3.8798313357507315, "grad_norm": 1.6916519403457642, "learning_rate": 3.060088945479374e-05, "loss": 0.038, "num_input_tokens_seen": 430589696, "step": 420500 }, { "epoch": 3.8844446904900307, "grad_norm": 2.846257209777832, "learning_rate": 3.057782268109724e-05, "loss": 0.0415, "num_input_tokens_seen": 431101696, "step": 421000 }, { "epoch": 3.88905804522933, "grad_norm": 0.8271204233169556, "learning_rate": 3.0554755907400746e-05, "loss": 0.0428, "num_input_tokens_seen": 431613696, "step": 421500 }, { "epoch": 3.893671399968629, "grad_norm": 1.4244275093078613, "learning_rate": 3.0531689133704247e-05, "loss": 0.042, "num_input_tokens_seen": 432125696, "step": 422000 }, { "epoch": 3.8982847547079285, "grad_norm": 1.629799485206604, "learning_rate": 3.050862236000775e-05, "loss": 0.038, "num_input_tokens_seen": 432637696, "step": 422500 }, { "epoch": 3.9028981094472277, "grad_norm": 1.1674317121505737, "learning_rate": 3.0485555586311255e-05, "loss": 0.0408, "num_input_tokens_seen": 433149696, "step": 423000 }, { "epoch": 3.907511464186527, "grad_norm": 0.816435694694519, "learning_rate": 3.046248881261476e-05, "loss": 0.0395, "num_input_tokens_seen": 433661696, "step": 423500 }, { "epoch": 3.9121248189258266, "grad_norm": 0.8461304903030396, "learning_rate": 3.0439422038918262e-05, "loss": 0.0414, "num_input_tokens_seen": 434173696, "step": 424000 }, { "epoch": 3.916738173665126, "grad_norm": 1.0469881296157837, "learning_rate": 3.0416355265221763e-05, "loss": 0.0403, "num_input_tokens_seen": 434685696, "step": 424500 }, { "epoch": 3.921351528404425, "grad_norm": 2.0151569843292236, "learning_rate": 3.0393288491525267e-05, "loss": 0.0411, "num_input_tokens_seen": 435197696, "step": 425000 }, { "epoch": 3.9259648831437244, "grad_norm": 1.178753137588501, "learning_rate": 3.0370221717828774e-05, "loss": 0.0415, "num_input_tokens_seen": 435709696, "step": 425500 }, { "epoch": 3.9305782378830236, "grad_norm": 0.6420595049858093, "learning_rate": 3.034715494413228e-05, "loss": 0.0433, "num_input_tokens_seen": 436221696, "step": 426000 }, { "epoch": 3.9351915926223233, "grad_norm": 1.1695127487182617, "learning_rate": 3.0324088170435776e-05, "loss": 0.0415, "num_input_tokens_seen": 436733696, "step": 426500 }, { "epoch": 3.9398049473616226, "grad_norm": 0.9923868179321289, "learning_rate": 3.0301021396739283e-05, "loss": 0.0412, "num_input_tokens_seen": 437245696, "step": 427000 }, { "epoch": 3.944418302100922, "grad_norm": 0.8079075217247009, "learning_rate": 3.0277954623042787e-05, "loss": 0.0401, "num_input_tokens_seen": 437757696, "step": 427500 }, { "epoch": 3.949031656840221, "grad_norm": 2.699918746948242, "learning_rate": 3.025488784934629e-05, "loss": 0.04, "num_input_tokens_seen": 438269696, "step": 428000 }, { "epoch": 3.9536450115795203, "grad_norm": 0.577458381652832, "learning_rate": 3.023182107564979e-05, "loss": 0.0404, "num_input_tokens_seen": 438781696, "step": 428500 }, { "epoch": 3.9582583663188196, "grad_norm": 0.6960185170173645, "learning_rate": 3.0208754301953295e-05, "loss": 0.0393, "num_input_tokens_seen": 439293696, "step": 429000 }, { "epoch": 3.962871721058119, "grad_norm": 1.2610116004943848, "learning_rate": 3.01856875282568e-05, "loss": 0.0385, "num_input_tokens_seen": 439805696, "step": 429500 }, { "epoch": 3.9674850757974185, "grad_norm": 1.0515618324279785, "learning_rate": 3.01626207545603e-05, "loss": 0.0386, "num_input_tokens_seen": 440317696, "step": 430000 }, { "epoch": 3.9720984305367177, "grad_norm": 0.9695286154747009, "learning_rate": 3.0139553980863804e-05, "loss": 0.0425, "num_input_tokens_seen": 440829696, "step": 430500 }, { "epoch": 3.976711785276017, "grad_norm": 1.542039155960083, "learning_rate": 3.011648720716731e-05, "loss": 0.0392, "num_input_tokens_seen": 441341696, "step": 431000 }, { "epoch": 3.9813251400153162, "grad_norm": 1.2009466886520386, "learning_rate": 3.0093420433470815e-05, "loss": 0.043, "num_input_tokens_seen": 441853696, "step": 431500 }, { "epoch": 3.985938494754616, "grad_norm": 1.8694528341293335, "learning_rate": 3.0070353659774312e-05, "loss": 0.0396, "num_input_tokens_seen": 442365696, "step": 432000 }, { "epoch": 3.990551849493915, "grad_norm": 1.2931849956512451, "learning_rate": 3.004728688607782e-05, "loss": 0.0382, "num_input_tokens_seen": 442877696, "step": 432500 }, { "epoch": 3.9951652042332144, "grad_norm": 0.953074038028717, "learning_rate": 3.0024220112381324e-05, "loss": 0.0429, "num_input_tokens_seen": 443389696, "step": 433000 }, { "epoch": 3.9997785589725137, "grad_norm": 2.807677745819092, "learning_rate": 3.0001153338684828e-05, "loss": 0.0387, "num_input_tokens_seen": 443901696, "step": 433500 }, { "epoch": 4.0, "eval_combined_score": 0.06748922723897993, "eval_loss": 0.06748922914266586, "eval_mse": 0.06748922533529399, "eval_runtime": 49.5025, "eval_samples_per_second": 1946.123, "eval_steps_per_second": 243.281, "num_input_tokens_seen": 443925504, "step": 433524 }, { "epoch": 4.004391913711813, "grad_norm": 0.2404492050409317, "learning_rate": 2.997808656498833e-05, "loss": 0.0308, "num_input_tokens_seen": 444412928, "step": 434000 }, { "epoch": 4.009005268451112, "grad_norm": 1.2364345788955688, "learning_rate": 2.9955019791291832e-05, "loss": 0.0297, "num_input_tokens_seen": 444924928, "step": 434500 }, { "epoch": 4.013618623190411, "grad_norm": 0.9113791584968567, "learning_rate": 2.9931953017595336e-05, "loss": 0.0287, "num_input_tokens_seen": 445436928, "step": 435000 }, { "epoch": 4.018231977929711, "grad_norm": 1.880218267440796, "learning_rate": 2.9908886243898837e-05, "loss": 0.0294, "num_input_tokens_seen": 445948928, "step": 435500 }, { "epoch": 4.02284533266901, "grad_norm": 1.7842798233032227, "learning_rate": 2.988581947020234e-05, "loss": 0.0288, "num_input_tokens_seen": 446460928, "step": 436000 }, { "epoch": 4.027458687408309, "grad_norm": 0.5358702540397644, "learning_rate": 2.9862752696505848e-05, "loss": 0.0305, "num_input_tokens_seen": 446972928, "step": 436500 }, { "epoch": 4.032072042147609, "grad_norm": 0.7529350519180298, "learning_rate": 2.9839685922809352e-05, "loss": 0.029, "num_input_tokens_seen": 447484928, "step": 437000 }, { "epoch": 4.0366853968869085, "grad_norm": 0.6187124848365784, "learning_rate": 2.981661914911285e-05, "loss": 0.0303, "num_input_tokens_seen": 447996928, "step": 437500 }, { "epoch": 4.041298751626208, "grad_norm": 1.1267274618148804, "learning_rate": 2.9793552375416357e-05, "loss": 0.0292, "num_input_tokens_seen": 448508928, "step": 438000 }, { "epoch": 4.045912106365507, "grad_norm": 1.6049976348876953, "learning_rate": 2.977048560171986e-05, "loss": 0.0292, "num_input_tokens_seen": 449020928, "step": 438500 }, { "epoch": 4.050525461104806, "grad_norm": 3.9203622341156006, "learning_rate": 2.9747418828023365e-05, "loss": 0.0312, "num_input_tokens_seen": 449532928, "step": 439000 }, { "epoch": 4.0551388158441055, "grad_norm": 0.6487706899642944, "learning_rate": 2.9724352054326865e-05, "loss": 0.029, "num_input_tokens_seen": 450044928, "step": 439500 }, { "epoch": 4.059752170583405, "grad_norm": 0.9871296882629395, "learning_rate": 2.970128528063037e-05, "loss": 0.0299, "num_input_tokens_seen": 450556928, "step": 440000 }, { "epoch": 4.064365525322704, "grad_norm": 0.4027337431907654, "learning_rate": 2.9678218506933873e-05, "loss": 0.0287, "num_input_tokens_seen": 451068928, "step": 440500 }, { "epoch": 4.068978880062003, "grad_norm": 1.1440553665161133, "learning_rate": 2.965515173323738e-05, "loss": 0.0313, "num_input_tokens_seen": 451580928, "step": 441000 }, { "epoch": 4.0735922348013025, "grad_norm": 0.5619149208068848, "learning_rate": 2.9632084959540878e-05, "loss": 0.0334, "num_input_tokens_seen": 452092928, "step": 441500 }, { "epoch": 4.078205589540602, "grad_norm": 3.5681047439575195, "learning_rate": 2.9609018185844385e-05, "loss": 0.0301, "num_input_tokens_seen": 452604928, "step": 442000 }, { "epoch": 4.082818944279902, "grad_norm": 1.2567273378372192, "learning_rate": 2.958595141214789e-05, "loss": 0.0317, "num_input_tokens_seen": 453116928, "step": 442500 }, { "epoch": 4.087432299019201, "grad_norm": 1.553036093711853, "learning_rate": 2.956288463845139e-05, "loss": 0.0296, "num_input_tokens_seen": 453628928, "step": 443000 }, { "epoch": 4.0920456537585, "grad_norm": 0.8509573340415955, "learning_rate": 2.9539817864754894e-05, "loss": 0.0325, "num_input_tokens_seen": 454140928, "step": 443500 }, { "epoch": 4.0966590084978, "grad_norm": 1.0355197191238403, "learning_rate": 2.9516751091058398e-05, "loss": 0.0346, "num_input_tokens_seen": 454652928, "step": 444000 }, { "epoch": 4.101272363237099, "grad_norm": 1.49540376663208, "learning_rate": 2.94936843173619e-05, "loss": 0.0335, "num_input_tokens_seen": 455164928, "step": 444500 }, { "epoch": 4.105885717976398, "grad_norm": 1.6079996824264526, "learning_rate": 2.9470617543665402e-05, "loss": 0.0311, "num_input_tokens_seen": 455676928, "step": 445000 }, { "epoch": 4.110499072715697, "grad_norm": 0.5073397159576416, "learning_rate": 2.9447550769968906e-05, "loss": 0.0308, "num_input_tokens_seen": 456188928, "step": 445500 }, { "epoch": 4.115112427454997, "grad_norm": 1.6608948707580566, "learning_rate": 2.942448399627241e-05, "loss": 0.0302, "num_input_tokens_seen": 456700928, "step": 446000 }, { "epoch": 4.119725782194296, "grad_norm": 0.9647392630577087, "learning_rate": 2.9401417222575917e-05, "loss": 0.0311, "num_input_tokens_seen": 457212928, "step": 446500 }, { "epoch": 4.124339136933595, "grad_norm": 0.6390677690505981, "learning_rate": 2.9378350448879415e-05, "loss": 0.0305, "num_input_tokens_seen": 457724928, "step": 447000 }, { "epoch": 4.128952491672894, "grad_norm": 1.7215697765350342, "learning_rate": 2.9355283675182922e-05, "loss": 0.0328, "num_input_tokens_seen": 458236928, "step": 447500 }, { "epoch": 4.133565846412194, "grad_norm": 1.1551854610443115, "learning_rate": 2.9332216901486426e-05, "loss": 0.0313, "num_input_tokens_seen": 458748928, "step": 448000 }, { "epoch": 4.138179201151494, "grad_norm": 1.6345293521881104, "learning_rate": 2.9309150127789927e-05, "loss": 0.0311, "num_input_tokens_seen": 459260928, "step": 448500 }, { "epoch": 4.142792555890793, "grad_norm": 1.5224887132644653, "learning_rate": 2.928608335409343e-05, "loss": 0.0307, "num_input_tokens_seen": 459772928, "step": 449000 }, { "epoch": 4.147405910630092, "grad_norm": 1.6716899871826172, "learning_rate": 2.9263016580396934e-05, "loss": 0.0346, "num_input_tokens_seen": 460284928, "step": 449500 }, { "epoch": 4.1520192653693915, "grad_norm": 2.299623489379883, "learning_rate": 2.923994980670044e-05, "loss": 0.0301, "num_input_tokens_seen": 460796928, "step": 450000 }, { "epoch": 4.156632620108691, "grad_norm": 0.7651464343070984, "learning_rate": 2.921688303300394e-05, "loss": 0.0308, "num_input_tokens_seen": 461308928, "step": 450500 }, { "epoch": 4.16124597484799, "grad_norm": 1.1913387775421143, "learning_rate": 2.9193816259307443e-05, "loss": 0.0312, "num_input_tokens_seen": 461820928, "step": 451000 }, { "epoch": 4.165859329587289, "grad_norm": 1.0334786176681519, "learning_rate": 2.9170749485610947e-05, "loss": 0.0335, "num_input_tokens_seen": 462332928, "step": 451500 }, { "epoch": 4.1704726843265885, "grad_norm": 1.9780852794647217, "learning_rate": 2.9147682711914454e-05, "loss": 0.0344, "num_input_tokens_seen": 462844928, "step": 452000 }, { "epoch": 4.175086039065888, "grad_norm": 0.8200696706771851, "learning_rate": 2.912461593821795e-05, "loss": 0.033, "num_input_tokens_seen": 463356928, "step": 452500 }, { "epoch": 4.179699393805187, "grad_norm": 1.0019230842590332, "learning_rate": 2.910154916452146e-05, "loss": 0.0303, "num_input_tokens_seen": 463868928, "step": 453000 }, { "epoch": 4.184312748544486, "grad_norm": 2.18719744682312, "learning_rate": 2.9078482390824963e-05, "loss": 0.03, "num_input_tokens_seen": 464380928, "step": 453500 }, { "epoch": 4.1889261032837855, "grad_norm": 1.2453852891921997, "learning_rate": 2.9055415617128467e-05, "loss": 0.0306, "num_input_tokens_seen": 464892928, "step": 454000 }, { "epoch": 4.193539458023086, "grad_norm": 2.0544652938842773, "learning_rate": 2.9032348843431967e-05, "loss": 0.0331, "num_input_tokens_seen": 465404928, "step": 454500 }, { "epoch": 4.198152812762385, "grad_norm": 5.509039878845215, "learning_rate": 2.900928206973547e-05, "loss": 0.0308, "num_input_tokens_seen": 465916928, "step": 455000 }, { "epoch": 4.202766167501684, "grad_norm": 0.6365485787391663, "learning_rate": 2.8986215296038975e-05, "loss": 0.0322, "num_input_tokens_seen": 466428928, "step": 455500 }, { "epoch": 4.207379522240983, "grad_norm": 0.8369764685630798, "learning_rate": 2.8963148522342476e-05, "loss": 0.0311, "num_input_tokens_seen": 466940928, "step": 456000 }, { "epoch": 4.211992876980283, "grad_norm": 1.3454687595367432, "learning_rate": 2.894008174864598e-05, "loss": 0.0317, "num_input_tokens_seen": 467452928, "step": 456500 }, { "epoch": 4.216606231719582, "grad_norm": 1.042900800704956, "learning_rate": 2.8917014974949487e-05, "loss": 0.0304, "num_input_tokens_seen": 467964928, "step": 457000 }, { "epoch": 4.221219586458881, "grad_norm": 2.2044434547424316, "learning_rate": 2.889394820125299e-05, "loss": 0.0309, "num_input_tokens_seen": 468476928, "step": 457500 }, { "epoch": 4.22583294119818, "grad_norm": 1.4156602621078491, "learning_rate": 2.887088142755649e-05, "loss": 0.0325, "num_input_tokens_seen": 468988928, "step": 458000 }, { "epoch": 4.23044629593748, "grad_norm": 1.4290229082107544, "learning_rate": 2.8847814653859996e-05, "loss": 0.034, "num_input_tokens_seen": 469500928, "step": 458500 }, { "epoch": 4.235059650676779, "grad_norm": 0.8856704235076904, "learning_rate": 2.88247478801635e-05, "loss": 0.0301, "num_input_tokens_seen": 470012928, "step": 459000 }, { "epoch": 4.239673005416078, "grad_norm": 1.0637128353118896, "learning_rate": 2.8801681106467004e-05, "loss": 0.0315, "num_input_tokens_seen": 470524928, "step": 459500 }, { "epoch": 4.244286360155378, "grad_norm": 0.9506544470787048, "learning_rate": 2.8778614332770504e-05, "loss": 0.0298, "num_input_tokens_seen": 471036928, "step": 460000 }, { "epoch": 4.248899714894677, "grad_norm": 1.05034339427948, "learning_rate": 2.8755547559074008e-05, "loss": 0.0331, "num_input_tokens_seen": 471548928, "step": 460500 }, { "epoch": 4.253513069633977, "grad_norm": 1.1537014245986938, "learning_rate": 2.8732480785377512e-05, "loss": 0.0309, "num_input_tokens_seen": 472060928, "step": 461000 }, { "epoch": 4.258126424373276, "grad_norm": 0.42139768600463867, "learning_rate": 2.8709414011681013e-05, "loss": 0.032, "num_input_tokens_seen": 472572928, "step": 461500 }, { "epoch": 4.262739779112575, "grad_norm": 2.2188069820404053, "learning_rate": 2.8686347237984517e-05, "loss": 0.0301, "num_input_tokens_seen": 473084928, "step": 462000 }, { "epoch": 4.267353133851874, "grad_norm": 1.293926477432251, "learning_rate": 2.8663280464288024e-05, "loss": 0.0323, "num_input_tokens_seen": 473596928, "step": 462500 }, { "epoch": 4.271966488591174, "grad_norm": 1.7295567989349365, "learning_rate": 2.8640213690591528e-05, "loss": 0.0286, "num_input_tokens_seen": 474108928, "step": 463000 }, { "epoch": 4.276579843330473, "grad_norm": 1.3442994356155396, "learning_rate": 2.8617146916895025e-05, "loss": 0.0312, "num_input_tokens_seen": 474620928, "step": 463500 }, { "epoch": 4.281193198069772, "grad_norm": 1.4000321626663208, "learning_rate": 2.8594080143198533e-05, "loss": 0.0326, "num_input_tokens_seen": 475132928, "step": 464000 }, { "epoch": 4.285806552809071, "grad_norm": 1.4646140336990356, "learning_rate": 2.8571013369502037e-05, "loss": 0.0316, "num_input_tokens_seen": 475644928, "step": 464500 }, { "epoch": 4.290419907548371, "grad_norm": 1.296420931816101, "learning_rate": 2.854794659580554e-05, "loss": 0.0328, "num_input_tokens_seen": 476156928, "step": 465000 }, { "epoch": 4.295033262287671, "grad_norm": 0.947172999382019, "learning_rate": 2.852487982210904e-05, "loss": 0.031, "num_input_tokens_seen": 476668928, "step": 465500 }, { "epoch": 4.29964661702697, "grad_norm": 0.6631402969360352, "learning_rate": 2.8501813048412545e-05, "loss": 0.0291, "num_input_tokens_seen": 477180928, "step": 466000 }, { "epoch": 4.304259971766269, "grad_norm": 0.5878441333770752, "learning_rate": 2.847874627471605e-05, "loss": 0.0316, "num_input_tokens_seen": 477692928, "step": 466500 }, { "epoch": 4.3088733265055685, "grad_norm": 1.32041335105896, "learning_rate": 2.8455679501019557e-05, "loss": 0.0296, "num_input_tokens_seen": 478204928, "step": 467000 }, { "epoch": 4.313486681244868, "grad_norm": 0.7355374097824097, "learning_rate": 2.8432612727323054e-05, "loss": 0.0322, "num_input_tokens_seen": 478716928, "step": 467500 }, { "epoch": 4.318100035984167, "grad_norm": 0.5715786218643188, "learning_rate": 2.840954595362656e-05, "loss": 0.0345, "num_input_tokens_seen": 479228928, "step": 468000 }, { "epoch": 4.322713390723466, "grad_norm": 0.873299777507782, "learning_rate": 2.8386479179930065e-05, "loss": 0.0341, "num_input_tokens_seen": 479740928, "step": 468500 }, { "epoch": 4.3273267454627655, "grad_norm": 0.4993022382259369, "learning_rate": 2.8363412406233562e-05, "loss": 0.0347, "num_input_tokens_seen": 480252928, "step": 469000 }, { "epoch": 4.331940100202065, "grad_norm": 1.0970638990402222, "learning_rate": 2.834034563253707e-05, "loss": 0.0297, "num_input_tokens_seen": 480764928, "step": 469500 }, { "epoch": 4.336553454941364, "grad_norm": 1.030454158782959, "learning_rate": 2.8317278858840574e-05, "loss": 0.0309, "num_input_tokens_seen": 481276928, "step": 470000 }, { "epoch": 4.341166809680663, "grad_norm": 2.224727153778076, "learning_rate": 2.8294212085144078e-05, "loss": 0.0319, "num_input_tokens_seen": 481788928, "step": 470500 }, { "epoch": 4.3457801644199625, "grad_norm": 0.8922818899154663, "learning_rate": 2.8271145311447578e-05, "loss": 0.0324, "num_input_tokens_seen": 482300928, "step": 471000 }, { "epoch": 4.350393519159263, "grad_norm": 1.355394721031189, "learning_rate": 2.8248078537751082e-05, "loss": 0.0322, "num_input_tokens_seen": 482812928, "step": 471500 }, { "epoch": 4.355006873898562, "grad_norm": 1.3697582483291626, "learning_rate": 2.8225011764054586e-05, "loss": 0.0307, "num_input_tokens_seen": 483324928, "step": 472000 }, { "epoch": 4.359620228637861, "grad_norm": 0.8543123006820679, "learning_rate": 2.8201944990358093e-05, "loss": 0.0308, "num_input_tokens_seen": 483836928, "step": 472500 }, { "epoch": 4.36423358337716, "grad_norm": 1.2586286067962646, "learning_rate": 2.817887821666159e-05, "loss": 0.0334, "num_input_tokens_seen": 484348928, "step": 473000 }, { "epoch": 4.36884693811646, "grad_norm": 1.0295668840408325, "learning_rate": 2.8155811442965098e-05, "loss": 0.0324, "num_input_tokens_seen": 484860928, "step": 473500 }, { "epoch": 4.373460292855759, "grad_norm": 1.3368573188781738, "learning_rate": 2.8132744669268602e-05, "loss": 0.0303, "num_input_tokens_seen": 485372928, "step": 474000 }, { "epoch": 4.378073647595058, "grad_norm": 0.5129613280296326, "learning_rate": 2.81096778955721e-05, "loss": 0.031, "num_input_tokens_seen": 485884928, "step": 474500 }, { "epoch": 4.382687002334357, "grad_norm": 0.7094746828079224, "learning_rate": 2.8086611121875606e-05, "loss": 0.0305, "num_input_tokens_seen": 486396928, "step": 475000 }, { "epoch": 4.387300357073657, "grad_norm": 1.2379733324050903, "learning_rate": 2.806354434817911e-05, "loss": 0.035, "num_input_tokens_seen": 486908928, "step": 475500 }, { "epoch": 4.391913711812956, "grad_norm": 0.9573284387588501, "learning_rate": 2.8040477574482614e-05, "loss": 0.0321, "num_input_tokens_seen": 487420928, "step": 476000 }, { "epoch": 4.396527066552255, "grad_norm": 0.8460474014282227, "learning_rate": 2.8017410800786115e-05, "loss": 0.032, "num_input_tokens_seen": 487932928, "step": 476500 }, { "epoch": 4.401140421291554, "grad_norm": 0.5795192122459412, "learning_rate": 2.799434402708962e-05, "loss": 0.0331, "num_input_tokens_seen": 488444928, "step": 477000 }, { "epoch": 4.4057537760308545, "grad_norm": 2.4742841720581055, "learning_rate": 2.7971277253393123e-05, "loss": 0.0287, "num_input_tokens_seen": 488956928, "step": 477500 }, { "epoch": 4.410367130770154, "grad_norm": 2.2295806407928467, "learning_rate": 2.794821047969663e-05, "loss": 0.0343, "num_input_tokens_seen": 489468928, "step": 478000 }, { "epoch": 4.414980485509453, "grad_norm": 1.4073495864868164, "learning_rate": 2.7925143706000128e-05, "loss": 0.0335, "num_input_tokens_seen": 489980928, "step": 478500 }, { "epoch": 4.419593840248752, "grad_norm": 1.378461480140686, "learning_rate": 2.7902076932303635e-05, "loss": 0.0343, "num_input_tokens_seen": 490492928, "step": 479000 }, { "epoch": 4.4242071949880515, "grad_norm": 0.6204975247383118, "learning_rate": 2.787901015860714e-05, "loss": 0.0323, "num_input_tokens_seen": 491004928, "step": 479500 }, { "epoch": 4.428820549727351, "grad_norm": 1.0409677028656006, "learning_rate": 2.7855943384910643e-05, "loss": 0.0325, "num_input_tokens_seen": 491516928, "step": 480000 }, { "epoch": 4.43343390446665, "grad_norm": 1.2104921340942383, "learning_rate": 2.7832876611214143e-05, "loss": 0.0339, "num_input_tokens_seen": 492028928, "step": 480500 }, { "epoch": 4.438047259205949, "grad_norm": 2.0074825286865234, "learning_rate": 2.7809809837517647e-05, "loss": 0.0322, "num_input_tokens_seen": 492540928, "step": 481000 }, { "epoch": 4.4426606139452485, "grad_norm": 0.8541880249977112, "learning_rate": 2.778674306382115e-05, "loss": 0.0299, "num_input_tokens_seen": 493052928, "step": 481500 }, { "epoch": 4.447273968684548, "grad_norm": 2.382373332977295, "learning_rate": 2.7763676290124652e-05, "loss": 0.0303, "num_input_tokens_seen": 493564928, "step": 482000 }, { "epoch": 4.451887323423847, "grad_norm": 0.8820599317550659, "learning_rate": 2.7740609516428156e-05, "loss": 0.0306, "num_input_tokens_seen": 494076928, "step": 482500 }, { "epoch": 4.456500678163147, "grad_norm": 0.6329056620597839, "learning_rate": 2.771754274273166e-05, "loss": 0.0313, "num_input_tokens_seen": 494588928, "step": 483000 }, { "epoch": 4.461114032902446, "grad_norm": 0.7391223311424255, "learning_rate": 2.7694475969035167e-05, "loss": 0.0342, "num_input_tokens_seen": 495100928, "step": 483500 }, { "epoch": 4.465727387641746, "grad_norm": 0.6143118143081665, "learning_rate": 2.7671409195338664e-05, "loss": 0.0324, "num_input_tokens_seen": 495612928, "step": 484000 }, { "epoch": 4.470340742381045, "grad_norm": 2.01242733001709, "learning_rate": 2.7648342421642172e-05, "loss": 0.029, "num_input_tokens_seen": 496124928, "step": 484500 }, { "epoch": 4.474954097120344, "grad_norm": 0.9278964996337891, "learning_rate": 2.7625275647945676e-05, "loss": 0.0362, "num_input_tokens_seen": 496636928, "step": 485000 }, { "epoch": 4.479567451859643, "grad_norm": 1.0499247312545776, "learning_rate": 2.760220887424918e-05, "loss": 0.033, "num_input_tokens_seen": 497148928, "step": 485500 }, { "epoch": 4.484180806598943, "grad_norm": 1.7017521858215332, "learning_rate": 2.757914210055268e-05, "loss": 0.0304, "num_input_tokens_seen": 497660928, "step": 486000 }, { "epoch": 4.488794161338242, "grad_norm": 2.3478429317474365, "learning_rate": 2.7556075326856184e-05, "loss": 0.0329, "num_input_tokens_seen": 498172928, "step": 486500 }, { "epoch": 4.493407516077541, "grad_norm": 3.133190155029297, "learning_rate": 2.7533008553159688e-05, "loss": 0.0341, "num_input_tokens_seen": 498684928, "step": 487000 }, { "epoch": 4.49802087081684, "grad_norm": 0.5625250339508057, "learning_rate": 2.750994177946319e-05, "loss": 0.0328, "num_input_tokens_seen": 499196928, "step": 487500 }, { "epoch": 4.50263422555614, "grad_norm": 1.0259020328521729, "learning_rate": 2.7486875005766693e-05, "loss": 0.0318, "num_input_tokens_seen": 499708928, "step": 488000 }, { "epoch": 4.50724758029544, "grad_norm": 0.48490577936172485, "learning_rate": 2.7463808232070197e-05, "loss": 0.0335, "num_input_tokens_seen": 500220928, "step": 488500 }, { "epoch": 4.511860935034738, "grad_norm": 0.40793031454086304, "learning_rate": 2.7440741458373704e-05, "loss": 0.0312, "num_input_tokens_seen": 500732928, "step": 489000 }, { "epoch": 4.516474289774038, "grad_norm": 1.1319341659545898, "learning_rate": 2.74176746846772e-05, "loss": 0.0327, "num_input_tokens_seen": 501244928, "step": 489500 }, { "epoch": 4.5210876445133374, "grad_norm": 1.9659985303878784, "learning_rate": 2.739460791098071e-05, "loss": 0.0338, "num_input_tokens_seen": 501756928, "step": 490000 }, { "epoch": 4.525700999252637, "grad_norm": 0.5315821766853333, "learning_rate": 2.7371541137284213e-05, "loss": 0.0336, "num_input_tokens_seen": 502268928, "step": 490500 }, { "epoch": 4.530314353991936, "grad_norm": 0.47908708453178406, "learning_rate": 2.7348474363587717e-05, "loss": 0.0295, "num_input_tokens_seen": 502780928, "step": 491000 }, { "epoch": 4.534927708731235, "grad_norm": 0.9557788968086243, "learning_rate": 2.7325407589891217e-05, "loss": 0.0314, "num_input_tokens_seen": 503292928, "step": 491500 }, { "epoch": 4.539541063470534, "grad_norm": 1.229929804801941, "learning_rate": 2.730234081619472e-05, "loss": 0.03, "num_input_tokens_seen": 503804928, "step": 492000 }, { "epoch": 4.544154418209834, "grad_norm": 2.0131001472473145, "learning_rate": 2.7279274042498225e-05, "loss": 0.0334, "num_input_tokens_seen": 504316928, "step": 492500 }, { "epoch": 4.548767772949133, "grad_norm": 1.8093568086624146, "learning_rate": 2.7256207268801732e-05, "loss": 0.0315, "num_input_tokens_seen": 504828928, "step": 493000 }, { "epoch": 4.553381127688432, "grad_norm": 3.043375253677368, "learning_rate": 2.723314049510523e-05, "loss": 0.0336, "num_input_tokens_seen": 505340928, "step": 493500 }, { "epoch": 4.557994482427731, "grad_norm": 1.5375556945800781, "learning_rate": 2.7210073721408734e-05, "loss": 0.0334, "num_input_tokens_seen": 505852928, "step": 494000 }, { "epoch": 4.562607837167031, "grad_norm": 1.2980600595474243, "learning_rate": 2.718700694771224e-05, "loss": 0.0315, "num_input_tokens_seen": 506364928, "step": 494500 }, { "epoch": 4.567221191906331, "grad_norm": 1.3334441184997559, "learning_rate": 2.7163940174015738e-05, "loss": 0.0345, "num_input_tokens_seen": 506876928, "step": 495000 }, { "epoch": 4.57183454664563, "grad_norm": 10.070221900939941, "learning_rate": 2.7140873400319246e-05, "loss": 0.034, "num_input_tokens_seen": 507388928, "step": 495500 }, { "epoch": 4.576447901384929, "grad_norm": 9.152368545532227, "learning_rate": 2.711780662662275e-05, "loss": 0.0316, "num_input_tokens_seen": 507900928, "step": 496000 }, { "epoch": 4.5810612561242285, "grad_norm": 2.569089651107788, "learning_rate": 2.7094739852926254e-05, "loss": 0.0336, "num_input_tokens_seen": 508412928, "step": 496500 }, { "epoch": 4.585674610863528, "grad_norm": 0.7014693021774292, "learning_rate": 2.7071673079229754e-05, "loss": 0.0316, "num_input_tokens_seen": 508924928, "step": 497000 }, { "epoch": 4.590287965602827, "grad_norm": 1.182787537574768, "learning_rate": 2.7048606305533258e-05, "loss": 0.0315, "num_input_tokens_seen": 509436928, "step": 497500 }, { "epoch": 4.594901320342126, "grad_norm": 0.6506703495979309, "learning_rate": 2.7025539531836762e-05, "loss": 0.0328, "num_input_tokens_seen": 509948928, "step": 498000 }, { "epoch": 4.5995146750814255, "grad_norm": 0.5681861639022827, "learning_rate": 2.700247275814027e-05, "loss": 0.0343, "num_input_tokens_seen": 510460928, "step": 498500 }, { "epoch": 4.604128029820725, "grad_norm": 1.2895385026931763, "learning_rate": 2.6979405984443767e-05, "loss": 0.0332, "num_input_tokens_seen": 510972928, "step": 499000 }, { "epoch": 4.608741384560024, "grad_norm": 1.2549630403518677, "learning_rate": 2.6956339210747274e-05, "loss": 0.032, "num_input_tokens_seen": 511484928, "step": 499500 }, { "epoch": 4.613354739299323, "grad_norm": 1.486061692237854, "learning_rate": 2.6933272437050778e-05, "loss": 0.0331, "num_input_tokens_seen": 511996928, "step": 500000 }, { "epoch": 4.617968094038623, "grad_norm": 1.0897846221923828, "learning_rate": 2.6910205663354275e-05, "loss": 0.0327, "num_input_tokens_seen": 512508928, "step": 500500 }, { "epoch": 4.622581448777923, "grad_norm": 0.9600527286529541, "learning_rate": 2.6887138889657782e-05, "loss": 0.0326, "num_input_tokens_seen": 513020928, "step": 501000 }, { "epoch": 4.627194803517222, "grad_norm": 3.943963050842285, "learning_rate": 2.6864072115961286e-05, "loss": 0.0283, "num_input_tokens_seen": 513532928, "step": 501500 }, { "epoch": 4.631808158256521, "grad_norm": 1.1537055969238281, "learning_rate": 2.684100534226479e-05, "loss": 0.0353, "num_input_tokens_seen": 514044928, "step": 502000 }, { "epoch": 4.63642151299582, "grad_norm": 2.200751543045044, "learning_rate": 2.681793856856829e-05, "loss": 0.0323, "num_input_tokens_seen": 514556928, "step": 502500 }, { "epoch": 4.64103486773512, "grad_norm": 1.1844205856323242, "learning_rate": 2.6794871794871795e-05, "loss": 0.0308, "num_input_tokens_seen": 515068928, "step": 503000 }, { "epoch": 4.645648222474419, "grad_norm": 4.328240871429443, "learning_rate": 2.67718050211753e-05, "loss": 0.0337, "num_input_tokens_seen": 515580928, "step": 503500 }, { "epoch": 4.650261577213718, "grad_norm": 1.1905447244644165, "learning_rate": 2.6748738247478806e-05, "loss": 0.0335, "num_input_tokens_seen": 516092928, "step": 504000 }, { "epoch": 4.654874931953017, "grad_norm": 0.4069402813911438, "learning_rate": 2.6725671473782303e-05, "loss": 0.034, "num_input_tokens_seen": 516604928, "step": 504500 }, { "epoch": 4.659488286692317, "grad_norm": 0.7860555648803711, "learning_rate": 2.670260470008581e-05, "loss": 0.034, "num_input_tokens_seen": 517116928, "step": 505000 }, { "epoch": 4.664101641431616, "grad_norm": 0.5769841074943542, "learning_rate": 2.6679537926389315e-05, "loss": 0.033, "num_input_tokens_seen": 517628928, "step": 505500 }, { "epoch": 4.668714996170916, "grad_norm": 1.5153945684432983, "learning_rate": 2.665647115269282e-05, "loss": 0.031, "num_input_tokens_seen": 518140928, "step": 506000 }, { "epoch": 4.673328350910215, "grad_norm": 1.6713037490844727, "learning_rate": 2.663340437899632e-05, "loss": 0.037, "num_input_tokens_seen": 518652928, "step": 506500 }, { "epoch": 4.6779417056495145, "grad_norm": 1.2307850122451782, "learning_rate": 2.6610337605299823e-05, "loss": 0.0318, "num_input_tokens_seen": 519164928, "step": 507000 }, { "epoch": 4.682555060388814, "grad_norm": 1.2771391868591309, "learning_rate": 2.6587270831603327e-05, "loss": 0.0292, "num_input_tokens_seen": 519676928, "step": 507500 }, { "epoch": 4.687168415128113, "grad_norm": 1.468724012374878, "learning_rate": 2.6564204057906828e-05, "loss": 0.0314, "num_input_tokens_seen": 520188928, "step": 508000 }, { "epoch": 4.691781769867412, "grad_norm": 0.9526101350784302, "learning_rate": 2.6541137284210332e-05, "loss": 0.033, "num_input_tokens_seen": 520700928, "step": 508500 }, { "epoch": 4.6963951246067115, "grad_norm": 0.8857848048210144, "learning_rate": 2.6518070510513836e-05, "loss": 0.0333, "num_input_tokens_seen": 521212928, "step": 509000 }, { "epoch": 4.701008479346011, "grad_norm": 1.5435466766357422, "learning_rate": 2.6495003736817343e-05, "loss": 0.0319, "num_input_tokens_seen": 521724928, "step": 509500 }, { "epoch": 4.70562183408531, "grad_norm": 0.6249234676361084, "learning_rate": 2.647193696312084e-05, "loss": 0.0307, "num_input_tokens_seen": 522236928, "step": 510000 }, { "epoch": 4.710235188824609, "grad_norm": 0.7634549140930176, "learning_rate": 2.6448870189424348e-05, "loss": 0.0325, "num_input_tokens_seen": 522748928, "step": 510500 }, { "epoch": 4.7148485435639085, "grad_norm": 0.8510231375694275, "learning_rate": 2.6425803415727852e-05, "loss": 0.0323, "num_input_tokens_seen": 523260928, "step": 511000 }, { "epoch": 4.719461898303209, "grad_norm": 0.797269344329834, "learning_rate": 2.6402736642031356e-05, "loss": 0.035, "num_input_tokens_seen": 523772928, "step": 511500 }, { "epoch": 4.724075253042507, "grad_norm": 1.6006139516830444, "learning_rate": 2.6379669868334856e-05, "loss": 0.0311, "num_input_tokens_seen": 524284928, "step": 512000 }, { "epoch": 4.728688607781807, "grad_norm": 0.5628824234008789, "learning_rate": 2.635660309463836e-05, "loss": 0.0298, "num_input_tokens_seen": 524796928, "step": 512500 }, { "epoch": 4.733301962521106, "grad_norm": 1.2842258214950562, "learning_rate": 2.6333536320941864e-05, "loss": 0.0329, "num_input_tokens_seen": 525308928, "step": 513000 }, { "epoch": 4.737915317260406, "grad_norm": 1.3331750631332397, "learning_rate": 2.6310469547245365e-05, "loss": 0.0346, "num_input_tokens_seen": 525820928, "step": 513500 }, { "epoch": 4.742528671999705, "grad_norm": 2.3819310665130615, "learning_rate": 2.628740277354887e-05, "loss": 0.0339, "num_input_tokens_seen": 526332928, "step": 514000 }, { "epoch": 4.747142026739004, "grad_norm": 0.8976543545722961, "learning_rate": 2.6264335999852373e-05, "loss": 0.035, "num_input_tokens_seen": 526844928, "step": 514500 }, { "epoch": 4.751755381478303, "grad_norm": 2.7922868728637695, "learning_rate": 2.624126922615588e-05, "loss": 0.0344, "num_input_tokens_seen": 527356928, "step": 515000 }, { "epoch": 4.756368736217603, "grad_norm": 1.2664451599121094, "learning_rate": 2.6218202452459377e-05, "loss": 0.033, "num_input_tokens_seen": 527868928, "step": 515500 }, { "epoch": 4.760982090956902, "grad_norm": 1.8173182010650635, "learning_rate": 2.6195135678762885e-05, "loss": 0.033, "num_input_tokens_seen": 528380928, "step": 516000 }, { "epoch": 4.765595445696201, "grad_norm": 1.2038295269012451, "learning_rate": 2.617206890506639e-05, "loss": 0.0329, "num_input_tokens_seen": 528892928, "step": 516500 }, { "epoch": 4.7702088004355, "grad_norm": 1.3875302076339722, "learning_rate": 2.6149002131369893e-05, "loss": 0.0337, "num_input_tokens_seen": 529404928, "step": 517000 }, { "epoch": 4.7748221551748, "grad_norm": 0.6060103178024292, "learning_rate": 2.6125935357673393e-05, "loss": 0.0331, "num_input_tokens_seen": 529916928, "step": 517500 }, { "epoch": 4.7794355099141, "grad_norm": 3.217010259628296, "learning_rate": 2.6102868583976897e-05, "loss": 0.0365, "num_input_tokens_seen": 530428928, "step": 518000 }, { "epoch": 4.784048864653399, "grad_norm": 1.3630263805389404, "learning_rate": 2.60798018102804e-05, "loss": 0.0352, "num_input_tokens_seen": 530940928, "step": 518500 }, { "epoch": 4.788662219392698, "grad_norm": 1.875205397605896, "learning_rate": 2.605673503658391e-05, "loss": 0.0312, "num_input_tokens_seen": 531452928, "step": 519000 }, { "epoch": 4.7932755741319975, "grad_norm": 1.0889365673065186, "learning_rate": 2.6033668262887406e-05, "loss": 0.032, "num_input_tokens_seen": 531964928, "step": 519500 }, { "epoch": 4.797888928871297, "grad_norm": 1.8945229053497314, "learning_rate": 2.601060148919091e-05, "loss": 0.0318, "num_input_tokens_seen": 532476928, "step": 520000 }, { "epoch": 4.802502283610596, "grad_norm": 0.8704883456230164, "learning_rate": 2.5987534715494417e-05, "loss": 0.0353, "num_input_tokens_seen": 532988928, "step": 520500 }, { "epoch": 4.807115638349895, "grad_norm": 0.5920878052711487, "learning_rate": 2.5964467941797914e-05, "loss": 0.0352, "num_input_tokens_seen": 533500928, "step": 521000 }, { "epoch": 4.811728993089194, "grad_norm": 1.7447361946105957, "learning_rate": 2.594140116810142e-05, "loss": 0.0333, "num_input_tokens_seen": 534012928, "step": 521500 }, { "epoch": 4.816342347828494, "grad_norm": 2.5715444087982178, "learning_rate": 2.5918334394404926e-05, "loss": 0.0331, "num_input_tokens_seen": 534524928, "step": 522000 }, { "epoch": 4.820955702567793, "grad_norm": 1.5223846435546875, "learning_rate": 2.589526762070843e-05, "loss": 0.0326, "num_input_tokens_seen": 535036928, "step": 522500 }, { "epoch": 4.825569057307092, "grad_norm": 1.0512726306915283, "learning_rate": 2.587220084701193e-05, "loss": 0.0312, "num_input_tokens_seen": 535548928, "step": 523000 }, { "epoch": 4.830182412046392, "grad_norm": 1.2424243688583374, "learning_rate": 2.5849134073315434e-05, "loss": 0.0356, "num_input_tokens_seen": 536060928, "step": 523500 }, { "epoch": 4.834795766785692, "grad_norm": 1.2689915895462036, "learning_rate": 2.5826067299618938e-05, "loss": 0.0317, "num_input_tokens_seen": 536572928, "step": 524000 }, { "epoch": 4.839409121524991, "grad_norm": 0.5996227860450745, "learning_rate": 2.5803000525922445e-05, "loss": 0.0318, "num_input_tokens_seen": 537084928, "step": 524500 }, { "epoch": 4.84402247626429, "grad_norm": 1.7113879919052124, "learning_rate": 2.5779933752225943e-05, "loss": 0.0322, "num_input_tokens_seen": 537596928, "step": 525000 }, { "epoch": 4.848635831003589, "grad_norm": 5.173702239990234, "learning_rate": 2.5756866978529447e-05, "loss": 0.0338, "num_input_tokens_seen": 538108928, "step": 525500 }, { "epoch": 4.8532491857428885, "grad_norm": 2.208484172821045, "learning_rate": 2.5733800204832954e-05, "loss": 0.0335, "num_input_tokens_seen": 538620928, "step": 526000 }, { "epoch": 4.857862540482188, "grad_norm": 0.7695846557617188, "learning_rate": 2.571073343113645e-05, "loss": 0.0323, "num_input_tokens_seen": 539132928, "step": 526500 }, { "epoch": 4.862475895221487, "grad_norm": 0.6419717073440552, "learning_rate": 2.568766665743996e-05, "loss": 0.0313, "num_input_tokens_seen": 539644928, "step": 527000 }, { "epoch": 4.867089249960786, "grad_norm": 0.4510629177093506, "learning_rate": 2.5664599883743462e-05, "loss": 0.0323, "num_input_tokens_seen": 540156928, "step": 527500 }, { "epoch": 4.8717026047000855, "grad_norm": 0.6697828769683838, "learning_rate": 2.5641533110046966e-05, "loss": 0.0306, "num_input_tokens_seen": 540668928, "step": 528000 }, { "epoch": 4.876315959439385, "grad_norm": 0.30349186062812805, "learning_rate": 2.5618466336350467e-05, "loss": 0.0354, "num_input_tokens_seen": 541180928, "step": 528500 }, { "epoch": 4.880929314178685, "grad_norm": 0.9010013937950134, "learning_rate": 2.559539956265397e-05, "loss": 0.0334, "num_input_tokens_seen": 541692928, "step": 529000 }, { "epoch": 4.885542668917984, "grad_norm": 5.212312698364258, "learning_rate": 2.5572332788957475e-05, "loss": 0.0338, "num_input_tokens_seen": 542204928, "step": 529500 }, { "epoch": 4.890156023657283, "grad_norm": 0.5742513537406921, "learning_rate": 2.5549266015260982e-05, "loss": 0.0331, "num_input_tokens_seen": 542716928, "step": 530000 }, { "epoch": 4.894769378396583, "grad_norm": 1.1083173751831055, "learning_rate": 2.552619924156448e-05, "loss": 0.0332, "num_input_tokens_seen": 543228928, "step": 530500 }, { "epoch": 4.899382733135882, "grad_norm": 2.323056697845459, "learning_rate": 2.5503132467867983e-05, "loss": 0.0316, "num_input_tokens_seen": 543740928, "step": 531000 }, { "epoch": 4.903996087875181, "grad_norm": 0.8404493927955627, "learning_rate": 2.548006569417149e-05, "loss": 0.0325, "num_input_tokens_seen": 544252928, "step": 531500 }, { "epoch": 4.90860944261448, "grad_norm": 0.7807884216308594, "learning_rate": 2.5456998920474995e-05, "loss": 0.034, "num_input_tokens_seen": 544764928, "step": 532000 }, { "epoch": 4.91322279735378, "grad_norm": 1.5149301290512085, "learning_rate": 2.5433932146778495e-05, "loss": 0.0329, "num_input_tokens_seen": 545276928, "step": 532500 }, { "epoch": 4.917836152093079, "grad_norm": 2.3330907821655273, "learning_rate": 2.5410865373082e-05, "loss": 0.0315, "num_input_tokens_seen": 545788928, "step": 533000 }, { "epoch": 4.922449506832378, "grad_norm": 0.9304101467132568, "learning_rate": 2.5387798599385503e-05, "loss": 0.0316, "num_input_tokens_seen": 546300928, "step": 533500 }, { "epoch": 4.927062861571677, "grad_norm": 1.3839999437332153, "learning_rate": 2.5364731825689004e-05, "loss": 0.0339, "num_input_tokens_seen": 546812928, "step": 534000 }, { "epoch": 4.931676216310977, "grad_norm": 1.3032892942428589, "learning_rate": 2.5341665051992508e-05, "loss": 0.0377, "num_input_tokens_seen": 547324928, "step": 534500 }, { "epoch": 4.936289571050276, "grad_norm": 0.5184182524681091, "learning_rate": 2.5318598278296012e-05, "loss": 0.0327, "num_input_tokens_seen": 547836928, "step": 535000 }, { "epoch": 4.940902925789576, "grad_norm": 4.176392078399658, "learning_rate": 2.529553150459952e-05, "loss": 0.0311, "num_input_tokens_seen": 548348928, "step": 535500 }, { "epoch": 4.945516280528875, "grad_norm": 1.8942577838897705, "learning_rate": 2.5272464730903016e-05, "loss": 0.0312, "num_input_tokens_seen": 548860928, "step": 536000 }, { "epoch": 4.9501296352681745, "grad_norm": 0.4011167585849762, "learning_rate": 2.524939795720652e-05, "loss": 0.0297, "num_input_tokens_seen": 549372928, "step": 536500 }, { "epoch": 4.954742990007474, "grad_norm": 1.2499672174453735, "learning_rate": 2.5226331183510028e-05, "loss": 0.0351, "num_input_tokens_seen": 549884928, "step": 537000 }, { "epoch": 4.959356344746773, "grad_norm": 1.7503982782363892, "learning_rate": 2.520326440981353e-05, "loss": 0.0346, "num_input_tokens_seen": 550396928, "step": 537500 }, { "epoch": 4.963969699486072, "grad_norm": 0.9771599173545837, "learning_rate": 2.5180197636117032e-05, "loss": 0.0344, "num_input_tokens_seen": 550908928, "step": 538000 }, { "epoch": 4.9685830542253715, "grad_norm": 1.7374619245529175, "learning_rate": 2.5157130862420536e-05, "loss": 0.0328, "num_input_tokens_seen": 551420928, "step": 538500 }, { "epoch": 4.973196408964671, "grad_norm": 2.459627866744995, "learning_rate": 2.513406408872404e-05, "loss": 0.0304, "num_input_tokens_seen": 551932928, "step": 539000 }, { "epoch": 4.97780976370397, "grad_norm": 1.0150238275527954, "learning_rate": 2.511099731502754e-05, "loss": 0.0341, "num_input_tokens_seen": 552444928, "step": 539500 }, { "epoch": 4.982423118443269, "grad_norm": 0.5386485457420349, "learning_rate": 2.5087930541331045e-05, "loss": 0.0323, "num_input_tokens_seen": 552956928, "step": 540000 }, { "epoch": 4.9870364731825685, "grad_norm": 2.0339949131011963, "learning_rate": 2.506486376763455e-05, "loss": 0.0308, "num_input_tokens_seen": 553468928, "step": 540500 }, { "epoch": 4.991649827921869, "grad_norm": 0.7838632464408875, "learning_rate": 2.5041796993938056e-05, "loss": 0.0335, "num_input_tokens_seen": 553980928, "step": 541000 }, { "epoch": 4.996263182661168, "grad_norm": 1.2253855466842651, "learning_rate": 2.5018730220241553e-05, "loss": 0.0325, "num_input_tokens_seen": 554492928, "step": 541500 }, { "epoch": 5.0, "eval_combined_score": 0.0704431934497777, "eval_loss": 0.07044319063425064, "eval_mse": 0.07044319626530475, "eval_runtime": 45.8855, "eval_samples_per_second": 2099.529, "eval_steps_per_second": 262.457, "num_input_tokens_seen": 554906880, "step": 541905 }, { "epoch": 5.000876537400467, "grad_norm": 1.9685852527618408, "learning_rate": 2.499566344654506e-05, "loss": 0.0352, "num_input_tokens_seen": 555004160, "step": 542000 }, { "epoch": 5.005489892139766, "grad_norm": 1.419827938079834, "learning_rate": 2.4972596672848565e-05, "loss": 0.0302, "num_input_tokens_seen": 555516160, "step": 542500 }, { "epoch": 5.010103246879066, "grad_norm": 3.999183177947998, "learning_rate": 2.4949529899152065e-05, "loss": 0.0242, "num_input_tokens_seen": 556028160, "step": 543000 }, { "epoch": 5.014716601618365, "grad_norm": 1.758694052696228, "learning_rate": 2.4926463125455573e-05, "loss": 0.0261, "num_input_tokens_seen": 556540160, "step": 543500 }, { "epoch": 5.019329956357664, "grad_norm": 1.1982614994049072, "learning_rate": 2.4903396351759073e-05, "loss": 0.0245, "num_input_tokens_seen": 557052160, "step": 544000 }, { "epoch": 5.023943311096963, "grad_norm": 0.8155698180198669, "learning_rate": 2.4880329578062577e-05, "loss": 0.0252, "num_input_tokens_seen": 557564160, "step": 544500 }, { "epoch": 5.028556665836263, "grad_norm": 0.5454326272010803, "learning_rate": 2.485726280436608e-05, "loss": 0.0243, "num_input_tokens_seen": 558076160, "step": 545000 }, { "epoch": 5.033170020575562, "grad_norm": 0.35681942105293274, "learning_rate": 2.4834196030669585e-05, "loss": 0.0259, "num_input_tokens_seen": 558588160, "step": 545500 }, { "epoch": 5.037783375314861, "grad_norm": 1.3723911046981812, "learning_rate": 2.4811129256973086e-05, "loss": 0.0254, "num_input_tokens_seen": 559100160, "step": 546000 }, { "epoch": 5.042396730054161, "grad_norm": 2.3160240650177, "learning_rate": 2.478806248327659e-05, "loss": 0.0238, "num_input_tokens_seen": 559612160, "step": 546500 }, { "epoch": 5.0470100847934605, "grad_norm": 0.447410523891449, "learning_rate": 2.4764995709580094e-05, "loss": 0.0245, "num_input_tokens_seen": 560124160, "step": 547000 }, { "epoch": 5.05162343953276, "grad_norm": 1.798653483390808, "learning_rate": 2.4741928935883598e-05, "loss": 0.0264, "num_input_tokens_seen": 560636160, "step": 547500 }, { "epoch": 5.056236794272059, "grad_norm": 0.5568801164627075, "learning_rate": 2.47188621621871e-05, "loss": 0.0262, "num_input_tokens_seen": 561148160, "step": 548000 }, { "epoch": 5.060850149011358, "grad_norm": 0.5296237468719482, "learning_rate": 2.4695795388490602e-05, "loss": 0.0257, "num_input_tokens_seen": 561660160, "step": 548500 }, { "epoch": 5.0654635037506575, "grad_norm": 1.8144594430923462, "learning_rate": 2.467272861479411e-05, "loss": 0.0244, "num_input_tokens_seen": 562172160, "step": 549000 }, { "epoch": 5.070076858489957, "grad_norm": 1.125553846359253, "learning_rate": 2.464966184109761e-05, "loss": 0.0278, "num_input_tokens_seen": 562684160, "step": 549500 }, { "epoch": 5.074690213229256, "grad_norm": 1.2279289960861206, "learning_rate": 2.4626595067401114e-05, "loss": 0.0254, "num_input_tokens_seen": 563196160, "step": 550000 }, { "epoch": 5.079303567968555, "grad_norm": 1.1253972053527832, "learning_rate": 2.4603528293704618e-05, "loss": 0.0273, "num_input_tokens_seen": 563708160, "step": 550500 }, { "epoch": 5.0839169227078544, "grad_norm": 1.958179235458374, "learning_rate": 2.4580461520008122e-05, "loss": 0.0227, "num_input_tokens_seen": 564220160, "step": 551000 }, { "epoch": 5.088530277447154, "grad_norm": 1.6592975854873657, "learning_rate": 2.4557394746311622e-05, "loss": 0.0265, "num_input_tokens_seen": 564732160, "step": 551500 }, { "epoch": 5.093143632186453, "grad_norm": 0.9499948024749756, "learning_rate": 2.453432797261513e-05, "loss": 0.0257, "num_input_tokens_seen": 565244160, "step": 552000 }, { "epoch": 5.097756986925753, "grad_norm": 0.7857697606086731, "learning_rate": 2.451126119891863e-05, "loss": 0.0256, "num_input_tokens_seen": 565756160, "step": 552500 }, { "epoch": 5.102370341665052, "grad_norm": 1.4605727195739746, "learning_rate": 2.4488194425222134e-05, "loss": 0.0241, "num_input_tokens_seen": 566268160, "step": 553000 }, { "epoch": 5.106983696404352, "grad_norm": 1.2469509840011597, "learning_rate": 2.446512765152564e-05, "loss": 0.0248, "num_input_tokens_seen": 566780160, "step": 553500 }, { "epoch": 5.111597051143651, "grad_norm": 1.826318383216858, "learning_rate": 2.444206087782914e-05, "loss": 0.0288, "num_input_tokens_seen": 567292160, "step": 554000 }, { "epoch": 5.11621040588295, "grad_norm": 4.358790397644043, "learning_rate": 2.4418994104132646e-05, "loss": 0.0248, "num_input_tokens_seen": 567804160, "step": 554500 }, { "epoch": 5.120823760622249, "grad_norm": 1.07144033908844, "learning_rate": 2.4395927330436147e-05, "loss": 0.0266, "num_input_tokens_seen": 568316160, "step": 555000 }, { "epoch": 5.125437115361549, "grad_norm": 1.7916905879974365, "learning_rate": 2.437286055673965e-05, "loss": 0.0268, "num_input_tokens_seen": 568828160, "step": 555500 }, { "epoch": 5.130050470100848, "grad_norm": 0.9158410429954529, "learning_rate": 2.4349793783043155e-05, "loss": 0.0263, "num_input_tokens_seen": 569340160, "step": 556000 }, { "epoch": 5.134663824840147, "grad_norm": 0.7724267244338989, "learning_rate": 2.432672700934666e-05, "loss": 0.0244, "num_input_tokens_seen": 569852160, "step": 556500 }, { "epoch": 5.139277179579446, "grad_norm": 0.48507311940193176, "learning_rate": 2.430366023565016e-05, "loss": 0.0274, "num_input_tokens_seen": 570364160, "step": 557000 }, { "epoch": 5.1438905343187455, "grad_norm": 0.6313498616218567, "learning_rate": 2.4280593461953667e-05, "loss": 0.0239, "num_input_tokens_seen": 570876160, "step": 557500 }, { "epoch": 5.148503889058045, "grad_norm": 0.987579345703125, "learning_rate": 2.4257526688257167e-05, "loss": 0.0255, "num_input_tokens_seen": 571388160, "step": 558000 }, { "epoch": 5.153117243797345, "grad_norm": 1.7795839309692383, "learning_rate": 2.423445991456067e-05, "loss": 0.0245, "num_input_tokens_seen": 571900160, "step": 558500 }, { "epoch": 5.157730598536644, "grad_norm": 1.233028531074524, "learning_rate": 2.4211393140864175e-05, "loss": 0.0272, "num_input_tokens_seen": 572412160, "step": 559000 }, { "epoch": 5.162343953275943, "grad_norm": 0.9197332262992859, "learning_rate": 2.4188326367167676e-05, "loss": 0.0243, "num_input_tokens_seen": 572924160, "step": 559500 }, { "epoch": 5.166957308015243, "grad_norm": 5.717777252197266, "learning_rate": 2.4165259593471183e-05, "loss": 0.024, "num_input_tokens_seen": 573436160, "step": 560000 }, { "epoch": 5.171570662754542, "grad_norm": 0.8062294721603394, "learning_rate": 2.4142192819774684e-05, "loss": 0.025, "num_input_tokens_seen": 573948160, "step": 560500 }, { "epoch": 5.176184017493841, "grad_norm": 1.5993818044662476, "learning_rate": 2.4119126046078188e-05, "loss": 0.0276, "num_input_tokens_seen": 574460160, "step": 561000 }, { "epoch": 5.18079737223314, "grad_norm": 1.086608648300171, "learning_rate": 2.4096059272381692e-05, "loss": 0.0237, "num_input_tokens_seen": 574972160, "step": 561500 }, { "epoch": 5.18541072697244, "grad_norm": 0.5633468627929688, "learning_rate": 2.4072992498685196e-05, "loss": 0.0267, "num_input_tokens_seen": 575484160, "step": 562000 }, { "epoch": 5.190024081711739, "grad_norm": 0.9681257605552673, "learning_rate": 2.4049925724988696e-05, "loss": 0.0247, "num_input_tokens_seen": 575996160, "step": 562500 }, { "epoch": 5.194637436451038, "grad_norm": 0.5693821907043457, "learning_rate": 2.4026858951292204e-05, "loss": 0.0262, "num_input_tokens_seen": 576508160, "step": 563000 }, { "epoch": 5.199250791190337, "grad_norm": 0.5459065437316895, "learning_rate": 2.4003792177595704e-05, "loss": 0.0246, "num_input_tokens_seen": 577020160, "step": 563500 }, { "epoch": 5.2038641459296375, "grad_norm": 0.8124216198921204, "learning_rate": 2.3980725403899208e-05, "loss": 0.0261, "num_input_tokens_seen": 577532160, "step": 564000 }, { "epoch": 5.208477500668937, "grad_norm": 2.0479400157928467, "learning_rate": 2.3957658630202712e-05, "loss": 0.0263, "num_input_tokens_seen": 578044160, "step": 564500 }, { "epoch": 5.213090855408236, "grad_norm": 0.4062500596046448, "learning_rate": 2.3934591856506216e-05, "loss": 0.0293, "num_input_tokens_seen": 578556160, "step": 565000 }, { "epoch": 5.217704210147535, "grad_norm": 0.6792827844619751, "learning_rate": 2.391152508280972e-05, "loss": 0.0243, "num_input_tokens_seen": 579068160, "step": 565500 }, { "epoch": 5.2223175648868345, "grad_norm": 1.978621482849121, "learning_rate": 2.388845830911322e-05, "loss": 0.0242, "num_input_tokens_seen": 579580160, "step": 566000 }, { "epoch": 5.226930919626134, "grad_norm": 1.0961169004440308, "learning_rate": 2.3865391535416725e-05, "loss": 0.0264, "num_input_tokens_seen": 580092160, "step": 566500 }, { "epoch": 5.231544274365433, "grad_norm": 2.3269541263580322, "learning_rate": 2.384232476172023e-05, "loss": 0.0246, "num_input_tokens_seen": 580604160, "step": 567000 }, { "epoch": 5.236157629104732, "grad_norm": 0.545312762260437, "learning_rate": 2.3819257988023733e-05, "loss": 0.0259, "num_input_tokens_seen": 581116160, "step": 567500 }, { "epoch": 5.2407709838440315, "grad_norm": 0.7577276825904846, "learning_rate": 2.3796191214327233e-05, "loss": 0.026, "num_input_tokens_seen": 581628160, "step": 568000 }, { "epoch": 5.245384338583331, "grad_norm": 0.5405977964401245, "learning_rate": 2.377312444063074e-05, "loss": 0.0232, "num_input_tokens_seen": 582140160, "step": 568500 }, { "epoch": 5.24999769332263, "grad_norm": 0.5924959182739258, "learning_rate": 2.375005766693424e-05, "loss": 0.0264, "num_input_tokens_seen": 582652160, "step": 569000 }, { "epoch": 5.25461104806193, "grad_norm": 1.2683016061782837, "learning_rate": 2.3726990893237745e-05, "loss": 0.0262, "num_input_tokens_seen": 583164160, "step": 569500 }, { "epoch": 5.259224402801229, "grad_norm": 1.1642249822616577, "learning_rate": 2.370392411954125e-05, "loss": 0.0263, "num_input_tokens_seen": 583676160, "step": 570000 }, { "epoch": 5.263837757540529, "grad_norm": 1.1712781190872192, "learning_rate": 2.3680857345844753e-05, "loss": 0.0254, "num_input_tokens_seen": 584188160, "step": 570500 }, { "epoch": 5.268451112279828, "grad_norm": 1.0108134746551514, "learning_rate": 2.3657790572148257e-05, "loss": 0.0256, "num_input_tokens_seen": 584700160, "step": 571000 }, { "epoch": 5.273064467019127, "grad_norm": 2.7338948249816895, "learning_rate": 2.363472379845176e-05, "loss": 0.0275, "num_input_tokens_seen": 585212160, "step": 571500 }, { "epoch": 5.277677821758426, "grad_norm": 0.6406319737434387, "learning_rate": 2.361165702475526e-05, "loss": 0.0238, "num_input_tokens_seen": 585724160, "step": 572000 }, { "epoch": 5.282291176497726, "grad_norm": 1.551131010055542, "learning_rate": 2.3588590251058766e-05, "loss": 0.0261, "num_input_tokens_seen": 586236160, "step": 572500 }, { "epoch": 5.286904531237025, "grad_norm": 0.41061103343963623, "learning_rate": 2.356552347736227e-05, "loss": 0.0281, "num_input_tokens_seen": 586748160, "step": 573000 }, { "epoch": 5.291517885976324, "grad_norm": 0.7769986987113953, "learning_rate": 2.354245670366577e-05, "loss": 0.0251, "num_input_tokens_seen": 587260160, "step": 573500 }, { "epoch": 5.296131240715623, "grad_norm": 1.0587828159332275, "learning_rate": 2.3519389929969277e-05, "loss": 0.024, "num_input_tokens_seen": 587772160, "step": 574000 }, { "epoch": 5.300744595454923, "grad_norm": 0.7457670569419861, "learning_rate": 2.3496323156272778e-05, "loss": 0.0258, "num_input_tokens_seen": 588284160, "step": 574500 }, { "epoch": 5.305357950194222, "grad_norm": 1.7087829113006592, "learning_rate": 2.3473256382576282e-05, "loss": 0.0265, "num_input_tokens_seen": 588796160, "step": 575000 }, { "epoch": 5.309971304933521, "grad_norm": 1.6121881008148193, "learning_rate": 2.3450189608879786e-05, "loss": 0.0236, "num_input_tokens_seen": 589308160, "step": 575500 }, { "epoch": 5.314584659672821, "grad_norm": 1.585402011871338, "learning_rate": 2.342712283518329e-05, "loss": 0.0253, "num_input_tokens_seen": 589820160, "step": 576000 }, { "epoch": 5.3191980144121205, "grad_norm": 2.160334348678589, "learning_rate": 2.3404056061486794e-05, "loss": 0.0266, "num_input_tokens_seen": 590332160, "step": 576500 }, { "epoch": 5.32381136915142, "grad_norm": 0.304321825504303, "learning_rate": 2.3380989287790298e-05, "loss": 0.0268, "num_input_tokens_seen": 590844160, "step": 577000 }, { "epoch": 5.328424723890719, "grad_norm": 0.9023957848548889, "learning_rate": 2.33579225140938e-05, "loss": 0.0258, "num_input_tokens_seen": 591356160, "step": 577500 }, { "epoch": 5.333038078630018, "grad_norm": 0.5087705254554749, "learning_rate": 2.3334855740397306e-05, "loss": 0.026, "num_input_tokens_seen": 591868160, "step": 578000 }, { "epoch": 5.3376514333693175, "grad_norm": 1.3647748231887817, "learning_rate": 2.3311788966700806e-05, "loss": 0.0268, "num_input_tokens_seen": 592380160, "step": 578500 }, { "epoch": 5.342264788108617, "grad_norm": 1.011982798576355, "learning_rate": 2.328872219300431e-05, "loss": 0.0267, "num_input_tokens_seen": 592892160, "step": 579000 }, { "epoch": 5.346878142847916, "grad_norm": 1.695412516593933, "learning_rate": 2.3265655419307814e-05, "loss": 0.0244, "num_input_tokens_seen": 593404160, "step": 579500 }, { "epoch": 5.351491497587215, "grad_norm": 2.6255669593811035, "learning_rate": 2.3242588645611315e-05, "loss": 0.0279, "num_input_tokens_seen": 593916160, "step": 580000 }, { "epoch": 5.3561048523265145, "grad_norm": 1.49470055103302, "learning_rate": 2.321952187191482e-05, "loss": 0.0255, "num_input_tokens_seen": 594428160, "step": 580500 }, { "epoch": 5.360718207065814, "grad_norm": 5.862457275390625, "learning_rate": 2.3196455098218323e-05, "loss": 0.0272, "num_input_tokens_seen": 594940160, "step": 581000 }, { "epoch": 5.365331561805114, "grad_norm": 1.1416678428649902, "learning_rate": 2.3173388324521827e-05, "loss": 0.0257, "num_input_tokens_seen": 595452160, "step": 581500 }, { "epoch": 5.369944916544413, "grad_norm": 1.0137473344802856, "learning_rate": 2.315032155082533e-05, "loss": 0.0278, "num_input_tokens_seen": 595964160, "step": 582000 }, { "epoch": 5.374558271283712, "grad_norm": 1.037350058555603, "learning_rate": 2.3127254777128835e-05, "loss": 0.0242, "num_input_tokens_seen": 596476160, "step": 582500 }, { "epoch": 5.379171626023012, "grad_norm": 0.5939755439758301, "learning_rate": 2.3104188003432335e-05, "loss": 0.0253, "num_input_tokens_seen": 596988160, "step": 583000 }, { "epoch": 5.383784980762311, "grad_norm": 0.8637872934341431, "learning_rate": 2.3081121229735843e-05, "loss": 0.0294, "num_input_tokens_seen": 597500160, "step": 583500 }, { "epoch": 5.38839833550161, "grad_norm": 0.6153502464294434, "learning_rate": 2.3058054456039343e-05, "loss": 0.0252, "num_input_tokens_seen": 598012160, "step": 584000 }, { "epoch": 5.393011690240909, "grad_norm": 0.7826283574104309, "learning_rate": 2.3034987682342847e-05, "loss": 0.0242, "num_input_tokens_seen": 598524160, "step": 584500 }, { "epoch": 5.397625044980209, "grad_norm": 0.8609397411346436, "learning_rate": 2.301192090864635e-05, "loss": 0.0281, "num_input_tokens_seen": 599036160, "step": 585000 }, { "epoch": 5.402238399719508, "grad_norm": 1.031718134880066, "learning_rate": 2.2988854134949852e-05, "loss": 0.0264, "num_input_tokens_seen": 599548160, "step": 585500 }, { "epoch": 5.406851754458807, "grad_norm": 4.244394779205322, "learning_rate": 2.296578736125336e-05, "loss": 0.0284, "num_input_tokens_seen": 600060160, "step": 586000 }, { "epoch": 5.411465109198106, "grad_norm": 0.6755638122558594, "learning_rate": 2.294272058755686e-05, "loss": 0.0256, "num_input_tokens_seen": 600572160, "step": 586500 }, { "epoch": 5.416078463937406, "grad_norm": 0.5303651690483093, "learning_rate": 2.2919653813860364e-05, "loss": 0.0272, "num_input_tokens_seen": 601084160, "step": 587000 }, { "epoch": 5.420691818676706, "grad_norm": 0.8649631142616272, "learning_rate": 2.2896587040163868e-05, "loss": 0.0245, "num_input_tokens_seen": 601596160, "step": 587500 }, { "epoch": 5.425305173416005, "grad_norm": 0.5191958546638489, "learning_rate": 2.287352026646737e-05, "loss": 0.0271, "num_input_tokens_seen": 602108160, "step": 588000 }, { "epoch": 5.429918528155304, "grad_norm": 1.2616572380065918, "learning_rate": 2.2850453492770872e-05, "loss": 0.0271, "num_input_tokens_seen": 602620160, "step": 588500 }, { "epoch": 5.434531882894603, "grad_norm": 0.8619266152381897, "learning_rate": 2.282738671907438e-05, "loss": 0.0262, "num_input_tokens_seen": 603132160, "step": 589000 }, { "epoch": 5.439145237633903, "grad_norm": 0.7039788961410522, "learning_rate": 2.280431994537788e-05, "loss": 0.0247, "num_input_tokens_seen": 603644160, "step": 589500 }, { "epoch": 5.443758592373202, "grad_norm": 2.772310495376587, "learning_rate": 2.2781253171681384e-05, "loss": 0.0267, "num_input_tokens_seen": 604156160, "step": 590000 }, { "epoch": 5.448371947112501, "grad_norm": 0.5451655387878418, "learning_rate": 2.2758186397984888e-05, "loss": 0.0261, "num_input_tokens_seen": 604668160, "step": 590500 }, { "epoch": 5.4529853018518, "grad_norm": 0.8995614647865295, "learning_rate": 2.2735119624288392e-05, "loss": 0.024, "num_input_tokens_seen": 605180160, "step": 591000 }, { "epoch": 5.4575986565911, "grad_norm": 1.981187105178833, "learning_rate": 2.2712052850591896e-05, "loss": 0.0263, "num_input_tokens_seen": 605692160, "step": 591500 }, { "epoch": 5.462212011330399, "grad_norm": 0.7811481952667236, "learning_rate": 2.2688986076895397e-05, "loss": 0.0286, "num_input_tokens_seen": 606204160, "step": 592000 }, { "epoch": 5.466825366069698, "grad_norm": 2.7757558822631836, "learning_rate": 2.26659193031989e-05, "loss": 0.0253, "num_input_tokens_seen": 606716160, "step": 592500 }, { "epoch": 5.471438720808998, "grad_norm": 1.9782260656356812, "learning_rate": 2.2642852529502405e-05, "loss": 0.0277, "num_input_tokens_seen": 607228160, "step": 593000 }, { "epoch": 5.4760520755482975, "grad_norm": 2.8401777744293213, "learning_rate": 2.261978575580591e-05, "loss": 0.0255, "num_input_tokens_seen": 607740160, "step": 593500 }, { "epoch": 5.480665430287597, "grad_norm": 0.5879292488098145, "learning_rate": 2.259671898210941e-05, "loss": 0.027, "num_input_tokens_seen": 608252160, "step": 594000 }, { "epoch": 5.485278785026896, "grad_norm": 1.1103825569152832, "learning_rate": 2.2573652208412917e-05, "loss": 0.0258, "num_input_tokens_seen": 608764160, "step": 594500 }, { "epoch": 5.489892139766195, "grad_norm": 1.002668857574463, "learning_rate": 2.2550585434716417e-05, "loss": 0.0276, "num_input_tokens_seen": 609276160, "step": 595000 }, { "epoch": 5.4945054945054945, "grad_norm": 0.5841794013977051, "learning_rate": 2.252751866101992e-05, "loss": 0.0272, "num_input_tokens_seen": 609788160, "step": 595500 }, { "epoch": 5.499118849244794, "grad_norm": 0.6137141585350037, "learning_rate": 2.2504451887323425e-05, "loss": 0.0269, "num_input_tokens_seen": 610300160, "step": 596000 }, { "epoch": 5.503732203984093, "grad_norm": 0.6018849015235901, "learning_rate": 2.248138511362693e-05, "loss": 0.0279, "num_input_tokens_seen": 610812160, "step": 596500 }, { "epoch": 5.508345558723392, "grad_norm": 1.4851562976837158, "learning_rate": 2.2458318339930433e-05, "loss": 0.0268, "num_input_tokens_seen": 611324160, "step": 597000 }, { "epoch": 5.5129589134626915, "grad_norm": 1.9454591274261475, "learning_rate": 2.2435251566233937e-05, "loss": 0.0258, "num_input_tokens_seen": 611836160, "step": 597500 }, { "epoch": 5.517572268201991, "grad_norm": 1.9615495204925537, "learning_rate": 2.2412184792537438e-05, "loss": 0.0257, "num_input_tokens_seen": 612348160, "step": 598000 }, { "epoch": 5.52218562294129, "grad_norm": 1.1803622245788574, "learning_rate": 2.238911801884094e-05, "loss": 0.0256, "num_input_tokens_seen": 612860160, "step": 598500 }, { "epoch": 5.52679897768059, "grad_norm": 0.7780105471611023, "learning_rate": 2.2366051245144445e-05, "loss": 0.0264, "num_input_tokens_seen": 613372160, "step": 599000 }, { "epoch": 5.531412332419889, "grad_norm": 0.5582423806190491, "learning_rate": 2.2342984471447946e-05, "loss": 0.0279, "num_input_tokens_seen": 613884160, "step": 599500 }, { "epoch": 5.536025687159189, "grad_norm": 1.4547449350357056, "learning_rate": 2.2319917697751453e-05, "loss": 0.0273, "num_input_tokens_seen": 614396160, "step": 600000 }, { "epoch": 5.540639041898488, "grad_norm": 1.0105394124984741, "learning_rate": 2.2296850924054954e-05, "loss": 0.0251, "num_input_tokens_seen": 614908160, "step": 600500 }, { "epoch": 5.545252396637787, "grad_norm": 0.7775139212608337, "learning_rate": 2.2273784150358458e-05, "loss": 0.0258, "num_input_tokens_seen": 615420160, "step": 601000 }, { "epoch": 5.549865751377086, "grad_norm": 0.40573227405548096, "learning_rate": 2.2250717376661962e-05, "loss": 0.0268, "num_input_tokens_seen": 615932160, "step": 601500 }, { "epoch": 5.554479106116386, "grad_norm": 1.130553126335144, "learning_rate": 2.2227650602965466e-05, "loss": 0.0255, "num_input_tokens_seen": 616444160, "step": 602000 }, { "epoch": 5.559092460855685, "grad_norm": 1.0450289249420166, "learning_rate": 2.220458382926897e-05, "loss": 0.0266, "num_input_tokens_seen": 616956160, "step": 602500 }, { "epoch": 5.563705815594984, "grad_norm": 0.7919219136238098, "learning_rate": 2.2181517055572474e-05, "loss": 0.0269, "num_input_tokens_seen": 617468160, "step": 603000 }, { "epoch": 5.568319170334283, "grad_norm": 0.7787536382675171, "learning_rate": 2.2158450281875974e-05, "loss": 0.0305, "num_input_tokens_seen": 617980160, "step": 603500 }, { "epoch": 5.572932525073583, "grad_norm": 1.2866960763931274, "learning_rate": 2.2135383508179482e-05, "loss": 0.028, "num_input_tokens_seen": 618492160, "step": 604000 }, { "epoch": 5.577545879812883, "grad_norm": 1.9128954410552979, "learning_rate": 2.2112316734482982e-05, "loss": 0.0276, "num_input_tokens_seen": 619004160, "step": 604500 }, { "epoch": 5.582159234552182, "grad_norm": 1.13468337059021, "learning_rate": 2.2089249960786483e-05, "loss": 0.027, "num_input_tokens_seen": 619516160, "step": 605000 }, { "epoch": 5.586772589291481, "grad_norm": 1.4375085830688477, "learning_rate": 2.206618318708999e-05, "loss": 0.0232, "num_input_tokens_seen": 620028160, "step": 605500 }, { "epoch": 5.5913859440307805, "grad_norm": 0.722649872303009, "learning_rate": 2.204311641339349e-05, "loss": 0.0259, "num_input_tokens_seen": 620540160, "step": 606000 }, { "epoch": 5.59599929877008, "grad_norm": 0.8669957518577576, "learning_rate": 2.2020049639696995e-05, "loss": 0.0262, "num_input_tokens_seen": 621052160, "step": 606500 }, { "epoch": 5.600612653509379, "grad_norm": 0.8053223490715027, "learning_rate": 2.19969828660005e-05, "loss": 0.0302, "num_input_tokens_seen": 621564160, "step": 607000 }, { "epoch": 5.605226008248678, "grad_norm": 1.0647988319396973, "learning_rate": 2.1973916092304003e-05, "loss": 0.0263, "num_input_tokens_seen": 622076160, "step": 607500 }, { "epoch": 5.6098393629879775, "grad_norm": 1.0449702739715576, "learning_rate": 2.1950849318607507e-05, "loss": 0.0292, "num_input_tokens_seen": 622588160, "step": 608000 }, { "epoch": 5.614452717727277, "grad_norm": 0.8551065921783447, "learning_rate": 2.192778254491101e-05, "loss": 0.026, "num_input_tokens_seen": 623100160, "step": 608500 }, { "epoch": 5.619066072466576, "grad_norm": 0.9317313432693481, "learning_rate": 2.190471577121451e-05, "loss": 0.0273, "num_input_tokens_seen": 623612160, "step": 609000 }, { "epoch": 5.623679427205875, "grad_norm": 1.1779793500900269, "learning_rate": 2.188164899751802e-05, "loss": 0.0267, "num_input_tokens_seen": 624124160, "step": 609500 }, { "epoch": 5.628292781945175, "grad_norm": 0.7221566438674927, "learning_rate": 2.185858222382152e-05, "loss": 0.0263, "num_input_tokens_seen": 624636160, "step": 610000 }, { "epoch": 5.632906136684475, "grad_norm": 1.5405559539794922, "learning_rate": 2.1835515450125023e-05, "loss": 0.0242, "num_input_tokens_seen": 625148160, "step": 610500 }, { "epoch": 5.637519491423774, "grad_norm": 1.2586696147918701, "learning_rate": 2.1812448676428527e-05, "loss": 0.0259, "num_input_tokens_seen": 625660160, "step": 611000 }, { "epoch": 5.642132846163073, "grad_norm": 1.4537557363510132, "learning_rate": 2.1789381902732028e-05, "loss": 0.0254, "num_input_tokens_seen": 626172160, "step": 611500 }, { "epoch": 5.646746200902372, "grad_norm": 0.7319709658622742, "learning_rate": 2.1766315129035532e-05, "loss": 0.0286, "num_input_tokens_seen": 626684160, "step": 612000 }, { "epoch": 5.651359555641672, "grad_norm": 0.6492053866386414, "learning_rate": 2.1743248355339036e-05, "loss": 0.0266, "num_input_tokens_seen": 627196160, "step": 612500 }, { "epoch": 5.655972910380971, "grad_norm": 1.0684195756912231, "learning_rate": 2.172018158164254e-05, "loss": 0.0239, "num_input_tokens_seen": 627708160, "step": 613000 }, { "epoch": 5.66058626512027, "grad_norm": 1.018306851387024, "learning_rate": 2.1697114807946044e-05, "loss": 0.027, "num_input_tokens_seen": 628220160, "step": 613500 }, { "epoch": 5.665199619859569, "grad_norm": 0.5089601278305054, "learning_rate": 2.1674048034249548e-05, "loss": 0.0258, "num_input_tokens_seen": 628732160, "step": 614000 }, { "epoch": 5.669812974598869, "grad_norm": 1.606461763381958, "learning_rate": 2.1650981260553048e-05, "loss": 0.0267, "num_input_tokens_seen": 629244160, "step": 614500 }, { "epoch": 5.674426329338168, "grad_norm": 1.479805588722229, "learning_rate": 2.1627914486856556e-05, "loss": 0.0266, "num_input_tokens_seen": 629756160, "step": 615000 }, { "epoch": 5.679039684077467, "grad_norm": 2.971240758895874, "learning_rate": 2.1604847713160056e-05, "loss": 0.0285, "num_input_tokens_seen": 630268160, "step": 615500 }, { "epoch": 5.683653038816766, "grad_norm": 0.5969455242156982, "learning_rate": 2.158178093946356e-05, "loss": 0.0265, "num_input_tokens_seen": 630780160, "step": 616000 }, { "epoch": 5.6882663935560664, "grad_norm": 0.7076913118362427, "learning_rate": 2.1558714165767064e-05, "loss": 0.024, "num_input_tokens_seen": 631292160, "step": 616500 }, { "epoch": 5.692879748295366, "grad_norm": 0.8780455589294434, "learning_rate": 2.1535647392070568e-05, "loss": 0.0253, "num_input_tokens_seen": 631804160, "step": 617000 }, { "epoch": 5.697493103034665, "grad_norm": 3.569014549255371, "learning_rate": 2.151258061837407e-05, "loss": 0.0252, "num_input_tokens_seen": 632316160, "step": 617500 }, { "epoch": 5.702106457773964, "grad_norm": 0.9523796439170837, "learning_rate": 2.1489513844677573e-05, "loss": 0.0275, "num_input_tokens_seen": 632828160, "step": 618000 }, { "epoch": 5.706719812513263, "grad_norm": 0.6151872873306274, "learning_rate": 2.1466447070981077e-05, "loss": 0.0272, "num_input_tokens_seen": 633340160, "step": 618500 }, { "epoch": 5.711333167252563, "grad_norm": 4.095676422119141, "learning_rate": 2.144338029728458e-05, "loss": 0.0309, "num_input_tokens_seen": 633852160, "step": 619000 }, { "epoch": 5.715946521991862, "grad_norm": 1.5436087846755981, "learning_rate": 2.1420313523588085e-05, "loss": 0.0237, "num_input_tokens_seen": 634364160, "step": 619500 }, { "epoch": 5.720559876731161, "grad_norm": 0.722958505153656, "learning_rate": 2.1397246749891585e-05, "loss": 0.024, "num_input_tokens_seen": 634876160, "step": 620000 }, { "epoch": 5.72517323147046, "grad_norm": 1.9889734983444214, "learning_rate": 2.1374179976195092e-05, "loss": 0.026, "num_input_tokens_seen": 635388160, "step": 620500 }, { "epoch": 5.72978658620976, "grad_norm": 1.8848015069961548, "learning_rate": 2.1351113202498593e-05, "loss": 0.0295, "num_input_tokens_seen": 635900160, "step": 621000 }, { "epoch": 5.734399940949059, "grad_norm": 1.4463508129119873, "learning_rate": 2.1328046428802097e-05, "loss": 0.0286, "num_input_tokens_seen": 636412160, "step": 621500 }, { "epoch": 5.739013295688359, "grad_norm": 2.2826876640319824, "learning_rate": 2.13049796551056e-05, "loss": 0.0278, "num_input_tokens_seen": 636924160, "step": 622000 }, { "epoch": 5.743626650427658, "grad_norm": 0.8323870897293091, "learning_rate": 2.1281912881409105e-05, "loss": 0.0247, "num_input_tokens_seen": 637436160, "step": 622500 }, { "epoch": 5.7482400051669575, "grad_norm": 1.4278696775436401, "learning_rate": 2.1258846107712606e-05, "loss": 0.0269, "num_input_tokens_seen": 637948160, "step": 623000 }, { "epoch": 5.752853359906257, "grad_norm": 0.425340473651886, "learning_rate": 2.1235779334016113e-05, "loss": 0.0263, "num_input_tokens_seen": 638460160, "step": 623500 }, { "epoch": 5.757466714645556, "grad_norm": 0.6665620803833008, "learning_rate": 2.1212712560319614e-05, "loss": 0.0286, "num_input_tokens_seen": 638972160, "step": 624000 }, { "epoch": 5.762080069384855, "grad_norm": 1.1083565950393677, "learning_rate": 2.1189645786623117e-05, "loss": 0.0251, "num_input_tokens_seen": 639484160, "step": 624500 }, { "epoch": 5.7666934241241545, "grad_norm": 1.5361641645431519, "learning_rate": 2.116657901292662e-05, "loss": 0.0238, "num_input_tokens_seen": 639996160, "step": 625000 }, { "epoch": 5.771306778863454, "grad_norm": 1.897976040840149, "learning_rate": 2.1143512239230122e-05, "loss": 0.0252, "num_input_tokens_seen": 640508160, "step": 625500 }, { "epoch": 5.775920133602753, "grad_norm": 1.181335687637329, "learning_rate": 2.112044546553363e-05, "loss": 0.0274, "num_input_tokens_seen": 641020160, "step": 626000 }, { "epoch": 5.780533488342052, "grad_norm": 1.2350566387176514, "learning_rate": 2.109737869183713e-05, "loss": 0.0269, "num_input_tokens_seen": 641532160, "step": 626500 }, { "epoch": 5.7851468430813515, "grad_norm": 0.9288113713264465, "learning_rate": 2.1074311918140634e-05, "loss": 0.0266, "num_input_tokens_seen": 642044160, "step": 627000 }, { "epoch": 5.789760197820652, "grad_norm": 1.3695634603500366, "learning_rate": 2.1051245144444138e-05, "loss": 0.0281, "num_input_tokens_seen": 642556160, "step": 627500 }, { "epoch": 5.794373552559951, "grad_norm": 1.5921497344970703, "learning_rate": 2.1028178370747642e-05, "loss": 0.0271, "num_input_tokens_seen": 643068160, "step": 628000 }, { "epoch": 5.79898690729925, "grad_norm": 0.9547250866889954, "learning_rate": 2.1005111597051146e-05, "loss": 0.0246, "num_input_tokens_seen": 643580160, "step": 628500 }, { "epoch": 5.803600262038549, "grad_norm": 0.702260434627533, "learning_rate": 2.098204482335465e-05, "loss": 0.0286, "num_input_tokens_seen": 644092160, "step": 629000 }, { "epoch": 5.808213616777849, "grad_norm": 1.7382519245147705, "learning_rate": 2.095897804965815e-05, "loss": 0.0247, "num_input_tokens_seen": 644604160, "step": 629500 }, { "epoch": 5.812826971517148, "grad_norm": 0.724609911441803, "learning_rate": 2.0935911275961654e-05, "loss": 0.0266, "num_input_tokens_seen": 645116160, "step": 630000 }, { "epoch": 5.817440326256447, "grad_norm": 0.8976930379867554, "learning_rate": 2.091284450226516e-05, "loss": 0.0261, "num_input_tokens_seen": 645628160, "step": 630500 }, { "epoch": 5.822053680995746, "grad_norm": 2.6822431087493896, "learning_rate": 2.088977772856866e-05, "loss": 0.0269, "num_input_tokens_seen": 646140160, "step": 631000 }, { "epoch": 5.826667035735046, "grad_norm": 0.9543342590332031, "learning_rate": 2.0866710954872166e-05, "loss": 0.0255, "num_input_tokens_seen": 646652160, "step": 631500 }, { "epoch": 5.831280390474345, "grad_norm": 1.0366599559783936, "learning_rate": 2.0843644181175667e-05, "loss": 0.0265, "num_input_tokens_seen": 647164160, "step": 632000 }, { "epoch": 5.835893745213644, "grad_norm": 2.613006830215454, "learning_rate": 2.082057740747917e-05, "loss": 0.0264, "num_input_tokens_seen": 647676160, "step": 632500 }, { "epoch": 5.840507099952944, "grad_norm": 0.2824631631374359, "learning_rate": 2.0797510633782675e-05, "loss": 0.0288, "num_input_tokens_seen": 648188160, "step": 633000 }, { "epoch": 5.845120454692243, "grad_norm": 3.399728298187256, "learning_rate": 2.077444386008618e-05, "loss": 0.0268, "num_input_tokens_seen": 648700160, "step": 633500 }, { "epoch": 5.849733809431543, "grad_norm": 0.7402966022491455, "learning_rate": 2.0751377086389683e-05, "loss": 0.0248, "num_input_tokens_seen": 649212160, "step": 634000 }, { "epoch": 5.854347164170842, "grad_norm": 0.7553480267524719, "learning_rate": 2.0728310312693187e-05, "loss": 0.0277, "num_input_tokens_seen": 649724160, "step": 634500 }, { "epoch": 5.858960518910141, "grad_norm": 3.4398159980773926, "learning_rate": 2.0705243538996687e-05, "loss": 0.0266, "num_input_tokens_seen": 650236160, "step": 635000 }, { "epoch": 5.8635738736494405, "grad_norm": 0.5711115598678589, "learning_rate": 2.0682176765300195e-05, "loss": 0.0241, "num_input_tokens_seen": 650748160, "step": 635500 }, { "epoch": 5.86818722838874, "grad_norm": 0.7952388525009155, "learning_rate": 2.0659109991603695e-05, "loss": 0.0275, "num_input_tokens_seen": 651260160, "step": 636000 }, { "epoch": 5.872800583128039, "grad_norm": 1.0399372577667236, "learning_rate": 2.06360432179072e-05, "loss": 0.0248, "num_input_tokens_seen": 651772160, "step": 636500 }, { "epoch": 5.877413937867338, "grad_norm": 1.6778496503829956, "learning_rate": 2.0612976444210703e-05, "loss": 0.0248, "num_input_tokens_seen": 652284160, "step": 637000 }, { "epoch": 5.8820272926066375, "grad_norm": 1.3442925214767456, "learning_rate": 2.0589909670514204e-05, "loss": 0.0271, "num_input_tokens_seen": 652796160, "step": 637500 }, { "epoch": 5.886640647345937, "grad_norm": 1.1822031736373901, "learning_rate": 2.0566842896817708e-05, "loss": 0.0256, "num_input_tokens_seen": 653308160, "step": 638000 }, { "epoch": 5.891254002085236, "grad_norm": 1.5322853326797485, "learning_rate": 2.0543776123121212e-05, "loss": 0.0269, "num_input_tokens_seen": 653820160, "step": 638500 }, { "epoch": 5.895867356824535, "grad_norm": 1.6025440692901611, "learning_rate": 2.0520709349424716e-05, "loss": 0.0281, "num_input_tokens_seen": 654332160, "step": 639000 }, { "epoch": 5.900480711563835, "grad_norm": 0.7516422867774963, "learning_rate": 2.049764257572822e-05, "loss": 0.0293, "num_input_tokens_seen": 654844160, "step": 639500 }, { "epoch": 5.905094066303135, "grad_norm": 0.7684640884399414, "learning_rate": 2.0474575802031724e-05, "loss": 0.0258, "num_input_tokens_seen": 655356160, "step": 640000 }, { "epoch": 5.909707421042434, "grad_norm": 1.2843828201293945, "learning_rate": 2.0451509028335224e-05, "loss": 0.0252, "num_input_tokens_seen": 655868160, "step": 640500 }, { "epoch": 5.914320775781733, "grad_norm": 1.0203999280929565, "learning_rate": 2.042844225463873e-05, "loss": 0.0268, "num_input_tokens_seen": 656380160, "step": 641000 }, { "epoch": 5.918934130521032, "grad_norm": 2.00242280960083, "learning_rate": 2.0405375480942232e-05, "loss": 0.0285, "num_input_tokens_seen": 656892160, "step": 641500 }, { "epoch": 5.923547485260332, "grad_norm": 1.0357120037078857, "learning_rate": 2.0382308707245736e-05, "loss": 0.0239, "num_input_tokens_seen": 657404160, "step": 642000 }, { "epoch": 5.928160839999631, "grad_norm": 1.1826400756835938, "learning_rate": 2.035924193354924e-05, "loss": 0.0268, "num_input_tokens_seen": 657916160, "step": 642500 }, { "epoch": 5.93277419473893, "grad_norm": 1.5662238597869873, "learning_rate": 2.0336175159852744e-05, "loss": 0.0259, "num_input_tokens_seen": 658428160, "step": 643000 }, { "epoch": 5.937387549478229, "grad_norm": 3.335893392562866, "learning_rate": 2.0313108386156245e-05, "loss": 0.0273, "num_input_tokens_seen": 658940160, "step": 643500 }, { "epoch": 5.942000904217529, "grad_norm": 0.7126489281654358, "learning_rate": 2.029004161245975e-05, "loss": 0.0268, "num_input_tokens_seen": 659452160, "step": 644000 }, { "epoch": 5.946614258956828, "grad_norm": 1.0062040090560913, "learning_rate": 2.0266974838763253e-05, "loss": 0.0276, "num_input_tokens_seen": 659964160, "step": 644500 }, { "epoch": 5.951227613696128, "grad_norm": 1.2691099643707275, "learning_rate": 2.0243908065066757e-05, "loss": 0.0295, "num_input_tokens_seen": 660476160, "step": 645000 }, { "epoch": 5.955840968435427, "grad_norm": 0.9768707752227783, "learning_rate": 2.022084129137026e-05, "loss": 0.0287, "num_input_tokens_seen": 660988160, "step": 645500 }, { "epoch": 5.9604543231747265, "grad_norm": 1.5846303701400757, "learning_rate": 2.019777451767376e-05, "loss": 0.028, "num_input_tokens_seen": 661500160, "step": 646000 }, { "epoch": 5.965067677914026, "grad_norm": 0.556376576423645, "learning_rate": 2.017470774397727e-05, "loss": 0.029, "num_input_tokens_seen": 662012160, "step": 646500 }, { "epoch": 5.969681032653325, "grad_norm": 1.8407984972000122, "learning_rate": 2.015164097028077e-05, "loss": 0.0278, "num_input_tokens_seen": 662524160, "step": 647000 }, { "epoch": 5.974294387392624, "grad_norm": 2.419261932373047, "learning_rate": 2.0128574196584273e-05, "loss": 0.0264, "num_input_tokens_seen": 663036160, "step": 647500 }, { "epoch": 5.978907742131923, "grad_norm": 1.3140838146209717, "learning_rate": 2.0105507422887777e-05, "loss": 0.0269, "num_input_tokens_seen": 663548160, "step": 648000 }, { "epoch": 5.983521096871223, "grad_norm": 1.3511277437210083, "learning_rate": 2.008244064919128e-05, "loss": 0.0258, "num_input_tokens_seen": 664060160, "step": 648500 }, { "epoch": 5.988134451610522, "grad_norm": 0.9623832106590271, "learning_rate": 2.005937387549478e-05, "loss": 0.0258, "num_input_tokens_seen": 664572160, "step": 649000 }, { "epoch": 5.992747806349821, "grad_norm": 1.2604849338531494, "learning_rate": 2.003630710179829e-05, "loss": 0.0257, "num_input_tokens_seen": 665084160, "step": 649500 }, { "epoch": 5.99736116108912, "grad_norm": 0.5637773871421814, "learning_rate": 2.001324032810179e-05, "loss": 0.0276, "num_input_tokens_seen": 665596160, "step": 650000 }, { "epoch": 6.0, "eval_combined_score": 0.06719425867896905, "eval_loss": 0.06719426065683365, "eval_mse": 0.06719425670110447, "eval_runtime": 46.0502, "eval_samples_per_second": 2092.023, "eval_steps_per_second": 261.519, "num_input_tokens_seen": 665888256, "step": 650286 }, { "epoch": 6.00197451582842, "grad_norm": 0.7754026055335999, "learning_rate": 1.9990173554405293e-05, "loss": 0.0243, "num_input_tokens_seen": 666107392, "step": 650500 }, { "epoch": 6.00658787056772, "grad_norm": 3.4056851863861084, "learning_rate": 1.9967106780708797e-05, "loss": 0.0198, "num_input_tokens_seen": 666619392, "step": 651000 }, { "epoch": 6.011201225307019, "grad_norm": 0.7338670492172241, "learning_rate": 1.9944040007012298e-05, "loss": 0.0194, "num_input_tokens_seen": 667131392, "step": 651500 }, { "epoch": 6.015814580046318, "grad_norm": 0.9775220155715942, "learning_rate": 1.9920973233315805e-05, "loss": 0.0218, "num_input_tokens_seen": 667643392, "step": 652000 }, { "epoch": 6.0204279347856176, "grad_norm": 0.6513090133666992, "learning_rate": 1.9897906459619306e-05, "loss": 0.0225, "num_input_tokens_seen": 668155392, "step": 652500 }, { "epoch": 6.025041289524917, "grad_norm": 1.0997514724731445, "learning_rate": 1.987483968592281e-05, "loss": 0.0218, "num_input_tokens_seen": 668667392, "step": 653000 }, { "epoch": 6.029654644264216, "grad_norm": 1.8776363134384155, "learning_rate": 1.9851772912226314e-05, "loss": 0.0232, "num_input_tokens_seen": 669179392, "step": 653500 }, { "epoch": 6.034267999003515, "grad_norm": 1.0117559432983398, "learning_rate": 1.9828706138529818e-05, "loss": 0.0195, "num_input_tokens_seen": 669691392, "step": 654000 }, { "epoch": 6.0388813537428145, "grad_norm": 1.839374303817749, "learning_rate": 1.980563936483332e-05, "loss": 0.0206, "num_input_tokens_seen": 670203392, "step": 654500 }, { "epoch": 6.043494708482114, "grad_norm": 1.1383150815963745, "learning_rate": 1.9782572591136826e-05, "loss": 0.02, "num_input_tokens_seen": 670715392, "step": 655000 }, { "epoch": 6.048108063221413, "grad_norm": 0.6940335631370544, "learning_rate": 1.9759505817440326e-05, "loss": 0.0218, "num_input_tokens_seen": 671227392, "step": 655500 }, { "epoch": 6.052721417960712, "grad_norm": 0.9437240958213806, "learning_rate": 1.973643904374383e-05, "loss": 0.0214, "num_input_tokens_seen": 671739392, "step": 656000 }, { "epoch": 6.057334772700012, "grad_norm": 1.297887921333313, "learning_rate": 1.9713372270047334e-05, "loss": 0.0197, "num_input_tokens_seen": 672251392, "step": 656500 }, { "epoch": 6.061948127439312, "grad_norm": 1.1121424436569214, "learning_rate": 1.9690305496350835e-05, "loss": 0.0228, "num_input_tokens_seen": 672763392, "step": 657000 }, { "epoch": 6.066561482178611, "grad_norm": 1.2576148509979248, "learning_rate": 1.9667238722654342e-05, "loss": 0.0202, "num_input_tokens_seen": 673275392, "step": 657500 }, { "epoch": 6.07117483691791, "grad_norm": 0.9484318494796753, "learning_rate": 1.9644171948957843e-05, "loss": 0.0208, "num_input_tokens_seen": 673787392, "step": 658000 }, { "epoch": 6.075788191657209, "grad_norm": 1.5170820951461792, "learning_rate": 1.9621105175261347e-05, "loss": 0.0216, "num_input_tokens_seen": 674299392, "step": 658500 }, { "epoch": 6.080401546396509, "grad_norm": 1.5162551403045654, "learning_rate": 1.959803840156485e-05, "loss": 0.0209, "num_input_tokens_seen": 674811392, "step": 659000 }, { "epoch": 6.085014901135808, "grad_norm": 1.1097129583358765, "learning_rate": 1.9574971627868355e-05, "loss": 0.0211, "num_input_tokens_seen": 675323392, "step": 659500 }, { "epoch": 6.089628255875107, "grad_norm": 1.9856687784194946, "learning_rate": 1.9551904854171855e-05, "loss": 0.0205, "num_input_tokens_seen": 675835392, "step": 660000 }, { "epoch": 6.094241610614406, "grad_norm": 0.447665810585022, "learning_rate": 1.9528838080475363e-05, "loss": 0.0215, "num_input_tokens_seen": 676347392, "step": 660500 }, { "epoch": 6.098854965353706, "grad_norm": 0.6140983700752258, "learning_rate": 1.9505771306778863e-05, "loss": 0.0204, "num_input_tokens_seen": 676859392, "step": 661000 }, { "epoch": 6.103468320093005, "grad_norm": 0.6753659844398499, "learning_rate": 1.9482704533082367e-05, "loss": 0.0234, "num_input_tokens_seen": 677371392, "step": 661500 }, { "epoch": 6.108081674832304, "grad_norm": 0.5752419233322144, "learning_rate": 1.945963775938587e-05, "loss": 0.0202, "num_input_tokens_seen": 677883392, "step": 662000 }, { "epoch": 6.112695029571604, "grad_norm": 0.8498187065124512, "learning_rate": 1.9436570985689375e-05, "loss": 0.0207, "num_input_tokens_seen": 678395392, "step": 662500 }, { "epoch": 6.1173083843109035, "grad_norm": 0.8756592273712158, "learning_rate": 1.941350421199288e-05, "loss": 0.0193, "num_input_tokens_seen": 678907392, "step": 663000 }, { "epoch": 6.121921739050203, "grad_norm": 2.693408250808716, "learning_rate": 1.939043743829638e-05, "loss": 0.0192, "num_input_tokens_seen": 679419392, "step": 663500 }, { "epoch": 6.126535093789502, "grad_norm": 1.2562410831451416, "learning_rate": 1.9367370664599884e-05, "loss": 0.0214, "num_input_tokens_seen": 679931392, "step": 664000 }, { "epoch": 6.131148448528801, "grad_norm": 1.662607192993164, "learning_rate": 1.9344303890903388e-05, "loss": 0.0202, "num_input_tokens_seen": 680443392, "step": 664500 }, { "epoch": 6.1357618032681005, "grad_norm": 0.8095691800117493, "learning_rate": 1.932123711720689e-05, "loss": 0.0212, "num_input_tokens_seen": 680955392, "step": 665000 }, { "epoch": 6.1403751580074, "grad_norm": 0.5978444218635559, "learning_rate": 1.9298170343510392e-05, "loss": 0.0211, "num_input_tokens_seen": 681467392, "step": 665500 }, { "epoch": 6.144988512746699, "grad_norm": 0.5060915946960449, "learning_rate": 1.92751035698139e-05, "loss": 0.0227, "num_input_tokens_seen": 681979392, "step": 666000 }, { "epoch": 6.149601867485998, "grad_norm": 0.9484182596206665, "learning_rate": 1.92520367961174e-05, "loss": 0.0203, "num_input_tokens_seen": 682491392, "step": 666500 }, { "epoch": 6.1542152222252975, "grad_norm": 1.3608324527740479, "learning_rate": 1.9228970022420904e-05, "loss": 0.0222, "num_input_tokens_seen": 683003392, "step": 667000 }, { "epoch": 6.158828576964597, "grad_norm": 0.9933167099952698, "learning_rate": 1.9205903248724408e-05, "loss": 0.0193, "num_input_tokens_seen": 683515392, "step": 667500 }, { "epoch": 6.163441931703897, "grad_norm": 1.8458038568496704, "learning_rate": 1.9182836475027912e-05, "loss": 0.0208, "num_input_tokens_seen": 684027392, "step": 668000 }, { "epoch": 6.168055286443196, "grad_norm": 0.9922088384628296, "learning_rate": 1.9159769701331416e-05, "loss": 0.0227, "num_input_tokens_seen": 684539392, "step": 668500 }, { "epoch": 6.172668641182495, "grad_norm": 0.7523616552352905, "learning_rate": 1.913670292763492e-05, "loss": 0.0214, "num_input_tokens_seen": 685051392, "step": 669000 }, { "epoch": 6.177281995921795, "grad_norm": 1.4571471214294434, "learning_rate": 1.911363615393842e-05, "loss": 0.0213, "num_input_tokens_seen": 685563392, "step": 669500 }, { "epoch": 6.181895350661094, "grad_norm": 1.6645666360855103, "learning_rate": 1.9090569380241925e-05, "loss": 0.0231, "num_input_tokens_seen": 686075392, "step": 670000 }, { "epoch": 6.186508705400393, "grad_norm": 0.5746430158615112, "learning_rate": 1.906750260654543e-05, "loss": 0.0214, "num_input_tokens_seen": 686587392, "step": 670500 }, { "epoch": 6.191122060139692, "grad_norm": 0.6545117497444153, "learning_rate": 1.9044435832848933e-05, "loss": 0.0203, "num_input_tokens_seen": 687099392, "step": 671000 }, { "epoch": 6.195735414878992, "grad_norm": 0.6282312273979187, "learning_rate": 1.9021369059152436e-05, "loss": 0.0222, "num_input_tokens_seen": 687611392, "step": 671500 }, { "epoch": 6.200348769618291, "grad_norm": 0.7718172073364258, "learning_rate": 1.8998302285455937e-05, "loss": 0.0211, "num_input_tokens_seen": 688123392, "step": 672000 }, { "epoch": 6.20496212435759, "grad_norm": 1.4277899265289307, "learning_rate": 1.897523551175944e-05, "loss": 0.0207, "num_input_tokens_seen": 688635392, "step": 672500 }, { "epoch": 6.209575479096889, "grad_norm": 0.5869673490524292, "learning_rate": 1.8952168738062945e-05, "loss": 0.0201, "num_input_tokens_seen": 689147392, "step": 673000 }, { "epoch": 6.214188833836189, "grad_norm": 0.7148327231407166, "learning_rate": 1.892910196436645e-05, "loss": 0.0217, "num_input_tokens_seen": 689659392, "step": 673500 }, { "epoch": 6.218802188575489, "grad_norm": 1.9917762279510498, "learning_rate": 1.8906035190669953e-05, "loss": 0.0201, "num_input_tokens_seen": 690171392, "step": 674000 }, { "epoch": 6.223415543314788, "grad_norm": 1.030920386314392, "learning_rate": 1.8882968416973457e-05, "loss": 0.0217, "num_input_tokens_seen": 690683392, "step": 674500 }, { "epoch": 6.228028898054087, "grad_norm": 0.6258344054222107, "learning_rate": 1.8859901643276958e-05, "loss": 0.0205, "num_input_tokens_seen": 691195392, "step": 675000 }, { "epoch": 6.2326422527933865, "grad_norm": 2.0319483280181885, "learning_rate": 1.8836834869580465e-05, "loss": 0.0208, "num_input_tokens_seen": 691707392, "step": 675500 }, { "epoch": 6.237255607532686, "grad_norm": 0.5357654094696045, "learning_rate": 1.8813768095883965e-05, "loss": 0.021, "num_input_tokens_seen": 692219392, "step": 676000 }, { "epoch": 6.241868962271985, "grad_norm": 2.2843759059906006, "learning_rate": 1.879070132218747e-05, "loss": 0.0224, "num_input_tokens_seen": 692731392, "step": 676500 }, { "epoch": 6.246482317011284, "grad_norm": 0.7464880347251892, "learning_rate": 1.8767634548490973e-05, "loss": 0.0204, "num_input_tokens_seen": 693243392, "step": 677000 }, { "epoch": 6.2510956717505834, "grad_norm": 1.1594797372817993, "learning_rate": 1.8744567774794474e-05, "loss": 0.0219, "num_input_tokens_seen": 693755392, "step": 677500 }, { "epoch": 6.255709026489883, "grad_norm": 2.049744129180908, "learning_rate": 1.872150100109798e-05, "loss": 0.0189, "num_input_tokens_seen": 694267392, "step": 678000 }, { "epoch": 6.260322381229182, "grad_norm": 2.227196216583252, "learning_rate": 1.8698434227401482e-05, "loss": 0.0215, "num_input_tokens_seen": 694779392, "step": 678500 }, { "epoch": 6.264935735968481, "grad_norm": 1.209151268005371, "learning_rate": 1.8675367453704986e-05, "loss": 0.0207, "num_input_tokens_seen": 695291392, "step": 679000 }, { "epoch": 6.26954909070778, "grad_norm": 0.6479954123497009, "learning_rate": 1.865230068000849e-05, "loss": 0.0229, "num_input_tokens_seen": 695803392, "step": 679500 }, { "epoch": 6.274162445447081, "grad_norm": 0.5225302577018738, "learning_rate": 1.8629233906311994e-05, "loss": 0.0207, "num_input_tokens_seen": 696315392, "step": 680000 }, { "epoch": 6.27877580018638, "grad_norm": 0.8142069578170776, "learning_rate": 1.8606167132615494e-05, "loss": 0.0242, "num_input_tokens_seen": 696827392, "step": 680500 }, { "epoch": 6.283389154925679, "grad_norm": 2.5518014430999756, "learning_rate": 1.8583100358919002e-05, "loss": 0.0221, "num_input_tokens_seen": 697339392, "step": 681000 }, { "epoch": 6.288002509664978, "grad_norm": 0.609211266040802, "learning_rate": 1.8560033585222502e-05, "loss": 0.0238, "num_input_tokens_seen": 697851392, "step": 681500 }, { "epoch": 6.292615864404278, "grad_norm": 0.6666821837425232, "learning_rate": 1.8536966811526006e-05, "loss": 0.0235, "num_input_tokens_seen": 698363392, "step": 682000 }, { "epoch": 6.297229219143577, "grad_norm": 2.551591396331787, "learning_rate": 1.851390003782951e-05, "loss": 0.0233, "num_input_tokens_seen": 698875392, "step": 682500 }, { "epoch": 6.301842573882876, "grad_norm": 1.171808123588562, "learning_rate": 1.849083326413301e-05, "loss": 0.0211, "num_input_tokens_seen": 699387392, "step": 683000 }, { "epoch": 6.306455928622175, "grad_norm": 1.9758840799331665, "learning_rate": 1.8467766490436518e-05, "loss": 0.0212, "num_input_tokens_seen": 699899392, "step": 683500 }, { "epoch": 6.3110692833614745, "grad_norm": 0.7469502091407776, "learning_rate": 1.844469971674002e-05, "loss": 0.0217, "num_input_tokens_seen": 700411392, "step": 684000 }, { "epoch": 6.315682638100774, "grad_norm": 0.9809781908988953, "learning_rate": 1.8421632943043523e-05, "loss": 0.0229, "num_input_tokens_seen": 700923392, "step": 684500 }, { "epoch": 6.320295992840073, "grad_norm": 0.9586873650550842, "learning_rate": 1.8398566169347027e-05, "loss": 0.021, "num_input_tokens_seen": 701435392, "step": 685000 }, { "epoch": 6.324909347579373, "grad_norm": 8.868587493896484, "learning_rate": 1.837549939565053e-05, "loss": 0.0225, "num_input_tokens_seen": 701947392, "step": 685500 }, { "epoch": 6.329522702318672, "grad_norm": 1.1265676021575928, "learning_rate": 1.835243262195403e-05, "loss": 0.0212, "num_input_tokens_seen": 702459392, "step": 686000 }, { "epoch": 6.334136057057972, "grad_norm": 1.0341181755065918, "learning_rate": 1.832936584825754e-05, "loss": 0.0232, "num_input_tokens_seen": 702971392, "step": 686500 }, { "epoch": 6.338749411797271, "grad_norm": 0.3800777196884155, "learning_rate": 1.830629907456104e-05, "loss": 0.0213, "num_input_tokens_seen": 703483392, "step": 687000 }, { "epoch": 6.34336276653657, "grad_norm": 0.7369467616081238, "learning_rate": 1.8283232300864543e-05, "loss": 0.0234, "num_input_tokens_seen": 703995392, "step": 687500 }, { "epoch": 6.347976121275869, "grad_norm": 1.0980653762817383, "learning_rate": 1.8260165527168047e-05, "loss": 0.02, "num_input_tokens_seen": 704507392, "step": 688000 }, { "epoch": 6.352589476015169, "grad_norm": 17.581872940063477, "learning_rate": 1.823709875347155e-05, "loss": 0.0233, "num_input_tokens_seen": 705019392, "step": 688500 }, { "epoch": 6.357202830754468, "grad_norm": 0.5301328301429749, "learning_rate": 1.8214031979775055e-05, "loss": 0.0226, "num_input_tokens_seen": 705531392, "step": 689000 }, { "epoch": 6.361816185493767, "grad_norm": 0.44786104559898376, "learning_rate": 1.8190965206078556e-05, "loss": 0.0216, "num_input_tokens_seen": 706043392, "step": 689500 }, { "epoch": 6.366429540233066, "grad_norm": 2.587684154510498, "learning_rate": 1.816789843238206e-05, "loss": 0.0232, "num_input_tokens_seen": 706555392, "step": 690000 }, { "epoch": 6.371042894972366, "grad_norm": 1.0485097169876099, "learning_rate": 1.8144831658685564e-05, "loss": 0.0198, "num_input_tokens_seen": 707067392, "step": 690500 }, { "epoch": 6.375656249711666, "grad_norm": 0.38697299361228943, "learning_rate": 1.8121764884989068e-05, "loss": 0.0224, "num_input_tokens_seen": 707579392, "step": 691000 }, { "epoch": 6.380269604450965, "grad_norm": 1.7703328132629395, "learning_rate": 1.8098698111292568e-05, "loss": 0.0205, "num_input_tokens_seen": 708091392, "step": 691500 }, { "epoch": 6.384882959190264, "grad_norm": 0.5361246466636658, "learning_rate": 1.8075631337596076e-05, "loss": 0.0217, "num_input_tokens_seen": 708603392, "step": 692000 }, { "epoch": 6.3894963139295635, "grad_norm": 0.7262565493583679, "learning_rate": 1.8052564563899576e-05, "loss": 0.0231, "num_input_tokens_seen": 709115392, "step": 692500 }, { "epoch": 6.394109668668863, "grad_norm": 0.5426166653633118, "learning_rate": 1.802949779020308e-05, "loss": 0.0229, "num_input_tokens_seen": 709627392, "step": 693000 }, { "epoch": 6.398723023408162, "grad_norm": 0.9370472431182861, "learning_rate": 1.8006431016506584e-05, "loss": 0.0231, "num_input_tokens_seen": 710139392, "step": 693500 }, { "epoch": 6.403336378147461, "grad_norm": 1.1743369102478027, "learning_rate": 1.7983364242810088e-05, "loss": 0.0221, "num_input_tokens_seen": 710651392, "step": 694000 }, { "epoch": 6.4079497328867605, "grad_norm": 1.1654258966445923, "learning_rate": 1.7960297469113592e-05, "loss": 0.0239, "num_input_tokens_seen": 711163392, "step": 694500 }, { "epoch": 6.41256308762606, "grad_norm": 0.9082449078559875, "learning_rate": 1.7937230695417096e-05, "loss": 0.0206, "num_input_tokens_seen": 711675392, "step": 695000 }, { "epoch": 6.417176442365359, "grad_norm": 0.7706845998764038, "learning_rate": 1.7914163921720597e-05, "loss": 0.0225, "num_input_tokens_seen": 712187392, "step": 695500 }, { "epoch": 6.421789797104658, "grad_norm": 0.8697851896286011, "learning_rate": 1.78910971480241e-05, "loss": 0.0196, "num_input_tokens_seen": 712699392, "step": 696000 }, { "epoch": 6.4264031518439575, "grad_norm": 0.8328973054885864, "learning_rate": 1.7868030374327605e-05, "loss": 0.0191, "num_input_tokens_seen": 713211392, "step": 696500 }, { "epoch": 6.431016506583257, "grad_norm": 7.328830242156982, "learning_rate": 1.7844963600631105e-05, "loss": 0.0219, "num_input_tokens_seen": 713723392, "step": 697000 }, { "epoch": 6.435629861322557, "grad_norm": 0.9811331629753113, "learning_rate": 1.7821896826934612e-05, "loss": 0.0225, "num_input_tokens_seen": 714235392, "step": 697500 }, { "epoch": 6.440243216061856, "grad_norm": 2.4249658584594727, "learning_rate": 1.7798830053238113e-05, "loss": 0.0199, "num_input_tokens_seen": 714747392, "step": 698000 }, { "epoch": 6.444856570801155, "grad_norm": 1.6844923496246338, "learning_rate": 1.7775763279541617e-05, "loss": 0.0214, "num_input_tokens_seen": 715259392, "step": 698500 }, { "epoch": 6.449469925540455, "grad_norm": 2.2441189289093018, "learning_rate": 1.775269650584512e-05, "loss": 0.0205, "num_input_tokens_seen": 715771392, "step": 699000 }, { "epoch": 6.454083280279754, "grad_norm": 0.4577130973339081, "learning_rate": 1.7729629732148625e-05, "loss": 0.022, "num_input_tokens_seen": 716283392, "step": 699500 }, { "epoch": 6.458696635019053, "grad_norm": 1.2576284408569336, "learning_rate": 1.770656295845213e-05, "loss": 0.021, "num_input_tokens_seen": 716795392, "step": 700000 }, { "epoch": 6.463309989758352, "grad_norm": 1.3181337118148804, "learning_rate": 1.7683496184755633e-05, "loss": 0.0228, "num_input_tokens_seen": 717307392, "step": 700500 }, { "epoch": 6.467923344497652, "grad_norm": 0.6435089707374573, "learning_rate": 1.7660429411059133e-05, "loss": 0.0213, "num_input_tokens_seen": 717819392, "step": 701000 }, { "epoch": 6.472536699236951, "grad_norm": 1.2723332643508911, "learning_rate": 1.763736263736264e-05, "loss": 0.0197, "num_input_tokens_seen": 718331392, "step": 701500 }, { "epoch": 6.47715005397625, "grad_norm": 5.60179328918457, "learning_rate": 1.761429586366614e-05, "loss": 0.0246, "num_input_tokens_seen": 718843392, "step": 702000 }, { "epoch": 6.481763408715549, "grad_norm": 1.1845461130142212, "learning_rate": 1.7591229089969642e-05, "loss": 0.0208, "num_input_tokens_seen": 719355392, "step": 702500 }, { "epoch": 6.4863767634548495, "grad_norm": 0.9325453042984009, "learning_rate": 1.756816231627315e-05, "loss": 0.0214, "num_input_tokens_seen": 719867392, "step": 703000 }, { "epoch": 6.490990118194149, "grad_norm": 1.919224500656128, "learning_rate": 1.754509554257665e-05, "loss": 0.0208, "num_input_tokens_seen": 720379392, "step": 703500 }, { "epoch": 6.495603472933448, "grad_norm": 0.8646382093429565, "learning_rate": 1.7522028768880154e-05, "loss": 0.0227, "num_input_tokens_seen": 720891392, "step": 704000 }, { "epoch": 6.500216827672747, "grad_norm": 0.6728546619415283, "learning_rate": 1.7498961995183658e-05, "loss": 0.0214, "num_input_tokens_seen": 721403392, "step": 704500 }, { "epoch": 6.5048301824120465, "grad_norm": 1.701745629310608, "learning_rate": 1.7475895221487162e-05, "loss": 0.0198, "num_input_tokens_seen": 721915392, "step": 705000 }, { "epoch": 6.509443537151346, "grad_norm": 1.382514476776123, "learning_rate": 1.7452828447790666e-05, "loss": 0.0229, "num_input_tokens_seen": 722427392, "step": 705500 }, { "epoch": 6.514056891890645, "grad_norm": 1.366165041923523, "learning_rate": 1.742976167409417e-05, "loss": 0.0244, "num_input_tokens_seen": 722939392, "step": 706000 }, { "epoch": 6.518670246629944, "grad_norm": 0.727484405040741, "learning_rate": 1.740669490039767e-05, "loss": 0.0215, "num_input_tokens_seen": 723451392, "step": 706500 }, { "epoch": 6.5232836013692435, "grad_norm": 0.9992395043373108, "learning_rate": 1.7383628126701178e-05, "loss": 0.0203, "num_input_tokens_seen": 723963392, "step": 707000 }, { "epoch": 6.527896956108543, "grad_norm": 1.4681673049926758, "learning_rate": 1.736056135300468e-05, "loss": 0.0236, "num_input_tokens_seen": 724475392, "step": 707500 }, { "epoch": 6.532510310847842, "grad_norm": 0.6639313101768494, "learning_rate": 1.7337494579308182e-05, "loss": 0.0196, "num_input_tokens_seen": 724987392, "step": 708000 }, { "epoch": 6.537123665587142, "grad_norm": 1.4685230255126953, "learning_rate": 1.7314427805611686e-05, "loss": 0.0231, "num_input_tokens_seen": 725499392, "step": 708500 }, { "epoch": 6.541737020326441, "grad_norm": 0.711995542049408, "learning_rate": 1.729136103191519e-05, "loss": 0.0217, "num_input_tokens_seen": 726011392, "step": 709000 }, { "epoch": 6.546350375065741, "grad_norm": 0.849071204662323, "learning_rate": 1.726829425821869e-05, "loss": 0.0219, "num_input_tokens_seen": 726523392, "step": 709500 }, { "epoch": 6.55096372980504, "grad_norm": 0.7562097311019897, "learning_rate": 1.7245227484522195e-05, "loss": 0.0203, "num_input_tokens_seen": 727035392, "step": 710000 }, { "epoch": 6.555577084544339, "grad_norm": 1.556663155555725, "learning_rate": 1.72221607108257e-05, "loss": 0.0224, "num_input_tokens_seen": 727547392, "step": 710500 }, { "epoch": 6.560190439283638, "grad_norm": 3.2554850578308105, "learning_rate": 1.7199093937129203e-05, "loss": 0.022, "num_input_tokens_seen": 728059392, "step": 711000 }, { "epoch": 6.564803794022938, "grad_norm": 1.4903610944747925, "learning_rate": 1.7176027163432707e-05, "loss": 0.0204, "num_input_tokens_seen": 728571392, "step": 711500 }, { "epoch": 6.569417148762237, "grad_norm": 1.828810691833496, "learning_rate": 1.7152960389736207e-05, "loss": 0.0233, "num_input_tokens_seen": 729083392, "step": 712000 }, { "epoch": 6.574030503501536, "grad_norm": 0.5452165603637695, "learning_rate": 1.7129893616039715e-05, "loss": 0.0216, "num_input_tokens_seen": 729595392, "step": 712500 }, { "epoch": 6.578643858240835, "grad_norm": 1.4269682168960571, "learning_rate": 1.7106826842343215e-05, "loss": 0.0222, "num_input_tokens_seen": 730107392, "step": 713000 }, { "epoch": 6.5832572129801346, "grad_norm": 0.5227313041687012, "learning_rate": 1.708376006864672e-05, "loss": 0.0201, "num_input_tokens_seen": 730619392, "step": 713500 }, { "epoch": 6.587870567719435, "grad_norm": 0.8635200262069702, "learning_rate": 1.7060693294950223e-05, "loss": 0.0208, "num_input_tokens_seen": 731131392, "step": 714000 }, { "epoch": 6.592483922458733, "grad_norm": 1.070576548576355, "learning_rate": 1.7037626521253727e-05, "loss": 0.0213, "num_input_tokens_seen": 731643392, "step": 714500 }, { "epoch": 6.597097277198033, "grad_norm": 21.42013931274414, "learning_rate": 1.7014559747557228e-05, "loss": 0.0214, "num_input_tokens_seen": 732155392, "step": 715000 }, { "epoch": 6.601710631937332, "grad_norm": 1.3582208156585693, "learning_rate": 1.6991492973860735e-05, "loss": 0.0222, "num_input_tokens_seen": 732667392, "step": 715500 }, { "epoch": 6.606323986676632, "grad_norm": 1.3939865827560425, "learning_rate": 1.6968426200164236e-05, "loss": 0.0212, "num_input_tokens_seen": 733179392, "step": 716000 }, { "epoch": 6.610937341415931, "grad_norm": 1.0751606225967407, "learning_rate": 1.694535942646774e-05, "loss": 0.0225, "num_input_tokens_seen": 733691392, "step": 716500 }, { "epoch": 6.61555069615523, "grad_norm": 1.630864143371582, "learning_rate": 1.6922292652771244e-05, "loss": 0.0215, "num_input_tokens_seen": 734203392, "step": 717000 }, { "epoch": 6.620164050894529, "grad_norm": 0.7903428077697754, "learning_rate": 1.6899225879074744e-05, "loss": 0.0204, "num_input_tokens_seen": 734715392, "step": 717500 }, { "epoch": 6.624777405633829, "grad_norm": 0.9173442125320435, "learning_rate": 1.687615910537825e-05, "loss": 0.0205, "num_input_tokens_seen": 735227392, "step": 718000 }, { "epoch": 6.629390760373128, "grad_norm": 0.4864923059940338, "learning_rate": 1.6853092331681752e-05, "loss": 0.0222, "num_input_tokens_seen": 735739392, "step": 718500 }, { "epoch": 6.634004115112427, "grad_norm": 2.9184951782226562, "learning_rate": 1.6830025557985256e-05, "loss": 0.0213, "num_input_tokens_seen": 736251392, "step": 719000 }, { "epoch": 6.638617469851726, "grad_norm": 0.9503863453865051, "learning_rate": 1.680695878428876e-05, "loss": 0.0213, "num_input_tokens_seen": 736763392, "step": 719500 }, { "epoch": 6.643230824591026, "grad_norm": 1.129035234451294, "learning_rate": 1.6783892010592264e-05, "loss": 0.0217, "num_input_tokens_seen": 737275392, "step": 720000 }, { "epoch": 6.647844179330326, "grad_norm": 0.7650052309036255, "learning_rate": 1.6760825236895768e-05, "loss": 0.0209, "num_input_tokens_seen": 737787392, "step": 720500 }, { "epoch": 6.652457534069625, "grad_norm": 1.070244312286377, "learning_rate": 1.6737758463199272e-05, "loss": 0.0201, "num_input_tokens_seen": 738299392, "step": 721000 }, { "epoch": 6.657070888808924, "grad_norm": 1.1811015605926514, "learning_rate": 1.6714691689502773e-05, "loss": 0.0233, "num_input_tokens_seen": 738811392, "step": 721500 }, { "epoch": 6.6616842435482235, "grad_norm": 1.0393638610839844, "learning_rate": 1.6691624915806277e-05, "loss": 0.0227, "num_input_tokens_seen": 739323392, "step": 722000 }, { "epoch": 6.666297598287523, "grad_norm": 1.2030943632125854, "learning_rate": 1.666855814210978e-05, "loss": 0.0234, "num_input_tokens_seen": 739835392, "step": 722500 }, { "epoch": 6.670910953026822, "grad_norm": 0.676896870136261, "learning_rate": 1.664549136841328e-05, "loss": 0.0214, "num_input_tokens_seen": 740347392, "step": 723000 }, { "epoch": 6.675524307766121, "grad_norm": 0.9208011031150818, "learning_rate": 1.662242459471679e-05, "loss": 0.0235, "num_input_tokens_seen": 740859392, "step": 723500 }, { "epoch": 6.6801376625054205, "grad_norm": 0.5715643763542175, "learning_rate": 1.659935782102029e-05, "loss": 0.0192, "num_input_tokens_seen": 741371392, "step": 724000 }, { "epoch": 6.68475101724472, "grad_norm": 3.038097381591797, "learning_rate": 1.6576291047323793e-05, "loss": 0.0227, "num_input_tokens_seen": 741883392, "step": 724500 }, { "epoch": 6.689364371984019, "grad_norm": 0.7479985952377319, "learning_rate": 1.6553224273627297e-05, "loss": 0.022, "num_input_tokens_seen": 742395392, "step": 725000 }, { "epoch": 6.693977726723318, "grad_norm": 0.4049575626850128, "learning_rate": 1.65301574999308e-05, "loss": 0.0225, "num_input_tokens_seen": 742907392, "step": 725500 }, { "epoch": 6.698591081462618, "grad_norm": 1.12605881690979, "learning_rate": 1.6507090726234305e-05, "loss": 0.0227, "num_input_tokens_seen": 743419392, "step": 726000 }, { "epoch": 6.703204436201918, "grad_norm": 0.9142519235610962, "learning_rate": 1.648402395253781e-05, "loss": 0.0225, "num_input_tokens_seen": 743931392, "step": 726500 }, { "epoch": 6.707817790941217, "grad_norm": 2.4688339233398438, "learning_rate": 1.646095717884131e-05, "loss": 0.0231, "num_input_tokens_seen": 744443392, "step": 727000 }, { "epoch": 6.712431145680516, "grad_norm": 0.49617233872413635, "learning_rate": 1.6437890405144817e-05, "loss": 0.0215, "num_input_tokens_seen": 744955392, "step": 727500 }, { "epoch": 6.717044500419815, "grad_norm": 2.4510884284973145, "learning_rate": 1.6414823631448317e-05, "loss": 0.022, "num_input_tokens_seen": 745467392, "step": 728000 }, { "epoch": 6.721657855159115, "grad_norm": 0.6233497262001038, "learning_rate": 1.639175685775182e-05, "loss": 0.0209, "num_input_tokens_seen": 745979392, "step": 728500 }, { "epoch": 6.726271209898414, "grad_norm": 1.1352206468582153, "learning_rate": 1.6368690084055325e-05, "loss": 0.0194, "num_input_tokens_seen": 746491392, "step": 729000 }, { "epoch": 6.730884564637713, "grad_norm": 0.4292503297328949, "learning_rate": 1.6345623310358826e-05, "loss": 0.0213, "num_input_tokens_seen": 747003392, "step": 729500 }, { "epoch": 6.735497919377012, "grad_norm": 0.7327638864517212, "learning_rate": 1.632255653666233e-05, "loss": 0.0216, "num_input_tokens_seen": 747515392, "step": 730000 }, { "epoch": 6.740111274116312, "grad_norm": 1.2657952308654785, "learning_rate": 1.6299489762965834e-05, "loss": 0.021, "num_input_tokens_seen": 748027392, "step": 730500 }, { "epoch": 6.744724628855611, "grad_norm": 2.1072635650634766, "learning_rate": 1.6276422989269338e-05, "loss": 0.0205, "num_input_tokens_seen": 748539392, "step": 731000 }, { "epoch": 6.749337983594911, "grad_norm": 0.5420140027999878, "learning_rate": 1.6253356215572842e-05, "loss": 0.0217, "num_input_tokens_seen": 749051392, "step": 731500 }, { "epoch": 6.75395133833421, "grad_norm": 0.9647169709205627, "learning_rate": 1.6230289441876346e-05, "loss": 0.023, "num_input_tokens_seen": 749563392, "step": 732000 }, { "epoch": 6.7585646930735095, "grad_norm": 0.5795858502388, "learning_rate": 1.6207222668179846e-05, "loss": 0.0207, "num_input_tokens_seen": 750075392, "step": 732500 }, { "epoch": 6.763178047812809, "grad_norm": 0.776720404624939, "learning_rate": 1.6184155894483354e-05, "loss": 0.0239, "num_input_tokens_seen": 750587392, "step": 733000 }, { "epoch": 6.767791402552108, "grad_norm": 3.4119088649749756, "learning_rate": 1.6161089120786854e-05, "loss": 0.0234, "num_input_tokens_seen": 751099392, "step": 733500 }, { "epoch": 6.772404757291407, "grad_norm": 0.5689214468002319, "learning_rate": 1.6138022347090358e-05, "loss": 0.021, "num_input_tokens_seen": 751611392, "step": 734000 }, { "epoch": 6.7770181120307065, "grad_norm": 0.6440141201019287, "learning_rate": 1.6114955573393862e-05, "loss": 0.0228, "num_input_tokens_seen": 752123392, "step": 734500 }, { "epoch": 6.781631466770006, "grad_norm": 0.5016751289367676, "learning_rate": 1.6091888799697366e-05, "loss": 0.023, "num_input_tokens_seen": 752635392, "step": 735000 }, { "epoch": 6.786244821509305, "grad_norm": 0.6144362092018127, "learning_rate": 1.6068822026000867e-05, "loss": 0.0227, "num_input_tokens_seen": 753147392, "step": 735500 }, { "epoch": 6.790858176248604, "grad_norm": 0.356981486082077, "learning_rate": 1.604575525230437e-05, "loss": 0.0204, "num_input_tokens_seen": 753659392, "step": 736000 }, { "epoch": 6.7954715309879035, "grad_norm": 0.6662021279335022, "learning_rate": 1.6022688478607875e-05, "loss": 0.0222, "num_input_tokens_seen": 754171392, "step": 736500 }, { "epoch": 6.800084885727204, "grad_norm": 1.0647578239440918, "learning_rate": 1.599962170491138e-05, "loss": 0.0207, "num_input_tokens_seen": 754683392, "step": 737000 }, { "epoch": 6.804698240466502, "grad_norm": 0.8494476675987244, "learning_rate": 1.5976554931214883e-05, "loss": 0.0208, "num_input_tokens_seen": 755195392, "step": 737500 }, { "epoch": 6.809311595205802, "grad_norm": 1.5736192464828491, "learning_rate": 1.5953488157518383e-05, "loss": 0.0245, "num_input_tokens_seen": 755707392, "step": 738000 }, { "epoch": 6.813924949945101, "grad_norm": 1.5811710357666016, "learning_rate": 1.593042138382189e-05, "loss": 0.0198, "num_input_tokens_seen": 756219392, "step": 738500 }, { "epoch": 6.818538304684401, "grad_norm": 0.7430917024612427, "learning_rate": 1.590735461012539e-05, "loss": 0.0238, "num_input_tokens_seen": 756731392, "step": 739000 }, { "epoch": 6.8231516594237, "grad_norm": 0.346450537443161, "learning_rate": 1.5884287836428895e-05, "loss": 0.0207, "num_input_tokens_seen": 757243392, "step": 739500 }, { "epoch": 6.827765014162999, "grad_norm": 5.301863670349121, "learning_rate": 1.58612210627324e-05, "loss": 0.0228, "num_input_tokens_seen": 757755392, "step": 740000 }, { "epoch": 6.832378368902298, "grad_norm": 0.9501894116401672, "learning_rate": 1.5838154289035903e-05, "loss": 0.0217, "num_input_tokens_seen": 758267392, "step": 740500 }, { "epoch": 6.836991723641598, "grad_norm": 0.4030236601829529, "learning_rate": 1.5815087515339404e-05, "loss": 0.0233, "num_input_tokens_seen": 758779392, "step": 741000 }, { "epoch": 6.841605078380897, "grad_norm": 3.976102352142334, "learning_rate": 1.579202074164291e-05, "loss": 0.0229, "num_input_tokens_seen": 759291392, "step": 741500 }, { "epoch": 6.846218433120196, "grad_norm": 1.0763275623321533, "learning_rate": 1.576895396794641e-05, "loss": 0.0247, "num_input_tokens_seen": 759803392, "step": 742000 }, { "epoch": 6.850831787859495, "grad_norm": 1.278295636177063, "learning_rate": 1.5745887194249916e-05, "loss": 0.0207, "num_input_tokens_seen": 760315392, "step": 742500 }, { "epoch": 6.855445142598795, "grad_norm": 1.3523164987564087, "learning_rate": 1.572282042055342e-05, "loss": 0.0215, "num_input_tokens_seen": 760827392, "step": 743000 }, { "epoch": 6.860058497338095, "grad_norm": 2.487576484680176, "learning_rate": 1.569975364685692e-05, "loss": 0.0197, "num_input_tokens_seen": 761339392, "step": 743500 }, { "epoch": 6.864671852077394, "grad_norm": 0.43189629912376404, "learning_rate": 1.5676686873160428e-05, "loss": 0.0209, "num_input_tokens_seen": 761851392, "step": 744000 }, { "epoch": 6.869285206816693, "grad_norm": 1.3960847854614258, "learning_rate": 1.5653620099463928e-05, "loss": 0.0211, "num_input_tokens_seen": 762363392, "step": 744500 }, { "epoch": 6.873898561555992, "grad_norm": 0.642167866230011, "learning_rate": 1.5630553325767432e-05, "loss": 0.0228, "num_input_tokens_seen": 762875392, "step": 745000 }, { "epoch": 6.878511916295292, "grad_norm": 0.7163909673690796, "learning_rate": 1.5607486552070936e-05, "loss": 0.0225, "num_input_tokens_seen": 763387392, "step": 745500 }, { "epoch": 6.883125271034591, "grad_norm": 0.8028944134712219, "learning_rate": 1.558441977837444e-05, "loss": 0.0217, "num_input_tokens_seen": 763899392, "step": 746000 }, { "epoch": 6.88773862577389, "grad_norm": 0.8963446617126465, "learning_rate": 1.556135300467794e-05, "loss": 0.0233, "num_input_tokens_seen": 764411392, "step": 746500 }, { "epoch": 6.892351980513189, "grad_norm": 1.2736632823944092, "learning_rate": 1.5538286230981448e-05, "loss": 0.0198, "num_input_tokens_seen": 764923392, "step": 747000 }, { "epoch": 6.896965335252489, "grad_norm": 1.9002121686935425, "learning_rate": 1.551521945728495e-05, "loss": 0.0204, "num_input_tokens_seen": 765435392, "step": 747500 }, { "epoch": 6.901578689991788, "grad_norm": 1.7518917322158813, "learning_rate": 1.5492152683588453e-05, "loss": 0.0212, "num_input_tokens_seen": 765947392, "step": 748000 }, { "epoch": 6.906192044731087, "grad_norm": 0.5055529475212097, "learning_rate": 1.5469085909891956e-05, "loss": 0.0223, "num_input_tokens_seen": 766459392, "step": 748500 }, { "epoch": 6.910805399470387, "grad_norm": 1.280887246131897, "learning_rate": 1.5446019136195457e-05, "loss": 0.0215, "num_input_tokens_seen": 766971392, "step": 749000 }, { "epoch": 6.9154187542096865, "grad_norm": 1.3082467317581177, "learning_rate": 1.5422952362498964e-05, "loss": 0.0239, "num_input_tokens_seen": 767483392, "step": 749500 }, { "epoch": 6.920032108948986, "grad_norm": 0.4849281907081604, "learning_rate": 1.5399885588802465e-05, "loss": 0.0219, "num_input_tokens_seen": 767995392, "step": 750000 }, { "epoch": 6.924645463688285, "grad_norm": 1.54342520236969, "learning_rate": 1.537681881510597e-05, "loss": 0.0212, "num_input_tokens_seen": 768507392, "step": 750500 }, { "epoch": 6.929258818427584, "grad_norm": 1.441550850868225, "learning_rate": 1.5353752041409473e-05, "loss": 0.0232, "num_input_tokens_seen": 769019392, "step": 751000 }, { "epoch": 6.9338721731668835, "grad_norm": 1.3304697275161743, "learning_rate": 1.5330685267712977e-05, "loss": 0.0229, "num_input_tokens_seen": 769531392, "step": 751500 }, { "epoch": 6.938485527906183, "grad_norm": 1.3655359745025635, "learning_rate": 1.5307618494016477e-05, "loss": 0.0232, "num_input_tokens_seen": 770043392, "step": 752000 }, { "epoch": 6.943098882645482, "grad_norm": 1.3380628824234009, "learning_rate": 1.5284551720319985e-05, "loss": 0.0208, "num_input_tokens_seen": 770555392, "step": 752500 }, { "epoch": 6.947712237384781, "grad_norm": 0.7669854164123535, "learning_rate": 1.5261484946623485e-05, "loss": 0.0219, "num_input_tokens_seen": 771067392, "step": 753000 }, { "epoch": 6.9523255921240805, "grad_norm": 0.653236985206604, "learning_rate": 1.5238418172926991e-05, "loss": 0.0197, "num_input_tokens_seen": 771579392, "step": 753500 }, { "epoch": 6.95693894686338, "grad_norm": 0.7252629995346069, "learning_rate": 1.5215351399230493e-05, "loss": 0.0231, "num_input_tokens_seen": 772091392, "step": 754000 }, { "epoch": 6.96155230160268, "grad_norm": 0.7869466543197632, "learning_rate": 1.5192284625533997e-05, "loss": 0.0219, "num_input_tokens_seen": 772603392, "step": 754500 }, { "epoch": 6.966165656341978, "grad_norm": 1.048891544342041, "learning_rate": 1.51692178518375e-05, "loss": 0.0246, "num_input_tokens_seen": 773115392, "step": 755000 }, { "epoch": 6.970779011081278, "grad_norm": 0.7492154836654663, "learning_rate": 1.5146151078141002e-05, "loss": 0.0207, "num_input_tokens_seen": 773627392, "step": 755500 }, { "epoch": 6.975392365820578, "grad_norm": 1.5296510457992554, "learning_rate": 1.5123084304444508e-05, "loss": 0.023, "num_input_tokens_seen": 774139392, "step": 756000 }, { "epoch": 6.980005720559877, "grad_norm": 0.6391850113868713, "learning_rate": 1.510001753074801e-05, "loss": 0.0211, "num_input_tokens_seen": 774651392, "step": 756500 }, { "epoch": 6.984619075299176, "grad_norm": 1.2069010734558105, "learning_rate": 1.5076950757051514e-05, "loss": 0.0207, "num_input_tokens_seen": 775163392, "step": 757000 }, { "epoch": 6.989232430038475, "grad_norm": 2.368687629699707, "learning_rate": 1.5053883983355016e-05, "loss": 0.024, "num_input_tokens_seen": 775675392, "step": 757500 }, { "epoch": 6.993845784777775, "grad_norm": 1.284287452697754, "learning_rate": 1.5030817209658522e-05, "loss": 0.0198, "num_input_tokens_seen": 776187392, "step": 758000 }, { "epoch": 6.998459139517074, "grad_norm": 5.402317523956299, "learning_rate": 1.5007750435962022e-05, "loss": 0.025, "num_input_tokens_seen": 776699392, "step": 758500 }, { "epoch": 7.0, "eval_combined_score": 0.06412914552577642, "eval_loss": 0.06412914395332336, "eval_mse": 0.06412914709822949, "eval_runtime": 47.0336, "eval_samples_per_second": 2048.28, "eval_steps_per_second": 256.051, "num_input_tokens_seen": 776869632, "step": 758667 }, { "epoch": 7.003072494256373, "grad_norm": 1.346767783164978, "learning_rate": 1.4984683662265528e-05, "loss": 0.018, "num_input_tokens_seen": 777210624, "step": 759000 }, { "epoch": 7.007685848995672, "grad_norm": 0.9796298146247864, "learning_rate": 1.496161688856903e-05, "loss": 0.0184, "num_input_tokens_seen": 777722624, "step": 759500 }, { "epoch": 7.012299203734972, "grad_norm": 1.2551716566085815, "learning_rate": 1.4938550114872534e-05, "loss": 0.0173, "num_input_tokens_seen": 778234624, "step": 760000 }, { "epoch": 7.016912558474272, "grad_norm": 0.8987337946891785, "learning_rate": 1.4915483341176037e-05, "loss": 0.0177, "num_input_tokens_seen": 778746624, "step": 760500 }, { "epoch": 7.021525913213571, "grad_norm": 0.38303157687187195, "learning_rate": 1.4892416567479542e-05, "loss": 0.0179, "num_input_tokens_seen": 779258624, "step": 761000 }, { "epoch": 7.02613926795287, "grad_norm": 1.3380213975906372, "learning_rate": 1.4869349793783044e-05, "loss": 0.0183, "num_input_tokens_seen": 779770624, "step": 761500 }, { "epoch": 7.0307526226921695, "grad_norm": 2.466179609298706, "learning_rate": 1.4846283020086547e-05, "loss": 0.0175, "num_input_tokens_seen": 780282624, "step": 762000 }, { "epoch": 7.035365977431469, "grad_norm": 0.4640190303325653, "learning_rate": 1.482321624639005e-05, "loss": 0.0168, "num_input_tokens_seen": 780794624, "step": 762500 }, { "epoch": 7.039979332170768, "grad_norm": 0.6390454173088074, "learning_rate": 1.4800149472693553e-05, "loss": 0.0165, "num_input_tokens_seen": 781306624, "step": 763000 }, { "epoch": 7.044592686910067, "grad_norm": 0.9119462966918945, "learning_rate": 1.4777082698997059e-05, "loss": 0.0162, "num_input_tokens_seen": 781818624, "step": 763500 }, { "epoch": 7.0492060416493665, "grad_norm": 1.088921070098877, "learning_rate": 1.475401592530056e-05, "loss": 0.0182, "num_input_tokens_seen": 782330624, "step": 764000 }, { "epoch": 7.053819396388666, "grad_norm": 0.5869113802909851, "learning_rate": 1.4730949151604065e-05, "loss": 0.0191, "num_input_tokens_seen": 782842624, "step": 764500 }, { "epoch": 7.058432751127965, "grad_norm": 1.6925584077835083, "learning_rate": 1.4707882377907567e-05, "loss": 0.0187, "num_input_tokens_seen": 783354624, "step": 765000 }, { "epoch": 7.063046105867264, "grad_norm": 1.0733281373977661, "learning_rate": 1.4684815604211071e-05, "loss": 0.0171, "num_input_tokens_seen": 783866624, "step": 765500 }, { "epoch": 7.0676594606065635, "grad_norm": 0.3278258442878723, "learning_rate": 1.4661748830514573e-05, "loss": 0.0183, "num_input_tokens_seen": 784378624, "step": 766000 }, { "epoch": 7.072272815345864, "grad_norm": 2.2622592449188232, "learning_rate": 1.4638682056818079e-05, "loss": 0.0171, "num_input_tokens_seen": 784890624, "step": 766500 }, { "epoch": 7.076886170085163, "grad_norm": 0.846518337726593, "learning_rate": 1.4615615283121581e-05, "loss": 0.0178, "num_input_tokens_seen": 785402624, "step": 767000 }, { "epoch": 7.081499524824462, "grad_norm": 0.9698590636253357, "learning_rate": 1.4592548509425085e-05, "loss": 0.0173, "num_input_tokens_seen": 785914624, "step": 767500 }, { "epoch": 7.086112879563761, "grad_norm": 0.5238065123558044, "learning_rate": 1.4569481735728588e-05, "loss": 0.0165, "num_input_tokens_seen": 786426624, "step": 768000 }, { "epoch": 7.090726234303061, "grad_norm": 0.7391173839569092, "learning_rate": 1.454641496203209e-05, "loss": 0.0193, "num_input_tokens_seen": 786938624, "step": 768500 }, { "epoch": 7.09533958904236, "grad_norm": 0.8646796941757202, "learning_rate": 1.4523348188335596e-05, "loss": 0.0182, "num_input_tokens_seen": 787450624, "step": 769000 }, { "epoch": 7.099952943781659, "grad_norm": 0.5301780700683594, "learning_rate": 1.4500281414639096e-05, "loss": 0.017, "num_input_tokens_seen": 787962624, "step": 769500 }, { "epoch": 7.104566298520958, "grad_norm": 2.3351125717163086, "learning_rate": 1.4477214640942602e-05, "loss": 0.0182, "num_input_tokens_seen": 788474624, "step": 770000 }, { "epoch": 7.109179653260258, "grad_norm": 0.59925377368927, "learning_rate": 1.4454147867246104e-05, "loss": 0.0164, "num_input_tokens_seen": 788986624, "step": 770500 }, { "epoch": 7.113793007999557, "grad_norm": 0.5372639298439026, "learning_rate": 1.4431081093549608e-05, "loss": 0.0181, "num_input_tokens_seen": 789498624, "step": 771000 }, { "epoch": 7.118406362738856, "grad_norm": 1.028199553489685, "learning_rate": 1.440801431985311e-05, "loss": 0.0179, "num_input_tokens_seen": 790010624, "step": 771500 }, { "epoch": 7.123019717478156, "grad_norm": 0.32566505670547485, "learning_rate": 1.4384947546156616e-05, "loss": 0.0177, "num_input_tokens_seen": 790522624, "step": 772000 }, { "epoch": 7.1276330722174555, "grad_norm": 1.434348702430725, "learning_rate": 1.4361880772460118e-05, "loss": 0.0195, "num_input_tokens_seen": 791034624, "step": 772500 }, { "epoch": 7.132246426956755, "grad_norm": 1.0634896755218506, "learning_rate": 1.4338813998763622e-05, "loss": 0.0172, "num_input_tokens_seen": 791546624, "step": 773000 }, { "epoch": 7.136859781696054, "grad_norm": 1.0522830486297607, "learning_rate": 1.4315747225067125e-05, "loss": 0.017, "num_input_tokens_seen": 792058624, "step": 773500 }, { "epoch": 7.141473136435353, "grad_norm": 1.2891104221343994, "learning_rate": 1.429268045137063e-05, "loss": 0.0173, "num_input_tokens_seen": 792570624, "step": 774000 }, { "epoch": 7.1460864911746524, "grad_norm": 0.5944826006889343, "learning_rate": 1.4269613677674132e-05, "loss": 0.0168, "num_input_tokens_seen": 793082624, "step": 774500 }, { "epoch": 7.150699845913952, "grad_norm": 1.0896071195602417, "learning_rate": 1.4246546903977635e-05, "loss": 0.0195, "num_input_tokens_seen": 793594624, "step": 775000 }, { "epoch": 7.155313200653251, "grad_norm": 0.5116850137710571, "learning_rate": 1.4223480130281139e-05, "loss": 0.0181, "num_input_tokens_seen": 794106624, "step": 775500 }, { "epoch": 7.15992655539255, "grad_norm": 0.6353034377098083, "learning_rate": 1.4200413356584641e-05, "loss": 0.015, "num_input_tokens_seen": 794618624, "step": 776000 }, { "epoch": 7.164539910131849, "grad_norm": 2.1156020164489746, "learning_rate": 1.4177346582888145e-05, "loss": 0.0181, "num_input_tokens_seen": 795130624, "step": 776500 }, { "epoch": 7.169153264871149, "grad_norm": 0.4953656494617462, "learning_rate": 1.4154279809191647e-05, "loss": 0.0174, "num_input_tokens_seen": 795642624, "step": 777000 }, { "epoch": 7.173766619610448, "grad_norm": 0.39725926518440247, "learning_rate": 1.4131213035495153e-05, "loss": 0.0178, "num_input_tokens_seen": 796154624, "step": 777500 }, { "epoch": 7.178379974349748, "grad_norm": 0.7973536849021912, "learning_rate": 1.4108146261798655e-05, "loss": 0.0192, "num_input_tokens_seen": 796666624, "step": 778000 }, { "epoch": 7.182993329089047, "grad_norm": 0.27644041180610657, "learning_rate": 1.4085079488102159e-05, "loss": 0.0187, "num_input_tokens_seen": 797178624, "step": 778500 }, { "epoch": 7.1876066838283466, "grad_norm": 0.5681914687156677, "learning_rate": 1.4062012714405661e-05, "loss": 0.0175, "num_input_tokens_seen": 797690624, "step": 779000 }, { "epoch": 7.192220038567646, "grad_norm": 0.19514349102973938, "learning_rate": 1.4038945940709167e-05, "loss": 0.0176, "num_input_tokens_seen": 798202624, "step": 779500 }, { "epoch": 7.196833393306945, "grad_norm": 1.4721050262451172, "learning_rate": 1.401587916701267e-05, "loss": 0.0188, "num_input_tokens_seen": 798714624, "step": 780000 }, { "epoch": 7.201446748046244, "grad_norm": 0.7421937584877014, "learning_rate": 1.3992812393316173e-05, "loss": 0.0188, "num_input_tokens_seen": 799226624, "step": 780500 }, { "epoch": 7.2060601027855435, "grad_norm": 0.12846527993679047, "learning_rate": 1.3969745619619676e-05, "loss": 0.018, "num_input_tokens_seen": 799738624, "step": 781000 }, { "epoch": 7.210673457524843, "grad_norm": 0.8358561992645264, "learning_rate": 1.3946678845923178e-05, "loss": 0.018, "num_input_tokens_seen": 800250624, "step": 781500 }, { "epoch": 7.215286812264142, "grad_norm": 1.0720690488815308, "learning_rate": 1.3923612072226684e-05, "loss": 0.0161, "num_input_tokens_seen": 800762624, "step": 782000 }, { "epoch": 7.219900167003441, "grad_norm": 0.4553976356983185, "learning_rate": 1.3900545298530184e-05, "loss": 0.0188, "num_input_tokens_seen": 801274624, "step": 782500 }, { "epoch": 7.2245135217427405, "grad_norm": 1.1510006189346313, "learning_rate": 1.387747852483369e-05, "loss": 0.018, "num_input_tokens_seen": 801786624, "step": 783000 }, { "epoch": 7.22912687648204, "grad_norm": 1.1483092308044434, "learning_rate": 1.3854411751137192e-05, "loss": 0.0171, "num_input_tokens_seen": 802298624, "step": 783500 }, { "epoch": 7.23374023122134, "grad_norm": 0.4925529658794403, "learning_rate": 1.3831344977440696e-05, "loss": 0.0173, "num_input_tokens_seen": 802810624, "step": 784000 }, { "epoch": 7.238353585960639, "grad_norm": 0.3787945508956909, "learning_rate": 1.3808278203744198e-05, "loss": 0.0188, "num_input_tokens_seen": 803322624, "step": 784500 }, { "epoch": 7.242966940699938, "grad_norm": 0.6160422563552856, "learning_rate": 1.3785211430047704e-05, "loss": 0.018, "num_input_tokens_seen": 803834624, "step": 785000 }, { "epoch": 7.247580295439238, "grad_norm": 1.1294529438018799, "learning_rate": 1.3762144656351206e-05, "loss": 0.02, "num_input_tokens_seen": 804346624, "step": 785500 }, { "epoch": 7.252193650178537, "grad_norm": 0.6138213872909546, "learning_rate": 1.373907788265471e-05, "loss": 0.0175, "num_input_tokens_seen": 804858624, "step": 786000 }, { "epoch": 7.256807004917836, "grad_norm": 0.5684888362884521, "learning_rate": 1.3716011108958212e-05, "loss": 0.0166, "num_input_tokens_seen": 805370624, "step": 786500 }, { "epoch": 7.261420359657135, "grad_norm": 0.7051540613174438, "learning_rate": 1.3692944335261718e-05, "loss": 0.0161, "num_input_tokens_seen": 805882624, "step": 787000 }, { "epoch": 7.266033714396435, "grad_norm": 0.7892741560935974, "learning_rate": 1.366987756156522e-05, "loss": 0.0193, "num_input_tokens_seen": 806394624, "step": 787500 }, { "epoch": 7.270647069135734, "grad_norm": 1.084768533706665, "learning_rate": 1.3646810787868721e-05, "loss": 0.0178, "num_input_tokens_seen": 806906624, "step": 788000 }, { "epoch": 7.275260423875033, "grad_norm": 1.111611008644104, "learning_rate": 1.3623744014172227e-05, "loss": 0.0181, "num_input_tokens_seen": 807418624, "step": 788500 }, { "epoch": 7.279873778614332, "grad_norm": 1.2572911977767944, "learning_rate": 1.3600677240475729e-05, "loss": 0.0216, "num_input_tokens_seen": 807930624, "step": 789000 }, { "epoch": 7.2844871333536325, "grad_norm": 1.4147090911865234, "learning_rate": 1.3577610466779233e-05, "loss": 0.0174, "num_input_tokens_seen": 808442624, "step": 789500 }, { "epoch": 7.289100488092932, "grad_norm": 1.129238247871399, "learning_rate": 1.3554543693082735e-05, "loss": 0.0174, "num_input_tokens_seen": 808954624, "step": 790000 }, { "epoch": 7.293713842832231, "grad_norm": 0.7517364621162415, "learning_rate": 1.3531476919386241e-05, "loss": 0.0198, "num_input_tokens_seen": 809466624, "step": 790500 }, { "epoch": 7.29832719757153, "grad_norm": 2.005709171295166, "learning_rate": 1.3508410145689743e-05, "loss": 0.019, "num_input_tokens_seen": 809978624, "step": 791000 }, { "epoch": 7.3029405523108295, "grad_norm": 0.5718657374382019, "learning_rate": 1.3485343371993247e-05, "loss": 0.0188, "num_input_tokens_seen": 810490624, "step": 791500 }, { "epoch": 7.307553907050129, "grad_norm": 2.84344744682312, "learning_rate": 1.346227659829675e-05, "loss": 0.018, "num_input_tokens_seen": 811002624, "step": 792000 }, { "epoch": 7.312167261789428, "grad_norm": 1.8831250667572021, "learning_rate": 1.3439209824600255e-05, "loss": 0.0198, "num_input_tokens_seen": 811514624, "step": 792500 }, { "epoch": 7.316780616528727, "grad_norm": 0.42998257279396057, "learning_rate": 1.3416143050903757e-05, "loss": 0.0182, "num_input_tokens_seen": 812026624, "step": 793000 }, { "epoch": 7.3213939712680265, "grad_norm": 0.4875911474227905, "learning_rate": 1.3393076277207261e-05, "loss": 0.0202, "num_input_tokens_seen": 812538624, "step": 793500 }, { "epoch": 7.326007326007326, "grad_norm": 0.6313169002532959, "learning_rate": 1.3370009503510764e-05, "loss": 0.0185, "num_input_tokens_seen": 813050624, "step": 794000 }, { "epoch": 7.330620680746625, "grad_norm": 0.5315720438957214, "learning_rate": 1.3346942729814266e-05, "loss": 0.0175, "num_input_tokens_seen": 813562624, "step": 794500 }, { "epoch": 7.335234035485925, "grad_norm": 0.636077344417572, "learning_rate": 1.332387595611777e-05, "loss": 0.0187, "num_input_tokens_seen": 814074624, "step": 795000 }, { "epoch": 7.339847390225224, "grad_norm": 1.2620755434036255, "learning_rate": 1.3300809182421272e-05, "loss": 0.0185, "num_input_tokens_seen": 814586624, "step": 795500 }, { "epoch": 7.344460744964524, "grad_norm": 0.40610164403915405, "learning_rate": 1.3277742408724778e-05, "loss": 0.0179, "num_input_tokens_seen": 815098624, "step": 796000 }, { "epoch": 7.349074099703823, "grad_norm": 0.5910019278526306, "learning_rate": 1.325467563502828e-05, "loss": 0.0191, "num_input_tokens_seen": 815610624, "step": 796500 }, { "epoch": 7.353687454443122, "grad_norm": 0.9699934720993042, "learning_rate": 1.3231608861331784e-05, "loss": 0.0173, "num_input_tokens_seen": 816122624, "step": 797000 }, { "epoch": 7.358300809182421, "grad_norm": 0.5334429740905762, "learning_rate": 1.3208542087635286e-05, "loss": 0.0185, "num_input_tokens_seen": 816634624, "step": 797500 }, { "epoch": 7.362914163921721, "grad_norm": 0.47226250171661377, "learning_rate": 1.3185475313938792e-05, "loss": 0.0181, "num_input_tokens_seen": 817146624, "step": 798000 }, { "epoch": 7.36752751866102, "grad_norm": 3.1056435108184814, "learning_rate": 1.3162408540242294e-05, "loss": 0.0189, "num_input_tokens_seen": 817658624, "step": 798500 }, { "epoch": 7.372140873400319, "grad_norm": 0.8559852838516235, "learning_rate": 1.3139341766545798e-05, "loss": 0.0186, "num_input_tokens_seen": 818170624, "step": 799000 }, { "epoch": 7.376754228139618, "grad_norm": 0.5092094540596008, "learning_rate": 1.31162749928493e-05, "loss": 0.0182, "num_input_tokens_seen": 818682624, "step": 799500 }, { "epoch": 7.381367582878918, "grad_norm": 0.7403343915939331, "learning_rate": 1.3093208219152806e-05, "loss": 0.0187, "num_input_tokens_seen": 819194624, "step": 800000 }, { "epoch": 7.385980937618217, "grad_norm": 1.0396490097045898, "learning_rate": 1.3070141445456308e-05, "loss": 0.0176, "num_input_tokens_seen": 819706624, "step": 800500 }, { "epoch": 7.390594292357516, "grad_norm": 1.229277491569519, "learning_rate": 1.3047074671759809e-05, "loss": 0.0166, "num_input_tokens_seen": 820218624, "step": 801000 }, { "epoch": 7.395207647096816, "grad_norm": 1.870112419128418, "learning_rate": 1.3024007898063315e-05, "loss": 0.0184, "num_input_tokens_seen": 820730624, "step": 801500 }, { "epoch": 7.3998210018361155, "grad_norm": 2.495352029800415, "learning_rate": 1.3000941124366817e-05, "loss": 0.0176, "num_input_tokens_seen": 821242624, "step": 802000 }, { "epoch": 7.404434356575415, "grad_norm": 1.2543821334838867, "learning_rate": 1.2977874350670321e-05, "loss": 0.0187, "num_input_tokens_seen": 821754624, "step": 802500 }, { "epoch": 7.409047711314714, "grad_norm": 0.9267345666885376, "learning_rate": 1.2954807576973823e-05, "loss": 0.0167, "num_input_tokens_seen": 822266624, "step": 803000 }, { "epoch": 7.413661066054013, "grad_norm": 0.7813261151313782, "learning_rate": 1.2931740803277329e-05, "loss": 0.0178, "num_input_tokens_seen": 822778624, "step": 803500 }, { "epoch": 7.4182744207933125, "grad_norm": 2.1433377265930176, "learning_rate": 1.2908674029580831e-05, "loss": 0.0185, "num_input_tokens_seen": 823290624, "step": 804000 }, { "epoch": 7.422887775532612, "grad_norm": 0.4169975519180298, "learning_rate": 1.2885607255884335e-05, "loss": 0.0174, "num_input_tokens_seen": 823802624, "step": 804500 }, { "epoch": 7.427501130271911, "grad_norm": 0.7654904723167419, "learning_rate": 1.2862540482187837e-05, "loss": 0.0189, "num_input_tokens_seen": 824314624, "step": 805000 }, { "epoch": 7.43211448501121, "grad_norm": 0.7712762355804443, "learning_rate": 1.2839473708491343e-05, "loss": 0.0169, "num_input_tokens_seen": 824826624, "step": 805500 }, { "epoch": 7.436727839750509, "grad_norm": 1.179842233657837, "learning_rate": 1.2816406934794845e-05, "loss": 0.0169, "num_input_tokens_seen": 825338624, "step": 806000 }, { "epoch": 7.441341194489809, "grad_norm": 1.1706069707870483, "learning_rate": 1.279334016109835e-05, "loss": 0.0191, "num_input_tokens_seen": 825850624, "step": 806500 }, { "epoch": 7.445954549229109, "grad_norm": 1.7458144426345825, "learning_rate": 1.2770273387401852e-05, "loss": 0.0183, "num_input_tokens_seen": 826362624, "step": 807000 }, { "epoch": 7.450567903968408, "grad_norm": 0.8518096804618835, "learning_rate": 1.2747206613705354e-05, "loss": 0.0181, "num_input_tokens_seen": 826874624, "step": 807500 }, { "epoch": 7.455181258707707, "grad_norm": 0.6776919960975647, "learning_rate": 1.2724139840008858e-05, "loss": 0.0167, "num_input_tokens_seen": 827386624, "step": 808000 }, { "epoch": 7.459794613447007, "grad_norm": 1.8147574663162231, "learning_rate": 1.270107306631236e-05, "loss": 0.0172, "num_input_tokens_seen": 827898624, "step": 808500 }, { "epoch": 7.464407968186306, "grad_norm": 0.730553150177002, "learning_rate": 1.2678006292615866e-05, "loss": 0.0163, "num_input_tokens_seen": 828410624, "step": 809000 }, { "epoch": 7.469021322925605, "grad_norm": 0.5966499447822571, "learning_rate": 1.2654939518919368e-05, "loss": 0.0185, "num_input_tokens_seen": 828922624, "step": 809500 }, { "epoch": 7.473634677664904, "grad_norm": 0.5111476182937622, "learning_rate": 1.2631872745222872e-05, "loss": 0.0181, "num_input_tokens_seen": 829434624, "step": 810000 }, { "epoch": 7.4782480324042035, "grad_norm": 1.1634365320205688, "learning_rate": 1.2608805971526374e-05, "loss": 0.0162, "num_input_tokens_seen": 829946624, "step": 810500 }, { "epoch": 7.482861387143503, "grad_norm": 1.030910611152649, "learning_rate": 1.258573919782988e-05, "loss": 0.0184, "num_input_tokens_seen": 830458624, "step": 811000 }, { "epoch": 7.487474741882802, "grad_norm": 1.035938024520874, "learning_rate": 1.2562672424133382e-05, "loss": 0.0177, "num_input_tokens_seen": 830970624, "step": 811500 }, { "epoch": 7.492088096622101, "grad_norm": 1.1685384511947632, "learning_rate": 1.2539605650436886e-05, "loss": 0.0173, "num_input_tokens_seen": 831482624, "step": 812000 }, { "epoch": 7.496701451361401, "grad_norm": 0.8186880946159363, "learning_rate": 1.2516538876740388e-05, "loss": 0.0175, "num_input_tokens_seen": 831994624, "step": 812500 }, { "epoch": 7.501314806100701, "grad_norm": 1.2309128046035767, "learning_rate": 1.2493472103043892e-05, "loss": 0.0187, "num_input_tokens_seen": 832506624, "step": 813000 }, { "epoch": 7.50592816084, "grad_norm": 0.9243940114974976, "learning_rate": 1.2470405329347395e-05, "loss": 0.0208, "num_input_tokens_seen": 833018624, "step": 813500 }, { "epoch": 7.510541515579299, "grad_norm": 1.5183156728744507, "learning_rate": 1.2447338555650899e-05, "loss": 0.0188, "num_input_tokens_seen": 833530624, "step": 814000 }, { "epoch": 7.515154870318598, "grad_norm": 0.7042239904403687, "learning_rate": 1.2424271781954403e-05, "loss": 0.0175, "num_input_tokens_seen": 834042624, "step": 814500 }, { "epoch": 7.519768225057898, "grad_norm": 0.7798308730125427, "learning_rate": 1.2401205008257907e-05, "loss": 0.0203, "num_input_tokens_seen": 834554624, "step": 815000 }, { "epoch": 7.524381579797197, "grad_norm": 0.6466756463050842, "learning_rate": 1.2378138234561409e-05, "loss": 0.019, "num_input_tokens_seen": 835066624, "step": 815500 }, { "epoch": 7.528994934536496, "grad_norm": 1.0861841440200806, "learning_rate": 1.2355071460864913e-05, "loss": 0.0177, "num_input_tokens_seen": 835578624, "step": 816000 }, { "epoch": 7.533608289275795, "grad_norm": 2.7624402046203613, "learning_rate": 1.2332004687168417e-05, "loss": 0.0175, "num_input_tokens_seen": 836090624, "step": 816500 }, { "epoch": 7.538221644015095, "grad_norm": 1.2840367555618286, "learning_rate": 1.2308937913471919e-05, "loss": 0.018, "num_input_tokens_seen": 836602624, "step": 817000 }, { "epoch": 7.542834998754394, "grad_norm": 0.6789388656616211, "learning_rate": 1.2285871139775421e-05, "loss": 0.0183, "num_input_tokens_seen": 837114624, "step": 817500 }, { "epoch": 7.547448353493694, "grad_norm": 0.5279095768928528, "learning_rate": 1.2262804366078925e-05, "loss": 0.0184, "num_input_tokens_seen": 837626624, "step": 818000 }, { "epoch": 7.552061708232992, "grad_norm": 0.5110554099082947, "learning_rate": 1.223973759238243e-05, "loss": 0.0178, "num_input_tokens_seen": 838138624, "step": 818500 }, { "epoch": 7.5566750629722925, "grad_norm": 1.535260796546936, "learning_rate": 1.2216670818685932e-05, "loss": 0.0189, "num_input_tokens_seen": 838650624, "step": 819000 }, { "epoch": 7.561288417711592, "grad_norm": 3.005444049835205, "learning_rate": 1.2193604044989436e-05, "loss": 0.0194, "num_input_tokens_seen": 839162624, "step": 819500 }, { "epoch": 7.565901772450891, "grad_norm": 0.3890930712223053, "learning_rate": 1.217053727129294e-05, "loss": 0.0182, "num_input_tokens_seen": 839674624, "step": 820000 }, { "epoch": 7.57051512719019, "grad_norm": 3.0413002967834473, "learning_rate": 1.2147470497596444e-05, "loss": 0.0179, "num_input_tokens_seen": 840186624, "step": 820500 }, { "epoch": 7.5751284819294895, "grad_norm": 0.33747154474258423, "learning_rate": 1.2124403723899946e-05, "loss": 0.0182, "num_input_tokens_seen": 840698624, "step": 821000 }, { "epoch": 7.579741836668789, "grad_norm": 0.7888673543930054, "learning_rate": 1.210133695020345e-05, "loss": 0.0158, "num_input_tokens_seen": 841210624, "step": 821500 }, { "epoch": 7.584355191408088, "grad_norm": 0.5673322081565857, "learning_rate": 1.2078270176506954e-05, "loss": 0.0175, "num_input_tokens_seen": 841722624, "step": 822000 }, { "epoch": 7.588968546147387, "grad_norm": 7.8960700035095215, "learning_rate": 1.2055203402810456e-05, "loss": 0.0171, "num_input_tokens_seen": 842234624, "step": 822500 }, { "epoch": 7.5935819008866865, "grad_norm": 0.6810684204101562, "learning_rate": 1.203213662911396e-05, "loss": 0.0169, "num_input_tokens_seen": 842746624, "step": 823000 }, { "epoch": 7.598195255625986, "grad_norm": 0.88917076587677, "learning_rate": 1.2009069855417462e-05, "loss": 0.0176, "num_input_tokens_seen": 843258624, "step": 823500 }, { "epoch": 7.602808610365285, "grad_norm": 0.7236852049827576, "learning_rate": 1.1986003081720966e-05, "loss": 0.0178, "num_input_tokens_seen": 843770624, "step": 824000 }, { "epoch": 7.607421965104585, "grad_norm": 2.4100208282470703, "learning_rate": 1.196293630802447e-05, "loss": 0.0183, "num_input_tokens_seen": 844282624, "step": 824500 }, { "epoch": 7.612035319843884, "grad_norm": 0.9818079471588135, "learning_rate": 1.1939869534327972e-05, "loss": 0.0184, "num_input_tokens_seen": 844794624, "step": 825000 }, { "epoch": 7.616648674583184, "grad_norm": 5.109523773193359, "learning_rate": 1.1916802760631476e-05, "loss": 0.0182, "num_input_tokens_seen": 845306624, "step": 825500 }, { "epoch": 7.621262029322483, "grad_norm": 1.1535288095474243, "learning_rate": 1.189373598693498e-05, "loss": 0.018, "num_input_tokens_seen": 845818624, "step": 826000 }, { "epoch": 7.625875384061782, "grad_norm": 1.0759390592575073, "learning_rate": 1.1870669213238483e-05, "loss": 0.018, "num_input_tokens_seen": 846330624, "step": 826500 }, { "epoch": 7.630488738801081, "grad_norm": 0.9492645263671875, "learning_rate": 1.1847602439541987e-05, "loss": 0.0162, "num_input_tokens_seen": 846842624, "step": 827000 }, { "epoch": 7.635102093540381, "grad_norm": 0.5077918767929077, "learning_rate": 1.182453566584549e-05, "loss": 0.018, "num_input_tokens_seen": 847354624, "step": 827500 }, { "epoch": 7.63971544827968, "grad_norm": 0.5069125890731812, "learning_rate": 1.1801468892148995e-05, "loss": 0.0181, "num_input_tokens_seen": 847866624, "step": 828000 }, { "epoch": 7.644328803018979, "grad_norm": 0.35941779613494873, "learning_rate": 1.1778402118452497e-05, "loss": 0.0187, "num_input_tokens_seen": 848378624, "step": 828500 }, { "epoch": 7.648942157758278, "grad_norm": 0.7320166230201721, "learning_rate": 1.1755335344756001e-05, "loss": 0.0166, "num_input_tokens_seen": 848890624, "step": 829000 }, { "epoch": 7.653555512497578, "grad_norm": 0.4909152686595917, "learning_rate": 1.1732268571059505e-05, "loss": 0.0174, "num_input_tokens_seen": 849402624, "step": 829500 }, { "epoch": 7.658168867236878, "grad_norm": 0.5299736857414246, "learning_rate": 1.1709201797363007e-05, "loss": 0.017, "num_input_tokens_seen": 849914624, "step": 830000 }, { "epoch": 7.662782221976177, "grad_norm": 1.6265432834625244, "learning_rate": 1.168613502366651e-05, "loss": 0.0192, "num_input_tokens_seen": 850426624, "step": 830500 }, { "epoch": 7.667395576715476, "grad_norm": 1.0842050313949585, "learning_rate": 1.1663068249970013e-05, "loss": 0.0174, "num_input_tokens_seen": 850938624, "step": 831000 }, { "epoch": 7.6720089314547755, "grad_norm": 0.46629172563552856, "learning_rate": 1.1640001476273517e-05, "loss": 0.019, "num_input_tokens_seen": 851450624, "step": 831500 }, { "epoch": 7.676622286194075, "grad_norm": 0.786178469657898, "learning_rate": 1.161693470257702e-05, "loss": 0.0174, "num_input_tokens_seen": 851962624, "step": 832000 }, { "epoch": 7.681235640933374, "grad_norm": 0.9928342700004578, "learning_rate": 1.1593867928880524e-05, "loss": 0.0187, "num_input_tokens_seen": 852474624, "step": 832500 }, { "epoch": 7.685848995672673, "grad_norm": 0.19910675287246704, "learning_rate": 1.1570801155184028e-05, "loss": 0.0165, "num_input_tokens_seen": 852986624, "step": 833000 }, { "epoch": 7.6904623504119725, "grad_norm": 0.44422009587287903, "learning_rate": 1.1547734381487532e-05, "loss": 0.019, "num_input_tokens_seen": 853498624, "step": 833500 }, { "epoch": 7.695075705151272, "grad_norm": 1.4326293468475342, "learning_rate": 1.1524667607791034e-05, "loss": 0.0189, "num_input_tokens_seen": 854010624, "step": 834000 }, { "epoch": 7.699689059890571, "grad_norm": 2.208235263824463, "learning_rate": 1.1501600834094538e-05, "loss": 0.0181, "num_input_tokens_seen": 854522624, "step": 834500 }, { "epoch": 7.70430241462987, "grad_norm": 1.5056183338165283, "learning_rate": 1.1478534060398042e-05, "loss": 0.0164, "num_input_tokens_seen": 855034624, "step": 835000 }, { "epoch": 7.70891576936917, "grad_norm": 0.991448700428009, "learning_rate": 1.1455467286701544e-05, "loss": 0.0188, "num_input_tokens_seen": 855546624, "step": 835500 }, { "epoch": 7.71352912410847, "grad_norm": 0.48746320605278015, "learning_rate": 1.1432400513005048e-05, "loss": 0.0187, "num_input_tokens_seen": 856058624, "step": 836000 }, { "epoch": 7.718142478847769, "grad_norm": 0.7954283356666565, "learning_rate": 1.140933373930855e-05, "loss": 0.0186, "num_input_tokens_seen": 856570624, "step": 836500 }, { "epoch": 7.722755833587068, "grad_norm": 0.3314274251461029, "learning_rate": 1.1386266965612054e-05, "loss": 0.0183, "num_input_tokens_seen": 857082624, "step": 837000 }, { "epoch": 7.727369188326367, "grad_norm": 0.40846577286720276, "learning_rate": 1.1363200191915556e-05, "loss": 0.0188, "num_input_tokens_seen": 857594624, "step": 837500 }, { "epoch": 7.731982543065667, "grad_norm": 0.5026475787162781, "learning_rate": 1.134013341821906e-05, "loss": 0.0174, "num_input_tokens_seen": 858106624, "step": 838000 }, { "epoch": 7.736595897804966, "grad_norm": 0.7746123671531677, "learning_rate": 1.1317066644522564e-05, "loss": 0.0195, "num_input_tokens_seen": 858618624, "step": 838500 }, { "epoch": 7.741209252544265, "grad_norm": 0.835455060005188, "learning_rate": 1.1293999870826068e-05, "loss": 0.0183, "num_input_tokens_seen": 859130624, "step": 839000 }, { "epoch": 7.745822607283564, "grad_norm": 1.107001781463623, "learning_rate": 1.127093309712957e-05, "loss": 0.0201, "num_input_tokens_seen": 859642624, "step": 839500 }, { "epoch": 7.750435962022864, "grad_norm": 0.31434282660484314, "learning_rate": 1.1247866323433075e-05, "loss": 0.0195, "num_input_tokens_seen": 860154624, "step": 840000 }, { "epoch": 7.755049316762163, "grad_norm": 0.7980784773826599, "learning_rate": 1.1224799549736579e-05, "loss": 0.0173, "num_input_tokens_seen": 860666624, "step": 840500 }, { "epoch": 7.759662671501462, "grad_norm": 0.6341221332550049, "learning_rate": 1.1201732776040081e-05, "loss": 0.0156, "num_input_tokens_seen": 861178624, "step": 841000 }, { "epoch": 7.764276026240761, "grad_norm": 1.298004388809204, "learning_rate": 1.1178666002343585e-05, "loss": 0.0182, "num_input_tokens_seen": 861690624, "step": 841500 }, { "epoch": 7.768889380980061, "grad_norm": 0.6212522983551025, "learning_rate": 1.1155599228647089e-05, "loss": 0.0183, "num_input_tokens_seen": 862202624, "step": 842000 }, { "epoch": 7.773502735719361, "grad_norm": 1.0448174476623535, "learning_rate": 1.1132532454950593e-05, "loss": 0.0167, "num_input_tokens_seen": 862714624, "step": 842500 }, { "epoch": 7.77811609045866, "grad_norm": 0.4349260628223419, "learning_rate": 1.1109465681254095e-05, "loss": 0.0182, "num_input_tokens_seen": 863226624, "step": 843000 }, { "epoch": 7.782729445197959, "grad_norm": 0.5279752016067505, "learning_rate": 1.1086398907557597e-05, "loss": 0.0191, "num_input_tokens_seen": 863738624, "step": 843500 }, { "epoch": 7.787342799937258, "grad_norm": 2.5519967079162598, "learning_rate": 1.1063332133861101e-05, "loss": 0.0188, "num_input_tokens_seen": 864250624, "step": 844000 }, { "epoch": 7.791956154676558, "grad_norm": 1.002515435218811, "learning_rate": 1.1040265360164605e-05, "loss": 0.0181, "num_input_tokens_seen": 864762624, "step": 844500 }, { "epoch": 7.796569509415857, "grad_norm": 1.0723029375076294, "learning_rate": 1.1017198586468108e-05, "loss": 0.0172, "num_input_tokens_seen": 865274624, "step": 845000 }, { "epoch": 7.801182864155156, "grad_norm": 0.492806613445282, "learning_rate": 1.0994131812771612e-05, "loss": 0.0184, "num_input_tokens_seen": 865786624, "step": 845500 }, { "epoch": 7.805796218894455, "grad_norm": 2.1584246158599854, "learning_rate": 1.0971065039075116e-05, "loss": 0.0195, "num_input_tokens_seen": 866298624, "step": 846000 }, { "epoch": 7.810409573633755, "grad_norm": 0.9871762990951538, "learning_rate": 1.094799826537862e-05, "loss": 0.0172, "num_input_tokens_seen": 866810624, "step": 846500 }, { "epoch": 7.815022928373054, "grad_norm": 1.234832525253296, "learning_rate": 1.0924931491682122e-05, "loss": 0.0178, "num_input_tokens_seen": 867322624, "step": 847000 }, { "epoch": 7.819636283112354, "grad_norm": 0.8536167144775391, "learning_rate": 1.0901864717985626e-05, "loss": 0.0189, "num_input_tokens_seen": 867834624, "step": 847500 }, { "epoch": 7.824249637851653, "grad_norm": 0.5045762658119202, "learning_rate": 1.087879794428913e-05, "loss": 0.0165, "num_input_tokens_seen": 868346624, "step": 848000 }, { "epoch": 7.8288629925909525, "grad_norm": 0.539504885673523, "learning_rate": 1.0855731170592632e-05, "loss": 0.0189, "num_input_tokens_seen": 868858624, "step": 848500 }, { "epoch": 7.833476347330252, "grad_norm": 0.6124027967453003, "learning_rate": 1.0832664396896136e-05, "loss": 0.0181, "num_input_tokens_seen": 869370624, "step": 849000 }, { "epoch": 7.838089702069551, "grad_norm": 0.5063890814781189, "learning_rate": 1.0809597623199638e-05, "loss": 0.0166, "num_input_tokens_seen": 869882624, "step": 849500 }, { "epoch": 7.84270305680885, "grad_norm": 0.4935370087623596, "learning_rate": 1.0786530849503142e-05, "loss": 0.0182, "num_input_tokens_seen": 870394624, "step": 850000 }, { "epoch": 7.8473164115481495, "grad_norm": 1.3337877988815308, "learning_rate": 1.0763464075806644e-05, "loss": 0.0167, "num_input_tokens_seen": 870906624, "step": 850500 }, { "epoch": 7.851929766287449, "grad_norm": 0.5984758734703064, "learning_rate": 1.0740397302110148e-05, "loss": 0.0181, "num_input_tokens_seen": 871418624, "step": 851000 }, { "epoch": 7.856543121026748, "grad_norm": 0.6499104499816895, "learning_rate": 1.0717330528413652e-05, "loss": 0.0176, "num_input_tokens_seen": 871930624, "step": 851500 }, { "epoch": 7.861156475766047, "grad_norm": 0.5723326206207275, "learning_rate": 1.0694263754717156e-05, "loss": 0.0173, "num_input_tokens_seen": 872442624, "step": 852000 }, { "epoch": 7.8657698305053465, "grad_norm": 0.6458103060722351, "learning_rate": 1.0671196981020659e-05, "loss": 0.0185, "num_input_tokens_seen": 872954624, "step": 852500 }, { "epoch": 7.870383185244647, "grad_norm": 0.6607184410095215, "learning_rate": 1.0648130207324163e-05, "loss": 0.0174, "num_input_tokens_seen": 873466624, "step": 853000 }, { "epoch": 7.874996539983946, "grad_norm": 0.7945510745048523, "learning_rate": 1.0625063433627667e-05, "loss": 0.0178, "num_input_tokens_seen": 873978624, "step": 853500 }, { "epoch": 7.879609894723245, "grad_norm": 0.9480940103530884, "learning_rate": 1.0601996659931169e-05, "loss": 0.019, "num_input_tokens_seen": 874490624, "step": 854000 }, { "epoch": 7.884223249462544, "grad_norm": 0.5195125937461853, "learning_rate": 1.0578929886234673e-05, "loss": 0.017, "num_input_tokens_seen": 875002624, "step": 854500 }, { "epoch": 7.888836604201844, "grad_norm": 0.3116241693496704, "learning_rate": 1.0555863112538177e-05, "loss": 0.0189, "num_input_tokens_seen": 875514624, "step": 855000 }, { "epoch": 7.893449958941143, "grad_norm": 0.8278101086616516, "learning_rate": 1.053279633884168e-05, "loss": 0.0177, "num_input_tokens_seen": 876026624, "step": 855500 }, { "epoch": 7.898063313680442, "grad_norm": 0.6848555207252502, "learning_rate": 1.0509729565145181e-05, "loss": 0.0186, "num_input_tokens_seen": 876538624, "step": 856000 }, { "epoch": 7.902676668419741, "grad_norm": 0.9749637842178345, "learning_rate": 1.0486662791448685e-05, "loss": 0.0214, "num_input_tokens_seen": 877050624, "step": 856500 }, { "epoch": 7.907290023159041, "grad_norm": 2.486924648284912, "learning_rate": 1.046359601775219e-05, "loss": 0.0194, "num_input_tokens_seen": 877562624, "step": 857000 }, { "epoch": 7.91190337789834, "grad_norm": 0.8250918388366699, "learning_rate": 1.0440529244055693e-05, "loss": 0.0187, "num_input_tokens_seen": 878074624, "step": 857500 }, { "epoch": 7.916516732637639, "grad_norm": 1.9874022006988525, "learning_rate": 1.0417462470359196e-05, "loss": 0.0181, "num_input_tokens_seen": 878586624, "step": 858000 }, { "epoch": 7.921130087376939, "grad_norm": 1.451173186302185, "learning_rate": 1.03943956966627e-05, "loss": 0.0191, "num_input_tokens_seen": 879098624, "step": 858500 }, { "epoch": 7.925743442116238, "grad_norm": 3.8313064575195312, "learning_rate": 1.0371328922966204e-05, "loss": 0.0188, "num_input_tokens_seen": 879610624, "step": 859000 }, { "epoch": 7.930356796855538, "grad_norm": 0.9106965661048889, "learning_rate": 1.0348262149269706e-05, "loss": 0.0164, "num_input_tokens_seen": 880122624, "step": 859500 }, { "epoch": 7.934970151594837, "grad_norm": 0.9856759905815125, "learning_rate": 1.032519537557321e-05, "loss": 0.0189, "num_input_tokens_seen": 880634624, "step": 860000 }, { "epoch": 7.939583506334136, "grad_norm": 1.1179744005203247, "learning_rate": 1.0302128601876714e-05, "loss": 0.0186, "num_input_tokens_seen": 881146624, "step": 860500 }, { "epoch": 7.9441968610734355, "grad_norm": 0.8333301544189453, "learning_rate": 1.0279061828180218e-05, "loss": 0.0182, "num_input_tokens_seen": 881658624, "step": 861000 }, { "epoch": 7.948810215812735, "grad_norm": 0.4756206274032593, "learning_rate": 1.025599505448372e-05, "loss": 0.0187, "num_input_tokens_seen": 882170624, "step": 861500 }, { "epoch": 7.953423570552034, "grad_norm": 1.3627671003341675, "learning_rate": 1.0232928280787224e-05, "loss": 0.018, "num_input_tokens_seen": 882682624, "step": 862000 }, { "epoch": 7.958036925291333, "grad_norm": 1.3066837787628174, "learning_rate": 1.0209861507090726e-05, "loss": 0.0193, "num_input_tokens_seen": 883194624, "step": 862500 }, { "epoch": 7.9626502800306325, "grad_norm": 0.46038496494293213, "learning_rate": 1.018679473339423e-05, "loss": 0.018, "num_input_tokens_seen": 883706624, "step": 863000 }, { "epoch": 7.967263634769932, "grad_norm": 0.67403644323349, "learning_rate": 1.0163727959697732e-05, "loss": 0.0188, "num_input_tokens_seen": 884218624, "step": 863500 }, { "epoch": 7.971876989509231, "grad_norm": 0.7785734534263611, "learning_rate": 1.0140661186001236e-05, "loss": 0.0169, "num_input_tokens_seen": 884730624, "step": 864000 }, { "epoch": 7.97649034424853, "grad_norm": 0.8497280478477478, "learning_rate": 1.011759441230474e-05, "loss": 0.0189, "num_input_tokens_seen": 885242624, "step": 864500 }, { "epoch": 7.98110369898783, "grad_norm": 4.073908805847168, "learning_rate": 1.0094527638608243e-05, "loss": 0.0183, "num_input_tokens_seen": 885754624, "step": 865000 }, { "epoch": 7.98571705372713, "grad_norm": 0.7901633977890015, "learning_rate": 1.0071460864911747e-05, "loss": 0.0177, "num_input_tokens_seen": 886266624, "step": 865500 }, { "epoch": 7.990330408466429, "grad_norm": 2.1585545539855957, "learning_rate": 1.004839409121525e-05, "loss": 0.0172, "num_input_tokens_seen": 886778624, "step": 866000 }, { "epoch": 7.994943763205728, "grad_norm": 0.6002645492553711, "learning_rate": 1.0025327317518755e-05, "loss": 0.0193, "num_input_tokens_seen": 887290624, "step": 866500 }, { "epoch": 7.999557117945027, "grad_norm": 0.602433443069458, "learning_rate": 1.0002260543822257e-05, "loss": 0.0182, "num_input_tokens_seen": 887802624, "step": 867000 }, { "epoch": 8.0, "eval_combined_score": 0.0675718570300666, "eval_loss": 0.06757185608148575, "eval_mse": 0.06757185797864745, "eval_runtime": 46.9325, "eval_samples_per_second": 2052.691, "eval_steps_per_second": 256.602, "num_input_tokens_seen": 887851008, "step": 867048 }, { "epoch": 8.004170472684327, "grad_norm": 1.5062319040298462, "learning_rate": 9.97919377012576e-06, "loss": 0.0154, "num_input_tokens_seen": 888313856, "step": 867500 }, { "epoch": 8.008783827423626, "grad_norm": 0.3685579001903534, "learning_rate": 9.956126996429265e-06, "loss": 0.0156, "num_input_tokens_seen": 888825856, "step": 868000 }, { "epoch": 8.013397182162926, "grad_norm": 0.5031562447547913, "learning_rate": 9.933060222732767e-06, "loss": 0.0147, "num_input_tokens_seen": 889337856, "step": 868500 }, { "epoch": 8.018010536902224, "grad_norm": 1.041576623916626, "learning_rate": 9.90999344903627e-06, "loss": 0.0158, "num_input_tokens_seen": 889849856, "step": 869000 }, { "epoch": 8.022623891641524, "grad_norm": 0.6168863773345947, "learning_rate": 9.886926675339773e-06, "loss": 0.0149, "num_input_tokens_seen": 890361856, "step": 869500 }, { "epoch": 8.027237246380823, "grad_norm": 1.0457834005355835, "learning_rate": 9.863859901643277e-06, "loss": 0.0155, "num_input_tokens_seen": 890873856, "step": 870000 }, { "epoch": 8.031850601120123, "grad_norm": 0.7371172904968262, "learning_rate": 9.840793127946781e-06, "loss": 0.0156, "num_input_tokens_seen": 891385856, "step": 870500 }, { "epoch": 8.036463955859421, "grad_norm": 1.857638955116272, "learning_rate": 9.817726354250284e-06, "loss": 0.0136, "num_input_tokens_seen": 891897856, "step": 871000 }, { "epoch": 8.041077310598721, "grad_norm": 1.3631207942962646, "learning_rate": 9.794659580553788e-06, "loss": 0.0157, "num_input_tokens_seen": 892409856, "step": 871500 }, { "epoch": 8.04569066533802, "grad_norm": 1.4387595653533936, "learning_rate": 9.771592806857291e-06, "loss": 0.0155, "num_input_tokens_seen": 892921856, "step": 872000 }, { "epoch": 8.05030402007732, "grad_norm": 0.44265180826187134, "learning_rate": 9.748526033160794e-06, "loss": 0.0151, "num_input_tokens_seen": 893433856, "step": 872500 }, { "epoch": 8.054917374816618, "grad_norm": 0.7352337837219238, "learning_rate": 9.725459259464298e-06, "loss": 0.0156, "num_input_tokens_seen": 893945856, "step": 873000 }, { "epoch": 8.059530729555918, "grad_norm": 0.6806060075759888, "learning_rate": 9.702392485767802e-06, "loss": 0.0139, "num_input_tokens_seen": 894457856, "step": 873500 }, { "epoch": 8.064144084295219, "grad_norm": 0.7403847575187683, "learning_rate": 9.679325712071306e-06, "loss": 0.0164, "num_input_tokens_seen": 894969856, "step": 874000 }, { "epoch": 8.068757439034517, "grad_norm": 1.1141221523284912, "learning_rate": 9.656258938374808e-06, "loss": 0.0148, "num_input_tokens_seen": 895481856, "step": 874500 }, { "epoch": 8.073370793773817, "grad_norm": 0.983514130115509, "learning_rate": 9.633192164678312e-06, "loss": 0.0154, "num_input_tokens_seen": 895993856, "step": 875000 }, { "epoch": 8.077984148513115, "grad_norm": 0.4191863536834717, "learning_rate": 9.610125390981814e-06, "loss": 0.0171, "num_input_tokens_seen": 896505856, "step": 875500 }, { "epoch": 8.082597503252416, "grad_norm": 0.4481130540370941, "learning_rate": 9.587058617285318e-06, "loss": 0.0134, "num_input_tokens_seen": 897017856, "step": 876000 }, { "epoch": 8.087210857991714, "grad_norm": 0.7153156995773315, "learning_rate": 9.56399184358882e-06, "loss": 0.0152, "num_input_tokens_seen": 897529856, "step": 876500 }, { "epoch": 8.091824212731014, "grad_norm": 1.7068063020706177, "learning_rate": 9.540925069892324e-06, "loss": 0.0163, "num_input_tokens_seen": 898041856, "step": 877000 }, { "epoch": 8.096437567470312, "grad_norm": 0.5899567008018494, "learning_rate": 9.517858296195828e-06, "loss": 0.0158, "num_input_tokens_seen": 898553856, "step": 877500 }, { "epoch": 8.101050922209613, "grad_norm": 0.9179006218910217, "learning_rate": 9.49479152249933e-06, "loss": 0.0143, "num_input_tokens_seen": 899065856, "step": 878000 }, { "epoch": 8.105664276948911, "grad_norm": 0.7641995549201965, "learning_rate": 9.471724748802835e-06, "loss": 0.0158, "num_input_tokens_seen": 899577856, "step": 878500 }, { "epoch": 8.110277631688211, "grad_norm": 0.8679375648498535, "learning_rate": 9.448657975106339e-06, "loss": 0.0156, "num_input_tokens_seen": 900089856, "step": 879000 }, { "epoch": 8.114890986427511, "grad_norm": 0.981959342956543, "learning_rate": 9.425591201409843e-06, "loss": 0.0163, "num_input_tokens_seen": 900601856, "step": 879500 }, { "epoch": 8.11950434116681, "grad_norm": 0.5581063032150269, "learning_rate": 9.402524427713345e-06, "loss": 0.0153, "num_input_tokens_seen": 901113856, "step": 880000 }, { "epoch": 8.12411769590611, "grad_norm": 0.4459242522716522, "learning_rate": 9.379457654016849e-06, "loss": 0.0139, "num_input_tokens_seen": 901625856, "step": 880500 }, { "epoch": 8.128731050645408, "grad_norm": 0.5052184462547302, "learning_rate": 9.356390880320353e-06, "loss": 0.0137, "num_input_tokens_seen": 902137856, "step": 881000 }, { "epoch": 8.133344405384708, "grad_norm": 2.326282024383545, "learning_rate": 9.333324106623855e-06, "loss": 0.0145, "num_input_tokens_seen": 902649856, "step": 881500 }, { "epoch": 8.137957760124007, "grad_norm": 0.3621096909046173, "learning_rate": 9.310257332927357e-06, "loss": 0.015, "num_input_tokens_seen": 903161856, "step": 882000 }, { "epoch": 8.142571114863307, "grad_norm": 0.599589467048645, "learning_rate": 9.287190559230861e-06, "loss": 0.0133, "num_input_tokens_seen": 903673856, "step": 882500 }, { "epoch": 8.147184469602605, "grad_norm": 0.6334195733070374, "learning_rate": 9.264123785534365e-06, "loss": 0.014, "num_input_tokens_seen": 904185856, "step": 883000 }, { "epoch": 8.151797824341905, "grad_norm": 0.5166653990745544, "learning_rate": 9.241057011837868e-06, "loss": 0.0159, "num_input_tokens_seen": 904697856, "step": 883500 }, { "epoch": 8.156411179081203, "grad_norm": 0.9574226140975952, "learning_rate": 9.217990238141372e-06, "loss": 0.0147, "num_input_tokens_seen": 905209856, "step": 884000 }, { "epoch": 8.161024533820504, "grad_norm": 0.7625335454940796, "learning_rate": 9.194923464444876e-06, "loss": 0.0172, "num_input_tokens_seen": 905721856, "step": 884500 }, { "epoch": 8.165637888559804, "grad_norm": 0.5956442356109619, "learning_rate": 9.17185669074838e-06, "loss": 0.0146, "num_input_tokens_seen": 906233856, "step": 885000 }, { "epoch": 8.170251243299102, "grad_norm": 0.6293473243713379, "learning_rate": 9.148789917051882e-06, "loss": 0.0142, "num_input_tokens_seen": 906745856, "step": 885500 }, { "epoch": 8.174864598038402, "grad_norm": 1.834021806716919, "learning_rate": 9.125723143355386e-06, "loss": 0.0148, "num_input_tokens_seen": 907257856, "step": 886000 }, { "epoch": 8.1794779527777, "grad_norm": 0.4335891008377075, "learning_rate": 9.10265636965889e-06, "loss": 0.0161, "num_input_tokens_seen": 907769856, "step": 886500 }, { "epoch": 8.184091307517, "grad_norm": 0.573677659034729, "learning_rate": 9.079589595962392e-06, "loss": 0.0142, "num_input_tokens_seen": 908281856, "step": 887000 }, { "epoch": 8.1887046622563, "grad_norm": 0.7976333498954773, "learning_rate": 9.056522822265896e-06, "loss": 0.016, "num_input_tokens_seen": 908793856, "step": 887500 }, { "epoch": 8.1933180169956, "grad_norm": 1.0269770622253418, "learning_rate": 9.0334560485694e-06, "loss": 0.0155, "num_input_tokens_seen": 909305856, "step": 888000 }, { "epoch": 8.197931371734898, "grad_norm": 0.9196085333824158, "learning_rate": 9.010389274872902e-06, "loss": 0.0161, "num_input_tokens_seen": 909817856, "step": 888500 }, { "epoch": 8.202544726474198, "grad_norm": 0.9371418952941895, "learning_rate": 8.987322501176406e-06, "loss": 0.0169, "num_input_tokens_seen": 910329856, "step": 889000 }, { "epoch": 8.207158081213496, "grad_norm": 0.5787968635559082, "learning_rate": 8.964255727479908e-06, "loss": 0.0151, "num_input_tokens_seen": 910841856, "step": 889500 }, { "epoch": 8.211771435952796, "grad_norm": 0.44304850697517395, "learning_rate": 8.941188953783412e-06, "loss": 0.0151, "num_input_tokens_seen": 911353856, "step": 890000 }, { "epoch": 8.216384790692096, "grad_norm": 1.7044280767440796, "learning_rate": 8.918122180086916e-06, "loss": 0.0165, "num_input_tokens_seen": 911865856, "step": 890500 }, { "epoch": 8.220998145431395, "grad_norm": 0.6133010983467102, "learning_rate": 8.895055406390419e-06, "loss": 0.0157, "num_input_tokens_seen": 912377856, "step": 891000 }, { "epoch": 8.225611500170695, "grad_norm": 2.290767192840576, "learning_rate": 8.871988632693923e-06, "loss": 0.016, "num_input_tokens_seen": 912889856, "step": 891500 }, { "epoch": 8.230224854909993, "grad_norm": 0.47266674041748047, "learning_rate": 8.848921858997427e-06, "loss": 0.0152, "num_input_tokens_seen": 913401856, "step": 892000 }, { "epoch": 8.234838209649293, "grad_norm": 0.7107419967651367, "learning_rate": 8.82585508530093e-06, "loss": 0.0148, "num_input_tokens_seen": 913913856, "step": 892500 }, { "epoch": 8.239451564388592, "grad_norm": 0.29794007539749146, "learning_rate": 8.802788311604433e-06, "loss": 0.0158, "num_input_tokens_seen": 914425856, "step": 893000 }, { "epoch": 8.244064919127892, "grad_norm": 0.9938859939575195, "learning_rate": 8.779721537907937e-06, "loss": 0.0138, "num_input_tokens_seen": 914937856, "step": 893500 }, { "epoch": 8.24867827386719, "grad_norm": 0.9996763467788696, "learning_rate": 8.75665476421144e-06, "loss": 0.0163, "num_input_tokens_seen": 915449856, "step": 894000 }, { "epoch": 8.25329162860649, "grad_norm": 0.8853555917739868, "learning_rate": 8.733587990514943e-06, "loss": 0.016, "num_input_tokens_seen": 915961856, "step": 894500 }, { "epoch": 8.257904983345789, "grad_norm": 0.5720754861831665, "learning_rate": 8.710521216818445e-06, "loss": 0.0147, "num_input_tokens_seen": 916473856, "step": 895000 }, { "epoch": 8.262518338085089, "grad_norm": 0.7386252880096436, "learning_rate": 8.68745444312195e-06, "loss": 0.0156, "num_input_tokens_seen": 916985856, "step": 895500 }, { "epoch": 8.267131692824387, "grad_norm": 0.5073798298835754, "learning_rate": 8.664387669425453e-06, "loss": 0.0152, "num_input_tokens_seen": 917497856, "step": 896000 }, { "epoch": 8.271745047563687, "grad_norm": 2.3658652305603027, "learning_rate": 8.641320895728956e-06, "loss": 0.0173, "num_input_tokens_seen": 918009856, "step": 896500 }, { "epoch": 8.276358402302987, "grad_norm": 0.4761596620082855, "learning_rate": 8.61825412203246e-06, "loss": 0.0145, "num_input_tokens_seen": 918521856, "step": 897000 }, { "epoch": 8.280971757042286, "grad_norm": 0.5883774161338806, "learning_rate": 8.595187348335963e-06, "loss": 0.0153, "num_input_tokens_seen": 919033856, "step": 897500 }, { "epoch": 8.285585111781586, "grad_norm": 0.9515103101730347, "learning_rate": 8.572120574639467e-06, "loss": 0.015, "num_input_tokens_seen": 919545856, "step": 898000 }, { "epoch": 8.290198466520884, "grad_norm": 0.5109001398086548, "learning_rate": 8.54905380094297e-06, "loss": 0.0164, "num_input_tokens_seen": 920057856, "step": 898500 }, { "epoch": 8.294811821260184, "grad_norm": 0.8202781081199646, "learning_rate": 8.525987027246474e-06, "loss": 0.0151, "num_input_tokens_seen": 920569856, "step": 899000 }, { "epoch": 8.299425175999483, "grad_norm": 1.913580060005188, "learning_rate": 8.502920253549978e-06, "loss": 0.0155, "num_input_tokens_seen": 921081856, "step": 899500 }, { "epoch": 8.304038530738783, "grad_norm": 0.6409407258033752, "learning_rate": 8.47985347985348e-06, "loss": 0.013, "num_input_tokens_seen": 921593856, "step": 900000 }, { "epoch": 8.308651885478081, "grad_norm": 0.4128231108188629, "learning_rate": 8.456786706156984e-06, "loss": 0.0148, "num_input_tokens_seen": 922105856, "step": 900500 }, { "epoch": 8.313265240217381, "grad_norm": 2.3555517196655273, "learning_rate": 8.433719932460488e-06, "loss": 0.0142, "num_input_tokens_seen": 922617856, "step": 901000 }, { "epoch": 8.31787859495668, "grad_norm": 1.5205661058425903, "learning_rate": 8.41065315876399e-06, "loss": 0.016, "num_input_tokens_seen": 923129856, "step": 901500 }, { "epoch": 8.32249194969598, "grad_norm": 0.8352044224739075, "learning_rate": 8.387586385067492e-06, "loss": 0.0154, "num_input_tokens_seen": 923641856, "step": 902000 }, { "epoch": 8.32710530443528, "grad_norm": 0.256552129983902, "learning_rate": 8.364519611370996e-06, "loss": 0.0151, "num_input_tokens_seen": 924153856, "step": 902500 }, { "epoch": 8.331718659174578, "grad_norm": 0.9458514451980591, "learning_rate": 8.3414528376745e-06, "loss": 0.0162, "num_input_tokens_seen": 924665856, "step": 903000 }, { "epoch": 8.336332013913879, "grad_norm": 0.9356163740158081, "learning_rate": 8.318386063978004e-06, "loss": 0.0144, "num_input_tokens_seen": 925177856, "step": 903500 }, { "epoch": 8.340945368653177, "grad_norm": 0.6801881790161133, "learning_rate": 8.295319290281507e-06, "loss": 0.0143, "num_input_tokens_seen": 925689856, "step": 904000 }, { "epoch": 8.345558723392477, "grad_norm": 1.2119888067245483, "learning_rate": 8.27225251658501e-06, "loss": 0.0147, "num_input_tokens_seen": 926201856, "step": 904500 }, { "epoch": 8.350172078131775, "grad_norm": 0.6034347414970398, "learning_rate": 8.249185742888515e-06, "loss": 0.0147, "num_input_tokens_seen": 926713856, "step": 905000 }, { "epoch": 8.354785432871076, "grad_norm": 0.47974085807800293, "learning_rate": 8.226118969192017e-06, "loss": 0.0173, "num_input_tokens_seen": 927225856, "step": 905500 }, { "epoch": 8.359398787610374, "grad_norm": 0.7787156105041504, "learning_rate": 8.20305219549552e-06, "loss": 0.0161, "num_input_tokens_seen": 927737856, "step": 906000 }, { "epoch": 8.364012142349674, "grad_norm": 0.8252438306808472, "learning_rate": 8.179985421799025e-06, "loss": 0.0156, "num_input_tokens_seen": 928249856, "step": 906500 }, { "epoch": 8.368625497088972, "grad_norm": 1.7516320943832397, "learning_rate": 8.156918648102529e-06, "loss": 0.0142, "num_input_tokens_seen": 928761856, "step": 907000 }, { "epoch": 8.373238851828273, "grad_norm": 0.9089247584342957, "learning_rate": 8.133851874406031e-06, "loss": 0.0179, "num_input_tokens_seen": 929273856, "step": 907500 }, { "epoch": 8.377852206567571, "grad_norm": 0.5961917042732239, "learning_rate": 8.110785100709533e-06, "loss": 0.0147, "num_input_tokens_seen": 929785856, "step": 908000 }, { "epoch": 8.382465561306871, "grad_norm": 0.9045282602310181, "learning_rate": 8.087718327013037e-06, "loss": 0.015, "num_input_tokens_seen": 930297856, "step": 908500 }, { "epoch": 8.387078916046171, "grad_norm": 2.7716050148010254, "learning_rate": 8.064651553316541e-06, "loss": 0.0166, "num_input_tokens_seen": 930809856, "step": 909000 }, { "epoch": 8.39169227078547, "grad_norm": 0.8180987238883972, "learning_rate": 8.041584779620044e-06, "loss": 0.0142, "num_input_tokens_seen": 931321856, "step": 909500 }, { "epoch": 8.39630562552477, "grad_norm": 0.8871789574623108, "learning_rate": 8.018518005923548e-06, "loss": 0.014, "num_input_tokens_seen": 931833856, "step": 910000 }, { "epoch": 8.400918980264068, "grad_norm": 0.9161932468414307, "learning_rate": 7.995451232227051e-06, "loss": 0.0153, "num_input_tokens_seen": 932345856, "step": 910500 }, { "epoch": 8.405532335003368, "grad_norm": 0.6723649501800537, "learning_rate": 7.972384458530555e-06, "loss": 0.0167, "num_input_tokens_seen": 932857856, "step": 911000 }, { "epoch": 8.410145689742667, "grad_norm": 0.57211834192276, "learning_rate": 7.949317684834058e-06, "loss": 0.015, "num_input_tokens_seen": 933369856, "step": 911500 }, { "epoch": 8.414759044481967, "grad_norm": 0.7815681099891663, "learning_rate": 7.926250911137562e-06, "loss": 0.0155, "num_input_tokens_seen": 933881856, "step": 912000 }, { "epoch": 8.419372399221265, "grad_norm": 1.4835954904556274, "learning_rate": 7.903184137441066e-06, "loss": 0.0175, "num_input_tokens_seen": 934393856, "step": 912500 }, { "epoch": 8.423985753960565, "grad_norm": 0.6556302905082703, "learning_rate": 7.880117363744568e-06, "loss": 0.0157, "num_input_tokens_seen": 934905856, "step": 913000 }, { "epoch": 8.428599108699864, "grad_norm": 0.3592114746570587, "learning_rate": 7.857050590048072e-06, "loss": 0.0148, "num_input_tokens_seen": 935417856, "step": 913500 }, { "epoch": 8.433212463439164, "grad_norm": 1.0812350511550903, "learning_rate": 7.833983816351576e-06, "loss": 0.0152, "num_input_tokens_seen": 935929856, "step": 914000 }, { "epoch": 8.437825818178464, "grad_norm": 0.5357770919799805, "learning_rate": 7.810917042655078e-06, "loss": 0.0151, "num_input_tokens_seen": 936441856, "step": 914500 }, { "epoch": 8.442439172917762, "grad_norm": 1.2673269510269165, "learning_rate": 7.78785026895858e-06, "loss": 0.0145, "num_input_tokens_seen": 936953856, "step": 915000 }, { "epoch": 8.447052527657062, "grad_norm": 1.7254928350448608, "learning_rate": 7.764783495262084e-06, "loss": 0.0165, "num_input_tokens_seen": 937465856, "step": 915500 }, { "epoch": 8.45166588239636, "grad_norm": 0.740627646446228, "learning_rate": 7.741716721565588e-06, "loss": 0.0145, "num_input_tokens_seen": 937977856, "step": 916000 }, { "epoch": 8.45627923713566, "grad_norm": 0.8942471146583557, "learning_rate": 7.718649947869092e-06, "loss": 0.0164, "num_input_tokens_seen": 938489856, "step": 916500 }, { "epoch": 8.46089259187496, "grad_norm": 0.5979003310203552, "learning_rate": 7.695583174172595e-06, "loss": 0.0152, "num_input_tokens_seen": 939001856, "step": 917000 }, { "epoch": 8.46550594661426, "grad_norm": 0.690619945526123, "learning_rate": 7.672516400476099e-06, "loss": 0.014, "num_input_tokens_seen": 939513856, "step": 917500 }, { "epoch": 8.470119301353558, "grad_norm": 0.9563241004943848, "learning_rate": 7.649449626779603e-06, "loss": 0.0151, "num_input_tokens_seen": 940025856, "step": 918000 }, { "epoch": 8.474732656092858, "grad_norm": 0.7812721729278564, "learning_rate": 7.626382853083106e-06, "loss": 0.0169, "num_input_tokens_seen": 940537856, "step": 918500 }, { "epoch": 8.479346010832156, "grad_norm": 0.7864488959312439, "learning_rate": 7.603316079386609e-06, "loss": 0.0165, "num_input_tokens_seen": 941049856, "step": 919000 }, { "epoch": 8.483959365571456, "grad_norm": 0.41324466466903687, "learning_rate": 7.580249305690113e-06, "loss": 0.0148, "num_input_tokens_seen": 941561856, "step": 919500 }, { "epoch": 8.488572720310756, "grad_norm": 1.0213603973388672, "learning_rate": 7.557182531993616e-06, "loss": 0.0141, "num_input_tokens_seen": 942073856, "step": 920000 }, { "epoch": 8.493186075050055, "grad_norm": 0.9692112803459167, "learning_rate": 7.53411575829712e-06, "loss": 0.0162, "num_input_tokens_seen": 942585856, "step": 920500 }, { "epoch": 8.497799429789355, "grad_norm": 0.9468556642532349, "learning_rate": 7.511048984600621e-06, "loss": 0.015, "num_input_tokens_seen": 943097856, "step": 921000 }, { "epoch": 8.502412784528653, "grad_norm": 1.1541293859481812, "learning_rate": 7.487982210904125e-06, "loss": 0.0154, "num_input_tokens_seen": 943609856, "step": 921500 }, { "epoch": 8.507026139267953, "grad_norm": 0.6092996597290039, "learning_rate": 7.464915437207628e-06, "loss": 0.0145, "num_input_tokens_seen": 944121856, "step": 922000 }, { "epoch": 8.511639494007252, "grad_norm": 2.1357691287994385, "learning_rate": 7.441848663511132e-06, "loss": 0.0158, "num_input_tokens_seen": 944633856, "step": 922500 }, { "epoch": 8.516252848746552, "grad_norm": 0.8940873146057129, "learning_rate": 7.4187818898146355e-06, "loss": 0.0158, "num_input_tokens_seen": 945145856, "step": 923000 }, { "epoch": 8.52086620348585, "grad_norm": 0.44890737533569336, "learning_rate": 7.395715116118139e-06, "loss": 0.0183, "num_input_tokens_seen": 945657856, "step": 923500 }, { "epoch": 8.52547955822515, "grad_norm": 0.6357942223548889, "learning_rate": 7.372648342421643e-06, "loss": 0.0158, "num_input_tokens_seen": 946169856, "step": 924000 }, { "epoch": 8.530092912964449, "grad_norm": 1.20125150680542, "learning_rate": 7.349581568725146e-06, "loss": 0.0138, "num_input_tokens_seen": 946681856, "step": 924500 }, { "epoch": 8.534706267703749, "grad_norm": 1.3115291595458984, "learning_rate": 7.32651479502865e-06, "loss": 0.0156, "num_input_tokens_seen": 947193856, "step": 925000 }, { "epoch": 8.539319622443049, "grad_norm": 1.5604932308197021, "learning_rate": 7.303448021332153e-06, "loss": 0.0152, "num_input_tokens_seen": 947705856, "step": 925500 }, { "epoch": 8.543932977182347, "grad_norm": 0.5092642307281494, "learning_rate": 7.280381247635657e-06, "loss": 0.0159, "num_input_tokens_seen": 948217856, "step": 926000 }, { "epoch": 8.548546331921647, "grad_norm": 0.914828896522522, "learning_rate": 7.25731447393916e-06, "loss": 0.0138, "num_input_tokens_seen": 948729856, "step": 926500 }, { "epoch": 8.553159686660946, "grad_norm": 0.554459810256958, "learning_rate": 7.234247700242663e-06, "loss": 0.0155, "num_input_tokens_seen": 949241856, "step": 927000 }, { "epoch": 8.557773041400246, "grad_norm": 0.48894843459129333, "learning_rate": 7.211180926546165e-06, "loss": 0.0142, "num_input_tokens_seen": 949753856, "step": 927500 }, { "epoch": 8.562386396139544, "grad_norm": 1.2641159296035767, "learning_rate": 7.188114152849669e-06, "loss": 0.0163, "num_input_tokens_seen": 950265856, "step": 928000 }, { "epoch": 8.566999750878844, "grad_norm": 0.9658982157707214, "learning_rate": 7.165047379153172e-06, "loss": 0.0138, "num_input_tokens_seen": 950777856, "step": 928500 }, { "epoch": 8.571613105618143, "grad_norm": 1.2537494897842407, "learning_rate": 7.141980605456676e-06, "loss": 0.016, "num_input_tokens_seen": 951289856, "step": 929000 }, { "epoch": 8.576226460357443, "grad_norm": 2.147233009338379, "learning_rate": 7.1189138317601795e-06, "loss": 0.0155, "num_input_tokens_seen": 951801856, "step": 929500 }, { "epoch": 8.580839815096741, "grad_norm": 1.6873968839645386, "learning_rate": 7.095847058063683e-06, "loss": 0.0155, "num_input_tokens_seen": 952313856, "step": 930000 }, { "epoch": 8.585453169836041, "grad_norm": 1.5905687808990479, "learning_rate": 7.072780284367187e-06, "loss": 0.016, "num_input_tokens_seen": 952825856, "step": 930500 }, { "epoch": 8.590066524575342, "grad_norm": 0.8234834671020508, "learning_rate": 7.04971351067069e-06, "loss": 0.0142, "num_input_tokens_seen": 953337856, "step": 931000 }, { "epoch": 8.59467987931464, "grad_norm": 1.0002344846725464, "learning_rate": 7.026646736974194e-06, "loss": 0.0152, "num_input_tokens_seen": 953849856, "step": 931500 }, { "epoch": 8.59929323405394, "grad_norm": 4.079251289367676, "learning_rate": 7.003579963277697e-06, "loss": 0.0146, "num_input_tokens_seen": 954361856, "step": 932000 }, { "epoch": 8.603906588793238, "grad_norm": 0.8030288815498352, "learning_rate": 6.980513189581201e-06, "loss": 0.0145, "num_input_tokens_seen": 954873856, "step": 932500 }, { "epoch": 8.608519943532539, "grad_norm": 0.8186569213867188, "learning_rate": 6.957446415884704e-06, "loss": 0.0161, "num_input_tokens_seen": 955385856, "step": 933000 }, { "epoch": 8.613133298271837, "grad_norm": 0.680074155330658, "learning_rate": 6.934379642188207e-06, "loss": 0.0155, "num_input_tokens_seen": 955897856, "step": 933500 }, { "epoch": 8.617746653011137, "grad_norm": 1.1147595643997192, "learning_rate": 6.911312868491709e-06, "loss": 0.0171, "num_input_tokens_seen": 956409856, "step": 934000 }, { "epoch": 8.622360007750435, "grad_norm": 1.0557124614715576, "learning_rate": 6.888246094795213e-06, "loss": 0.0155, "num_input_tokens_seen": 956921856, "step": 934500 }, { "epoch": 8.626973362489736, "grad_norm": 0.5240976214408875, "learning_rate": 6.865179321098716e-06, "loss": 0.0144, "num_input_tokens_seen": 957433856, "step": 935000 }, { "epoch": 8.631586717229034, "grad_norm": 0.6534589529037476, "learning_rate": 6.8421125474022195e-06, "loss": 0.0167, "num_input_tokens_seen": 957945856, "step": 935500 }, { "epoch": 8.636200071968334, "grad_norm": 0.33386147022247314, "learning_rate": 6.8190457737057235e-06, "loss": 0.0128, "num_input_tokens_seen": 958457856, "step": 936000 }, { "epoch": 8.640813426707634, "grad_norm": 1.6744736433029175, "learning_rate": 6.795979000009227e-06, "loss": 0.0159, "num_input_tokens_seen": 958969856, "step": 936500 }, { "epoch": 8.645426781446933, "grad_norm": 6.504983425140381, "learning_rate": 6.7729122263127306e-06, "loss": 0.0182, "num_input_tokens_seen": 959481856, "step": 937000 }, { "epoch": 8.650040136186233, "grad_norm": 1.2921936511993408, "learning_rate": 6.749845452616234e-06, "loss": 0.0164, "num_input_tokens_seen": 959993856, "step": 937500 }, { "epoch": 8.654653490925531, "grad_norm": 1.5937762260437012, "learning_rate": 6.726778678919738e-06, "loss": 0.0156, "num_input_tokens_seen": 960505856, "step": 938000 }, { "epoch": 8.659266845664831, "grad_norm": 0.9005319476127625, "learning_rate": 6.703711905223241e-06, "loss": 0.0165, "num_input_tokens_seen": 961017856, "step": 938500 }, { "epoch": 8.66388020040413, "grad_norm": 1.019418716430664, "learning_rate": 6.680645131526744e-06, "loss": 0.0162, "num_input_tokens_seen": 961529856, "step": 939000 }, { "epoch": 8.66849355514343, "grad_norm": 0.5105811953544617, "learning_rate": 6.657578357830248e-06, "loss": 0.0152, "num_input_tokens_seen": 962041856, "step": 939500 }, { "epoch": 8.673106909882728, "grad_norm": 0.6588147282600403, "learning_rate": 6.634511584133751e-06, "loss": 0.0173, "num_input_tokens_seen": 962553856, "step": 940000 }, { "epoch": 8.677720264622028, "grad_norm": 0.5775207877159119, "learning_rate": 6.611444810437253e-06, "loss": 0.0158, "num_input_tokens_seen": 963065856, "step": 940500 }, { "epoch": 8.682333619361327, "grad_norm": 1.1807801723480225, "learning_rate": 6.588378036740757e-06, "loss": 0.015, "num_input_tokens_seen": 963577856, "step": 941000 }, { "epoch": 8.686946974100627, "grad_norm": 0.7394533157348633, "learning_rate": 6.56531126304426e-06, "loss": 0.0149, "num_input_tokens_seen": 964089856, "step": 941500 }, { "epoch": 8.691560328839925, "grad_norm": 0.5393823385238647, "learning_rate": 6.5422444893477635e-06, "loss": 0.0159, "num_input_tokens_seen": 964601856, "step": 942000 }, { "epoch": 8.696173683579225, "grad_norm": 1.1270785331726074, "learning_rate": 6.5191777156512675e-06, "loss": 0.0145, "num_input_tokens_seen": 965113856, "step": 942500 }, { "epoch": 8.700787038318525, "grad_norm": 1.156285047531128, "learning_rate": 6.496110941954771e-06, "loss": 0.0147, "num_input_tokens_seen": 965625856, "step": 943000 }, { "epoch": 8.705400393057824, "grad_norm": 0.3501507639884949, "learning_rate": 6.4730441682582746e-06, "loss": 0.0167, "num_input_tokens_seen": 966137856, "step": 943500 }, { "epoch": 8.710013747797124, "grad_norm": 0.7830114960670471, "learning_rate": 6.449977394561778e-06, "loss": 0.0155, "num_input_tokens_seen": 966649856, "step": 944000 }, { "epoch": 8.714627102536422, "grad_norm": 0.9424002766609192, "learning_rate": 6.426910620865282e-06, "loss": 0.0159, "num_input_tokens_seen": 967161856, "step": 944500 }, { "epoch": 8.719240457275722, "grad_norm": 1.7092015743255615, "learning_rate": 6.403843847168785e-06, "loss": 0.0159, "num_input_tokens_seen": 967673856, "step": 945000 }, { "epoch": 8.72385381201502, "grad_norm": 0.3808750808238983, "learning_rate": 6.380777073472288e-06, "loss": 0.0157, "num_input_tokens_seen": 968185856, "step": 945500 }, { "epoch": 8.72846716675432, "grad_norm": 0.8436591625213623, "learning_rate": 6.357710299775792e-06, "loss": 0.015, "num_input_tokens_seen": 968697856, "step": 946000 }, { "epoch": 8.73308052149362, "grad_norm": 0.48995792865753174, "learning_rate": 6.334643526079295e-06, "loss": 0.014, "num_input_tokens_seen": 969209856, "step": 946500 }, { "epoch": 8.73769387623292, "grad_norm": 0.6074419021606445, "learning_rate": 6.311576752382799e-06, "loss": 0.0163, "num_input_tokens_seen": 969721856, "step": 947000 }, { "epoch": 8.742307230972218, "grad_norm": 1.1008994579315186, "learning_rate": 6.2885099786863e-06, "loss": 0.014, "num_input_tokens_seen": 970233856, "step": 947500 }, { "epoch": 8.746920585711518, "grad_norm": 0.4239863157272339, "learning_rate": 6.265443204989804e-06, "loss": 0.0152, "num_input_tokens_seen": 970745856, "step": 948000 }, { "epoch": 8.751533940450816, "grad_norm": 0.8348074555397034, "learning_rate": 6.242376431293308e-06, "loss": 0.0159, "num_input_tokens_seen": 971257856, "step": 948500 }, { "epoch": 8.756147295190116, "grad_norm": 0.9429554343223572, "learning_rate": 6.2193096575968115e-06, "loss": 0.016, "num_input_tokens_seen": 971769856, "step": 949000 }, { "epoch": 8.760760649929416, "grad_norm": 0.8379220366477966, "learning_rate": 6.196242883900315e-06, "loss": 0.0144, "num_input_tokens_seen": 972281856, "step": 949500 }, { "epoch": 8.765374004668715, "grad_norm": 0.543300211429596, "learning_rate": 6.1731761102038186e-06, "loss": 0.0168, "num_input_tokens_seen": 972793856, "step": 950000 }, { "epoch": 8.769987359408015, "grad_norm": 1.0430985689163208, "learning_rate": 6.150109336507322e-06, "loss": 0.0148, "num_input_tokens_seen": 973305856, "step": 950500 }, { "epoch": 8.774600714147313, "grad_norm": 1.5497344732284546, "learning_rate": 6.127042562810825e-06, "loss": 0.0154, "num_input_tokens_seen": 973817856, "step": 951000 }, { "epoch": 8.779214068886613, "grad_norm": 0.5469529628753662, "learning_rate": 6.103975789114329e-06, "loss": 0.0136, "num_input_tokens_seen": 974329856, "step": 951500 }, { "epoch": 8.783827423625912, "grad_norm": 1.1605631113052368, "learning_rate": 6.080909015417832e-06, "loss": 0.0143, "num_input_tokens_seen": 974841856, "step": 952000 }, { "epoch": 8.788440778365212, "grad_norm": 0.4232845604419708, "learning_rate": 6.057842241721335e-06, "loss": 0.015, "num_input_tokens_seen": 975353856, "step": 952500 }, { "epoch": 8.79305413310451, "grad_norm": 0.9222050905227661, "learning_rate": 6.034775468024838e-06, "loss": 0.0142, "num_input_tokens_seen": 975865856, "step": 953000 }, { "epoch": 8.79766748784381, "grad_norm": 0.6866771578788757, "learning_rate": 6.011708694328342e-06, "loss": 0.0149, "num_input_tokens_seen": 976377856, "step": 953500 }, { "epoch": 8.802280842583109, "grad_norm": 0.7165865302085876, "learning_rate": 5.988641920631845e-06, "loss": 0.0153, "num_input_tokens_seen": 976889856, "step": 954000 }, { "epoch": 8.806894197322409, "grad_norm": 0.8396665453910828, "learning_rate": 5.965575146935349e-06, "loss": 0.0141, "num_input_tokens_seen": 977401856, "step": 954500 }, { "epoch": 8.811507552061709, "grad_norm": 0.6975528597831726, "learning_rate": 5.942508373238852e-06, "loss": 0.0147, "num_input_tokens_seen": 977913856, "step": 955000 }, { "epoch": 8.816120906801007, "grad_norm": 0.8357110619544983, "learning_rate": 5.9194415995423555e-06, "loss": 0.0148, "num_input_tokens_seen": 978425856, "step": 955500 }, { "epoch": 8.820734261540307, "grad_norm": 0.9856480956077576, "learning_rate": 5.896374825845859e-06, "loss": 0.0155, "num_input_tokens_seen": 978937856, "step": 956000 }, { "epoch": 8.825347616279606, "grad_norm": 1.2731949090957642, "learning_rate": 5.8733080521493626e-06, "loss": 0.0155, "num_input_tokens_seen": 979449856, "step": 956500 }, { "epoch": 8.829960971018906, "grad_norm": 0.7930001020431519, "learning_rate": 5.850241278452866e-06, "loss": 0.0143, "num_input_tokens_seen": 979961856, "step": 957000 }, { "epoch": 8.834574325758204, "grad_norm": 0.7619320154190063, "learning_rate": 5.827174504756369e-06, "loss": 0.016, "num_input_tokens_seen": 980473856, "step": 957500 }, { "epoch": 8.839187680497504, "grad_norm": 0.7133992314338684, "learning_rate": 5.804107731059873e-06, "loss": 0.0164, "num_input_tokens_seen": 980985856, "step": 958000 }, { "epoch": 8.843801035236803, "grad_norm": 0.42310747504234314, "learning_rate": 5.781040957363375e-06, "loss": 0.0146, "num_input_tokens_seen": 981497856, "step": 958500 }, { "epoch": 8.848414389976103, "grad_norm": 0.3348715305328369, "learning_rate": 5.757974183666879e-06, "loss": 0.0162, "num_input_tokens_seen": 982009856, "step": 959000 }, { "epoch": 8.853027744715401, "grad_norm": 0.6126227974891663, "learning_rate": 5.734907409970382e-06, "loss": 0.0141, "num_input_tokens_seen": 982521856, "step": 959500 }, { "epoch": 8.857641099454701, "grad_norm": 0.6455732583999634, "learning_rate": 5.711840636273886e-06, "loss": 0.0154, "num_input_tokens_seen": 983033856, "step": 960000 }, { "epoch": 8.862254454194002, "grad_norm": 1.075323224067688, "learning_rate": 5.688773862577389e-06, "loss": 0.016, "num_input_tokens_seen": 983545856, "step": 960500 }, { "epoch": 8.8668678089333, "grad_norm": 0.8069124817848206, "learning_rate": 5.665707088880893e-06, "loss": 0.0149, "num_input_tokens_seen": 984057856, "step": 961000 }, { "epoch": 8.8714811636726, "grad_norm": 0.9779102206230164, "learning_rate": 5.642640315184396e-06, "loss": 0.0154, "num_input_tokens_seen": 984569856, "step": 961500 }, { "epoch": 8.876094518411898, "grad_norm": 0.8441368937492371, "learning_rate": 5.6195735414878994e-06, "loss": 0.0154, "num_input_tokens_seen": 985081856, "step": 962000 }, { "epoch": 8.880707873151199, "grad_norm": 0.44055867195129395, "learning_rate": 5.5965067677914026e-06, "loss": 0.014, "num_input_tokens_seen": 985593856, "step": 962500 }, { "epoch": 8.885321227890497, "grad_norm": 1.1985424757003784, "learning_rate": 5.5734399940949065e-06, "loss": 0.0156, "num_input_tokens_seen": 986105856, "step": 963000 }, { "epoch": 8.889934582629797, "grad_norm": 1.8032441139221191, "learning_rate": 5.55037322039841e-06, "loss": 0.017, "num_input_tokens_seen": 986617856, "step": 963500 }, { "epoch": 8.894547937369095, "grad_norm": 2.679948329925537, "learning_rate": 5.527306446701913e-06, "loss": 0.016, "num_input_tokens_seen": 987129856, "step": 964000 }, { "epoch": 8.899161292108396, "grad_norm": 1.422170639038086, "learning_rate": 5.504239673005417e-06, "loss": 0.0152, "num_input_tokens_seen": 987641856, "step": 964500 }, { "epoch": 8.903774646847694, "grad_norm": 0.785531759262085, "learning_rate": 5.481172899308919e-06, "loss": 0.0157, "num_input_tokens_seen": 988153856, "step": 965000 }, { "epoch": 8.908388001586994, "grad_norm": 0.813910961151123, "learning_rate": 5.458106125612423e-06, "loss": 0.0167, "num_input_tokens_seen": 988665856, "step": 965500 }, { "epoch": 8.913001356326294, "grad_norm": 0.6769202351570129, "learning_rate": 5.435039351915926e-06, "loss": 0.0159, "num_input_tokens_seen": 989177856, "step": 966000 }, { "epoch": 8.917614711065593, "grad_norm": 2.5310189723968506, "learning_rate": 5.41197257821943e-06, "loss": 0.0152, "num_input_tokens_seen": 989689856, "step": 966500 }, { "epoch": 8.922228065804893, "grad_norm": 0.5400819182395935, "learning_rate": 5.388905804522933e-06, "loss": 0.0152, "num_input_tokens_seen": 990201856, "step": 967000 }, { "epoch": 8.926841420544191, "grad_norm": 0.33608752489089966, "learning_rate": 5.365839030826437e-06, "loss": 0.0153, "num_input_tokens_seen": 990713856, "step": 967500 }, { "epoch": 8.931454775283491, "grad_norm": 0.6144788265228271, "learning_rate": 5.34277225712994e-06, "loss": 0.0151, "num_input_tokens_seen": 991225856, "step": 968000 }, { "epoch": 8.93606813002279, "grad_norm": 0.8687652349472046, "learning_rate": 5.3197054834334434e-06, "loss": 0.016, "num_input_tokens_seen": 991737856, "step": 968500 }, { "epoch": 8.94068148476209, "grad_norm": 0.9648618698120117, "learning_rate": 5.2966387097369466e-06, "loss": 0.0166, "num_input_tokens_seen": 992249856, "step": 969000 }, { "epoch": 8.945294839501388, "grad_norm": 0.6023857593536377, "learning_rate": 5.27357193604045e-06, "loss": 0.0144, "num_input_tokens_seen": 992761856, "step": 969500 }, { "epoch": 8.949908194240688, "grad_norm": 1.8448054790496826, "learning_rate": 5.250505162343954e-06, "loss": 0.0155, "num_input_tokens_seen": 993273856, "step": 970000 }, { "epoch": 8.954521548979987, "grad_norm": 0.6951389312744141, "learning_rate": 5.227438388647457e-06, "loss": 0.0154, "num_input_tokens_seen": 993785856, "step": 970500 }, { "epoch": 8.959134903719287, "grad_norm": 0.5784729719161987, "learning_rate": 5.204371614950961e-06, "loss": 0.0154, "num_input_tokens_seen": 994297856, "step": 971000 }, { "epoch": 8.963748258458587, "grad_norm": 1.4732640981674194, "learning_rate": 5.181304841254463e-06, "loss": 0.0147, "num_input_tokens_seen": 994809856, "step": 971500 }, { "epoch": 8.968361613197885, "grad_norm": 0.9267556667327881, "learning_rate": 5.158238067557967e-06, "loss": 0.0149, "num_input_tokens_seen": 995321856, "step": 972000 }, { "epoch": 8.972974967937185, "grad_norm": 0.3285810053348541, "learning_rate": 5.13517129386147e-06, "loss": 0.0146, "num_input_tokens_seen": 995833856, "step": 972500 }, { "epoch": 8.977588322676484, "grad_norm": 1.0577844381332397, "learning_rate": 5.112104520164974e-06, "loss": 0.0142, "num_input_tokens_seen": 996345856, "step": 973000 }, { "epoch": 8.982201677415784, "grad_norm": 0.40497535467147827, "learning_rate": 5.089037746468477e-06, "loss": 0.0157, "num_input_tokens_seen": 996857856, "step": 973500 }, { "epoch": 8.986815032155082, "grad_norm": 0.6067364811897278, "learning_rate": 5.065970972771981e-06, "loss": 0.0155, "num_input_tokens_seen": 997369856, "step": 974000 }, { "epoch": 8.991428386894382, "grad_norm": 0.5121076703071594, "learning_rate": 5.042904199075484e-06, "loss": 0.0145, "num_input_tokens_seen": 997881856, "step": 974500 }, { "epoch": 8.99604174163368, "grad_norm": 1.0173983573913574, "learning_rate": 5.0198374253789874e-06, "loss": 0.0154, "num_input_tokens_seen": 998393856, "step": 975000 }, { "epoch": 9.0, "eval_combined_score": 0.06468997752487994, "eval_loss": 0.06468997895717621, "eval_mse": 0.06468997609258367, "eval_runtime": 45.8521, "eval_samples_per_second": 2101.059, "eval_steps_per_second": 262.649, "num_input_tokens_seen": 998832384, "step": 975429 }, { "epoch": 9.00065509637298, "grad_norm": 0.4236084818840027, "learning_rate": 4.9967706516824906e-06, "loss": 0.0159, "num_input_tokens_seen": 998905088, "step": 975500 }, { "epoch": 9.00526845111228, "grad_norm": 0.6183050870895386, "learning_rate": 4.973703877985994e-06, "loss": 0.0138, "num_input_tokens_seen": 999417088, "step": 976000 }, { "epoch": 9.00988180585158, "grad_norm": 3.3244409561157227, "learning_rate": 4.950637104289498e-06, "loss": 0.0136, "num_input_tokens_seen": 999929088, "step": 976500 }, { "epoch": 9.014495160590878, "grad_norm": 0.5056183934211731, "learning_rate": 4.927570330593001e-06, "loss": 0.0133, "num_input_tokens_seen": 1000441088, "step": 977000 }, { "epoch": 9.019108515330178, "grad_norm": 0.6775535941123962, "learning_rate": 4.904503556896505e-06, "loss": 0.0135, "num_input_tokens_seen": 1000953088, "step": 977500 }, { "epoch": 9.023721870069478, "grad_norm": 0.4014028012752533, "learning_rate": 4.881436783200007e-06, "loss": 0.013, "num_input_tokens_seen": 1001465088, "step": 978000 }, { "epoch": 9.028335224808776, "grad_norm": 0.6904358863830566, "learning_rate": 4.858370009503511e-06, "loss": 0.0128, "num_input_tokens_seen": 1001977088, "step": 978500 }, { "epoch": 9.032948579548076, "grad_norm": 1.717046856880188, "learning_rate": 4.835303235807014e-06, "loss": 0.0135, "num_input_tokens_seen": 1002489088, "step": 979000 }, { "epoch": 9.037561934287375, "grad_norm": 1.1280878782272339, "learning_rate": 4.812236462110518e-06, "loss": 0.0141, "num_input_tokens_seen": 1003001088, "step": 979500 }, { "epoch": 9.042175289026675, "grad_norm": 0.9828783869743347, "learning_rate": 4.789169688414021e-06, "loss": 0.0139, "num_input_tokens_seen": 1003513088, "step": 980000 }, { "epoch": 9.046788643765973, "grad_norm": 0.9039996266365051, "learning_rate": 4.766102914717524e-06, "loss": 0.013, "num_input_tokens_seen": 1004025088, "step": 980500 }, { "epoch": 9.051401998505273, "grad_norm": 0.760273277759552, "learning_rate": 4.743036141021028e-06, "loss": 0.0129, "num_input_tokens_seen": 1004537088, "step": 981000 }, { "epoch": 9.056015353244572, "grad_norm": 0.6820119619369507, "learning_rate": 4.719969367324531e-06, "loss": 0.0138, "num_input_tokens_seen": 1005049088, "step": 981500 }, { "epoch": 9.060628707983872, "grad_norm": 0.8274890780448914, "learning_rate": 4.6969025936280346e-06, "loss": 0.0137, "num_input_tokens_seen": 1005561088, "step": 982000 }, { "epoch": 9.06524206272317, "grad_norm": 0.43844661116600037, "learning_rate": 4.673835819931538e-06, "loss": 0.0133, "num_input_tokens_seen": 1006073088, "step": 982500 }, { "epoch": 9.06985541746247, "grad_norm": 1.0397804975509644, "learning_rate": 4.650769046235042e-06, "loss": 0.0131, "num_input_tokens_seen": 1006585088, "step": 983000 }, { "epoch": 9.07446877220177, "grad_norm": 1.1185849905014038, "learning_rate": 4.627702272538545e-06, "loss": 0.0139, "num_input_tokens_seen": 1007097088, "step": 983500 }, { "epoch": 9.079082126941069, "grad_norm": 0.4616248607635498, "learning_rate": 4.604635498842049e-06, "loss": 0.0136, "num_input_tokens_seen": 1007609088, "step": 984000 }, { "epoch": 9.083695481680369, "grad_norm": 0.4887053966522217, "learning_rate": 4.581568725145551e-06, "loss": 0.0119, "num_input_tokens_seen": 1008121088, "step": 984500 }, { "epoch": 9.088308836419667, "grad_norm": 0.9657731056213379, "learning_rate": 4.558501951449055e-06, "loss": 0.0134, "num_input_tokens_seen": 1008633088, "step": 985000 }, { "epoch": 9.092922191158967, "grad_norm": 0.6589749455451965, "learning_rate": 4.535435177752558e-06, "loss": 0.0137, "num_input_tokens_seen": 1009145088, "step": 985500 }, { "epoch": 9.097535545898266, "grad_norm": 1.095737338066101, "learning_rate": 4.512368404056062e-06, "loss": 0.0136, "num_input_tokens_seen": 1009657088, "step": 986000 }, { "epoch": 9.102148900637566, "grad_norm": 0.9578360915184021, "learning_rate": 4.489301630359565e-06, "loss": 0.0144, "num_input_tokens_seen": 1010169088, "step": 986500 }, { "epoch": 9.106762255376864, "grad_norm": 1.0494704246520996, "learning_rate": 4.466234856663068e-06, "loss": 0.0143, "num_input_tokens_seen": 1010681088, "step": 987000 }, { "epoch": 9.111375610116164, "grad_norm": 0.3351483643054962, "learning_rate": 4.443168082966572e-06, "loss": 0.0143, "num_input_tokens_seen": 1011193088, "step": 987500 }, { "epoch": 9.115988964855463, "grad_norm": 1.107553482055664, "learning_rate": 4.4201013092700746e-06, "loss": 0.0158, "num_input_tokens_seen": 1011705088, "step": 988000 }, { "epoch": 9.120602319594763, "grad_norm": 0.8427937626838684, "learning_rate": 4.3970345355735785e-06, "loss": 0.0134, "num_input_tokens_seen": 1012217088, "step": 988500 }, { "epoch": 9.125215674334063, "grad_norm": 0.5374360084533691, "learning_rate": 4.373967761877082e-06, "loss": 0.0121, "num_input_tokens_seen": 1012729088, "step": 989000 }, { "epoch": 9.129829029073361, "grad_norm": 1.2801436185836792, "learning_rate": 4.350900988180586e-06, "loss": 0.0134, "num_input_tokens_seen": 1013241088, "step": 989500 }, { "epoch": 9.134442383812662, "grad_norm": 2.0048415660858154, "learning_rate": 4.327834214484089e-06, "loss": 0.0132, "num_input_tokens_seen": 1013753088, "step": 990000 }, { "epoch": 9.13905573855196, "grad_norm": 1.3461086750030518, "learning_rate": 4.304767440787593e-06, "loss": 0.0136, "num_input_tokens_seen": 1014265088, "step": 990500 }, { "epoch": 9.14366909329126, "grad_norm": 0.5770676732063293, "learning_rate": 4.281700667091096e-06, "loss": 0.0142, "num_input_tokens_seen": 1014777088, "step": 991000 }, { "epoch": 9.148282448030558, "grad_norm": 0.7648055553436279, "learning_rate": 4.258633893394599e-06, "loss": 0.0134, "num_input_tokens_seen": 1015289088, "step": 991500 }, { "epoch": 9.152895802769859, "grad_norm": 0.8219977021217346, "learning_rate": 4.235567119698102e-06, "loss": 0.0139, "num_input_tokens_seen": 1015801088, "step": 992000 }, { "epoch": 9.157509157509157, "grad_norm": 0.2618965804576874, "learning_rate": 4.212500346001605e-06, "loss": 0.0146, "num_input_tokens_seen": 1016313088, "step": 992500 }, { "epoch": 9.162122512248457, "grad_norm": 0.580898642539978, "learning_rate": 4.189433572305109e-06, "loss": 0.0131, "num_input_tokens_seen": 1016825088, "step": 993000 }, { "epoch": 9.166735866987755, "grad_norm": 1.426604151725769, "learning_rate": 4.166366798608612e-06, "loss": 0.0137, "num_input_tokens_seen": 1017337088, "step": 993500 }, { "epoch": 9.171349221727056, "grad_norm": 0.4607691764831543, "learning_rate": 4.143300024912116e-06, "loss": 0.0126, "num_input_tokens_seen": 1017849088, "step": 994000 }, { "epoch": 9.175962576466354, "grad_norm": 0.5528801083564758, "learning_rate": 4.1202332512156186e-06, "loss": 0.0125, "num_input_tokens_seen": 1018361088, "step": 994500 }, { "epoch": 9.180575931205654, "grad_norm": 0.24360989034175873, "learning_rate": 4.0971664775191225e-06, "loss": 0.0117, "num_input_tokens_seen": 1018873088, "step": 995000 }, { "epoch": 9.185189285944954, "grad_norm": 0.5846107602119446, "learning_rate": 4.074099703822626e-06, "loss": 0.0141, "num_input_tokens_seen": 1019385088, "step": 995500 }, { "epoch": 9.189802640684253, "grad_norm": 0.8627530932426453, "learning_rate": 4.05103293012613e-06, "loss": 0.0129, "num_input_tokens_seen": 1019897088, "step": 996000 }, { "epoch": 9.194415995423553, "grad_norm": 0.7435634732246399, "learning_rate": 4.027966156429633e-06, "loss": 0.0139, "num_input_tokens_seen": 1020409088, "step": 996500 }, { "epoch": 9.199029350162851, "grad_norm": 0.6394104957580566, "learning_rate": 4.004899382733137e-06, "loss": 0.0146, "num_input_tokens_seen": 1020921088, "step": 997000 }, { "epoch": 9.203642704902151, "grad_norm": 0.4735194444656372, "learning_rate": 3.98183260903664e-06, "loss": 0.0133, "num_input_tokens_seen": 1021433088, "step": 997500 }, { "epoch": 9.20825605964145, "grad_norm": 0.9603920578956604, "learning_rate": 3.958765835340143e-06, "loss": 0.013, "num_input_tokens_seen": 1021945088, "step": 998000 }, { "epoch": 9.21286941438075, "grad_norm": 1.0817182064056396, "learning_rate": 3.935699061643646e-06, "loss": 0.0117, "num_input_tokens_seen": 1022457088, "step": 998500 }, { "epoch": 9.217482769120048, "grad_norm": 0.5785081386566162, "learning_rate": 3.912632287947149e-06, "loss": 0.0127, "num_input_tokens_seen": 1022969088, "step": 999000 }, { "epoch": 9.222096123859348, "grad_norm": 0.34806227684020996, "learning_rate": 3.889565514250653e-06, "loss": 0.0129, "num_input_tokens_seen": 1023481088, "step": 999500 }, { "epoch": 9.226709478598647, "grad_norm": 0.8392277359962463, "learning_rate": 3.866498740554156e-06, "loss": 0.0128, "num_input_tokens_seen": 1023993088, "step": 1000000 }, { "epoch": 9.231322833337947, "grad_norm": 0.34862348437309265, "learning_rate": 3.84343196685766e-06, "loss": 0.0147, "num_input_tokens_seen": 1024505088, "step": 1000500 }, { "epoch": 9.235936188077247, "grad_norm": 0.8864858150482178, "learning_rate": 3.8203651931611626e-06, "loss": 0.013, "num_input_tokens_seen": 1025017088, "step": 1001000 }, { "epoch": 9.240549542816545, "grad_norm": 0.7740064263343811, "learning_rate": 3.797298419464666e-06, "loss": 0.0128, "num_input_tokens_seen": 1025529088, "step": 1001500 }, { "epoch": 9.245162897555845, "grad_norm": 0.21236860752105713, "learning_rate": 3.7742316457681697e-06, "loss": 0.013, "num_input_tokens_seen": 1026041088, "step": 1002000 }, { "epoch": 9.249776252295144, "grad_norm": 0.5248683094978333, "learning_rate": 3.751164872071673e-06, "loss": 0.0137, "num_input_tokens_seen": 1026553088, "step": 1002500 }, { "epoch": 9.254389607034444, "grad_norm": 0.49671700596809387, "learning_rate": 3.7280980983751767e-06, "loss": 0.0129, "num_input_tokens_seen": 1027065088, "step": 1003000 }, { "epoch": 9.259002961773742, "grad_norm": 0.7748130559921265, "learning_rate": 3.7050313246786803e-06, "loss": 0.013, "num_input_tokens_seen": 1027577088, "step": 1003500 }, { "epoch": 9.263616316513042, "grad_norm": 0.5696319341659546, "learning_rate": 3.681964550982184e-06, "loss": 0.0128, "num_input_tokens_seen": 1028089088, "step": 1004000 }, { "epoch": 9.26822967125234, "grad_norm": 1.47969651222229, "learning_rate": 3.6588977772856865e-06, "loss": 0.0134, "num_input_tokens_seen": 1028601088, "step": 1004500 }, { "epoch": 9.27284302599164, "grad_norm": 0.6833159923553467, "learning_rate": 3.63583100358919e-06, "loss": 0.0145, "num_input_tokens_seen": 1029113088, "step": 1005000 }, { "epoch": 9.27745638073094, "grad_norm": 0.9838703870773315, "learning_rate": 3.6127642298926936e-06, "loss": 0.0141, "num_input_tokens_seen": 1029625088, "step": 1005500 }, { "epoch": 9.28206973547024, "grad_norm": 0.5185501575469971, "learning_rate": 3.589697456196197e-06, "loss": 0.0144, "num_input_tokens_seen": 1030137088, "step": 1006000 }, { "epoch": 9.28668309020954, "grad_norm": 0.6044150590896606, "learning_rate": 3.5666306824997003e-06, "loss": 0.0132, "num_input_tokens_seen": 1030649088, "step": 1006500 }, { "epoch": 9.291296444948838, "grad_norm": 0.5589469075202942, "learning_rate": 3.543563908803204e-06, "loss": 0.0137, "num_input_tokens_seen": 1031161088, "step": 1007000 }, { "epoch": 9.295909799688138, "grad_norm": 0.8428828120231628, "learning_rate": 3.5204971351067066e-06, "loss": 0.0132, "num_input_tokens_seen": 1031673088, "step": 1007500 }, { "epoch": 9.300523154427436, "grad_norm": 1.0949701070785522, "learning_rate": 3.49743036141021e-06, "loss": 0.0132, "num_input_tokens_seen": 1032185088, "step": 1008000 }, { "epoch": 9.305136509166736, "grad_norm": 0.48161888122558594, "learning_rate": 3.4743635877137136e-06, "loss": 0.0138, "num_input_tokens_seen": 1032697088, "step": 1008500 }, { "epoch": 9.309749863906035, "grad_norm": 1.4229580163955688, "learning_rate": 3.451296814017217e-06, "loss": 0.013, "num_input_tokens_seen": 1033209088, "step": 1009000 }, { "epoch": 9.314363218645335, "grad_norm": 1.3797547817230225, "learning_rate": 3.4282300403207207e-06, "loss": 0.0143, "num_input_tokens_seen": 1033721088, "step": 1009500 }, { "epoch": 9.318976573384633, "grad_norm": 0.764750599861145, "learning_rate": 3.4051632666242243e-06, "loss": 0.0124, "num_input_tokens_seen": 1034233088, "step": 1010000 }, { "epoch": 9.323589928123933, "grad_norm": 1.4155054092407227, "learning_rate": 3.382096492927728e-06, "loss": 0.0143, "num_input_tokens_seen": 1034745088, "step": 1010500 }, { "epoch": 9.328203282863232, "grad_norm": 0.5639691352844238, "learning_rate": 3.3590297192312305e-06, "loss": 0.0123, "num_input_tokens_seen": 1035257088, "step": 1011000 }, { "epoch": 9.332816637602532, "grad_norm": 1.6954376697540283, "learning_rate": 3.335962945534734e-06, "loss": 0.0158, "num_input_tokens_seen": 1035769088, "step": 1011500 }, { "epoch": 9.337429992341832, "grad_norm": 1.096420168876648, "learning_rate": 3.3128961718382376e-06, "loss": 0.0134, "num_input_tokens_seen": 1036281088, "step": 1012000 }, { "epoch": 9.34204334708113, "grad_norm": 0.7063207626342773, "learning_rate": 3.2898293981417408e-06, "loss": 0.0142, "num_input_tokens_seen": 1036793088, "step": 1012500 }, { "epoch": 9.34665670182043, "grad_norm": 1.40740966796875, "learning_rate": 3.2667626244452443e-06, "loss": 0.0133, "num_input_tokens_seen": 1037305088, "step": 1013000 }, { "epoch": 9.351270056559729, "grad_norm": 1.0713701248168945, "learning_rate": 3.243695850748748e-06, "loss": 0.0122, "num_input_tokens_seen": 1037817088, "step": 1013500 }, { "epoch": 9.355883411299029, "grad_norm": 0.41992899775505066, "learning_rate": 3.2206290770522505e-06, "loss": 0.0114, "num_input_tokens_seen": 1038329088, "step": 1014000 }, { "epoch": 9.360496766038327, "grad_norm": 0.42630577087402344, "learning_rate": 3.197562303355754e-06, "loss": 0.0147, "num_input_tokens_seen": 1038841088, "step": 1014500 }, { "epoch": 9.365110120777628, "grad_norm": 1.1027462482452393, "learning_rate": 3.1744955296592576e-06, "loss": 0.0131, "num_input_tokens_seen": 1039353088, "step": 1015000 }, { "epoch": 9.369723475516926, "grad_norm": 0.5520905256271362, "learning_rate": 3.151428755962761e-06, "loss": 0.0139, "num_input_tokens_seen": 1039865088, "step": 1015500 }, { "epoch": 9.374336830256226, "grad_norm": 0.46760430932044983, "learning_rate": 3.1283619822662647e-06, "loss": 0.0142, "num_input_tokens_seen": 1040377088, "step": 1016000 }, { "epoch": 9.378950184995524, "grad_norm": 0.5815434455871582, "learning_rate": 3.105295208569768e-06, "loss": 0.0142, "num_input_tokens_seen": 1040889088, "step": 1016500 }, { "epoch": 9.383563539734824, "grad_norm": 1.3620293140411377, "learning_rate": 3.0822284348732714e-06, "loss": 0.0139, "num_input_tokens_seen": 1041401088, "step": 1017000 }, { "epoch": 9.388176894474123, "grad_norm": 0.8543253540992737, "learning_rate": 3.059161661176775e-06, "loss": 0.0128, "num_input_tokens_seen": 1041913088, "step": 1017500 }, { "epoch": 9.392790249213423, "grad_norm": 1.2159240245819092, "learning_rate": 3.036094887480278e-06, "loss": 0.0138, "num_input_tokens_seen": 1042425088, "step": 1018000 }, { "epoch": 9.397403603952723, "grad_norm": 0.7059375643730164, "learning_rate": 3.013028113783781e-06, "loss": 0.0137, "num_input_tokens_seen": 1042937088, "step": 1018500 }, { "epoch": 9.402016958692021, "grad_norm": 0.45824775099754333, "learning_rate": 2.9899613400872847e-06, "loss": 0.0135, "num_input_tokens_seen": 1043449088, "step": 1019000 }, { "epoch": 9.406630313431322, "grad_norm": 0.6606787443161011, "learning_rate": 2.9668945663907883e-06, "loss": 0.0138, "num_input_tokens_seen": 1043961088, "step": 1019500 }, { "epoch": 9.41124366817062, "grad_norm": 0.8153837323188782, "learning_rate": 2.9438277926942914e-06, "loss": 0.0125, "num_input_tokens_seen": 1044473088, "step": 1020000 }, { "epoch": 9.41585702290992, "grad_norm": 0.4770793318748474, "learning_rate": 2.920761018997795e-06, "loss": 0.0135, "num_input_tokens_seen": 1044985088, "step": 1020500 }, { "epoch": 9.420470377649218, "grad_norm": 1.226976990699768, "learning_rate": 2.8976942453012985e-06, "loss": 0.0132, "num_input_tokens_seen": 1045497088, "step": 1021000 }, { "epoch": 9.425083732388519, "grad_norm": 0.3825905919075012, "learning_rate": 2.8746274716048016e-06, "loss": 0.0125, "num_input_tokens_seen": 1046009088, "step": 1021500 }, { "epoch": 9.429697087127817, "grad_norm": 0.6580853462219238, "learning_rate": 2.851560697908305e-06, "loss": 0.014, "num_input_tokens_seen": 1046521088, "step": 1022000 }, { "epoch": 9.434310441867117, "grad_norm": 1.0704902410507202, "learning_rate": 2.8284939242118087e-06, "loss": 0.013, "num_input_tokens_seen": 1047033088, "step": 1022500 }, { "epoch": 9.438923796606415, "grad_norm": 1.5487003326416016, "learning_rate": 2.805427150515312e-06, "loss": 0.0156, "num_input_tokens_seen": 1047545088, "step": 1023000 }, { "epoch": 9.443537151345716, "grad_norm": 0.4171670079231262, "learning_rate": 2.7823603768188154e-06, "loss": 0.0127, "num_input_tokens_seen": 1048057088, "step": 1023500 }, { "epoch": 9.448150506085016, "grad_norm": 1.1898133754730225, "learning_rate": 2.7592936031223185e-06, "loss": 0.0157, "num_input_tokens_seen": 1048569088, "step": 1024000 }, { "epoch": 9.452763860824314, "grad_norm": 0.4748603105545044, "learning_rate": 2.7362268294258216e-06, "loss": 0.0132, "num_input_tokens_seen": 1049081088, "step": 1024500 }, { "epoch": 9.457377215563614, "grad_norm": 1.6988264322280884, "learning_rate": 2.713160055729325e-06, "loss": 0.0139, "num_input_tokens_seen": 1049593088, "step": 1025000 }, { "epoch": 9.461990570302913, "grad_norm": 1.1586196422576904, "learning_rate": 2.6900932820328287e-06, "loss": 0.0144, "num_input_tokens_seen": 1050105088, "step": 1025500 }, { "epoch": 9.466603925042213, "grad_norm": 1.3323612213134766, "learning_rate": 2.6670265083363323e-06, "loss": 0.0117, "num_input_tokens_seen": 1050617088, "step": 1026000 }, { "epoch": 9.471217279781511, "grad_norm": 0.6006079316139221, "learning_rate": 2.6439597346398354e-06, "loss": 0.0147, "num_input_tokens_seen": 1051129088, "step": 1026500 }, { "epoch": 9.475830634520811, "grad_norm": 0.9578723907470703, "learning_rate": 2.620892960943339e-06, "loss": 0.0122, "num_input_tokens_seen": 1051641088, "step": 1027000 }, { "epoch": 9.48044398926011, "grad_norm": 0.9589295983314514, "learning_rate": 2.5978261872468425e-06, "loss": 0.0136, "num_input_tokens_seen": 1052153088, "step": 1027500 }, { "epoch": 9.48505734399941, "grad_norm": 1.320854663848877, "learning_rate": 2.5747594135503456e-06, "loss": 0.0166, "num_input_tokens_seen": 1052665088, "step": 1028000 }, { "epoch": 9.489670698738708, "grad_norm": 0.5850228071212769, "learning_rate": 2.551692639853849e-06, "loss": 0.0127, "num_input_tokens_seen": 1053177088, "step": 1028500 }, { "epoch": 9.494284053478008, "grad_norm": 0.4947618544101715, "learning_rate": 2.5286258661573527e-06, "loss": 0.013, "num_input_tokens_seen": 1053689088, "step": 1029000 }, { "epoch": 9.498897408217307, "grad_norm": 1.5554652214050293, "learning_rate": 2.505559092460856e-06, "loss": 0.0114, "num_input_tokens_seen": 1054201088, "step": 1029500 }, { "epoch": 9.503510762956607, "grad_norm": 0.7134987711906433, "learning_rate": 2.482492318764359e-06, "loss": 0.0131, "num_input_tokens_seen": 1054713088, "step": 1030000 }, { "epoch": 9.508124117695907, "grad_norm": 0.6300977468490601, "learning_rate": 2.4594255450678625e-06, "loss": 0.0132, "num_input_tokens_seen": 1055225088, "step": 1030500 }, { "epoch": 9.512737472435205, "grad_norm": 0.30723100900650024, "learning_rate": 2.4363587713713656e-06, "loss": 0.012, "num_input_tokens_seen": 1055737088, "step": 1031000 }, { "epoch": 9.517350827174505, "grad_norm": 0.5518991947174072, "learning_rate": 2.413291997674869e-06, "loss": 0.0125, "num_input_tokens_seen": 1056249088, "step": 1031500 }, { "epoch": 9.521964181913804, "grad_norm": 0.48715853691101074, "learning_rate": 2.3902252239783727e-06, "loss": 0.0147, "num_input_tokens_seen": 1056761088, "step": 1032000 }, { "epoch": 9.526577536653104, "grad_norm": 0.9060729742050171, "learning_rate": 2.3671584502818763e-06, "loss": 0.0138, "num_input_tokens_seen": 1057273088, "step": 1032500 }, { "epoch": 9.531190891392402, "grad_norm": 0.6399810910224915, "learning_rate": 2.3440916765853794e-06, "loss": 0.0154, "num_input_tokens_seen": 1057785088, "step": 1033000 }, { "epoch": 9.535804246131702, "grad_norm": 0.8663894534111023, "learning_rate": 2.321024902888883e-06, "loss": 0.0128, "num_input_tokens_seen": 1058297088, "step": 1033500 }, { "epoch": 9.540417600871, "grad_norm": 1.554218053817749, "learning_rate": 2.2979581291923865e-06, "loss": 0.013, "num_input_tokens_seen": 1058809088, "step": 1034000 }, { "epoch": 9.5450309556103, "grad_norm": 0.5967795848846436, "learning_rate": 2.2748913554958896e-06, "loss": 0.0136, "num_input_tokens_seen": 1059321088, "step": 1034500 }, { "epoch": 9.5496443103496, "grad_norm": 0.7761898040771484, "learning_rate": 2.251824581799393e-06, "loss": 0.0137, "num_input_tokens_seen": 1059833088, "step": 1035000 }, { "epoch": 9.5542576650889, "grad_norm": 0.4565838873386383, "learning_rate": 2.2287578081028963e-06, "loss": 0.0137, "num_input_tokens_seen": 1060345088, "step": 1035500 }, { "epoch": 9.5588710198282, "grad_norm": 1.4918292760849, "learning_rate": 2.2056910344063994e-06, "loss": 0.0154, "num_input_tokens_seen": 1060857088, "step": 1036000 }, { "epoch": 9.563484374567498, "grad_norm": 1.143227458000183, "learning_rate": 2.182624260709903e-06, "loss": 0.0126, "num_input_tokens_seen": 1061369088, "step": 1036500 }, { "epoch": 9.568097729306798, "grad_norm": 0.4711507558822632, "learning_rate": 2.1595574870134065e-06, "loss": 0.0139, "num_input_tokens_seen": 1061881088, "step": 1037000 }, { "epoch": 9.572711084046096, "grad_norm": 1.8225018978118896, "learning_rate": 2.1364907133169096e-06, "loss": 0.0148, "num_input_tokens_seen": 1062393088, "step": 1037500 }, { "epoch": 9.577324438785396, "grad_norm": 1.6516982316970825, "learning_rate": 2.113423939620413e-06, "loss": 0.0139, "num_input_tokens_seen": 1062905088, "step": 1038000 }, { "epoch": 9.581937793524695, "grad_norm": 0.6592885255813599, "learning_rate": 2.0903571659239167e-06, "loss": 0.0135, "num_input_tokens_seen": 1063417088, "step": 1038500 }, { "epoch": 9.586551148263995, "grad_norm": 0.9162536263465881, "learning_rate": 2.0672903922274203e-06, "loss": 0.0143, "num_input_tokens_seen": 1063929088, "step": 1039000 }, { "epoch": 9.591164503003293, "grad_norm": 1.3136478662490845, "learning_rate": 2.0442236185309234e-06, "loss": 0.0145, "num_input_tokens_seen": 1064441088, "step": 1039500 }, { "epoch": 9.595777857742593, "grad_norm": 0.8929975032806396, "learning_rate": 2.021156844834427e-06, "loss": 0.0124, "num_input_tokens_seen": 1064953088, "step": 1040000 }, { "epoch": 9.600391212481892, "grad_norm": 0.6862032413482666, "learning_rate": 1.9980900711379305e-06, "loss": 0.0116, "num_input_tokens_seen": 1065465088, "step": 1040500 }, { "epoch": 9.605004567221192, "grad_norm": 1.4420340061187744, "learning_rate": 1.9750232974414336e-06, "loss": 0.0131, "num_input_tokens_seen": 1065977088, "step": 1041000 }, { "epoch": 9.609617921960492, "grad_norm": 0.6107918620109558, "learning_rate": 1.9519565237449367e-06, "loss": 0.0128, "num_input_tokens_seen": 1066489088, "step": 1041500 }, { "epoch": 9.61423127669979, "grad_norm": 0.8065725564956665, "learning_rate": 1.9288897500484403e-06, "loss": 0.0136, "num_input_tokens_seen": 1067001088, "step": 1042000 }, { "epoch": 9.61884463143909, "grad_norm": 1.1736738681793213, "learning_rate": 1.9058229763519436e-06, "loss": 0.0142, "num_input_tokens_seen": 1067513088, "step": 1042500 }, { "epoch": 9.623457986178389, "grad_norm": 3.729763984680176, "learning_rate": 1.882756202655447e-06, "loss": 0.0131, "num_input_tokens_seen": 1068025088, "step": 1043000 }, { "epoch": 9.628071340917689, "grad_norm": 0.39236801862716675, "learning_rate": 1.8596894289589505e-06, "loss": 0.014, "num_input_tokens_seen": 1068537088, "step": 1043500 }, { "epoch": 9.632684695656987, "grad_norm": 1.0780402421951294, "learning_rate": 1.8366226552624536e-06, "loss": 0.0112, "num_input_tokens_seen": 1069049088, "step": 1044000 }, { "epoch": 9.637298050396288, "grad_norm": 0.5110656023025513, "learning_rate": 1.8135558815659572e-06, "loss": 0.0127, "num_input_tokens_seen": 1069561088, "step": 1044500 }, { "epoch": 9.641911405135586, "grad_norm": 0.23593804240226746, "learning_rate": 1.7904891078694607e-06, "loss": 0.0131, "num_input_tokens_seen": 1070073088, "step": 1045000 }, { "epoch": 9.646524759874886, "grad_norm": 0.9505711793899536, "learning_rate": 1.767422334172964e-06, "loss": 0.0125, "num_input_tokens_seen": 1070585088, "step": 1045500 }, { "epoch": 9.651138114614184, "grad_norm": 0.9649909138679504, "learning_rate": 1.7443555604764672e-06, "loss": 0.0153, "num_input_tokens_seen": 1071097088, "step": 1046000 }, { "epoch": 9.655751469353484, "grad_norm": 0.29947414994239807, "learning_rate": 1.7212887867799707e-06, "loss": 0.014, "num_input_tokens_seen": 1071609088, "step": 1046500 }, { "epoch": 9.660364824092785, "grad_norm": 0.9218162298202515, "learning_rate": 1.6982220130834743e-06, "loss": 0.0141, "num_input_tokens_seen": 1072121088, "step": 1047000 }, { "epoch": 9.664978178832083, "grad_norm": 1.3005330562591553, "learning_rate": 1.6751552393869774e-06, "loss": 0.0145, "num_input_tokens_seen": 1072633088, "step": 1047500 }, { "epoch": 9.669591533571383, "grad_norm": 1.300002932548523, "learning_rate": 1.652088465690481e-06, "loss": 0.0137, "num_input_tokens_seen": 1073145088, "step": 1048000 }, { "epoch": 9.674204888310681, "grad_norm": 0.6326736211776733, "learning_rate": 1.6290216919939843e-06, "loss": 0.0163, "num_input_tokens_seen": 1073657088, "step": 1048500 }, { "epoch": 9.678818243049982, "grad_norm": 0.865162193775177, "learning_rate": 1.6059549182974874e-06, "loss": 0.0137, "num_input_tokens_seen": 1074169088, "step": 1049000 }, { "epoch": 9.68343159778928, "grad_norm": 0.6226495504379272, "learning_rate": 1.582888144600991e-06, "loss": 0.013, "num_input_tokens_seen": 1074681088, "step": 1049500 }, { "epoch": 9.68804495252858, "grad_norm": 1.6454648971557617, "learning_rate": 1.5598213709044945e-06, "loss": 0.012, "num_input_tokens_seen": 1075193088, "step": 1050000 }, { "epoch": 9.692658307267878, "grad_norm": 0.4671117663383484, "learning_rate": 1.5367545972079978e-06, "loss": 0.0133, "num_input_tokens_seen": 1075705088, "step": 1050500 }, { "epoch": 9.697271662007179, "grad_norm": 0.9937256574630737, "learning_rate": 1.5136878235115012e-06, "loss": 0.0142, "num_input_tokens_seen": 1076217088, "step": 1051000 }, { "epoch": 9.701885016746477, "grad_norm": 0.976679265499115, "learning_rate": 1.4906210498150045e-06, "loss": 0.0134, "num_input_tokens_seen": 1076729088, "step": 1051500 }, { "epoch": 9.706498371485777, "grad_norm": 0.5003361701965332, "learning_rate": 1.4675542761185078e-06, "loss": 0.0139, "num_input_tokens_seen": 1077241088, "step": 1052000 }, { "epoch": 9.711111726225077, "grad_norm": 0.7003839015960693, "learning_rate": 1.4444875024220114e-06, "loss": 0.013, "num_input_tokens_seen": 1077753088, "step": 1052500 }, { "epoch": 9.715725080964376, "grad_norm": 0.6862497925758362, "learning_rate": 1.4214207287255147e-06, "loss": 0.0132, "num_input_tokens_seen": 1078265088, "step": 1053000 }, { "epoch": 9.720338435703676, "grad_norm": 0.26981067657470703, "learning_rate": 1.398353955029018e-06, "loss": 0.0124, "num_input_tokens_seen": 1078777088, "step": 1053500 }, { "epoch": 9.724951790442974, "grad_norm": 0.6135255694389343, "learning_rate": 1.3752871813325216e-06, "loss": 0.0133, "num_input_tokens_seen": 1079289088, "step": 1054000 }, { "epoch": 9.729565145182274, "grad_norm": 0.6279376149177551, "learning_rate": 1.3522204076360247e-06, "loss": 0.014, "num_input_tokens_seen": 1079801088, "step": 1054500 }, { "epoch": 9.734178499921573, "grad_norm": 1.5329886674880981, "learning_rate": 1.329153633939528e-06, "loss": 0.0152, "num_input_tokens_seen": 1080313088, "step": 1055000 }, { "epoch": 9.738791854660873, "grad_norm": 1.2570598125457764, "learning_rate": 1.3060868602430316e-06, "loss": 0.0126, "num_input_tokens_seen": 1080825088, "step": 1055500 }, { "epoch": 9.743405209400171, "grad_norm": 1.8935927152633667, "learning_rate": 1.283020086546535e-06, "loss": 0.0138, "num_input_tokens_seen": 1081337088, "step": 1056000 }, { "epoch": 9.748018564139471, "grad_norm": 0.5364086031913757, "learning_rate": 1.2599533128500385e-06, "loss": 0.0136, "num_input_tokens_seen": 1081849088, "step": 1056500 }, { "epoch": 9.75263191887877, "grad_norm": 0.6562399864196777, "learning_rate": 1.2368865391535418e-06, "loss": 0.0128, "num_input_tokens_seen": 1082361088, "step": 1057000 }, { "epoch": 9.75724527361807, "grad_norm": 0.7584030628204346, "learning_rate": 1.213819765457045e-06, "loss": 0.013, "num_input_tokens_seen": 1082873088, "step": 1057500 }, { "epoch": 9.76185862835737, "grad_norm": 0.8746394515037537, "learning_rate": 1.1907529917605485e-06, "loss": 0.014, "num_input_tokens_seen": 1083385088, "step": 1058000 }, { "epoch": 9.766471983096668, "grad_norm": 1.1132066249847412, "learning_rate": 1.1676862180640518e-06, "loss": 0.0129, "num_input_tokens_seen": 1083897088, "step": 1058500 }, { "epoch": 9.771085337835968, "grad_norm": 0.7786855697631836, "learning_rate": 1.1446194443675554e-06, "loss": 0.0137, "num_input_tokens_seen": 1084409088, "step": 1059000 }, { "epoch": 9.775698692575267, "grad_norm": 0.5935215353965759, "learning_rate": 1.1215526706710587e-06, "loss": 0.0144, "num_input_tokens_seen": 1084921088, "step": 1059500 }, { "epoch": 9.780312047314567, "grad_norm": 1.0187913179397583, "learning_rate": 1.098485896974562e-06, "loss": 0.0145, "num_input_tokens_seen": 1085433088, "step": 1060000 }, { "epoch": 9.784925402053865, "grad_norm": 0.6144331693649292, "learning_rate": 1.0754191232780654e-06, "loss": 0.0123, "num_input_tokens_seen": 1085945088, "step": 1060500 }, { "epoch": 9.789538756793165, "grad_norm": 0.6357366442680359, "learning_rate": 1.0523523495815687e-06, "loss": 0.0124, "num_input_tokens_seen": 1086457088, "step": 1061000 }, { "epoch": 9.794152111532464, "grad_norm": 8.163220405578613, "learning_rate": 1.0292855758850723e-06, "loss": 0.0151, "num_input_tokens_seen": 1086969088, "step": 1061500 }, { "epoch": 9.798765466271764, "grad_norm": 1.1560457944869995, "learning_rate": 1.0062188021885756e-06, "loss": 0.0142, "num_input_tokens_seen": 1087481088, "step": 1062000 }, { "epoch": 9.803378821011062, "grad_norm": 1.6285614967346191, "learning_rate": 9.83152028492079e-07, "loss": 0.0124, "num_input_tokens_seen": 1087993088, "step": 1062500 }, { "epoch": 9.807992175750362, "grad_norm": 0.9213132858276367, "learning_rate": 9.600852547955823e-07, "loss": 0.0125, "num_input_tokens_seen": 1088505088, "step": 1063000 }, { "epoch": 9.81260553048966, "grad_norm": 0.7964446544647217, "learning_rate": 9.370184810990857e-07, "loss": 0.0139, "num_input_tokens_seen": 1089017088, "step": 1063500 }, { "epoch": 9.81721888522896, "grad_norm": 0.8223236799240112, "learning_rate": 9.139517074025889e-07, "loss": 0.0128, "num_input_tokens_seen": 1089529088, "step": 1064000 }, { "epoch": 9.821832239968261, "grad_norm": 0.9797717332839966, "learning_rate": 8.908849337060925e-07, "loss": 0.0126, "num_input_tokens_seen": 1090041088, "step": 1064500 }, { "epoch": 9.82644559470756, "grad_norm": 0.23104320466518402, "learning_rate": 8.678181600095958e-07, "loss": 0.0145, "num_input_tokens_seen": 1090553088, "step": 1065000 }, { "epoch": 9.83105894944686, "grad_norm": 0.5735734105110168, "learning_rate": 8.447513863130993e-07, "loss": 0.0146, "num_input_tokens_seen": 1091065088, "step": 1065500 }, { "epoch": 9.835672304186158, "grad_norm": 0.5744655132293701, "learning_rate": 8.216846126166026e-07, "loss": 0.0128, "num_input_tokens_seen": 1091577088, "step": 1066000 }, { "epoch": 9.840285658925458, "grad_norm": 4.304238319396973, "learning_rate": 7.986178389201059e-07, "loss": 0.0124, "num_input_tokens_seen": 1092089088, "step": 1066500 }, { "epoch": 9.844899013664756, "grad_norm": 0.7492998838424683, "learning_rate": 7.755510652236094e-07, "loss": 0.0137, "num_input_tokens_seen": 1092601088, "step": 1067000 }, { "epoch": 9.849512368404056, "grad_norm": 0.21370269358158112, "learning_rate": 7.524842915271127e-07, "loss": 0.0128, "num_input_tokens_seen": 1093113088, "step": 1067500 }, { "epoch": 9.854125723143355, "grad_norm": 1.3890074491500854, "learning_rate": 7.294175178306161e-07, "loss": 0.0133, "num_input_tokens_seen": 1093625088, "step": 1068000 }, { "epoch": 9.858739077882655, "grad_norm": 0.9255247116088867, "learning_rate": 7.063507441341195e-07, "loss": 0.0125, "num_input_tokens_seen": 1094137088, "step": 1068500 }, { "epoch": 9.863352432621953, "grad_norm": 0.617211639881134, "learning_rate": 6.832839704376229e-07, "loss": 0.0118, "num_input_tokens_seen": 1094649088, "step": 1069000 }, { "epoch": 9.867965787361253, "grad_norm": 0.7818981409072876, "learning_rate": 6.602171967411263e-07, "loss": 0.0128, "num_input_tokens_seen": 1095161088, "step": 1069500 }, { "epoch": 9.872579142100552, "grad_norm": 0.7910097241401672, "learning_rate": 6.371504230446296e-07, "loss": 0.0125, "num_input_tokens_seen": 1095673088, "step": 1070000 }, { "epoch": 9.877192496839852, "grad_norm": 0.9167271256446838, "learning_rate": 6.14083649348133e-07, "loss": 0.0145, "num_input_tokens_seen": 1096185088, "step": 1070500 }, { "epoch": 9.881805851579152, "grad_norm": 0.4515294134616852, "learning_rate": 5.910168756516364e-07, "loss": 0.0128, "num_input_tokens_seen": 1096697088, "step": 1071000 }, { "epoch": 9.88641920631845, "grad_norm": 1.4242569208145142, "learning_rate": 5.679501019551397e-07, "loss": 0.0123, "num_input_tokens_seen": 1097209088, "step": 1071500 }, { "epoch": 9.89103256105775, "grad_norm": 1.5031037330627441, "learning_rate": 5.448833282586431e-07, "loss": 0.0132, "num_input_tokens_seen": 1097721088, "step": 1072000 }, { "epoch": 9.895645915797049, "grad_norm": 0.5102546215057373, "learning_rate": 5.218165545621465e-07, "loss": 0.0134, "num_input_tokens_seen": 1098233088, "step": 1072500 }, { "epoch": 9.900259270536349, "grad_norm": 0.5648242831230164, "learning_rate": 4.987497808656499e-07, "loss": 0.0132, "num_input_tokens_seen": 1098745088, "step": 1073000 }, { "epoch": 9.904872625275647, "grad_norm": 1.368865728378296, "learning_rate": 4.756830071691533e-07, "loss": 0.0136, "num_input_tokens_seen": 1099257088, "step": 1073500 }, { "epoch": 9.909485980014948, "grad_norm": 0.372745156288147, "learning_rate": 4.5261623347265665e-07, "loss": 0.0149, "num_input_tokens_seen": 1099769088, "step": 1074000 }, { "epoch": 9.914099334754246, "grad_norm": 0.5571704506874084, "learning_rate": 4.2954945977616003e-07, "loss": 0.0132, "num_input_tokens_seen": 1100281088, "step": 1074500 }, { "epoch": 9.918712689493546, "grad_norm": 0.44755375385284424, "learning_rate": 4.064826860796634e-07, "loss": 0.0138, "num_input_tokens_seen": 1100793088, "step": 1075000 }, { "epoch": 9.923326044232844, "grad_norm": 0.467204749584198, "learning_rate": 3.834159123831668e-07, "loss": 0.0138, "num_input_tokens_seen": 1101305088, "step": 1075500 }, { "epoch": 9.927939398972145, "grad_norm": 1.1227315664291382, "learning_rate": 3.603491386866702e-07, "loss": 0.0131, "num_input_tokens_seen": 1101817088, "step": 1076000 }, { "epoch": 9.932552753711445, "grad_norm": 0.8583968877792358, "learning_rate": 3.3728236499017353e-07, "loss": 0.0143, "num_input_tokens_seen": 1102329088, "step": 1076500 }, { "epoch": 9.937166108450743, "grad_norm": 0.830702543258667, "learning_rate": 3.14215591293677e-07, "loss": 0.0139, "num_input_tokens_seen": 1102841088, "step": 1077000 }, { "epoch": 9.941779463190043, "grad_norm": 1.864600658416748, "learning_rate": 2.9114881759718036e-07, "loss": 0.013, "num_input_tokens_seen": 1103353088, "step": 1077500 }, { "epoch": 9.946392817929341, "grad_norm": 0.8975169658660889, "learning_rate": 2.680820439006837e-07, "loss": 0.0127, "num_input_tokens_seen": 1103865088, "step": 1078000 }, { "epoch": 9.951006172668642, "grad_norm": 0.7767340540885925, "learning_rate": 2.450152702041871e-07, "loss": 0.0132, "num_input_tokens_seen": 1104377088, "step": 1078500 }, { "epoch": 9.95561952740794, "grad_norm": 0.6193325519561768, "learning_rate": 2.2194849650769047e-07, "loss": 0.0144, "num_input_tokens_seen": 1104889088, "step": 1079000 }, { "epoch": 9.96023288214724, "grad_norm": 1.1023420095443726, "learning_rate": 1.9888172281119386e-07, "loss": 0.0141, "num_input_tokens_seen": 1105401088, "step": 1079500 }, { "epoch": 9.964846236886538, "grad_norm": 1.2743160724639893, "learning_rate": 1.7581494911469725e-07, "loss": 0.0119, "num_input_tokens_seen": 1105913088, "step": 1080000 }, { "epoch": 9.969459591625839, "grad_norm": 0.7009992599487305, "learning_rate": 1.527481754182006e-07, "loss": 0.0131, "num_input_tokens_seen": 1106425088, "step": 1080500 }, { "epoch": 9.974072946365137, "grad_norm": 0.5736069679260254, "learning_rate": 1.29681401721704e-07, "loss": 0.0174, "num_input_tokens_seen": 1106937088, "step": 1081000 }, { "epoch": 9.978686301104437, "grad_norm": 0.4789179861545563, "learning_rate": 1.0661462802520738e-07, "loss": 0.0129, "num_input_tokens_seen": 1107449088, "step": 1081500 }, { "epoch": 9.983299655843737, "grad_norm": 0.7064932584762573, "learning_rate": 8.354785432871076e-08, "loss": 0.0122, "num_input_tokens_seen": 1107961088, "step": 1082000 }, { "epoch": 9.987913010583036, "grad_norm": 1.0066189765930176, "learning_rate": 6.048108063221414e-08, "loss": 0.0127, "num_input_tokens_seen": 1108473088, "step": 1082500 }, { "epoch": 9.992526365322336, "grad_norm": 1.61360502243042, "learning_rate": 3.7414306935717514e-08, "loss": 0.0135, "num_input_tokens_seen": 1108985088, "step": 1083000 }, { "epoch": 9.997139720061634, "grad_norm": 0.37303218245506287, "learning_rate": 1.4347533239220898e-08, "loss": 0.0133, "num_input_tokens_seen": 1109497088, "step": 1083500 }, { "epoch": 10.0, "eval_combined_score": 0.06429717740844736, "eval_loss": 0.06429717689752579, "eval_mse": 0.06429717791936893, "eval_runtime": 46.2743, "eval_samples_per_second": 2081.892, "eval_steps_per_second": 260.253, "num_input_tokens_seen": 1109813760, "step": 1083810 }, { "epoch": 10.0, "num_input_tokens_seen": 1109813760, "step": 1083810, "total_flos": 1.4278349548463616e+17, "train_loss": 0.035630166295778455, "train_runtime": 37672.0963, "train_samples_per_second": 230.155, "train_steps_per_second": 28.77, "train_tokens_per_second": 29459.836 } ], "logging_steps": 500, "max_steps": 1083810, "num_input_tokens_seen": 1109813760, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.4278349548463616e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }