{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.39378238341968913, "eval_steps": 500, "global_step": 266, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014803849000740192, "grad_norm": 0.9312934875488281, "learning_rate": 0.0, "loss": 8.8924, "num_input_tokens_seen": 1572864, "step": 1 }, { "epoch": 0.0029607698001480384, "grad_norm": 1.1956263780593872, "learning_rate": 3.7037037037037036e-07, "loss": 9.6013, "num_input_tokens_seen": 3145728, "step": 2 }, { "epoch": 0.0044411547002220575, "grad_norm": 1.0787264108657837, "learning_rate": 7.407407407407407e-07, "loss": 8.428, "num_input_tokens_seen": 4718592, "step": 3 }, { "epoch": 0.005921539600296077, "grad_norm": 0.9807946681976318, "learning_rate": 1.111111111111111e-06, "loss": 9.1064, "num_input_tokens_seen": 6291456, "step": 4 }, { "epoch": 0.007401924500370096, "grad_norm": 0.946605920791626, "learning_rate": 1.4814814814814815e-06, "loss": 9.3099, "num_input_tokens_seen": 7864320, "step": 5 }, { "epoch": 0.008882309400444115, "grad_norm": 0.9959967136383057, "learning_rate": 1.8518518518518519e-06, "loss": 9.8204, "num_input_tokens_seen": 9437184, "step": 6 }, { "epoch": 0.010362694300518135, "grad_norm": 1.0211068391799927, "learning_rate": 2.222222222222222e-06, "loss": 8.9413, "num_input_tokens_seen": 11010048, "step": 7 }, { "epoch": 0.011843079200592153, "grad_norm": 1.1169291734695435, "learning_rate": 2.5925925925925925e-06, "loss": 9.1279, "num_input_tokens_seen": 12582912, "step": 8 }, { "epoch": 0.013323464100666173, "grad_norm": 0.8922852277755737, "learning_rate": 2.962962962962963e-06, "loss": 9.6572, "num_input_tokens_seen": 14155776, "step": 9 }, { "epoch": 0.014803849000740192, "grad_norm": 0.936945915222168, "learning_rate": 3.3333333333333333e-06, "loss": 7.9991, "num_input_tokens_seen": 15728640, "step": 10 }, { "epoch": 0.01628423390081421, "grad_norm": 0.948231041431427, "learning_rate": 3.7037037037037037e-06, "loss": 9.4423, "num_input_tokens_seen": 17301504, "step": 11 }, { "epoch": 0.01776461880088823, "grad_norm": 1.1012229919433594, "learning_rate": 4.074074074074074e-06, "loss": 7.9854, "num_input_tokens_seen": 18874368, "step": 12 }, { "epoch": 0.01924500370096225, "grad_norm": 1.2395195960998535, "learning_rate": 4.444444444444444e-06, "loss": 8.9368, "num_input_tokens_seen": 20447232, "step": 13 }, { "epoch": 0.02072538860103627, "grad_norm": 0.930361807346344, "learning_rate": 4.814814814814815e-06, "loss": 9.5469, "num_input_tokens_seen": 22020096, "step": 14 }, { "epoch": 0.02220577350111029, "grad_norm": 0.8275301456451416, "learning_rate": 5.185185185185185e-06, "loss": 10.3952, "num_input_tokens_seen": 23592960, "step": 15 }, { "epoch": 0.023686158401184307, "grad_norm": 0.8683385848999023, "learning_rate": 5.555555555555557e-06, "loss": 8.2971, "num_input_tokens_seen": 25165824, "step": 16 }, { "epoch": 0.025166543301258327, "grad_norm": 1.0179831981658936, "learning_rate": 5.925925925925926e-06, "loss": 9.1234, "num_input_tokens_seen": 26738688, "step": 17 }, { "epoch": 0.026646928201332347, "grad_norm": 1.0104173421859741, "learning_rate": 6.296296296296297e-06, "loss": 9.5261, "num_input_tokens_seen": 28311552, "step": 18 }, { "epoch": 0.028127313101406367, "grad_norm": 1.060304880142212, "learning_rate": 6.666666666666667e-06, "loss": 7.6262, "num_input_tokens_seen": 29884416, "step": 19 }, { "epoch": 0.029607698001480384, "grad_norm": 0.9215478301048279, "learning_rate": 7.0370370370370375e-06, "loss": 8.4184, "num_input_tokens_seen": 31457280, "step": 20 }, { "epoch": 0.031088082901554404, "grad_norm": 1.0252139568328857, "learning_rate": 7.4074074074074075e-06, "loss": 8.2821, "num_input_tokens_seen": 33030144, "step": 21 }, { "epoch": 0.03256846780162842, "grad_norm": 0.9496989846229553, "learning_rate": 7.77777777777778e-06, "loss": 9.4994, "num_input_tokens_seen": 34603008, "step": 22 }, { "epoch": 0.034048852701702444, "grad_norm": 0.9764742255210876, "learning_rate": 8.148148148148148e-06, "loss": 9.4438, "num_input_tokens_seen": 36175872, "step": 23 }, { "epoch": 0.03552923760177646, "grad_norm": 1.3688772916793823, "learning_rate": 8.518518518518519e-06, "loss": 8.3824, "num_input_tokens_seen": 37748736, "step": 24 }, { "epoch": 0.037009622501850484, "grad_norm": 1.0360182523727417, "learning_rate": 8.888888888888888e-06, "loss": 9.5644, "num_input_tokens_seen": 39321600, "step": 25 }, { "epoch": 0.0384900074019245, "grad_norm": 0.9460437297821045, "learning_rate": 9.25925925925926e-06, "loss": 9.7146, "num_input_tokens_seen": 40894464, "step": 26 }, { "epoch": 0.03997039230199852, "grad_norm": 1.10233473777771, "learning_rate": 9.62962962962963e-06, "loss": 9.3393, "num_input_tokens_seen": 42467328, "step": 27 }, { "epoch": 0.04145077720207254, "grad_norm": 1.0540364980697632, "learning_rate": 1e-05, "loss": 9.1959, "num_input_tokens_seen": 44040192, "step": 28 }, { "epoch": 0.04293116210214656, "grad_norm": 1.0485292673110962, "learning_rate": 9.999568045802216e-06, "loss": 9.1612, "num_input_tokens_seen": 45613056, "step": 29 }, { "epoch": 0.04441154700222058, "grad_norm": 1.0755236148834229, "learning_rate": 9.99827225784264e-06, "loss": 10.1287, "num_input_tokens_seen": 47185920, "step": 30 }, { "epoch": 0.0458919319022946, "grad_norm": 1.0970845222473145, "learning_rate": 9.996112860009689e-06, "loss": 10.1364, "num_input_tokens_seen": 48758784, "step": 31 }, { "epoch": 0.047372316802368614, "grad_norm": 1.005739688873291, "learning_rate": 9.993090225407743e-06, "loss": 9.0447, "num_input_tokens_seen": 50331648, "step": 32 }, { "epoch": 0.04885270170244264, "grad_norm": 1.276772141456604, "learning_rate": 9.98920487629269e-06, "loss": 10.0261, "num_input_tokens_seen": 51904512, "step": 33 }, { "epoch": 0.050333086602516654, "grad_norm": 1.0036358833312988, "learning_rate": 9.98445748398167e-06, "loss": 9.3572, "num_input_tokens_seen": 53477376, "step": 34 }, { "epoch": 0.05181347150259067, "grad_norm": 0.9702248573303223, "learning_rate": 9.978848868737099e-06, "loss": 8.6036, "num_input_tokens_seen": 55050240, "step": 35 }, { "epoch": 0.053293856402664694, "grad_norm": 1.0070680379867554, "learning_rate": 9.972379999624935e-06, "loss": 8.265, "num_input_tokens_seen": 56623104, "step": 36 }, { "epoch": 0.05477424130273871, "grad_norm": 0.797947347164154, "learning_rate": 9.96505199434725e-06, "loss": 8.4143, "num_input_tokens_seen": 58195968, "step": 37 }, { "epoch": 0.056254626202812734, "grad_norm": 0.9980666041374207, "learning_rate": 9.956866119049095e-06, "loss": 9.0371, "num_input_tokens_seen": 59768832, "step": 38 }, { "epoch": 0.05773501110288675, "grad_norm": 1.1065810918807983, "learning_rate": 9.947823788099754e-06, "loss": 8.8108, "num_input_tokens_seen": 61341696, "step": 39 }, { "epoch": 0.05921539600296077, "grad_norm": 0.9878284931182861, "learning_rate": 9.937926563848345e-06, "loss": 9.6218, "num_input_tokens_seen": 62914560, "step": 40 }, { "epoch": 0.06069578090303479, "grad_norm": 6.665683746337891, "learning_rate": 9.9271761563539e-06, "loss": 8.1358, "num_input_tokens_seen": 64487424, "step": 41 }, { "epoch": 0.06217616580310881, "grad_norm": 1.0430138111114502, "learning_rate": 9.915574423089872e-06, "loss": 8.3405, "num_input_tokens_seen": 66060288, "step": 42 }, { "epoch": 0.06365655070318282, "grad_norm": 0.9139232039451599, "learning_rate": 9.903123368623216e-06, "loss": 9.0803, "num_input_tokens_seen": 67633152, "step": 43 }, { "epoch": 0.06513693560325684, "grad_norm": 1.0046182870864868, "learning_rate": 9.889825144268029e-06, "loss": 10.1074, "num_input_tokens_seen": 69206016, "step": 44 }, { "epoch": 0.06661732050333087, "grad_norm": 0.9864291548728943, "learning_rate": 9.875682047713847e-06, "loss": 9.0753, "num_input_tokens_seen": 70778880, "step": 45 }, { "epoch": 0.06809770540340489, "grad_norm": 1.1455698013305664, "learning_rate": 9.860696522628638e-06, "loss": 9.2234, "num_input_tokens_seen": 72351744, "step": 46 }, { "epoch": 0.0695780903034789, "grad_norm": 1.0296474695205688, "learning_rate": 9.84487115823659e-06, "loss": 8.7844, "num_input_tokens_seen": 73924608, "step": 47 }, { "epoch": 0.07105847520355292, "grad_norm": 0.9440281987190247, "learning_rate": 9.828208688870736e-06, "loss": 8.3587, "num_input_tokens_seen": 75497472, "step": 48 }, { "epoch": 0.07253886010362694, "grad_norm": 0.9836198687553406, "learning_rate": 9.810711993500506e-06, "loss": 8.9316, "num_input_tokens_seen": 77070336, "step": 49 }, { "epoch": 0.07401924500370097, "grad_norm": 1.0525554418563843, "learning_rate": 9.792384095234312e-06, "loss": 9.571, "num_input_tokens_seen": 78643200, "step": 50 }, { "epoch": 0.07549962990377498, "grad_norm": 0.873354971408844, "learning_rate": 9.773228160797187e-06, "loss": 8.9413, "num_input_tokens_seen": 80216064, "step": 51 }, { "epoch": 0.076980014803849, "grad_norm": 1.2775590419769287, "learning_rate": 9.753247499983649e-06, "loss": 9.3912, "num_input_tokens_seen": 81788928, "step": 52 }, { "epoch": 0.07846039970392302, "grad_norm": 0.9796115159988403, "learning_rate": 9.732445565085823e-06, "loss": 8.5451, "num_input_tokens_seen": 83361792, "step": 53 }, { "epoch": 0.07994078460399703, "grad_norm": 0.7836295962333679, "learning_rate": 9.71082595029695e-06, "loss": 8.7509, "num_input_tokens_seen": 84934656, "step": 54 }, { "epoch": 0.08142116950407106, "grad_norm": 1.104665756225586, "learning_rate": 9.688392391090374e-06, "loss": 8.8454, "num_input_tokens_seen": 86507520, "step": 55 }, { "epoch": 0.08290155440414508, "grad_norm": 0.9231814742088318, "learning_rate": 9.665148763574123e-06, "loss": 9.2759, "num_input_tokens_seen": 88080384, "step": 56 }, { "epoch": 0.0843819393042191, "grad_norm": 1.1079339981079102, "learning_rate": 9.64109908382119e-06, "loss": 10.4245, "num_input_tokens_seen": 89653248, "step": 57 }, { "epoch": 0.08586232420429311, "grad_norm": 0.9315667152404785, "learning_rate": 9.616247507175624e-06, "loss": 8.921, "num_input_tokens_seen": 91226112, "step": 58 }, { "epoch": 0.08734270910436713, "grad_norm": 1.1500376462936401, "learning_rate": 9.590598327534563e-06, "loss": 9.2388, "num_input_tokens_seen": 92798976, "step": 59 }, { "epoch": 0.08882309400444116, "grad_norm": 1.5817395448684692, "learning_rate": 9.56415597660634e-06, "loss": 9.022, "num_input_tokens_seen": 94371840, "step": 60 }, { "epoch": 0.09030347890451518, "grad_norm": 0.9491121768951416, "learning_rate": 9.536925023144742e-06, "loss": 7.9679, "num_input_tokens_seen": 95944704, "step": 61 }, { "epoch": 0.0917838638045892, "grad_norm": 2.5303561687469482, "learning_rate": 9.508910172159635e-06, "loss": 9.0134, "num_input_tokens_seen": 97517568, "step": 62 }, { "epoch": 0.09326424870466321, "grad_norm": 0.986416220664978, "learning_rate": 9.48011626410401e-06, "loss": 9.0806, "num_input_tokens_seen": 99090432, "step": 63 }, { "epoch": 0.09474463360473723, "grad_norm": 1.1620358228683472, "learning_rate": 9.450548274037652e-06, "loss": 10.1018, "num_input_tokens_seen": 100663296, "step": 64 }, { "epoch": 0.09622501850481126, "grad_norm": 1.0916987657546997, "learning_rate": 9.420211310767534e-06, "loss": 8.0095, "num_input_tokens_seen": 102236160, "step": 65 }, { "epoch": 0.09770540340488527, "grad_norm": 0.9612499475479126, "learning_rate": 9.389110615965102e-06, "loss": 9.4223, "num_input_tokens_seen": 103809024, "step": 66 }, { "epoch": 0.09918578830495929, "grad_norm": 1.1053907871246338, "learning_rate": 9.35725156326063e-06, "loss": 9.4943, "num_input_tokens_seen": 105381888, "step": 67 }, { "epoch": 0.10066617320503331, "grad_norm": 0.9855115413665771, "learning_rate": 9.324639657314742e-06, "loss": 9.3133, "num_input_tokens_seen": 106954752, "step": 68 }, { "epoch": 0.10214655810510732, "grad_norm": 0.9884914755821228, "learning_rate": 9.291280532867301e-06, "loss": 8.9539, "num_input_tokens_seen": 108527616, "step": 69 }, { "epoch": 0.10362694300518134, "grad_norm": 0.8383261561393738, "learning_rate": 9.257179953763846e-06, "loss": 8.0476, "num_input_tokens_seen": 110100480, "step": 70 }, { "epoch": 0.10510732790525537, "grad_norm": 0.8421911597251892, "learning_rate": 9.222343811959694e-06, "loss": 8.8411, "num_input_tokens_seen": 111673344, "step": 71 }, { "epoch": 0.10658771280532939, "grad_norm": 1.018642783164978, "learning_rate": 9.186778126501916e-06, "loss": 9.0418, "num_input_tokens_seen": 113246208, "step": 72 }, { "epoch": 0.1080680977054034, "grad_norm": 1.124177098274231, "learning_rate": 9.150489042489368e-06, "loss": 8.9343, "num_input_tokens_seen": 114819072, "step": 73 }, { "epoch": 0.10954848260547742, "grad_norm": 0.9522198438644409, "learning_rate": 9.113482830010918e-06, "loss": 9.0528, "num_input_tokens_seen": 116391936, "step": 74 }, { "epoch": 0.11102886750555144, "grad_norm": 0.9624304175376892, "learning_rate": 9.075765883062093e-06, "loss": 9.0861, "num_input_tokens_seen": 117964800, "step": 75 }, { "epoch": 0.11250925240562547, "grad_norm": 0.8929852843284607, "learning_rate": 9.037344718440321e-06, "loss": 9.2024, "num_input_tokens_seen": 119537664, "step": 76 }, { "epoch": 0.11398963730569948, "grad_norm": 1.0029925107955933, "learning_rate": 8.99822597461894e-06, "loss": 9.4743, "num_input_tokens_seen": 121110528, "step": 77 }, { "epoch": 0.1154700222057735, "grad_norm": 1.0162649154663086, "learning_rate": 8.958416410600188e-06, "loss": 8.5334, "num_input_tokens_seen": 122683392, "step": 78 }, { "epoch": 0.11695040710584752, "grad_norm": 0.9771702885627747, "learning_rate": 8.917922904747385e-06, "loss": 9.1915, "num_input_tokens_seen": 124256256, "step": 79 }, { "epoch": 0.11843079200592153, "grad_norm": 0.8723686337471008, "learning_rate": 8.876752453596462e-06, "loss": 9.0803, "num_input_tokens_seen": 125829120, "step": 80 }, { "epoch": 0.11991117690599556, "grad_norm": 0.9210541248321533, "learning_rate": 8.834912170647102e-06, "loss": 9.3622, "num_input_tokens_seen": 127401984, "step": 81 }, { "epoch": 0.12139156180606958, "grad_norm": 0.9311140179634094, "learning_rate": 8.792409285133644e-06, "loss": 7.9475, "num_input_tokens_seen": 128974848, "step": 82 }, { "epoch": 0.1228719467061436, "grad_norm": 0.8897220492362976, "learning_rate": 8.749251140776016e-06, "loss": 9.3202, "num_input_tokens_seen": 130547712, "step": 83 }, { "epoch": 0.12435233160621761, "grad_norm": 0.9463574290275574, "learning_rate": 8.705445194510868e-06, "loss": 10.1407, "num_input_tokens_seen": 132120576, "step": 84 }, { "epoch": 0.12583271650629163, "grad_norm": 1.3442161083221436, "learning_rate": 8.660999015203152e-06, "loss": 9.8572, "num_input_tokens_seen": 133693440, "step": 85 }, { "epoch": 0.12731310140636565, "grad_norm": 1.0177114009857178, "learning_rate": 8.615920282338355e-06, "loss": 9.7569, "num_input_tokens_seen": 135266304, "step": 86 }, { "epoch": 0.12879348630643966, "grad_norm": 0.9807493090629578, "learning_rate": 8.570216784695637e-06, "loss": 9.6619, "num_input_tokens_seen": 136839168, "step": 87 }, { "epoch": 0.13027387120651368, "grad_norm": 0.9313049912452698, "learning_rate": 8.52389641900206e-06, "loss": 8.2778, "num_input_tokens_seen": 138412032, "step": 88 }, { "epoch": 0.13175425610658772, "grad_norm": 1.1321674585342407, "learning_rate": 8.476967188568187e-06, "loss": 9.2182, "num_input_tokens_seen": 139984896, "step": 89 }, { "epoch": 0.13323464100666174, "grad_norm": 1.1267322301864624, "learning_rate": 8.429437201905254e-06, "loss": 9.0367, "num_input_tokens_seen": 141557760, "step": 90 }, { "epoch": 0.13471502590673576, "grad_norm": 1.0089036226272583, "learning_rate": 8.38131467132416e-06, "loss": 8.6384, "num_input_tokens_seen": 143130624, "step": 91 }, { "epoch": 0.13619541080680977, "grad_norm": 0.9808329939842224, "learning_rate": 8.332607911516545e-06, "loss": 9.3609, "num_input_tokens_seen": 144703488, "step": 92 }, { "epoch": 0.1376757957068838, "grad_norm": 0.982054591178894, "learning_rate": 8.283325338118154e-06, "loss": 9.0296, "num_input_tokens_seen": 146276352, "step": 93 }, { "epoch": 0.1391561806069578, "grad_norm": 0.9693288207054138, "learning_rate": 8.233475466254766e-06, "loss": 9.9902, "num_input_tokens_seen": 147849216, "step": 94 }, { "epoch": 0.14063656550703182, "grad_norm": 0.9457436800003052, "learning_rate": 8.183066909070946e-06, "loss": 9.9753, "num_input_tokens_seen": 149422080, "step": 95 }, { "epoch": 0.14211695040710584, "grad_norm": 1.1782922744750977, "learning_rate": 8.132108376241849e-06, "loss": 10.0226, "num_input_tokens_seen": 150994944, "step": 96 }, { "epoch": 0.14359733530717986, "grad_norm": 1.0504759550094604, "learning_rate": 8.08060867246834e-06, "loss": 8.143, "num_input_tokens_seen": 152567808, "step": 97 }, { "epoch": 0.14507772020725387, "grad_norm": 1.3626543283462524, "learning_rate": 8.028576695955711e-06, "loss": 9.8448, "num_input_tokens_seen": 154140672, "step": 98 }, { "epoch": 0.14655810510732792, "grad_norm": 1.204558253288269, "learning_rate": 7.976021436876232e-06, "loss": 8.2288, "num_input_tokens_seen": 155713536, "step": 99 }, { "epoch": 0.14803849000740193, "grad_norm": 0.9750257730484009, "learning_rate": 7.92295197581581e-06, "loss": 8.5604, "num_input_tokens_seen": 157286400, "step": 100 }, { "epoch": 0.14951887490747595, "grad_norm": 0.9481039643287659, "learning_rate": 7.869377482205042e-06, "loss": 9.6968, "num_input_tokens_seen": 158859264, "step": 101 }, { "epoch": 0.15099925980754997, "grad_norm": 0.9680772423744202, "learning_rate": 7.815307212734888e-06, "loss": 9.6446, "num_input_tokens_seen": 160432128, "step": 102 }, { "epoch": 0.15247964470762398, "grad_norm": 1.005219578742981, "learning_rate": 7.7607505097573e-06, "loss": 8.7518, "num_input_tokens_seen": 162004992, "step": 103 }, { "epoch": 0.153960029607698, "grad_norm": 1.04623281955719, "learning_rate": 7.705716799671019e-06, "loss": 7.4682, "num_input_tokens_seen": 163577856, "step": 104 }, { "epoch": 0.15544041450777202, "grad_norm": 1.0221362113952637, "learning_rate": 7.650215591292888e-06, "loss": 9.2972, "num_input_tokens_seen": 165150720, "step": 105 }, { "epoch": 0.15692079940784603, "grad_norm": 1.0508990287780762, "learning_rate": 7.594256474214883e-06, "loss": 9.3146, "num_input_tokens_seen": 166723584, "step": 106 }, { "epoch": 0.15840118430792005, "grad_norm": 1.265791416168213, "learning_rate": 7.537849117147212e-06, "loss": 9.1447, "num_input_tokens_seen": 168296448, "step": 107 }, { "epoch": 0.15988156920799407, "grad_norm": 1.1966804265975952, "learning_rate": 7.481003266247745e-06, "loss": 8.3043, "num_input_tokens_seen": 169869312, "step": 108 }, { "epoch": 0.16136195410806808, "grad_norm": 1.0808377265930176, "learning_rate": 7.4237287434380485e-06, "loss": 9.2868, "num_input_tokens_seen": 171442176, "step": 109 }, { "epoch": 0.16284233900814213, "grad_norm": 0.9915937781333923, "learning_rate": 7.366035444706346e-06, "loss": 9.0849, "num_input_tokens_seen": 173015040, "step": 110 }, { "epoch": 0.16432272390821615, "grad_norm": 1.0843515396118164, "learning_rate": 7.307933338397667e-06, "loss": 8.1961, "num_input_tokens_seen": 174587904, "step": 111 }, { "epoch": 0.16580310880829016, "grad_norm": 1.0236737728118896, "learning_rate": 7.249432463491498e-06, "loss": 8.7919, "num_input_tokens_seen": 176160768, "step": 112 }, { "epoch": 0.16728349370836418, "grad_norm": 0.9542803764343262, "learning_rate": 7.190542927867234e-06, "loss": 8.0601, "num_input_tokens_seen": 177733632, "step": 113 }, { "epoch": 0.1687638786084382, "grad_norm": 0.9664928317070007, "learning_rate": 7.131274906557725e-06, "loss": 8.7192, "num_input_tokens_seen": 179306496, "step": 114 }, { "epoch": 0.1702442635085122, "grad_norm": 1.4056304693222046, "learning_rate": 7.0716386399912075e-06, "loss": 9.0565, "num_input_tokens_seen": 180879360, "step": 115 }, { "epoch": 0.17172464840858623, "grad_norm": 1.0535268783569336, "learning_rate": 7.0116444322219575e-06, "loss": 8.2026, "num_input_tokens_seen": 182452224, "step": 116 }, { "epoch": 0.17320503330866024, "grad_norm": 0.9302631616592407, "learning_rate": 6.95130264914993e-06, "loss": 8.6117, "num_input_tokens_seen": 184025088, "step": 117 }, { "epoch": 0.17468541820873426, "grad_norm": 1.038080096244812, "learning_rate": 6.890623716729724e-06, "loss": 8.0289, "num_input_tokens_seen": 185597952, "step": 118 }, { "epoch": 0.17616580310880828, "grad_norm": 1.0219101905822754, "learning_rate": 6.829618119169169e-06, "loss": 8.4351, "num_input_tokens_seen": 187170816, "step": 119 }, { "epoch": 0.17764618800888232, "grad_norm": 0.9798210859298706, "learning_rate": 6.768296397117848e-06, "loss": 8.9511, "num_input_tokens_seen": 188743680, "step": 120 }, { "epoch": 0.17912657290895634, "grad_norm": 1.0009288787841797, "learning_rate": 6.706669145845863e-06, "loss": 9.2014, "num_input_tokens_seen": 190316544, "step": 121 }, { "epoch": 0.18060695780903036, "grad_norm": 0.9980687499046326, "learning_rate": 6.6447470134131685e-06, "loss": 9.7525, "num_input_tokens_seen": 191889408, "step": 122 }, { "epoch": 0.18208734270910437, "grad_norm": 0.9398377537727356, "learning_rate": 6.5825406988297815e-06, "loss": 8.8423, "num_input_tokens_seen": 193462272, "step": 123 }, { "epoch": 0.1835677276091784, "grad_norm": 0.8980095386505127, "learning_rate": 6.520060950207186e-06, "loss": 9.3148, "num_input_tokens_seen": 195035136, "step": 124 }, { "epoch": 0.1850481125092524, "grad_norm": 1.2639355659484863, "learning_rate": 6.457318562901257e-06, "loss": 9.2706, "num_input_tokens_seen": 196608000, "step": 125 }, { "epoch": 0.18652849740932642, "grad_norm": 0.9730526804924011, "learning_rate": 6.394324377647028e-06, "loss": 9.9944, "num_input_tokens_seen": 198180864, "step": 126 }, { "epoch": 0.18800888230940044, "grad_norm": 1.089982032775879, "learning_rate": 6.331089278685599e-06, "loss": 8.8171, "num_input_tokens_seen": 199753728, "step": 127 }, { "epoch": 0.18948926720947445, "grad_norm": 1.2901642322540283, "learning_rate": 6.267624191883551e-06, "loss": 9.3602, "num_input_tokens_seen": 201326592, "step": 128 }, { "epoch": 0.19096965210954847, "grad_norm": 1.0030603408813477, "learning_rate": 6.203940082845144e-06, "loss": 8.4607, "num_input_tokens_seen": 202899456, "step": 129 }, { "epoch": 0.19245003700962252, "grad_norm": 1.0402629375457764, "learning_rate": 6.140047955017672e-06, "loss": 9.4883, "num_input_tokens_seen": 204472320, "step": 130 }, { "epoch": 0.19393042190969653, "grad_norm": 1.1065270900726318, "learning_rate": 6.075958847790262e-06, "loss": 9.5924, "num_input_tokens_seen": 206045184, "step": 131 }, { "epoch": 0.19541080680977055, "grad_norm": 0.9005462527275085, "learning_rate": 6.011683834586474e-06, "loss": 7.8592, "num_input_tokens_seen": 207618048, "step": 132 }, { "epoch": 0.19689119170984457, "grad_norm": 0.9889174699783325, "learning_rate": 5.947234020951015e-06, "loss": 8.8831, "num_input_tokens_seen": 209190912, "step": 133 }, { "epoch": 0.19837157660991858, "grad_norm": 0.9471520781517029, "learning_rate": 5.882620542630901e-06, "loss": 8.2636, "num_input_tokens_seen": 210763776, "step": 134 }, { "epoch": 0.1998519615099926, "grad_norm": 1.062712550163269, "learning_rate": 5.817854563651415e-06, "loss": 8.6707, "num_input_tokens_seen": 212336640, "step": 135 }, { "epoch": 0.20133234641006661, "grad_norm": 1.0248647928237915, "learning_rate": 5.752947274387147e-06, "loss": 9.3591, "num_input_tokens_seen": 213909504, "step": 136 }, { "epoch": 0.20281273131014063, "grad_norm": 1.0230802297592163, "learning_rate": 5.687909889628529e-06, "loss": 10.19, "num_input_tokens_seen": 215482368, "step": 137 }, { "epoch": 0.20429311621021465, "grad_norm": 1.065903902053833, "learning_rate": 5.622753646644102e-06, "loss": 8.6389, "num_input_tokens_seen": 217055232, "step": 138 }, { "epoch": 0.20577350111028866, "grad_norm": 0.9532954692840576, "learning_rate": 5.557489803238934e-06, "loss": 9.201, "num_input_tokens_seen": 218628096, "step": 139 }, { "epoch": 0.20725388601036268, "grad_norm": 1.2282330989837646, "learning_rate": 5.492129635809473e-06, "loss": 8.9373, "num_input_tokens_seen": 220200960, "step": 140 }, { "epoch": 0.20873427091043673, "grad_norm": 1.0525203943252563, "learning_rate": 5.426684437395196e-06, "loss": 8.1673, "num_input_tokens_seen": 221773824, "step": 141 }, { "epoch": 0.21021465581051074, "grad_norm": 1.0882335901260376, "learning_rate": 5.361165515727374e-06, "loss": 9.2341, "num_input_tokens_seen": 223346688, "step": 142 }, { "epoch": 0.21169504071058476, "grad_norm": 2.01729416847229, "learning_rate": 5.295584191275308e-06, "loss": 9.7504, "num_input_tokens_seen": 224919552, "step": 143 }, { "epoch": 0.21317542561065878, "grad_norm": 0.8909555077552795, "learning_rate": 5.229951795290353e-06, "loss": 8.9742, "num_input_tokens_seen": 226492416, "step": 144 }, { "epoch": 0.2146558105107328, "grad_norm": 0.9122491478919983, "learning_rate": 5.164279667848094e-06, "loss": 8.6969, "num_input_tokens_seen": 228065280, "step": 145 }, { "epoch": 0.2161361954108068, "grad_norm": 1.1025797128677368, "learning_rate": 5.0985791558889785e-06, "loss": 9.6289, "num_input_tokens_seen": 229638144, "step": 146 }, { "epoch": 0.21761658031088082, "grad_norm": 1.1003743410110474, "learning_rate": 5.032861611257783e-06, "loss": 8.6957, "num_input_tokens_seen": 231211008, "step": 147 }, { "epoch": 0.21909696521095484, "grad_norm": 0.9547885656356812, "learning_rate": 4.967138388742218e-06, "loss": 8.9786, "num_input_tokens_seen": 232783872, "step": 148 }, { "epoch": 0.22057735011102886, "grad_norm": 0.8208208680152893, "learning_rate": 4.9014208441110215e-06, "loss": 8.2503, "num_input_tokens_seen": 234356736, "step": 149 }, { "epoch": 0.22205773501110287, "grad_norm": 0.9018646478652954, "learning_rate": 4.835720332151907e-06, "loss": 9.121, "num_input_tokens_seen": 235929600, "step": 150 }, { "epoch": 0.22353811991117692, "grad_norm": 1.0260975360870361, "learning_rate": 4.770048204709648e-06, "loss": 9.8406, "num_input_tokens_seen": 237502464, "step": 151 }, { "epoch": 0.22501850481125094, "grad_norm": 1.107323169708252, "learning_rate": 4.7044158087246926e-06, "loss": 9.724, "num_input_tokens_seen": 239075328, "step": 152 }, { "epoch": 0.22649888971132495, "grad_norm": 0.9756017327308655, "learning_rate": 4.6388344842726266e-06, "loss": 7.9496, "num_input_tokens_seen": 240648192, "step": 153 }, { "epoch": 0.22797927461139897, "grad_norm": 0.9942044615745544, "learning_rate": 4.573315562604804e-06, "loss": 10.6851, "num_input_tokens_seen": 242221056, "step": 154 }, { "epoch": 0.22945965951147299, "grad_norm": 0.9424007534980774, "learning_rate": 4.5078703641905275e-06, "loss": 9.0249, "num_input_tokens_seen": 243793920, "step": 155 }, { "epoch": 0.230940044411547, "grad_norm": 1.0353285074234009, "learning_rate": 4.442510196761068e-06, "loss": 8.9519, "num_input_tokens_seen": 245366784, "step": 156 }, { "epoch": 0.23242042931162102, "grad_norm": 0.8987874388694763, "learning_rate": 4.377246353355899e-06, "loss": 8.8955, "num_input_tokens_seen": 246939648, "step": 157 }, { "epoch": 0.23390081421169504, "grad_norm": 1.4252616167068481, "learning_rate": 4.312090110371473e-06, "loss": 9.1974, "num_input_tokens_seen": 248512512, "step": 158 }, { "epoch": 0.23538119911176905, "grad_norm": 0.9613856673240662, "learning_rate": 4.247052725612853e-06, "loss": 9.7788, "num_input_tokens_seen": 250085376, "step": 159 }, { "epoch": 0.23686158401184307, "grad_norm": 0.9787338376045227, "learning_rate": 4.182145436348587e-06, "loss": 9.5162, "num_input_tokens_seen": 251658240, "step": 160 }, { "epoch": 0.23834196891191708, "grad_norm": 0.9008899331092834, "learning_rate": 4.1173794573691e-06, "loss": 8.1727, "num_input_tokens_seen": 253231104, "step": 161 }, { "epoch": 0.23982235381199113, "grad_norm": 1.171798825263977, "learning_rate": 4.052765979048986e-06, "loss": 8.8755, "num_input_tokens_seen": 254803968, "step": 162 }, { "epoch": 0.24130273871206515, "grad_norm": 0.9477250576019287, "learning_rate": 3.988316165413528e-06, "loss": 9.1919, "num_input_tokens_seen": 256376832, "step": 163 }, { "epoch": 0.24278312361213916, "grad_norm": 0.9179975390434265, "learning_rate": 3.924041152209739e-06, "loss": 8.9077, "num_input_tokens_seen": 257949696, "step": 164 }, { "epoch": 0.24426350851221318, "grad_norm": 0.9550530910491943, "learning_rate": 3.859952044982329e-06, "loss": 8.0621, "num_input_tokens_seen": 259522560, "step": 165 }, { "epoch": 0.2457438934122872, "grad_norm": 1.0108799934387207, "learning_rate": 3.7960599171548572e-06, "loss": 9.8261, "num_input_tokens_seen": 261095424, "step": 166 }, { "epoch": 0.2472242783123612, "grad_norm": 1.032667636871338, "learning_rate": 3.732375808116451e-06, "loss": 9.353, "num_input_tokens_seen": 262668288, "step": 167 }, { "epoch": 0.24870466321243523, "grad_norm": 0.99181067943573, "learning_rate": 3.6689107213144025e-06, "loss": 8.4164, "num_input_tokens_seen": 264241152, "step": 168 }, { "epoch": 0.25018504811250925, "grad_norm": 1.082139492034912, "learning_rate": 3.6056756223529734e-06, "loss": 9.764, "num_input_tokens_seen": 265814016, "step": 169 }, { "epoch": 0.25166543301258326, "grad_norm": 0.905164361000061, "learning_rate": 3.542681437098745e-06, "loss": 8.5983, "num_input_tokens_seen": 267386880, "step": 170 }, { "epoch": 0.2531458179126573, "grad_norm": 0.9460884928703308, "learning_rate": 3.479939049792817e-06, "loss": 8.7677, "num_input_tokens_seen": 268959744, "step": 171 }, { "epoch": 0.2546262028127313, "grad_norm": 1.0467054843902588, "learning_rate": 3.4174593011702197e-06, "loss": 9.57, "num_input_tokens_seen": 270532608, "step": 172 }, { "epoch": 0.2561065877128053, "grad_norm": 1.0835543870925903, "learning_rate": 3.3552529865868323e-06, "loss": 8.5939, "num_input_tokens_seen": 272105472, "step": 173 }, { "epoch": 0.25758697261287933, "grad_norm": 0.9074414372444153, "learning_rate": 3.2933308541541365e-06, "loss": 8.4592, "num_input_tokens_seen": 273678336, "step": 174 }, { "epoch": 0.25906735751295334, "grad_norm": 0.9010567665100098, "learning_rate": 3.2317036028821523e-06, "loss": 8.5693, "num_input_tokens_seen": 275251200, "step": 175 }, { "epoch": 0.26054774241302736, "grad_norm": 0.8705753684043884, "learning_rate": 3.1703818808308327e-06, "loss": 8.4727, "num_input_tokens_seen": 276824064, "step": 176 }, { "epoch": 0.26202812731310143, "grad_norm": 0.9062320590019226, "learning_rate": 3.1093762832702775e-06, "loss": 7.9813, "num_input_tokens_seen": 278396928, "step": 177 }, { "epoch": 0.26350851221317545, "grad_norm": 1.0476340055465698, "learning_rate": 3.048697350850073e-06, "loss": 9.594, "num_input_tokens_seen": 279969792, "step": 178 }, { "epoch": 0.26498889711324947, "grad_norm": 0.9298098683357239, "learning_rate": 2.988355567778043e-06, "loss": 9.7273, "num_input_tokens_seen": 281542656, "step": 179 }, { "epoch": 0.2664692820133235, "grad_norm": 1.8974478244781494, "learning_rate": 2.9283613600087933e-06, "loss": 10.0173, "num_input_tokens_seen": 283115520, "step": 180 }, { "epoch": 0.2679496669133975, "grad_norm": 1.0068743228912354, "learning_rate": 2.8687250934422774e-06, "loss": 9.5477, "num_input_tokens_seen": 284688384, "step": 181 }, { "epoch": 0.2694300518134715, "grad_norm": 1.0006181001663208, "learning_rate": 2.809457072132766e-06, "loss": 9.0357, "num_input_tokens_seen": 286261248, "step": 182 }, { "epoch": 0.27091043671354553, "grad_norm": 0.9685924649238586, "learning_rate": 2.750567536508504e-06, "loss": 7.8482, "num_input_tokens_seen": 287834112, "step": 183 }, { "epoch": 0.27239082161361955, "grad_norm": 1.0447078943252563, "learning_rate": 2.692066661602333e-06, "loss": 8.1498, "num_input_tokens_seen": 289406976, "step": 184 }, { "epoch": 0.27387120651369357, "grad_norm": 0.9324178099632263, "learning_rate": 2.633964555293654e-06, "loss": 9.7588, "num_input_tokens_seen": 290979840, "step": 185 }, { "epoch": 0.2753515914137676, "grad_norm": 1.0867562294006348, "learning_rate": 2.576271256561953e-06, "loss": 8.6674, "num_input_tokens_seen": 292552704, "step": 186 }, { "epoch": 0.2768319763138416, "grad_norm": 1.0550150871276855, "learning_rate": 2.5189967337522574e-06, "loss": 9.759, "num_input_tokens_seen": 294125568, "step": 187 }, { "epoch": 0.2783123612139156, "grad_norm": 0.8793514370918274, "learning_rate": 2.46215088285279e-06, "loss": 8.3819, "num_input_tokens_seen": 295698432, "step": 188 }, { "epoch": 0.27979274611398963, "grad_norm": 1.1474155187606812, "learning_rate": 2.4057435257851173e-06, "loss": 8.0812, "num_input_tokens_seen": 297271296, "step": 189 }, { "epoch": 0.28127313101406365, "grad_norm": 1.0644192695617676, "learning_rate": 2.349784408707112e-06, "loss": 8.8425, "num_input_tokens_seen": 298844160, "step": 190 }, { "epoch": 0.28275351591413767, "grad_norm": 1.0398298501968384, "learning_rate": 2.2942832003289823e-06, "loss": 8.2475, "num_input_tokens_seen": 300417024, "step": 191 }, { "epoch": 0.2842339008142117, "grad_norm": 1.050809621810913, "learning_rate": 2.2392494902427027e-06, "loss": 8.5463, "num_input_tokens_seen": 301989888, "step": 192 }, { "epoch": 0.2857142857142857, "grad_norm": 0.9422303438186646, "learning_rate": 2.1846927872651135e-06, "loss": 9.5692, "num_input_tokens_seen": 303562752, "step": 193 }, { "epoch": 0.2871946706143597, "grad_norm": 0.986879289150238, "learning_rate": 2.1306225177949584e-06, "loss": 8.8887, "num_input_tokens_seen": 305135616, "step": 194 }, { "epoch": 0.28867505551443373, "grad_norm": 1.1267606019973755, "learning_rate": 2.07704802418419e-06, "loss": 9.7485, "num_input_tokens_seen": 306708480, "step": 195 }, { "epoch": 0.29015544041450775, "grad_norm": 0.9625312685966492, "learning_rate": 2.023978563123771e-06, "loss": 8.7099, "num_input_tokens_seen": 308281344, "step": 196 }, { "epoch": 0.29163582531458176, "grad_norm": 0.9570685625076294, "learning_rate": 1.9714233040442915e-06, "loss": 9.4608, "num_input_tokens_seen": 309854208, "step": 197 }, { "epoch": 0.29311621021465584, "grad_norm": 0.9656940698623657, "learning_rate": 1.919391327531663e-06, "loss": 9.6251, "num_input_tokens_seen": 311427072, "step": 198 }, { "epoch": 0.29459659511472985, "grad_norm": 1.0022757053375244, "learning_rate": 1.8678916237581524e-06, "loss": 8.4094, "num_input_tokens_seen": 312999936, "step": 199 }, { "epoch": 0.29607698001480387, "grad_norm": 0.9374834895133972, "learning_rate": 1.816933090929055e-06, "loss": 8.2778, "num_input_tokens_seen": 314572800, "step": 200 }, { "epoch": 0.2975573649148779, "grad_norm": 0.9973615407943726, "learning_rate": 1.7665245337452368e-06, "loss": 9.8626, "num_input_tokens_seen": 316145664, "step": 201 }, { "epoch": 0.2990377498149519, "grad_norm": 1.0838911533355713, "learning_rate": 1.716674661881848e-06, "loss": 9.0148, "num_input_tokens_seen": 317718528, "step": 202 }, { "epoch": 0.3005181347150259, "grad_norm": 0.9430079460144043, "learning_rate": 1.667392088483456e-06, "loss": 8.5152, "num_input_tokens_seen": 319291392, "step": 203 }, { "epoch": 0.30199851961509994, "grad_norm": 0.9246770739555359, "learning_rate": 1.6186853286758397e-06, "loss": 8.677, "num_input_tokens_seen": 320864256, "step": 204 }, { "epoch": 0.30347890451517395, "grad_norm": 0.9551609754562378, "learning_rate": 1.570562798094747e-06, "loss": 9.2058, "num_input_tokens_seen": 322437120, "step": 205 }, { "epoch": 0.30495928941524797, "grad_norm": 0.9086716175079346, "learning_rate": 1.5230328114318127e-06, "loss": 8.6598, "num_input_tokens_seen": 324009984, "step": 206 }, { "epoch": 0.306439674315322, "grad_norm": 1.0503205060958862, "learning_rate": 1.4761035809979395e-06, "loss": 9.6173, "num_input_tokens_seen": 325582848, "step": 207 }, { "epoch": 0.307920059215396, "grad_norm": 1.0393455028533936, "learning_rate": 1.4297832153043657e-06, "loss": 9.3826, "num_input_tokens_seen": 327155712, "step": 208 }, { "epoch": 0.30940044411547, "grad_norm": 0.867643415927887, "learning_rate": 1.3840797176616467e-06, "loss": 8.3576, "num_input_tokens_seen": 328728576, "step": 209 }, { "epoch": 0.31088082901554404, "grad_norm": 0.8910089135169983, "learning_rate": 1.3390009847968505e-06, "loss": 8.7599, "num_input_tokens_seen": 330301440, "step": 210 }, { "epoch": 0.31236121391561805, "grad_norm": 0.9597201943397522, "learning_rate": 1.2945548054891322e-06, "loss": 9.3277, "num_input_tokens_seen": 331874304, "step": 211 }, { "epoch": 0.31384159881569207, "grad_norm": 1.089836835861206, "learning_rate": 1.2507488592239848e-06, "loss": 8.9898, "num_input_tokens_seen": 333447168, "step": 212 }, { "epoch": 0.3153219837157661, "grad_norm": 0.9154278039932251, "learning_rate": 1.2075907148663579e-06, "loss": 8.9663, "num_input_tokens_seen": 335020032, "step": 213 }, { "epoch": 0.3168023686158401, "grad_norm": 1.0550861358642578, "learning_rate": 1.1650878293528994e-06, "loss": 9.9383, "num_input_tokens_seen": 336592896, "step": 214 }, { "epoch": 0.3182827535159141, "grad_norm": 1.0752592086791992, "learning_rate": 1.1232475464035386e-06, "loss": 9.0678, "num_input_tokens_seen": 338165760, "step": 215 }, { "epoch": 0.31976313841598814, "grad_norm": 0.9631142020225525, "learning_rate": 1.0820770952526155e-06, "loss": 9.0148, "num_input_tokens_seen": 339738624, "step": 216 }, { "epoch": 0.32124352331606215, "grad_norm": 1.012771487236023, "learning_rate": 1.0415835893998116e-06, "loss": 9.3557, "num_input_tokens_seen": 341311488, "step": 217 }, { "epoch": 0.32272390821613617, "grad_norm": 0.8069270253181458, "learning_rate": 1.0017740253810608e-06, "loss": 8.6944, "num_input_tokens_seen": 342884352, "step": 218 }, { "epoch": 0.32420429311621024, "grad_norm": 1.0018329620361328, "learning_rate": 9.62655281559679e-07, "loss": 9.864, "num_input_tokens_seen": 344457216, "step": 219 }, { "epoch": 0.32568467801628426, "grad_norm": 0.9892444610595703, "learning_rate": 9.242341169379077e-07, "loss": 9.3986, "num_input_tokens_seen": 346030080, "step": 220 }, { "epoch": 0.3271650629163583, "grad_norm": 0.9774924516677856, "learning_rate": 8.865171699890835e-07, "loss": 9.3258, "num_input_tokens_seen": 347602944, "step": 221 }, { "epoch": 0.3286454478164323, "grad_norm": 0.9630134701728821, "learning_rate": 8.495109575106331e-07, "loss": 9.5456, "num_input_tokens_seen": 349175808, "step": 222 }, { "epoch": 0.3301258327165063, "grad_norm": 0.9312668442726135, "learning_rate": 8.132218734980852e-07, "loss": 9.1459, "num_input_tokens_seen": 350748672, "step": 223 }, { "epoch": 0.3316062176165803, "grad_norm": 0.8973395824432373, "learning_rate": 7.776561880403072e-07, "loss": 8.6568, "num_input_tokens_seen": 352321536, "step": 224 }, { "epoch": 0.33308660251665434, "grad_norm": 0.9541833400726318, "learning_rate": 7.42820046236154e-07, "loss": 9.7172, "num_input_tokens_seen": 353894400, "step": 225 }, { "epoch": 0.33456698741672836, "grad_norm": 1.1497849225997925, "learning_rate": 7.087194671326986e-07, "loss": 8.6373, "num_input_tokens_seen": 355467264, "step": 226 }, { "epoch": 0.3360473723168024, "grad_norm": 1.0820022821426392, "learning_rate": 6.753603426852589e-07, "loss": 9.0473, "num_input_tokens_seen": 357040128, "step": 227 }, { "epoch": 0.3375277572168764, "grad_norm": 0.9475330114364624, "learning_rate": 6.427484367393699e-07, "loss": 8.4438, "num_input_tokens_seen": 358612992, "step": 228 }, { "epoch": 0.3390081421169504, "grad_norm": 1.0315179824829102, "learning_rate": 6.108893840348995e-07, "loss": 8.6633, "num_input_tokens_seen": 360185856, "step": 229 }, { "epoch": 0.3404885270170244, "grad_norm": 1.080465316772461, "learning_rate": 5.797886892324695e-07, "loss": 8.8234, "num_input_tokens_seen": 361758720, "step": 230 }, { "epoch": 0.34196891191709844, "grad_norm": 0.9258569478988647, "learning_rate": 5.494517259623478e-07, "loss": 9.1777, "num_input_tokens_seen": 363331584, "step": 231 }, { "epoch": 0.34344929681717246, "grad_norm": 1.034071445465088, "learning_rate": 5.198837358959901e-07, "loss": 8.5888, "num_input_tokens_seen": 364904448, "step": 232 }, { "epoch": 0.3449296817172465, "grad_norm": 0.8718693256378174, "learning_rate": 4.91089827840367e-07, "loss": 10.2311, "num_input_tokens_seen": 366477312, "step": 233 }, { "epoch": 0.3464100666173205, "grad_norm": 0.8354641795158386, "learning_rate": 4.6307497685525894e-07, "loss": 8.614, "num_input_tokens_seen": 368050176, "step": 234 }, { "epoch": 0.3478904515173945, "grad_norm": 1.3446056842803955, "learning_rate": 4.3584402339366174e-07, "loss": 9.0865, "num_input_tokens_seen": 369623040, "step": 235 }, { "epoch": 0.3493708364174685, "grad_norm": 0.9503663778305054, "learning_rate": 4.0940167246543595e-07, "loss": 9.8676, "num_input_tokens_seen": 371195904, "step": 236 }, { "epoch": 0.35085122131754254, "grad_norm": 1.1458405256271362, "learning_rate": 3.8375249282437743e-07, "loss": 9.0429, "num_input_tokens_seen": 372768768, "step": 237 }, { "epoch": 0.35233160621761656, "grad_norm": 0.9733213782310486, "learning_rate": 3.589009161788104e-07, "loss": 9.3573, "num_input_tokens_seen": 374341632, "step": 238 }, { "epoch": 0.35381199111769057, "grad_norm": 1.7733560800552368, "learning_rate": 3.3485123642587657e-07, "loss": 9.0824, "num_input_tokens_seen": 375914496, "step": 239 }, { "epoch": 0.35529237601776464, "grad_norm": 0.9710190892219543, "learning_rate": 3.116076089096265e-07, "loss": 9.1545, "num_input_tokens_seen": 377487360, "step": 240 }, { "epoch": 0.35677276091783866, "grad_norm": 0.9978755712509155, "learning_rate": 2.8917404970305096e-07, "loss": 9.4122, "num_input_tokens_seen": 379060224, "step": 241 }, { "epoch": 0.3582531458179127, "grad_norm": 1.022213339805603, "learning_rate": 2.6755443491417786e-07, "loss": 10.0346, "num_input_tokens_seen": 380633088, "step": 242 }, { "epoch": 0.3597335307179867, "grad_norm": 1.015006422996521, "learning_rate": 2.467525000163523e-07, "loss": 8.7515, "num_input_tokens_seen": 382205952, "step": 243 }, { "epoch": 0.3612139156180607, "grad_norm": 1.0302066802978516, "learning_rate": 2.2677183920281342e-07, "loss": 9.541, "num_input_tokens_seen": 383778816, "step": 244 }, { "epoch": 0.3626943005181347, "grad_norm": 0.9580565690994263, "learning_rate": 2.0761590476568893e-07, "loss": 10.0999, "num_input_tokens_seen": 385351680, "step": 245 }, { "epoch": 0.36417468541820874, "grad_norm": 0.9705215692520142, "learning_rate": 1.892880064994934e-07, "loss": 9.1858, "num_input_tokens_seen": 386924544, "step": 246 }, { "epoch": 0.36565507031828276, "grad_norm": 0.7981467247009277, "learning_rate": 1.7179131112926628e-07, "loss": 8.0595, "num_input_tokens_seen": 388497408, "step": 247 }, { "epoch": 0.3671354552183568, "grad_norm": 1.095644474029541, "learning_rate": 1.551288417634106e-07, "loss": 8.4068, "num_input_tokens_seen": 390070272, "step": 248 }, { "epoch": 0.3686158401184308, "grad_norm": 1.028245449066162, "learning_rate": 1.3930347737136195e-07, "loss": 9.8459, "num_input_tokens_seen": 391643136, "step": 249 }, { "epoch": 0.3700962250185048, "grad_norm": 0.981110155582428, "learning_rate": 1.2431795228615372e-07, "loss": 8.3485, "num_input_tokens_seen": 393216000, "step": 250 }, { "epoch": 0.3715766099185788, "grad_norm": 0.8525012135505676, "learning_rate": 1.1017485573197151e-07, "loss": 9.1968, "num_input_tokens_seen": 394788864, "step": 251 }, { "epoch": 0.37305699481865284, "grad_norm": 0.9560251832008362, "learning_rate": 9.687663137678605e-08, "loss": 10.2963, "num_input_tokens_seen": 396361728, "step": 252 }, { "epoch": 0.37453737971872686, "grad_norm": 0.8536869883537292, "learning_rate": 8.442557691013042e-08, "loss": 9.5709, "num_input_tokens_seen": 397934592, "step": 253 }, { "epoch": 0.3760177646188009, "grad_norm": 1.0305019617080688, "learning_rate": 7.282384364610207e-08, "loss": 8.4585, "num_input_tokens_seen": 399507456, "step": 254 }, { "epoch": 0.3774981495188749, "grad_norm": 0.9297599792480469, "learning_rate": 6.207343615165562e-08, "loss": 9.1767, "num_input_tokens_seen": 401080320, "step": 255 }, { "epoch": 0.3789785344189489, "grad_norm": 1.0432578325271606, "learning_rate": 5.21762119002478e-08, "loss": 8.9613, "num_input_tokens_seen": 402653184, "step": 256 }, { "epoch": 0.3804589193190229, "grad_norm": 0.9926044940948486, "learning_rate": 4.31338809509052e-08, "loss": 10.0942, "num_input_tokens_seen": 404226048, "step": 257 }, { "epoch": 0.38193930421909694, "grad_norm": 1.0333350896835327, "learning_rate": 3.494800565275125e-08, "loss": 8.8122, "num_input_tokens_seen": 405798912, "step": 258 }, { "epoch": 0.38341968911917096, "grad_norm": 1.0322990417480469, "learning_rate": 2.7620000375064848e-08, "loss": 9.7431, "num_input_tokens_seen": 407371776, "step": 259 }, { "epoch": 0.38490007401924503, "grad_norm": 0.9929525256156921, "learning_rate": 2.115113126290258e-08, "loss": 8.8897, "num_input_tokens_seen": 408944640, "step": 260 }, { "epoch": 0.38638045891931905, "grad_norm": 0.9430935978889465, "learning_rate": 1.554251601833201e-08, "loss": 8.9446, "num_input_tokens_seen": 410517504, "step": 261 }, { "epoch": 0.38786084381939306, "grad_norm": 1.02765691280365, "learning_rate": 1.0795123707312283e-08, "loss": 9.9964, "num_input_tokens_seen": 412090368, "step": 262 }, { "epoch": 0.3893412287194671, "grad_norm": 0.9639078974723816, "learning_rate": 6.9097745922580564e-09, "loss": 9.2349, "num_input_tokens_seen": 413663232, "step": 263 }, { "epoch": 0.3908216136195411, "grad_norm": 0.8407428860664368, "learning_rate": 3.887139990313427e-09, "loss": 10.3673, "num_input_tokens_seen": 415236096, "step": 264 }, { "epoch": 0.3923019985196151, "grad_norm": 1.0720157623291016, "learning_rate": 1.7277421573608234e-09, "loss": 9.3684, "num_input_tokens_seen": 416808960, "step": 265 }, { "epoch": 0.39378238341968913, "grad_norm": 0.9578903913497925, "learning_rate": 4.3195419778319095e-10, "loss": 8.8959, "num_input_tokens_seen": 418381824, "step": 266 } ], "logging_steps": 1.0, "max_steps": 266, "num_input_tokens_seen": 418381824, "num_train_epochs": 1, "save_steps": 133, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 773247121489920.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }