{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 165, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0060790273556231, "grad_norm": 1.133876085281372, "learning_rate": 0.0, "loss": 2.3054, "step": 1 }, { "epoch": 0.0121580547112462, "grad_norm": 1.1317460536956787, "learning_rate": 4e-05, "loss": 2.3129, "step": 2 }, { "epoch": 0.0182370820668693, "grad_norm": 1.030227780342102, "learning_rate": 8e-05, "loss": 2.2997, "step": 3 }, { "epoch": 0.0243161094224924, "grad_norm": 0.7727698087692261, "learning_rate": 0.00012, "loss": 2.0895, "step": 4 }, { "epoch": 0.030395136778115502, "grad_norm": 0.6157111525535583, "learning_rate": 0.00016, "loss": 1.9285, "step": 5 }, { "epoch": 0.0364741641337386, "grad_norm": 0.45515450835227966, "learning_rate": 0.0002, "loss": 1.6672, "step": 6 }, { "epoch": 0.0425531914893617, "grad_norm": 0.3927247226238251, "learning_rate": 0.0001999953280342959, "loss": 1.4969, "step": 7 }, { "epoch": 0.0486322188449848, "grad_norm": 0.36588212847709656, "learning_rate": 0.00019998131257372876, "loss": 1.3842, "step": 8 }, { "epoch": 0.0547112462006079, "grad_norm": 0.29039424657821655, "learning_rate": 0.0001999579549278937, "loss": 1.2499, "step": 9 }, { "epoch": 0.060790273556231005, "grad_norm": 0.279226690530777, "learning_rate": 0.00019992525727931303, "loss": 1.1685, "step": 10 }, { "epoch": 0.0668693009118541, "grad_norm": 0.272398978471756, "learning_rate": 0.00019988322268323268, "loss": 1.0638, "step": 11 }, { "epoch": 0.0729483282674772, "grad_norm": 0.24636079370975494, "learning_rate": 0.0001998318550673364, "loss": 0.9584, "step": 12 }, { "epoch": 0.0790273556231003, "grad_norm": 0.22141209244728088, "learning_rate": 0.00019977115923137912, "loss": 0.9138, "step": 13 }, { "epoch": 0.0851063829787234, "grad_norm": 0.17475497722625732, "learning_rate": 0.00019970114084673796, "loss": 0.8515, "step": 14 }, { "epoch": 0.0911854103343465, "grad_norm": 0.15117621421813965, "learning_rate": 0.0001996218064558829, "loss": 0.8233, "step": 15 }, { "epoch": 0.0972644376899696, "grad_norm": 0.11647526919841766, "learning_rate": 0.00019953316347176488, "loss": 0.8347, "step": 16 }, { "epoch": 0.1033434650455927, "grad_norm": 0.10608438402414322, "learning_rate": 0.00019943522017712358, "loss": 0.7888, "step": 17 }, { "epoch": 0.1094224924012158, "grad_norm": 0.10528700798749924, "learning_rate": 0.0001993279857237133, "loss": 0.7655, "step": 18 }, { "epoch": 0.11550151975683891, "grad_norm": 0.09133293479681015, "learning_rate": 0.0001992114701314478, "loss": 0.7585, "step": 19 }, { "epoch": 0.12158054711246201, "grad_norm": 0.09861589223146439, "learning_rate": 0.0001990856842874641, "loss": 0.7616, "step": 20 }, { "epoch": 0.1276595744680851, "grad_norm": 0.07328511029481888, "learning_rate": 0.0001989506399451051, "loss": 0.746, "step": 21 }, { "epoch": 0.1337386018237082, "grad_norm": 0.08320208638906479, "learning_rate": 0.00019880634972282166, "loss": 0.729, "step": 22 }, { "epoch": 0.1398176291793313, "grad_norm": 0.08053930848836899, "learning_rate": 0.0001986528271029931, "loss": 0.7052, "step": 23 }, { "epoch": 0.1458966565349544, "grad_norm": 0.08553291857242584, "learning_rate": 0.00019849008643066772, "loss": 0.7167, "step": 24 }, { "epoch": 0.1519756838905775, "grad_norm": 0.07856528460979462, "learning_rate": 0.00019831814291222232, "loss": 0.707, "step": 25 }, { "epoch": 0.1580547112462006, "grad_norm": 0.0697573572397232, "learning_rate": 0.00019813701261394136, "loss": 0.706, "step": 26 }, { "epoch": 0.1641337386018237, "grad_norm": 0.07752593606710434, "learning_rate": 0.0001979467124605156, "loss": 0.702, "step": 27 }, { "epoch": 0.1702127659574468, "grad_norm": 0.06536618620157242, "learning_rate": 0.0001977472602334609, "loss": 0.6612, "step": 28 }, { "epoch": 0.1762917933130699, "grad_norm": 0.061260443180799484, "learning_rate": 0.0001975386745694565, "loss": 0.6432, "step": 29 }, { "epoch": 0.182370820668693, "grad_norm": 0.04959681257605553, "learning_rate": 0.00019732097495860386, "loss": 0.6545, "step": 30 }, { "epoch": 0.1884498480243161, "grad_norm": 0.05118938162922859, "learning_rate": 0.0001970941817426052, "loss": 0.6564, "step": 31 }, { "epoch": 0.1945288753799392, "grad_norm": 0.05098890885710716, "learning_rate": 0.0001968583161128631, "loss": 0.6751, "step": 32 }, { "epoch": 0.2006079027355623, "grad_norm": 0.049598027020692825, "learning_rate": 0.00019661340010850026, "loss": 0.6582, "step": 33 }, { "epoch": 0.2066869300911854, "grad_norm": 0.052051279693841934, "learning_rate": 0.00019635945661430006, "loss": 0.6279, "step": 34 }, { "epoch": 0.2127659574468085, "grad_norm": 0.04154731333255768, "learning_rate": 0.00019609650935856844, "loss": 0.6344, "step": 35 }, { "epoch": 0.2188449848024316, "grad_norm": 0.045474398881196976, "learning_rate": 0.00019582458291091663, "loss": 0.6195, "step": 36 }, { "epoch": 0.22492401215805471, "grad_norm": 0.04392531141638756, "learning_rate": 0.00019554370267996538, "loss": 0.6394, "step": 37 }, { "epoch": 0.23100303951367782, "grad_norm": 0.04103963077068329, "learning_rate": 0.0001952538949109708, "loss": 0.6334, "step": 38 }, { "epoch": 0.23708206686930092, "grad_norm": 0.042054690420627594, "learning_rate": 0.00019495518668337201, "loss": 0.6239, "step": 39 }, { "epoch": 0.24316109422492402, "grad_norm": 0.0393654890358448, "learning_rate": 0.00019464760590826098, "loss": 0.6054, "step": 40 }, { "epoch": 0.24924012158054712, "grad_norm": 0.042832907289266586, "learning_rate": 0.0001943311813257743, "loss": 0.5769, "step": 41 }, { "epoch": 0.2553191489361702, "grad_norm": 0.03982216864824295, "learning_rate": 0.00019400594250240798, "loss": 0.5921, "step": 42 }, { "epoch": 0.2613981762917933, "grad_norm": 0.03954484313726425, "learning_rate": 0.0001936719198282545, "loss": 0.5976, "step": 43 }, { "epoch": 0.2674772036474164, "grad_norm": 0.042563363909721375, "learning_rate": 0.00019332914451416347, "loss": 0.6016, "step": 44 }, { "epoch": 0.2735562310030395, "grad_norm": 0.04000777006149292, "learning_rate": 0.00019297764858882514, "loss": 0.5822, "step": 45 }, { "epoch": 0.2796352583586626, "grad_norm": 0.04084771126508713, "learning_rate": 0.00019261746489577765, "loss": 0.6122, "step": 46 }, { "epoch": 0.2857142857142857, "grad_norm": 0.040883008390665054, "learning_rate": 0.00019224862709033824, "loss": 0.5898, "step": 47 }, { "epoch": 0.2917933130699088, "grad_norm": 0.04313352331519127, "learning_rate": 0.00019187116963645842, "loss": 0.5852, "step": 48 }, { "epoch": 0.2978723404255319, "grad_norm": 0.04391175135970116, "learning_rate": 0.00019148512780350384, "loss": 0.5901, "step": 49 }, { "epoch": 0.303951367781155, "grad_norm": 0.041302260011434555, "learning_rate": 0.0001910905376629585, "loss": 0.5725, "step": 50 }, { "epoch": 0.3100303951367781, "grad_norm": 0.044717274606227875, "learning_rate": 0.00019068743608505455, "loss": 0.5396, "step": 51 }, { "epoch": 0.3161094224924012, "grad_norm": 0.04290296137332916, "learning_rate": 0.0001902758607353269, "loss": 0.5888, "step": 52 }, { "epoch": 0.3221884498480243, "grad_norm": 0.04425125569105148, "learning_rate": 0.0001898558500710939, "loss": 0.5614, "step": 53 }, { "epoch": 0.3282674772036474, "grad_norm": 0.047949157655239105, "learning_rate": 0.00018942744333786397, "loss": 0.5434, "step": 54 }, { "epoch": 0.3343465045592705, "grad_norm": 0.0461716391146183, "learning_rate": 0.0001889906805656684, "loss": 0.5788, "step": 55 }, { "epoch": 0.3404255319148936, "grad_norm": 0.04746852442622185, "learning_rate": 0.000188545602565321, "loss": 0.5431, "step": 56 }, { "epoch": 0.3465045592705167, "grad_norm": 0.04825344309210777, "learning_rate": 0.00018809225092460488, "loss": 0.5372, "step": 57 }, { "epoch": 0.3525835866261398, "grad_norm": 0.046712443232536316, "learning_rate": 0.00018763066800438636, "loss": 0.5721, "step": 58 }, { "epoch": 0.3586626139817629, "grad_norm": 0.045053232461214066, "learning_rate": 0.00018716089693465696, "loss": 0.5632, "step": 59 }, { "epoch": 0.364741641337386, "grad_norm": 0.04859543964266777, "learning_rate": 0.00018668298161050309, "loss": 0.579, "step": 60 }, { "epoch": 0.3708206686930091, "grad_norm": 0.05270425230264664, "learning_rate": 0.00018619696668800492, "loss": 0.5625, "step": 61 }, { "epoch": 0.3768996960486322, "grad_norm": 0.05261973291635513, "learning_rate": 0.00018570289758006346, "loss": 0.5377, "step": 62 }, { "epoch": 0.3829787234042553, "grad_norm": 0.04979139566421509, "learning_rate": 0.0001852008204521572, "loss": 0.5403, "step": 63 }, { "epoch": 0.3890577507598784, "grad_norm": 0.05422956123948097, "learning_rate": 0.0001846907822180286, "loss": 0.5251, "step": 64 }, { "epoch": 0.3951367781155015, "grad_norm": 0.057373154908418655, "learning_rate": 0.00018417283053530044, "loss": 0.5237, "step": 65 }, { "epoch": 0.4012158054711246, "grad_norm": 0.054097920656204224, "learning_rate": 0.00018364701380102266, "loss": 0.5083, "step": 66 }, { "epoch": 0.4072948328267477, "grad_norm": 0.05596352368593216, "learning_rate": 0.0001831133811471503, "loss": 0.5185, "step": 67 }, { "epoch": 0.4133738601823708, "grad_norm": 0.054655492305755615, "learning_rate": 0.0001825719824359524, "loss": 0.5103, "step": 68 }, { "epoch": 0.4194528875379939, "grad_norm": 0.056628547608852386, "learning_rate": 0.0001820228682553533, "loss": 0.5402, "step": 69 }, { "epoch": 0.425531914893617, "grad_norm": 0.058673664927482605, "learning_rate": 0.00018146608991420534, "loss": 0.5156, "step": 70 }, { "epoch": 0.4316109422492401, "grad_norm": 0.0580279715359211, "learning_rate": 0.00018090169943749476, "loss": 0.5077, "step": 71 }, { "epoch": 0.4376899696048632, "grad_norm": 0.06008626148104668, "learning_rate": 0.00018032974956148063, "loss": 0.4982, "step": 72 }, { "epoch": 0.44376899696048633, "grad_norm": 0.06102442368865013, "learning_rate": 0.00017975029372876706, "loss": 0.5117, "step": 73 }, { "epoch": 0.44984802431610943, "grad_norm": 0.061185382306575775, "learning_rate": 0.0001791633860833096, "loss": 0.4933, "step": 74 }, { "epoch": 0.45592705167173253, "grad_norm": 0.06262445449829102, "learning_rate": 0.00017856908146535603, "loss": 0.4907, "step": 75 }, { "epoch": 0.46200607902735563, "grad_norm": 0.06194239482283592, "learning_rate": 0.00017796743540632223, "loss": 0.4722, "step": 76 }, { "epoch": 0.46808510638297873, "grad_norm": 0.06256049871444702, "learning_rate": 0.00017735850412360331, "loss": 0.4935, "step": 77 }, { "epoch": 0.47416413373860183, "grad_norm": 0.06734279543161392, "learning_rate": 0.00017674234451532065, "loss": 0.4767, "step": 78 }, { "epoch": 0.48024316109422494, "grad_norm": 0.06772830337285995, "learning_rate": 0.00017611901415500535, "loss": 0.4915, "step": 79 }, { "epoch": 0.48632218844984804, "grad_norm": 0.06995881348848343, "learning_rate": 0.00017548857128621875, "loss": 0.4723, "step": 80 }, { "epoch": 0.49240121580547114, "grad_norm": 0.06601176410913467, "learning_rate": 0.00017485107481711012, "loss": 0.4831, "step": 81 }, { "epoch": 0.49848024316109424, "grad_norm": 0.06836414337158203, "learning_rate": 0.00017420658431491223, "loss": 0.4585, "step": 82 }, { "epoch": 0.5045592705167173, "grad_norm": 0.06948156654834747, "learning_rate": 0.00017355516000037554, "loss": 0.4624, "step": 83 }, { "epoch": 0.5106382978723404, "grad_norm": 0.06856788694858551, "learning_rate": 0.00017289686274214118, "loss": 0.4497, "step": 84 }, { "epoch": 0.5167173252279635, "grad_norm": 0.07304105907678604, "learning_rate": 0.0001722317540510534, "loss": 0.4697, "step": 85 }, { "epoch": 0.5227963525835866, "grad_norm": 0.07297949492931366, "learning_rate": 0.00017155989607441213, "loss": 0.4376, "step": 86 }, { "epoch": 0.5288753799392097, "grad_norm": 0.07408228516578674, "learning_rate": 0.00017088135159016584, "loss": 0.4493, "step": 87 }, { "epoch": 0.5349544072948328, "grad_norm": 0.07207636535167694, "learning_rate": 0.00017019618400104572, "loss": 0.4612, "step": 88 }, { "epoch": 0.541033434650456, "grad_norm": 0.07454758137464523, "learning_rate": 0.00016950445732864127, "loss": 0.4123, "step": 89 }, { "epoch": 0.547112462006079, "grad_norm": 0.07566685974597931, "learning_rate": 0.00016880623620741842, "loss": 0.4632, "step": 90 }, { "epoch": 0.5531914893617021, "grad_norm": 0.07650725543498993, "learning_rate": 0.00016810158587867973, "loss": 0.4153, "step": 91 }, { "epoch": 0.5592705167173252, "grad_norm": 0.07634485512971878, "learning_rate": 0.0001673905721844686, "loss": 0.4402, "step": 92 }, { "epoch": 0.5653495440729484, "grad_norm": 0.07571671158075333, "learning_rate": 0.00016667326156141692, "loss": 0.4308, "step": 93 }, { "epoch": 0.5714285714285714, "grad_norm": 0.08016683161258698, "learning_rate": 0.00016594972103453726, "loss": 0.4213, "step": 94 }, { "epoch": 0.5775075987841946, "grad_norm": 0.07403143495321274, "learning_rate": 0.0001652200182109602, "loss": 0.4485, "step": 95 }, { "epoch": 0.5835866261398176, "grad_norm": 0.08598003536462784, "learning_rate": 0.00016448422127361706, "loss": 0.4044, "step": 96 }, { "epoch": 0.5896656534954408, "grad_norm": 0.0837491899728775, "learning_rate": 0.000163742398974869, "loss": 0.3965, "step": 97 }, { "epoch": 0.5957446808510638, "grad_norm": 0.08433262258768082, "learning_rate": 0.00016299462063008272, "loss": 0.4267, "step": 98 }, { "epoch": 0.601823708206687, "grad_norm": 0.07738591730594635, "learning_rate": 0.00016224095611115384, "loss": 0.4175, "step": 99 }, { "epoch": 0.60790273556231, "grad_norm": 0.08330941945314407, "learning_rate": 0.00016148147583997812, "loss": 0.4324, "step": 100 }, { "epoch": 0.6139817629179332, "grad_norm": 0.08629601448774338, "learning_rate": 0.00016071625078187114, "loss": 0.3949, "step": 101 }, { "epoch": 0.6200607902735562, "grad_norm": 0.08496759831905365, "learning_rate": 0.0001599453524389374, "loss": 0.4181, "step": 102 }, { "epoch": 0.6261398176291794, "grad_norm": 0.08593132346868515, "learning_rate": 0.00015916885284338937, "loss": 0.3979, "step": 103 }, { "epoch": 0.6322188449848024, "grad_norm": 0.08198531717061996, "learning_rate": 0.00015838682455081657, "loss": 0.396, "step": 104 }, { "epoch": 0.6382978723404256, "grad_norm": 0.08349744975566864, "learning_rate": 0.00015759934063340627, "loss": 0.3773, "step": 105 }, { "epoch": 0.6443768996960486, "grad_norm": 0.08445355296134949, "learning_rate": 0.00015680647467311557, "loss": 0.3946, "step": 106 }, { "epoch": 0.6504559270516718, "grad_norm": 0.0808950737118721, "learning_rate": 0.00015600830075479603, "loss": 0.3926, "step": 107 }, { "epoch": 0.6565349544072948, "grad_norm": 0.08728586137294769, "learning_rate": 0.00015520489345927096, "loss": 0.4248, "step": 108 }, { "epoch": 0.662613981762918, "grad_norm": 0.08645470440387726, "learning_rate": 0.00015439632785636706, "loss": 0.4051, "step": 109 }, { "epoch": 0.668693009118541, "grad_norm": 0.07910045236349106, "learning_rate": 0.00015358267949789966, "loss": 0.4167, "step": 110 }, { "epoch": 0.6747720364741642, "grad_norm": 0.08322255313396454, "learning_rate": 0.0001527640244106133, "loss": 0.3842, "step": 111 }, { "epoch": 0.6808510638297872, "grad_norm": 0.07981768995523453, "learning_rate": 0.00015194043908907775, "loss": 0.3942, "step": 112 }, { "epoch": 0.6869300911854104, "grad_norm": 0.08817645907402039, "learning_rate": 0.00015111200048854056, "loss": 0.3739, "step": 113 }, { "epoch": 0.6930091185410334, "grad_norm": 0.09770014137029648, "learning_rate": 0.00015027878601773633, "loss": 0.3781, "step": 114 }, { "epoch": 0.6990881458966566, "grad_norm": 0.09042941778898239, "learning_rate": 0.0001494408735316537, "loss": 0.3897, "step": 115 }, { "epoch": 0.7051671732522796, "grad_norm": 0.08232049643993378, "learning_rate": 0.0001485983413242606, "loss": 0.3566, "step": 116 }, { "epoch": 0.7112462006079028, "grad_norm": 0.08187402784824371, "learning_rate": 0.00014775126812118864, "loss": 0.3559, "step": 117 }, { "epoch": 0.7173252279635258, "grad_norm": 0.08564823865890503, "learning_rate": 0.00014689973307237687, "loss": 0.37, "step": 118 }, { "epoch": 0.723404255319149, "grad_norm": 0.09082309901714325, "learning_rate": 0.00014604381574467615, "loss": 0.3962, "step": 119 }, { "epoch": 0.729483282674772, "grad_norm": 0.09014427661895752, "learning_rate": 0.0001451835961144145, "loss": 0.3391, "step": 120 }, { "epoch": 0.7355623100303952, "grad_norm": 0.08382211625576019, "learning_rate": 0.00014431915455992414, "loss": 0.3547, "step": 121 }, { "epoch": 0.7416413373860182, "grad_norm": 0.08397499471902847, "learning_rate": 0.000143450571854031, "loss": 0.3479, "step": 122 }, { "epoch": 0.7477203647416414, "grad_norm": 0.08822325617074966, "learning_rate": 0.00014257792915650728, "loss": 0.3824, "step": 123 }, { "epoch": 0.7537993920972644, "grad_norm": 0.08629824221134186, "learning_rate": 0.00014170130800648814, "loss": 0.3548, "step": 124 }, { "epoch": 0.7598784194528876, "grad_norm": 0.09454140067100525, "learning_rate": 0.0001408207903148525, "loss": 0.3524, "step": 125 }, { "epoch": 0.7659574468085106, "grad_norm": 0.08818770945072174, "learning_rate": 0.00013993645835656953, "loss": 0.388, "step": 126 }, { "epoch": 0.7720364741641338, "grad_norm": 0.08908054232597351, "learning_rate": 0.0001390483947630109, "loss": 0.3548, "step": 127 }, { "epoch": 0.7781155015197568, "grad_norm": 0.08153887093067169, "learning_rate": 0.00013815668251422952, "loss": 0.3545, "step": 128 }, { "epoch": 0.78419452887538, "grad_norm": 0.08948613703250885, "learning_rate": 0.0001372614049312064, "loss": 0.3483, "step": 129 }, { "epoch": 0.790273556231003, "grad_norm": 0.08655694872140884, "learning_rate": 0.0001363626456680647, "loss": 0.321, "step": 130 }, { "epoch": 0.7963525835866262, "grad_norm": 0.09818850457668304, "learning_rate": 0.00013546048870425356, "loss": 0.3647, "step": 131 }, { "epoch": 0.8024316109422492, "grad_norm": 0.0866372138261795, "learning_rate": 0.00013455501833670088, "loss": 0.3604, "step": 132 }, { "epoch": 0.8085106382978723, "grad_norm": 0.09384460002183914, "learning_rate": 0.0001336463191719367, "loss": 0.357, "step": 133 }, { "epoch": 0.8145896656534954, "grad_norm": 0.09135902673006058, "learning_rate": 0.00013273447611818767, "loss": 0.3406, "step": 134 }, { "epoch": 0.8206686930091185, "grad_norm": 0.08802594989538193, "learning_rate": 0.00013181957437744332, "loss": 0.3548, "step": 135 }, { "epoch": 0.8267477203647416, "grad_norm": 0.09025990217924118, "learning_rate": 0.00013090169943749476, "loss": 0.3505, "step": 136 }, { "epoch": 0.8328267477203647, "grad_norm": 0.09130747616291046, "learning_rate": 0.00012998093706394675, "loss": 0.3159, "step": 137 }, { "epoch": 0.8389057750759878, "grad_norm": 0.09548977762460709, "learning_rate": 0.00012905737329220392, "loss": 0.3473, "step": 138 }, { "epoch": 0.8449848024316109, "grad_norm": 0.08869072049856186, "learning_rate": 0.00012813109441943166, "loss": 0.3379, "step": 139 }, { "epoch": 0.851063829787234, "grad_norm": 0.09292670339345932, "learning_rate": 0.00012720218699649243, "loss": 0.3252, "step": 140 }, { "epoch": 0.8571428571428571, "grad_norm": 0.08339939266443253, "learning_rate": 0.0001262707378198587, "loss": 0.2908, "step": 141 }, { "epoch": 0.8632218844984803, "grad_norm": 0.0901230052113533, "learning_rate": 0.00012533683392350263, "loss": 0.3346, "step": 142 }, { "epoch": 0.8693009118541033, "grad_norm": 0.09229591488838196, "learning_rate": 0.00012440056257076375, "loss": 0.3523, "step": 143 }, { "epoch": 0.8753799392097265, "grad_norm": 0.09824731945991516, "learning_rate": 0.00012346201124619502, "loss": 0.3416, "step": 144 }, { "epoch": 0.8814589665653495, "grad_norm": 0.09328845143318176, "learning_rate": 0.00012252126764738844, "loss": 0.3211, "step": 145 }, { "epoch": 0.8875379939209727, "grad_norm": 0.09492561221122742, "learning_rate": 0.00012157841967678063, "loss": 0.3404, "step": 146 }, { "epoch": 0.8936170212765957, "grad_norm": 0.09995546191930771, "learning_rate": 0.00012063355543343924, "loss": 0.3188, "step": 147 }, { "epoch": 0.8996960486322189, "grad_norm": 0.09694822877645493, "learning_rate": 0.00011968676320483103, "loss": 0.3269, "step": 148 }, { "epoch": 0.9057750759878419, "grad_norm": 0.11103739589452744, "learning_rate": 0.00011873813145857249, "loss": 0.3228, "step": 149 }, { "epoch": 0.9118541033434651, "grad_norm": 0.10173939168453217, "learning_rate": 0.00011778774883416323, "loss": 0.3033, "step": 150 }, { "epoch": 0.9179331306990881, "grad_norm": 0.10053914040327072, "learning_rate": 0.00011683570413470383, "loss": 0.3357, "step": 151 }, { "epoch": 0.9240121580547113, "grad_norm": 0.09117776155471802, "learning_rate": 0.00011588208631859807, "loss": 0.3288, "step": 152 }, { "epoch": 0.9300911854103343, "grad_norm": 0.08972764015197754, "learning_rate": 0.00011492698449124042, "loss": 0.3134, "step": 153 }, { "epoch": 0.9361702127659575, "grad_norm": 0.09842713177204132, "learning_rate": 0.0001139704878966906, "loss": 0.3258, "step": 154 }, { "epoch": 0.9422492401215805, "grad_norm": 0.08585759252309799, "learning_rate": 0.00011301268590933434, "loss": 0.3121, "step": 155 }, { "epoch": 0.9483282674772037, "grad_norm": 0.10005568712949753, "learning_rate": 0.0001120536680255323, "loss": 0.3199, "step": 156 }, { "epoch": 0.9544072948328267, "grad_norm": 0.10690546780824661, "learning_rate": 0.00011109352385525783, "loss": 0.3008, "step": 157 }, { "epoch": 0.9604863221884499, "grad_norm": 0.0924290269613266, "learning_rate": 0.00011013234311372353, "loss": 0.339, "step": 158 }, { "epoch": 0.9665653495440729, "grad_norm": 0.09407492727041245, "learning_rate": 0.00010917021561299863, "loss": 0.3282, "step": 159 }, { "epoch": 0.9726443768996961, "grad_norm": 0.09656916558742523, "learning_rate": 0.00010820723125361684, "loss": 0.285, "step": 160 }, { "epoch": 0.9787234042553191, "grad_norm": 0.0891689881682396, "learning_rate": 0.00010724348001617625, "loss": 0.3134, "step": 161 }, { "epoch": 0.9848024316109423, "grad_norm": 0.09076245874166489, "learning_rate": 0.00010627905195293135, "loss": 0.3014, "step": 162 }, { "epoch": 0.9908814589665653, "grad_norm": 0.09239046275615692, "learning_rate": 0.00010531403717937887, "loss": 0.2828, "step": 163 }, { "epoch": 0.9969604863221885, "grad_norm": 0.10273317247629166, "learning_rate": 0.00010434852586583736, "loss": 0.3159, "step": 164 }, { "epoch": 1.0, "grad_norm": 0.1356026530265808, "learning_rate": 0.00010338260822902167, "loss": 0.2756, "step": 165 } ], "logging_steps": 1, "max_steps": 330, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 15, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.452241334344024e+18, "train_batch_size": 64, "trial_name": null, "trial_params": null }