{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9996985954548194, "eval_steps": 500, "global_step": 8294, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012056181807221652, "grad_norm": 1.3480521440505981, "learning_rate": 4.9999971305306124e-05, "loss": 1.4433, "num_input_tokens_seen": 216464, "step": 5 }, { "epoch": 0.0024112363614443304, "grad_norm": 0.6002678871154785, "learning_rate": 4.999985473322515e-05, "loss": 1.2127, "num_input_tokens_seen": 429576, "step": 10 }, { "epoch": 0.003616854542166496, "grad_norm": 0.8366473913192749, "learning_rate": 4.999964849075651e-05, "loss": 1.175, "num_input_tokens_seen": 637232, "step": 15 }, { "epoch": 0.004822472722888661, "grad_norm": 0.5535498261451721, "learning_rate": 4.9999352578639956e-05, "loss": 1.1726, "num_input_tokens_seen": 849144, "step": 20 }, { "epoch": 0.006028090903610827, "grad_norm": 1.325501799583435, "learning_rate": 4.999896699793689e-05, "loss": 1.1, "num_input_tokens_seen": 1066096, "step": 25 }, { "epoch": 0.007233709084332992, "grad_norm": 1.2245291471481323, "learning_rate": 4.9998491750030315e-05, "loss": 1.1467, "num_input_tokens_seen": 1279184, "step": 30 }, { "epoch": 0.008439327265055157, "grad_norm": 0.5779328346252441, "learning_rate": 4.999792683662487e-05, "loss": 1.0863, "num_input_tokens_seen": 1494200, "step": 35 }, { "epoch": 0.009644945445777322, "grad_norm": 0.6143853664398193, "learning_rate": 4.999727225974683e-05, "loss": 1.1508, "num_input_tokens_seen": 1713136, "step": 40 }, { "epoch": 0.010850563626499488, "grad_norm": 0.761713445186615, "learning_rate": 4.999652802174402e-05, "loss": 1.0892, "num_input_tokens_seen": 1914408, "step": 45 }, { "epoch": 0.012056181807221653, "grad_norm": 0.6331069469451904, "learning_rate": 4.999569412528591e-05, "loss": 1.0297, "num_input_tokens_seen": 2119464, "step": 50 }, { "epoch": 0.013261799987943818, "grad_norm": 0.7920764088630676, "learning_rate": 4.999477057336356e-05, "loss": 1.0973, "num_input_tokens_seen": 2324248, "step": 55 }, { "epoch": 0.014467418168665983, "grad_norm": 0.8538907170295715, "learning_rate": 4.99937573692896e-05, "loss": 1.097, "num_input_tokens_seen": 2538568, "step": 60 }, { "epoch": 0.015673036349388148, "grad_norm": 0.7392667531967163, "learning_rate": 4.999265451669821e-05, "loss": 1.0879, "num_input_tokens_seen": 2757816, "step": 65 }, { "epoch": 0.016878654530110313, "grad_norm": 0.7396961450576782, "learning_rate": 4.9991462019545157e-05, "loss": 1.0768, "num_input_tokens_seen": 2964904, "step": 70 }, { "epoch": 0.018084272710832478, "grad_norm": 0.6768509149551392, "learning_rate": 4.999017988210773e-05, "loss": 0.9968, "num_input_tokens_seen": 3182536, "step": 75 }, { "epoch": 0.019289890891554643, "grad_norm": 0.6711410880088806, "learning_rate": 4.9988808108984755e-05, "loss": 1.0879, "num_input_tokens_seen": 3393528, "step": 80 }, { "epoch": 0.02049550907227681, "grad_norm": 0.6745995283126831, "learning_rate": 4.998734670509653e-05, "loss": 1.0051, "num_input_tokens_seen": 3605696, "step": 85 }, { "epoch": 0.021701127252998977, "grad_norm": 0.7406673431396484, "learning_rate": 4.99857956756849e-05, "loss": 1.0619, "num_input_tokens_seen": 3821496, "step": 90 }, { "epoch": 0.02290674543372114, "grad_norm": 0.7060136198997498, "learning_rate": 4.998415502631315e-05, "loss": 0.9891, "num_input_tokens_seen": 4038456, "step": 95 }, { "epoch": 0.024112363614443307, "grad_norm": 0.6734737157821655, "learning_rate": 4.998242476286601e-05, "loss": 0.9937, "num_input_tokens_seen": 4256024, "step": 100 }, { "epoch": 0.02531798179516547, "grad_norm": 0.9583795666694641, "learning_rate": 4.9980604891549656e-05, "loss": 1.0275, "num_input_tokens_seen": 4469056, "step": 105 }, { "epoch": 0.026523599975887636, "grad_norm": 0.8260840177536011, "learning_rate": 4.997869541889166e-05, "loss": 1.1295, "num_input_tokens_seen": 4677952, "step": 110 }, { "epoch": 0.0277292181566098, "grad_norm": 0.7908892631530762, "learning_rate": 4.9976696351741003e-05, "loss": 1.0288, "num_input_tokens_seen": 4903144, "step": 115 }, { "epoch": 0.028934836337331966, "grad_norm": 0.9096012711524963, "learning_rate": 4.997460769726801e-05, "loss": 1.0157, "num_input_tokens_seen": 5117096, "step": 120 }, { "epoch": 0.03014045451805413, "grad_norm": 0.824535608291626, "learning_rate": 4.997242946296433e-05, "loss": 1.1174, "num_input_tokens_seen": 5336464, "step": 125 }, { "epoch": 0.031346072698776296, "grad_norm": 0.946870744228363, "learning_rate": 4.9970161656642944e-05, "loss": 1.0805, "num_input_tokens_seen": 5544960, "step": 130 }, { "epoch": 0.03255169087949846, "grad_norm": 0.8047392964363098, "learning_rate": 4.99678042864381e-05, "loss": 1.037, "num_input_tokens_seen": 5759256, "step": 135 }, { "epoch": 0.033757309060220626, "grad_norm": 0.816834568977356, "learning_rate": 4.99653573608053e-05, "loss": 1.067, "num_input_tokens_seen": 5965984, "step": 140 }, { "epoch": 0.03496292724094279, "grad_norm": 0.8405272364616394, "learning_rate": 4.996282088852127e-05, "loss": 1.036, "num_input_tokens_seen": 6178504, "step": 145 }, { "epoch": 0.036168545421664956, "grad_norm": 0.7949711084365845, "learning_rate": 4.9960194878683906e-05, "loss": 1.0026, "num_input_tokens_seen": 6388696, "step": 150 }, { "epoch": 0.03737416360238712, "grad_norm": 0.9096952080726624, "learning_rate": 4.995747934071229e-05, "loss": 1.0313, "num_input_tokens_seen": 6587896, "step": 155 }, { "epoch": 0.038579781783109286, "grad_norm": 0.8610963821411133, "learning_rate": 4.99546742843466e-05, "loss": 0.9866, "num_input_tokens_seen": 6789216, "step": 160 }, { "epoch": 0.03978539996383145, "grad_norm": 0.8836936950683594, "learning_rate": 4.995177971964813e-05, "loss": 1.0583, "num_input_tokens_seen": 7003232, "step": 165 }, { "epoch": 0.04099101814455362, "grad_norm": 0.9231252670288086, "learning_rate": 4.99487956569992e-05, "loss": 1.093, "num_input_tokens_seen": 7222232, "step": 170 }, { "epoch": 0.04219663632527579, "grad_norm": 0.8607410788536072, "learning_rate": 4.994572210710315e-05, "loss": 0.9495, "num_input_tokens_seen": 7432096, "step": 175 }, { "epoch": 0.04340225450599795, "grad_norm": 0.808646023273468, "learning_rate": 4.9942559080984305e-05, "loss": 1.0041, "num_input_tokens_seen": 7653416, "step": 180 }, { "epoch": 0.04460787268672012, "grad_norm": 0.8063214421272278, "learning_rate": 4.993930658998793e-05, "loss": 0.9832, "num_input_tokens_seen": 7877480, "step": 185 }, { "epoch": 0.04581349086744228, "grad_norm": 0.8844137787818909, "learning_rate": 4.993596464578016e-05, "loss": 0.9747, "num_input_tokens_seen": 8091432, "step": 190 }, { "epoch": 0.04701910904816445, "grad_norm": 0.8603246212005615, "learning_rate": 4.9932533260348025e-05, "loss": 1.0345, "num_input_tokens_seen": 8313192, "step": 195 }, { "epoch": 0.04822472722888661, "grad_norm": 0.826956033706665, "learning_rate": 4.992901244599933e-05, "loss": 1.0012, "num_input_tokens_seen": 8514536, "step": 200 }, { "epoch": 0.04943034540960878, "grad_norm": 0.8988208174705505, "learning_rate": 4.992540221536266e-05, "loss": 1.0407, "num_input_tokens_seen": 8730704, "step": 205 }, { "epoch": 0.05063596359033094, "grad_norm": 0.7894056439399719, "learning_rate": 4.992170258138732e-05, "loss": 1.0707, "num_input_tokens_seen": 8941248, "step": 210 }, { "epoch": 0.05184158177105311, "grad_norm": 0.8063021302223206, "learning_rate": 4.991791355734329e-05, "loss": 0.9563, "num_input_tokens_seen": 9150720, "step": 215 }, { "epoch": 0.05304719995177527, "grad_norm": 0.7905607223510742, "learning_rate": 4.991403515682119e-05, "loss": 0.965, "num_input_tokens_seen": 9366560, "step": 220 }, { "epoch": 0.05425281813249744, "grad_norm": 0.8522237539291382, "learning_rate": 4.99100673937322e-05, "loss": 0.8477, "num_input_tokens_seen": 9570712, "step": 225 }, { "epoch": 0.0554584363132196, "grad_norm": 0.7920671701431274, "learning_rate": 4.990601028230803e-05, "loss": 0.9686, "num_input_tokens_seen": 9775432, "step": 230 }, { "epoch": 0.05666405449394177, "grad_norm": 0.8169652819633484, "learning_rate": 4.9901863837100886e-05, "loss": 0.9811, "num_input_tokens_seen": 9989280, "step": 235 }, { "epoch": 0.05786967267466393, "grad_norm": 0.8672053217887878, "learning_rate": 4.9897628072983386e-05, "loss": 0.9473, "num_input_tokens_seen": 10199256, "step": 240 }, { "epoch": 0.0590752908553861, "grad_norm": 0.8624667525291443, "learning_rate": 4.989330300514853e-05, "loss": 0.9536, "num_input_tokens_seen": 10405776, "step": 245 }, { "epoch": 0.06028090903610826, "grad_norm": 0.8855419158935547, "learning_rate": 4.988888864910961e-05, "loss": 1.015, "num_input_tokens_seen": 10619584, "step": 250 }, { "epoch": 0.06148652721683043, "grad_norm": 0.939984917640686, "learning_rate": 4.988438502070021e-05, "loss": 0.9815, "num_input_tokens_seen": 10823672, "step": 255 }, { "epoch": 0.06269214539755259, "grad_norm": 0.8530163764953613, "learning_rate": 4.987979213607411e-05, "loss": 0.9956, "num_input_tokens_seen": 11038560, "step": 260 }, { "epoch": 0.06389776357827476, "grad_norm": 0.926736056804657, "learning_rate": 4.987511001170523e-05, "loss": 0.964, "num_input_tokens_seen": 11246864, "step": 265 }, { "epoch": 0.06510338175899692, "grad_norm": 0.9214469194412231, "learning_rate": 4.987033866438759e-05, "loss": 1.0111, "num_input_tokens_seen": 11447160, "step": 270 }, { "epoch": 0.06630899993971909, "grad_norm": 0.8270502090454102, "learning_rate": 4.9865478111235225e-05, "loss": 0.9253, "num_input_tokens_seen": 11663704, "step": 275 }, { "epoch": 0.06751461812044125, "grad_norm": 0.9323253035545349, "learning_rate": 4.986052836968215e-05, "loss": 1.0015, "num_input_tokens_seen": 11886704, "step": 280 }, { "epoch": 0.06872023630116342, "grad_norm": 0.9024085998535156, "learning_rate": 4.985548945748227e-05, "loss": 0.9993, "num_input_tokens_seen": 12089648, "step": 285 }, { "epoch": 0.06992585448188558, "grad_norm": 0.8758073449134827, "learning_rate": 4.985036139270935e-05, "loss": 0.9787, "num_input_tokens_seen": 12305880, "step": 290 }, { "epoch": 0.07113147266260775, "grad_norm": 1.1533262729644775, "learning_rate": 4.984514419375691e-05, "loss": 1.0673, "num_input_tokens_seen": 12513336, "step": 295 }, { "epoch": 0.07233709084332991, "grad_norm": 0.8299407362937927, "learning_rate": 4.983983787933819e-05, "loss": 1.0273, "num_input_tokens_seen": 12722784, "step": 300 }, { "epoch": 0.07354270902405208, "grad_norm": 0.994638204574585, "learning_rate": 4.9834442468486076e-05, "loss": 0.966, "num_input_tokens_seen": 12943328, "step": 305 }, { "epoch": 0.07474832720477424, "grad_norm": 0.8646377921104431, "learning_rate": 4.9828957980553017e-05, "loss": 0.9219, "num_input_tokens_seen": 13163352, "step": 310 }, { "epoch": 0.07595394538549641, "grad_norm": 0.9515541195869446, "learning_rate": 4.9823384435210974e-05, "loss": 0.9529, "num_input_tokens_seen": 13383672, "step": 315 }, { "epoch": 0.07715956356621857, "grad_norm": 0.9333364367485046, "learning_rate": 4.981772185245135e-05, "loss": 0.9715, "num_input_tokens_seen": 13597752, "step": 320 }, { "epoch": 0.07836518174694074, "grad_norm": 0.8728859424591064, "learning_rate": 4.98119702525849e-05, "loss": 0.9285, "num_input_tokens_seen": 13804584, "step": 325 }, { "epoch": 0.0795707999276629, "grad_norm": 0.9496778845787048, "learning_rate": 4.9806129656241666e-05, "loss": 0.9672, "num_input_tokens_seen": 14031640, "step": 330 }, { "epoch": 0.08077641810838508, "grad_norm": 1.0503513813018799, "learning_rate": 4.980020008437092e-05, "loss": 0.9857, "num_input_tokens_seen": 14234872, "step": 335 }, { "epoch": 0.08198203628910725, "grad_norm": 1.4025806188583374, "learning_rate": 4.9794181558241055e-05, "loss": 1.018, "num_input_tokens_seen": 14445536, "step": 340 }, { "epoch": 0.08318765446982941, "grad_norm": 1.0076154470443726, "learning_rate": 4.978807409943955e-05, "loss": 0.9519, "num_input_tokens_seen": 14648480, "step": 345 }, { "epoch": 0.08439327265055158, "grad_norm": 0.9718018174171448, "learning_rate": 4.978187772987285e-05, "loss": 0.9668, "num_input_tokens_seen": 14866840, "step": 350 }, { "epoch": 0.08559889083127374, "grad_norm": 2.5567164421081543, "learning_rate": 4.977559247176632e-05, "loss": 0.9664, "num_input_tokens_seen": 15070552, "step": 355 }, { "epoch": 0.0868045090119959, "grad_norm": 1.6297798156738281, "learning_rate": 4.9769218347664165e-05, "loss": 1.0054, "num_input_tokens_seen": 15283880, "step": 360 }, { "epoch": 0.08801012719271807, "grad_norm": 0.9169830679893494, "learning_rate": 4.976275538042932e-05, "loss": 1.0012, "num_input_tokens_seen": 15500416, "step": 365 }, { "epoch": 0.08921574537344024, "grad_norm": 1.0098609924316406, "learning_rate": 4.975620359324337e-05, "loss": 0.9831, "num_input_tokens_seen": 15709896, "step": 370 }, { "epoch": 0.0904213635541624, "grad_norm": 0.9704644680023193, "learning_rate": 4.9749563009606534e-05, "loss": 0.898, "num_input_tokens_seen": 15926112, "step": 375 }, { "epoch": 0.09162698173488457, "grad_norm": 1.1817699670791626, "learning_rate": 4.9742833653337476e-05, "loss": 0.981, "num_input_tokens_seen": 16131624, "step": 380 }, { "epoch": 0.09283259991560673, "grad_norm": 0.915503203868866, "learning_rate": 4.973601554857331e-05, "loss": 0.9948, "num_input_tokens_seen": 16340416, "step": 385 }, { "epoch": 0.0940382180963289, "grad_norm": 1.1324381828308105, "learning_rate": 4.972910871976946e-05, "loss": 0.9539, "num_input_tokens_seen": 16543840, "step": 390 }, { "epoch": 0.09524383627705106, "grad_norm": 0.9060422778129578, "learning_rate": 4.972211319169958e-05, "loss": 0.9453, "num_input_tokens_seen": 16750928, "step": 395 }, { "epoch": 0.09644945445777323, "grad_norm": 1.046651840209961, "learning_rate": 4.971502898945549e-05, "loss": 0.9393, "num_input_tokens_seen": 16975920, "step": 400 }, { "epoch": 0.09765507263849539, "grad_norm": 0.8987438678741455, "learning_rate": 4.970785613844707e-05, "loss": 0.9435, "num_input_tokens_seen": 17193600, "step": 405 }, { "epoch": 0.09886069081921756, "grad_norm": 0.9140846729278564, "learning_rate": 4.970059466440216e-05, "loss": 0.9344, "num_input_tokens_seen": 17408720, "step": 410 }, { "epoch": 0.10006630899993972, "grad_norm": 0.9514924883842468, "learning_rate": 4.9693244593366476e-05, "loss": 1.0136, "num_input_tokens_seen": 17622952, "step": 415 }, { "epoch": 0.10127192718066189, "grad_norm": 0.8930341005325317, "learning_rate": 4.9685805951703524e-05, "loss": 0.9324, "num_input_tokens_seen": 17837320, "step": 420 }, { "epoch": 0.10247754536138405, "grad_norm": 0.8707810044288635, "learning_rate": 4.9678278766094485e-05, "loss": 0.9671, "num_input_tokens_seen": 18053776, "step": 425 }, { "epoch": 0.10368316354210622, "grad_norm": 0.9207585453987122, "learning_rate": 4.967066306353816e-05, "loss": 0.97, "num_input_tokens_seen": 18267688, "step": 430 }, { "epoch": 0.10488878172282838, "grad_norm": 0.9838485717773438, "learning_rate": 4.966295887135081e-05, "loss": 1.0207, "num_input_tokens_seen": 18470376, "step": 435 }, { "epoch": 0.10609439990355055, "grad_norm": 1.4611468315124512, "learning_rate": 4.9655166217166114e-05, "loss": 1.0002, "num_input_tokens_seen": 18668616, "step": 440 }, { "epoch": 0.10730001808427271, "grad_norm": 1.0122599601745605, "learning_rate": 4.964728512893505e-05, "loss": 1.0065, "num_input_tokens_seen": 18875000, "step": 445 }, { "epoch": 0.10850563626499488, "grad_norm": 1.0062628984451294, "learning_rate": 4.96393156349258e-05, "loss": 0.9533, "num_input_tokens_seen": 19096440, "step": 450 }, { "epoch": 0.10971125444571704, "grad_norm": 0.9244326949119568, "learning_rate": 4.963125776372363e-05, "loss": 0.8919, "num_input_tokens_seen": 19305128, "step": 455 }, { "epoch": 0.1109168726264392, "grad_norm": 1.1496950387954712, "learning_rate": 4.962311154423079e-05, "loss": 0.9416, "num_input_tokens_seen": 19510200, "step": 460 }, { "epoch": 0.11212249080716137, "grad_norm": 0.8429615497589111, "learning_rate": 4.961487700566646e-05, "loss": 0.9282, "num_input_tokens_seen": 19719336, "step": 465 }, { "epoch": 0.11332810898788354, "grad_norm": 0.8722050786018372, "learning_rate": 4.9606554177566574e-05, "loss": 0.9297, "num_input_tokens_seen": 19931672, "step": 470 }, { "epoch": 0.1145337271686057, "grad_norm": 0.9270762205123901, "learning_rate": 4.9598143089783744e-05, "loss": 0.9261, "num_input_tokens_seen": 20138536, "step": 475 }, { "epoch": 0.11573934534932787, "grad_norm": 1.0691932439804077, "learning_rate": 4.9589643772487185e-05, "loss": 0.9395, "num_input_tokens_seen": 20354976, "step": 480 }, { "epoch": 0.11694496353005003, "grad_norm": 1.044533371925354, "learning_rate": 4.958105625616253e-05, "loss": 0.9014, "num_input_tokens_seen": 20559944, "step": 485 }, { "epoch": 0.1181505817107722, "grad_norm": 0.9564684629440308, "learning_rate": 4.957238057161182e-05, "loss": 0.9315, "num_input_tokens_seen": 20777888, "step": 490 }, { "epoch": 0.11935619989149436, "grad_norm": 0.9704129099845886, "learning_rate": 4.95636167499533e-05, "loss": 0.9291, "num_input_tokens_seen": 20987616, "step": 495 }, { "epoch": 0.12056181807221653, "grad_norm": 1.0293527841567993, "learning_rate": 4.955476482262137e-05, "loss": 0.9253, "num_input_tokens_seen": 21200744, "step": 500 }, { "epoch": 0.12176743625293869, "grad_norm": 1.0511715412139893, "learning_rate": 4.954582482136645e-05, "loss": 0.9688, "num_input_tokens_seen": 21406568, "step": 505 }, { "epoch": 0.12297305443366086, "grad_norm": 0.9141925573348999, "learning_rate": 4.953679677825486e-05, "loss": 0.9288, "num_input_tokens_seen": 21622016, "step": 510 }, { "epoch": 0.12417867261438302, "grad_norm": 1.2180051803588867, "learning_rate": 4.95276807256687e-05, "loss": 0.972, "num_input_tokens_seen": 21834776, "step": 515 }, { "epoch": 0.12538429079510519, "grad_norm": 0.9707120060920715, "learning_rate": 4.951847669630577e-05, "loss": 1.0223, "num_input_tokens_seen": 22036808, "step": 520 }, { "epoch": 0.12658990897582736, "grad_norm": 1.0374388694763184, "learning_rate": 4.950918472317942e-05, "loss": 0.9273, "num_input_tokens_seen": 22241744, "step": 525 }, { "epoch": 0.12779552715654952, "grad_norm": 1.0348138809204102, "learning_rate": 4.949980483961842e-05, "loss": 0.9054, "num_input_tokens_seen": 22462984, "step": 530 }, { "epoch": 0.1290011453372717, "grad_norm": 0.9179016351699829, "learning_rate": 4.94903370792669e-05, "loss": 0.9227, "num_input_tokens_seen": 22679864, "step": 535 }, { "epoch": 0.13020676351799385, "grad_norm": 1.1562983989715576, "learning_rate": 4.948078147608416e-05, "loss": 0.962, "num_input_tokens_seen": 22890456, "step": 540 }, { "epoch": 0.13141238169871602, "grad_norm": 0.9354329705238342, "learning_rate": 4.947113806434457e-05, "loss": 0.9359, "num_input_tokens_seen": 23093400, "step": 545 }, { "epoch": 0.13261799987943818, "grad_norm": 1.031166911125183, "learning_rate": 4.94614068786375e-05, "loss": 0.918, "num_input_tokens_seen": 23313240, "step": 550 }, { "epoch": 0.13382361806016035, "grad_norm": 1.021323800086975, "learning_rate": 4.945158795386708e-05, "loss": 0.9414, "num_input_tokens_seen": 23518360, "step": 555 }, { "epoch": 0.1350292362408825, "grad_norm": 1.062201976776123, "learning_rate": 4.9441681325252215e-05, "loss": 0.9317, "num_input_tokens_seen": 23724304, "step": 560 }, { "epoch": 0.13623485442160468, "grad_norm": 1.0418514013290405, "learning_rate": 4.9431687028326355e-05, "loss": 0.8828, "num_input_tokens_seen": 23936032, "step": 565 }, { "epoch": 0.13744047260232684, "grad_norm": 1.0468416213989258, "learning_rate": 4.942160509893741e-05, "loss": 0.9625, "num_input_tokens_seen": 24148072, "step": 570 }, { "epoch": 0.13864609078304901, "grad_norm": 1.0978264808654785, "learning_rate": 4.94114355732476e-05, "loss": 0.942, "num_input_tokens_seen": 24347848, "step": 575 }, { "epoch": 0.13985170896377117, "grad_norm": 1.019508719444275, "learning_rate": 4.940117848773336e-05, "loss": 0.8932, "num_input_tokens_seen": 24568256, "step": 580 }, { "epoch": 0.14105732714449334, "grad_norm": 1.0238145589828491, "learning_rate": 4.939083387918517e-05, "loss": 0.9388, "num_input_tokens_seen": 24783216, "step": 585 }, { "epoch": 0.1422629453252155, "grad_norm": 0.9687267541885376, "learning_rate": 4.938040178470745e-05, "loss": 0.941, "num_input_tokens_seen": 25006144, "step": 590 }, { "epoch": 0.14346856350593767, "grad_norm": 1.11016047000885, "learning_rate": 4.936988224171842e-05, "loss": 0.9039, "num_input_tokens_seen": 25215944, "step": 595 }, { "epoch": 0.14467418168665983, "grad_norm": 0.9590184092521667, "learning_rate": 4.935927528794996e-05, "loss": 0.9321, "num_input_tokens_seen": 25431352, "step": 600 }, { "epoch": 0.145879799867382, "grad_norm": 1.3537558317184448, "learning_rate": 4.934858096144746e-05, "loss": 0.9539, "num_input_tokens_seen": 25644120, "step": 605 }, { "epoch": 0.14708541804810416, "grad_norm": 1.100325584411621, "learning_rate": 4.933779930056975e-05, "loss": 0.9641, "num_input_tokens_seen": 25852216, "step": 610 }, { "epoch": 0.14829103622882633, "grad_norm": 0.9006015062332153, "learning_rate": 4.9326930343988854e-05, "loss": 0.9463, "num_input_tokens_seen": 26070336, "step": 615 }, { "epoch": 0.14949665440954849, "grad_norm": 1.0074900388717651, "learning_rate": 4.9315974130689956e-05, "loss": 0.904, "num_input_tokens_seen": 26293152, "step": 620 }, { "epoch": 0.15070227259027066, "grad_norm": 1.0894089937210083, "learning_rate": 4.9304930699971194e-05, "loss": 1.0082, "num_input_tokens_seen": 26494248, "step": 625 }, { "epoch": 0.15190789077099282, "grad_norm": 0.9495737552642822, "learning_rate": 4.9293800091443555e-05, "loss": 0.8695, "num_input_tokens_seen": 26711736, "step": 630 }, { "epoch": 0.153113508951715, "grad_norm": 1.0039435625076294, "learning_rate": 4.92825823450307e-05, "loss": 0.911, "num_input_tokens_seen": 26932672, "step": 635 }, { "epoch": 0.15431912713243714, "grad_norm": 1.1550997495651245, "learning_rate": 4.927127750096885e-05, "loss": 0.8383, "num_input_tokens_seen": 27141128, "step": 640 }, { "epoch": 0.15552474531315932, "grad_norm": 0.9979497194290161, "learning_rate": 4.925988559980664e-05, "loss": 0.9033, "num_input_tokens_seen": 27328336, "step": 645 }, { "epoch": 0.15673036349388147, "grad_norm": 0.9910388588905334, "learning_rate": 4.924840668240495e-05, "loss": 0.9291, "num_input_tokens_seen": 27555864, "step": 650 }, { "epoch": 0.15793598167460365, "grad_norm": 1.068831205368042, "learning_rate": 4.9236840789936766e-05, "loss": 0.951, "num_input_tokens_seen": 27769600, "step": 655 }, { "epoch": 0.1591415998553258, "grad_norm": 0.9822483062744141, "learning_rate": 4.9225187963887074e-05, "loss": 0.9021, "num_input_tokens_seen": 27994640, "step": 660 }, { "epoch": 0.16034721803604798, "grad_norm": 0.8445255756378174, "learning_rate": 4.921344824605264e-05, "loss": 0.8633, "num_input_tokens_seen": 28206440, "step": 665 }, { "epoch": 0.16155283621677016, "grad_norm": 0.9046006798744202, "learning_rate": 4.920162167854192e-05, "loss": 0.9267, "num_input_tokens_seen": 28423792, "step": 670 }, { "epoch": 0.1627584543974923, "grad_norm": 0.9862354397773743, "learning_rate": 4.9189708303774864e-05, "loss": 0.9391, "num_input_tokens_seen": 28632528, "step": 675 }, { "epoch": 0.1639640725782145, "grad_norm": 1.0912857055664062, "learning_rate": 4.9177708164482804e-05, "loss": 0.9586, "num_input_tokens_seen": 28838480, "step": 680 }, { "epoch": 0.16516969075893664, "grad_norm": 0.9752511382102966, "learning_rate": 4.916562130370828e-05, "loss": 0.9241, "num_input_tokens_seen": 29057248, "step": 685 }, { "epoch": 0.16637530893965882, "grad_norm": 1.2874888181686401, "learning_rate": 4.915344776480487e-05, "loss": 0.9805, "num_input_tokens_seen": 29274288, "step": 690 }, { "epoch": 0.16758092712038097, "grad_norm": 0.9697347283363342, "learning_rate": 4.914118759143709e-05, "loss": 0.9218, "num_input_tokens_seen": 29482328, "step": 695 }, { "epoch": 0.16878654530110315, "grad_norm": 1.3662371635437012, "learning_rate": 4.912884082758015e-05, "loss": 0.902, "num_input_tokens_seen": 29703632, "step": 700 }, { "epoch": 0.1699921634818253, "grad_norm": 1.2370350360870361, "learning_rate": 4.911640751751988e-05, "loss": 0.9134, "num_input_tokens_seen": 29909760, "step": 705 }, { "epoch": 0.17119778166254748, "grad_norm": 1.0421106815338135, "learning_rate": 4.910388770585253e-05, "loss": 0.9872, "num_input_tokens_seen": 30132808, "step": 710 }, { "epoch": 0.17240339984326963, "grad_norm": 1.21604585647583, "learning_rate": 4.909128143748462e-05, "loss": 0.9017, "num_input_tokens_seen": 30342360, "step": 715 }, { "epoch": 0.1736090180239918, "grad_norm": 0.9547712802886963, "learning_rate": 4.907858875763276e-05, "loss": 0.8986, "num_input_tokens_seen": 30551784, "step": 720 }, { "epoch": 0.17481463620471396, "grad_norm": 3.815072774887085, "learning_rate": 4.9065809711823536e-05, "loss": 0.9095, "num_input_tokens_seen": 30764544, "step": 725 }, { "epoch": 0.17602025438543614, "grad_norm": 0.9306098222732544, "learning_rate": 4.905294434589327e-05, "loss": 0.8692, "num_input_tokens_seen": 30970232, "step": 730 }, { "epoch": 0.1772258725661583, "grad_norm": 0.9312036037445068, "learning_rate": 4.9039992705987944e-05, "loss": 0.9457, "num_input_tokens_seen": 31191040, "step": 735 }, { "epoch": 0.17843149074688047, "grad_norm": 1.0130152702331543, "learning_rate": 4.9026954838562954e-05, "loss": 0.8632, "num_input_tokens_seen": 31399904, "step": 740 }, { "epoch": 0.17963710892760262, "grad_norm": 0.9852222204208374, "learning_rate": 4.901383079038301e-05, "loss": 0.9577, "num_input_tokens_seen": 31612432, "step": 745 }, { "epoch": 0.1808427271083248, "grad_norm": 1.1059669256210327, "learning_rate": 4.900062060852192e-05, "loss": 0.9528, "num_input_tokens_seen": 31811152, "step": 750 }, { "epoch": 0.18204834528904695, "grad_norm": 0.9931268692016602, "learning_rate": 4.898732434036244e-05, "loss": 0.9548, "num_input_tokens_seen": 32028376, "step": 755 }, { "epoch": 0.18325396346976913, "grad_norm": 1.0170475244522095, "learning_rate": 4.897394203359611e-05, "loss": 0.8433, "num_input_tokens_seen": 32243568, "step": 760 }, { "epoch": 0.18445958165049128, "grad_norm": 1.1025986671447754, "learning_rate": 4.896047373622308e-05, "loss": 0.8852, "num_input_tokens_seen": 32461152, "step": 765 }, { "epoch": 0.18566519983121346, "grad_norm": 1.1451163291931152, "learning_rate": 4.8946919496551926e-05, "loss": 0.9665, "num_input_tokens_seen": 32679152, "step": 770 }, { "epoch": 0.1868708180119356, "grad_norm": 0.9855079054832458, "learning_rate": 4.8933279363199485e-05, "loss": 0.9094, "num_input_tokens_seen": 32900800, "step": 775 }, { "epoch": 0.1880764361926578, "grad_norm": 1.005905032157898, "learning_rate": 4.8919553385090685e-05, "loss": 0.9068, "num_input_tokens_seen": 33115544, "step": 780 }, { "epoch": 0.18928205437337994, "grad_norm": 1.0492976903915405, "learning_rate": 4.890574161145836e-05, "loss": 0.905, "num_input_tokens_seen": 33335944, "step": 785 }, { "epoch": 0.19048767255410212, "grad_norm": 0.9678381681442261, "learning_rate": 4.889184409184308e-05, "loss": 1.0185, "num_input_tokens_seen": 33554968, "step": 790 }, { "epoch": 0.19169329073482427, "grad_norm": 1.0396451950073242, "learning_rate": 4.887786087609299e-05, "loss": 0.9317, "num_input_tokens_seen": 33758688, "step": 795 }, { "epoch": 0.19289890891554645, "grad_norm": 0.999522864818573, "learning_rate": 4.886379201436359e-05, "loss": 0.9444, "num_input_tokens_seen": 33979680, "step": 800 }, { "epoch": 0.1941045270962686, "grad_norm": 1.0043264627456665, "learning_rate": 4.8849637557117566e-05, "loss": 1.0236, "num_input_tokens_seen": 34198160, "step": 805 }, { "epoch": 0.19531014527699078, "grad_norm": 1.1024693250656128, "learning_rate": 4.883539755512465e-05, "loss": 1.0002, "num_input_tokens_seen": 34414376, "step": 810 }, { "epoch": 0.19651576345771293, "grad_norm": 1.0454233884811401, "learning_rate": 4.882107205946142e-05, "loss": 0.8226, "num_input_tokens_seen": 34618624, "step": 815 }, { "epoch": 0.1977213816384351, "grad_norm": 1.0113712549209595, "learning_rate": 4.8806661121511045e-05, "loss": 0.938, "num_input_tokens_seen": 34823112, "step": 820 }, { "epoch": 0.19892699981915726, "grad_norm": 0.9987353682518005, "learning_rate": 4.879216479296323e-05, "loss": 0.9653, "num_input_tokens_seen": 35045032, "step": 825 }, { "epoch": 0.20013261799987944, "grad_norm": 1.0113697052001953, "learning_rate": 4.877758312581393e-05, "loss": 0.9034, "num_input_tokens_seen": 35247120, "step": 830 }, { "epoch": 0.2013382361806016, "grad_norm": 1.2987358570098877, "learning_rate": 4.876291617236519e-05, "loss": 0.9182, "num_input_tokens_seen": 35469376, "step": 835 }, { "epoch": 0.20254385436132377, "grad_norm": 0.9110861420631409, "learning_rate": 4.874816398522497e-05, "loss": 0.8956, "num_input_tokens_seen": 35693440, "step": 840 }, { "epoch": 0.20374947254204592, "grad_norm": 0.9845291376113892, "learning_rate": 4.873332661730697e-05, "loss": 0.8667, "num_input_tokens_seen": 35897800, "step": 845 }, { "epoch": 0.2049550907227681, "grad_norm": 0.9247505068778992, "learning_rate": 4.871840412183038e-05, "loss": 0.9647, "num_input_tokens_seen": 36112104, "step": 850 }, { "epoch": 0.20616070890349025, "grad_norm": 1.0141013860702515, "learning_rate": 4.870339655231976e-05, "loss": 0.9428, "num_input_tokens_seen": 36322008, "step": 855 }, { "epoch": 0.20736632708421243, "grad_norm": 1.057724118232727, "learning_rate": 4.86883039626048e-05, "loss": 0.929, "num_input_tokens_seen": 36528384, "step": 860 }, { "epoch": 0.20857194526493458, "grad_norm": 1.122794270515442, "learning_rate": 4.8673126406820144e-05, "loss": 0.95, "num_input_tokens_seen": 36733616, "step": 865 }, { "epoch": 0.20977756344565676, "grad_norm": 1.0327709913253784, "learning_rate": 4.865786393940522e-05, "loss": 0.9183, "num_input_tokens_seen": 36948776, "step": 870 }, { "epoch": 0.2109831816263789, "grad_norm": 1.0998144149780273, "learning_rate": 4.8642516615103994e-05, "loss": 0.9446, "num_input_tokens_seen": 37151640, "step": 875 }, { "epoch": 0.2121887998071011, "grad_norm": 1.018345594406128, "learning_rate": 4.862708448896479e-05, "loss": 0.917, "num_input_tokens_seen": 37356232, "step": 880 }, { "epoch": 0.21339441798782324, "grad_norm": 1.0875458717346191, "learning_rate": 4.861156761634014e-05, "loss": 0.948, "num_input_tokens_seen": 37563992, "step": 885 }, { "epoch": 0.21460003616854542, "grad_norm": 1.0635877847671509, "learning_rate": 4.859596605288651e-05, "loss": 0.8774, "num_input_tokens_seen": 37778448, "step": 890 }, { "epoch": 0.2158056543492676, "grad_norm": 1.0215502977371216, "learning_rate": 4.858027985456417e-05, "loss": 0.9006, "num_input_tokens_seen": 37995056, "step": 895 }, { "epoch": 0.21701127252998975, "grad_norm": 0.9464943408966064, "learning_rate": 4.856450907763693e-05, "loss": 0.9334, "num_input_tokens_seen": 38213544, "step": 900 }, { "epoch": 0.21821689071071193, "grad_norm": 0.8972001671791077, "learning_rate": 4.854865377867201e-05, "loss": 0.9073, "num_input_tokens_seen": 38425816, "step": 905 }, { "epoch": 0.21942250889143408, "grad_norm": 1.2254059314727783, "learning_rate": 4.853271401453975e-05, "loss": 0.871, "num_input_tokens_seen": 38628560, "step": 910 }, { "epoch": 0.22062812707215626, "grad_norm": 1.0607829093933105, "learning_rate": 4.851668984241348e-05, "loss": 0.8851, "num_input_tokens_seen": 38851792, "step": 915 }, { "epoch": 0.2218337452528784, "grad_norm": 1.0934605598449707, "learning_rate": 4.850058131976929e-05, "loss": 0.9203, "num_input_tokens_seen": 39066040, "step": 920 }, { "epoch": 0.2230393634336006, "grad_norm": 1.149463415145874, "learning_rate": 4.8484388504385806e-05, "loss": 0.8901, "num_input_tokens_seen": 39283064, "step": 925 }, { "epoch": 0.22424498161432274, "grad_norm": 1.3603874444961548, "learning_rate": 4.8468111454344015e-05, "loss": 0.9842, "num_input_tokens_seen": 39486384, "step": 930 }, { "epoch": 0.22545059979504492, "grad_norm": 1.067646861076355, "learning_rate": 4.8451750228027035e-05, "loss": 0.8329, "num_input_tokens_seen": 39692136, "step": 935 }, { "epoch": 0.22665621797576707, "grad_norm": 1.2321659326553345, "learning_rate": 4.843530488411989e-05, "loss": 0.9315, "num_input_tokens_seen": 39912184, "step": 940 }, { "epoch": 0.22786183615648925, "grad_norm": 0.9800248742103577, "learning_rate": 4.841877548160935e-05, "loss": 1.0093, "num_input_tokens_seen": 40129952, "step": 945 }, { "epoch": 0.2290674543372114, "grad_norm": 1.492313265800476, "learning_rate": 4.840216207978368e-05, "loss": 0.9011, "num_input_tokens_seen": 40342632, "step": 950 }, { "epoch": 0.23027307251793358, "grad_norm": 1.0844993591308594, "learning_rate": 4.838546473823242e-05, "loss": 0.9797, "num_input_tokens_seen": 40552888, "step": 955 }, { "epoch": 0.23147869069865573, "grad_norm": 1.0432848930358887, "learning_rate": 4.836868351684621e-05, "loss": 0.9473, "num_input_tokens_seen": 40765960, "step": 960 }, { "epoch": 0.2326843088793779, "grad_norm": 1.079478144645691, "learning_rate": 4.835181847581656e-05, "loss": 0.903, "num_input_tokens_seen": 40976584, "step": 965 }, { "epoch": 0.23388992706010006, "grad_norm": 1.077728033065796, "learning_rate": 4.833486967563558e-05, "loss": 0.8839, "num_input_tokens_seen": 41188312, "step": 970 }, { "epoch": 0.23509554524082224, "grad_norm": 0.9432997703552246, "learning_rate": 4.831783717709587e-05, "loss": 0.9285, "num_input_tokens_seen": 41396264, "step": 975 }, { "epoch": 0.2363011634215444, "grad_norm": 1.0686081647872925, "learning_rate": 4.8300721041290194e-05, "loss": 0.8688, "num_input_tokens_seen": 41611608, "step": 980 }, { "epoch": 0.23750678160226657, "grad_norm": 0.9642893075942993, "learning_rate": 4.828352132961134e-05, "loss": 0.9275, "num_input_tokens_seen": 41828928, "step": 985 }, { "epoch": 0.23871239978298872, "grad_norm": 1.0179120302200317, "learning_rate": 4.826623810375187e-05, "loss": 0.904, "num_input_tokens_seen": 42050304, "step": 990 }, { "epoch": 0.2399180179637109, "grad_norm": 0.9713773131370544, "learning_rate": 4.824887142570387e-05, "loss": 0.8724, "num_input_tokens_seen": 42266648, "step": 995 }, { "epoch": 0.24112363614443305, "grad_norm": 1.0628626346588135, "learning_rate": 4.8231421357758776e-05, "loss": 0.8972, "num_input_tokens_seen": 42473696, "step": 1000 }, { "epoch": 0.24232925432515523, "grad_norm": 1.4627134799957275, "learning_rate": 4.821388796250713e-05, "loss": 0.9439, "num_input_tokens_seen": 42686816, "step": 1005 }, { "epoch": 0.24353487250587738, "grad_norm": 0.986068844795227, "learning_rate": 4.8196271302838347e-05, "loss": 0.9158, "num_input_tokens_seen": 42895120, "step": 1010 }, { "epoch": 0.24474049068659956, "grad_norm": 0.9546301960945129, "learning_rate": 4.817857144194051e-05, "loss": 0.8, "num_input_tokens_seen": 43102280, "step": 1015 }, { "epoch": 0.2459461088673217, "grad_norm": 0.9769660234451294, "learning_rate": 4.8160788443300116e-05, "loss": 0.9081, "num_input_tokens_seen": 43299872, "step": 1020 }, { "epoch": 0.2471517270480439, "grad_norm": 1.0551965236663818, "learning_rate": 4.814292237070187e-05, "loss": 0.914, "num_input_tokens_seen": 43508888, "step": 1025 }, { "epoch": 0.24835734522876604, "grad_norm": 0.9773661494255066, "learning_rate": 4.8124973288228456e-05, "loss": 0.9816, "num_input_tokens_seen": 43730472, "step": 1030 }, { "epoch": 0.24956296340948822, "grad_norm": 0.9589386582374573, "learning_rate": 4.8106941260260296e-05, "loss": 0.9918, "num_input_tokens_seen": 43942848, "step": 1035 }, { "epoch": 0.25076858159021037, "grad_norm": 1.0969736576080322, "learning_rate": 4.8088826351475316e-05, "loss": 0.9444, "num_input_tokens_seen": 44158576, "step": 1040 }, { "epoch": 0.25197419977093255, "grad_norm": 1.4454617500305176, "learning_rate": 4.8070628626848735e-05, "loss": 0.9241, "num_input_tokens_seen": 44382128, "step": 1045 }, { "epoch": 0.25317981795165473, "grad_norm": 1.0844621658325195, "learning_rate": 4.805234815165282e-05, "loss": 0.8833, "num_input_tokens_seen": 44604504, "step": 1050 }, { "epoch": 0.25438543613237685, "grad_norm": 0.8991053104400635, "learning_rate": 4.803398499145662e-05, "loss": 0.8834, "num_input_tokens_seen": 44818184, "step": 1055 }, { "epoch": 0.25559105431309903, "grad_norm": 0.9751043319702148, "learning_rate": 4.80155392121258e-05, "loss": 0.8116, "num_input_tokens_seen": 45028688, "step": 1060 }, { "epoch": 0.2567966724938212, "grad_norm": 1.056107997894287, "learning_rate": 4.799701087982237e-05, "loss": 0.9205, "num_input_tokens_seen": 45233728, "step": 1065 }, { "epoch": 0.2580022906745434, "grad_norm": 0.9977249503135681, "learning_rate": 4.7978400061004405e-05, "loss": 0.8936, "num_input_tokens_seen": 45447176, "step": 1070 }, { "epoch": 0.2592079088552655, "grad_norm": 0.9438022375106812, "learning_rate": 4.7959706822425865e-05, "loss": 0.8281, "num_input_tokens_seen": 45662704, "step": 1075 }, { "epoch": 0.2604135270359877, "grad_norm": 1.0474023818969727, "learning_rate": 4.794093123113634e-05, "loss": 0.8912, "num_input_tokens_seen": 45867832, "step": 1080 }, { "epoch": 0.26161914521670987, "grad_norm": 1.7578970193862915, "learning_rate": 4.7922073354480814e-05, "loss": 0.8736, "num_input_tokens_seen": 46079184, "step": 1085 }, { "epoch": 0.26282476339743205, "grad_norm": 1.148949146270752, "learning_rate": 4.7903133260099385e-05, "loss": 0.8985, "num_input_tokens_seen": 46294208, "step": 1090 }, { "epoch": 0.26403038157815417, "grad_norm": 0.9310439229011536, "learning_rate": 4.7884111015927094e-05, "loss": 0.9163, "num_input_tokens_seen": 46504040, "step": 1095 }, { "epoch": 0.26523599975887635, "grad_norm": 1.5707510709762573, "learning_rate": 4.78650066901936e-05, "loss": 0.8818, "num_input_tokens_seen": 46726312, "step": 1100 }, { "epoch": 0.26644161793959853, "grad_norm": 0.9448241591453552, "learning_rate": 4.784582035142301e-05, "loss": 0.9756, "num_input_tokens_seen": 46942344, "step": 1105 }, { "epoch": 0.2676472361203207, "grad_norm": 1.0408532619476318, "learning_rate": 4.782655206843356e-05, "loss": 0.8977, "num_input_tokens_seen": 47155720, "step": 1110 }, { "epoch": 0.2688528543010429, "grad_norm": 0.9437874555587769, "learning_rate": 4.7807201910337453e-05, "loss": 0.9501, "num_input_tokens_seen": 47367480, "step": 1115 }, { "epoch": 0.270058472481765, "grad_norm": 0.9614304900169373, "learning_rate": 4.778776994654052e-05, "loss": 0.854, "num_input_tokens_seen": 47597088, "step": 1120 }, { "epoch": 0.2712640906624872, "grad_norm": 1.0169709920883179, "learning_rate": 4.776825624674204e-05, "loss": 0.9289, "num_input_tokens_seen": 47800984, "step": 1125 }, { "epoch": 0.27246970884320937, "grad_norm": 1.3136272430419922, "learning_rate": 4.7748660880934455e-05, "loss": 0.9131, "num_input_tokens_seen": 48004040, "step": 1130 }, { "epoch": 0.27367532702393155, "grad_norm": 1.011932134628296, "learning_rate": 4.772898391940315e-05, "loss": 0.8801, "num_input_tokens_seen": 48220880, "step": 1135 }, { "epoch": 0.27488094520465367, "grad_norm": 1.1574703454971313, "learning_rate": 4.770922543272615e-05, "loss": 0.8396, "num_input_tokens_seen": 48430672, "step": 1140 }, { "epoch": 0.27608656338537585, "grad_norm": 1.1394647359848022, "learning_rate": 4.768938549177393e-05, "loss": 0.8784, "num_input_tokens_seen": 48650144, "step": 1145 }, { "epoch": 0.27729218156609803, "grad_norm": 0.974724531173706, "learning_rate": 4.76694641677091e-05, "loss": 0.942, "num_input_tokens_seen": 48864392, "step": 1150 }, { "epoch": 0.2784977997468202, "grad_norm": 1.0835553407669067, "learning_rate": 4.764946153198618e-05, "loss": 0.8527, "num_input_tokens_seen": 49073288, "step": 1155 }, { "epoch": 0.27970341792754233, "grad_norm": 0.9113886952400208, "learning_rate": 4.762937765635138e-05, "loss": 0.8451, "num_input_tokens_seen": 49286344, "step": 1160 }, { "epoch": 0.2809090361082645, "grad_norm": 0.9594201445579529, "learning_rate": 4.760921261284225e-05, "loss": 0.8636, "num_input_tokens_seen": 49502400, "step": 1165 }, { "epoch": 0.2821146542889867, "grad_norm": 1.075929880142212, "learning_rate": 4.758896647378751e-05, "loss": 0.9392, "num_input_tokens_seen": 49718376, "step": 1170 }, { "epoch": 0.28332027246970887, "grad_norm": 1.0661914348602295, "learning_rate": 4.7568639311806744e-05, "loss": 0.8808, "num_input_tokens_seen": 49934936, "step": 1175 }, { "epoch": 0.284525890650431, "grad_norm": 0.9960696697235107, "learning_rate": 4.7548231199810164e-05, "loss": 0.9131, "num_input_tokens_seen": 50147688, "step": 1180 }, { "epoch": 0.28573150883115317, "grad_norm": 0.9663341641426086, "learning_rate": 4.752774221099832e-05, "loss": 0.9627, "num_input_tokens_seen": 50354136, "step": 1185 }, { "epoch": 0.28693712701187535, "grad_norm": 0.8862481117248535, "learning_rate": 4.750717241886185e-05, "loss": 0.8667, "num_input_tokens_seen": 50563000, "step": 1190 }, { "epoch": 0.2881427451925975, "grad_norm": 0.9842221140861511, "learning_rate": 4.748652189718126e-05, "loss": 0.9373, "num_input_tokens_seen": 50777664, "step": 1195 }, { "epoch": 0.28934836337331965, "grad_norm": 1.2426091432571411, "learning_rate": 4.746579072002657e-05, "loss": 0.9189, "num_input_tokens_seen": 50990968, "step": 1200 }, { "epoch": 0.29055398155404183, "grad_norm": 1.1096442937850952, "learning_rate": 4.744497896175713e-05, "loss": 0.8711, "num_input_tokens_seen": 51200792, "step": 1205 }, { "epoch": 0.291759599734764, "grad_norm": 1.0573973655700684, "learning_rate": 4.7424086697021305e-05, "loss": 0.9891, "num_input_tokens_seen": 51410784, "step": 1210 }, { "epoch": 0.2929652179154862, "grad_norm": 1.0691542625427246, "learning_rate": 4.7403114000756236e-05, "loss": 0.9769, "num_input_tokens_seen": 51615400, "step": 1215 }, { "epoch": 0.2941708360962083, "grad_norm": 1.036690592765808, "learning_rate": 4.7382060948187545e-05, "loss": 0.9596, "num_input_tokens_seen": 51838488, "step": 1220 }, { "epoch": 0.2953764542769305, "grad_norm": 1.0758652687072754, "learning_rate": 4.73609276148291e-05, "loss": 0.8496, "num_input_tokens_seen": 52052464, "step": 1225 }, { "epoch": 0.29658207245765267, "grad_norm": 1.0326350927352905, "learning_rate": 4.7339714076482686e-05, "loss": 0.9151, "num_input_tokens_seen": 52260072, "step": 1230 }, { "epoch": 0.29778769063837485, "grad_norm": 1.029476523399353, "learning_rate": 4.731842040923782e-05, "loss": 0.8266, "num_input_tokens_seen": 52477416, "step": 1235 }, { "epoch": 0.29899330881909697, "grad_norm": 0.8776472806930542, "learning_rate": 4.729704668947138e-05, "loss": 0.9262, "num_input_tokens_seen": 52676160, "step": 1240 }, { "epoch": 0.30019892699981915, "grad_norm": 1.0397179126739502, "learning_rate": 4.727559299384741e-05, "loss": 0.8936, "num_input_tokens_seen": 52884120, "step": 1245 }, { "epoch": 0.30140454518054133, "grad_norm": 0.8663790225982666, "learning_rate": 4.725405939931681e-05, "loss": 0.897, "num_input_tokens_seen": 53104120, "step": 1250 }, { "epoch": 0.3026101633612635, "grad_norm": 1.0946685075759888, "learning_rate": 4.7232445983117045e-05, "loss": 0.9132, "num_input_tokens_seen": 53316968, "step": 1255 }, { "epoch": 0.30381578154198563, "grad_norm": 1.168857455253601, "learning_rate": 4.721075282277191e-05, "loss": 0.9693, "num_input_tokens_seen": 53524272, "step": 1260 }, { "epoch": 0.3050213997227078, "grad_norm": 1.4118732213974, "learning_rate": 4.718897999609123e-05, "loss": 0.9601, "num_input_tokens_seen": 53733032, "step": 1265 }, { "epoch": 0.30622701790343, "grad_norm": 1.0661664009094238, "learning_rate": 4.716712758117057e-05, "loss": 0.9492, "num_input_tokens_seen": 53941112, "step": 1270 }, { "epoch": 0.30743263608415217, "grad_norm": 0.9035789370536804, "learning_rate": 4.714519565639095e-05, "loss": 0.9203, "num_input_tokens_seen": 54156144, "step": 1275 }, { "epoch": 0.3086382542648743, "grad_norm": 1.0114617347717285, "learning_rate": 4.71231843004186e-05, "loss": 0.87, "num_input_tokens_seen": 54361344, "step": 1280 }, { "epoch": 0.30984387244559647, "grad_norm": 1.1288913488388062, "learning_rate": 4.710109359220466e-05, "loss": 0.9348, "num_input_tokens_seen": 54571568, "step": 1285 }, { "epoch": 0.31104949062631865, "grad_norm": 0.9035195708274841, "learning_rate": 4.707892361098488e-05, "loss": 0.9714, "num_input_tokens_seen": 54785376, "step": 1290 }, { "epoch": 0.3122551088070408, "grad_norm": 0.984122097492218, "learning_rate": 4.705667443627936e-05, "loss": 0.8942, "num_input_tokens_seen": 55008504, "step": 1295 }, { "epoch": 0.31346072698776295, "grad_norm": 0.9119220972061157, "learning_rate": 4.703434614789224e-05, "loss": 0.8839, "num_input_tokens_seen": 55218128, "step": 1300 }, { "epoch": 0.31466634516848513, "grad_norm": 0.9661288857460022, "learning_rate": 4.701193882591145e-05, "loss": 0.9343, "num_input_tokens_seen": 55430280, "step": 1305 }, { "epoch": 0.3158719633492073, "grad_norm": 1.3090124130249023, "learning_rate": 4.698945255070837e-05, "loss": 0.9074, "num_input_tokens_seen": 55639160, "step": 1310 }, { "epoch": 0.3170775815299295, "grad_norm": 0.9003798365592957, "learning_rate": 4.6966887402937595e-05, "loss": 0.9246, "num_input_tokens_seen": 55861816, "step": 1315 }, { "epoch": 0.3182831997106516, "grad_norm": 0.9468660950660706, "learning_rate": 4.694424346353663e-05, "loss": 0.8341, "num_input_tokens_seen": 56076560, "step": 1320 }, { "epoch": 0.3194888178913738, "grad_norm": 1.0333856344223022, "learning_rate": 4.6921520813725554e-05, "loss": 0.9267, "num_input_tokens_seen": 56299232, "step": 1325 }, { "epoch": 0.32069443607209597, "grad_norm": 1.1444239616394043, "learning_rate": 4.689871953500682e-05, "loss": 0.8774, "num_input_tokens_seen": 56509568, "step": 1330 }, { "epoch": 0.32190005425281815, "grad_norm": 1.0806092023849487, "learning_rate": 4.687583970916487e-05, "loss": 0.8857, "num_input_tokens_seen": 56736888, "step": 1335 }, { "epoch": 0.3231056724335403, "grad_norm": 1.1541969776153564, "learning_rate": 4.685288141826589e-05, "loss": 0.8966, "num_input_tokens_seen": 56941984, "step": 1340 }, { "epoch": 0.32431129061426245, "grad_norm": 0.9262281656265259, "learning_rate": 4.682984474465752e-05, "loss": 0.8905, "num_input_tokens_seen": 57164624, "step": 1345 }, { "epoch": 0.3255169087949846, "grad_norm": 1.0836807489395142, "learning_rate": 4.680672977096854e-05, "loss": 0.8811, "num_input_tokens_seen": 57382896, "step": 1350 }, { "epoch": 0.3267225269757068, "grad_norm": 1.02498459815979, "learning_rate": 4.6783536580108575e-05, "loss": 0.9804, "num_input_tokens_seen": 57610848, "step": 1355 }, { "epoch": 0.327928145156429, "grad_norm": 1.3286705017089844, "learning_rate": 4.6760265255267796e-05, "loss": 0.9082, "num_input_tokens_seen": 57827648, "step": 1360 }, { "epoch": 0.3291337633371511, "grad_norm": 0.9943946599960327, "learning_rate": 4.673691587991667e-05, "loss": 0.8726, "num_input_tokens_seen": 58049664, "step": 1365 }, { "epoch": 0.3303393815178733, "grad_norm": 1.0178776979446411, "learning_rate": 4.671348853780554e-05, "loss": 0.8346, "num_input_tokens_seen": 58272152, "step": 1370 }, { "epoch": 0.33154499969859547, "grad_norm": 1.1020039319992065, "learning_rate": 4.6689983312964466e-05, "loss": 0.8957, "num_input_tokens_seen": 58480432, "step": 1375 }, { "epoch": 0.33275061787931765, "grad_norm": 1.0991055965423584, "learning_rate": 4.6666400289702864e-05, "loss": 0.9134, "num_input_tokens_seen": 58695024, "step": 1380 }, { "epoch": 0.33395623606003977, "grad_norm": 1.1934117078781128, "learning_rate": 4.6642739552609146e-05, "loss": 0.9706, "num_input_tokens_seen": 58902752, "step": 1385 }, { "epoch": 0.33516185424076195, "grad_norm": 0.9491044878959656, "learning_rate": 4.661900118655053e-05, "loss": 0.8953, "num_input_tokens_seen": 59115528, "step": 1390 }, { "epoch": 0.3363674724214841, "grad_norm": 1.1253283023834229, "learning_rate": 4.659518527667263e-05, "loss": 0.8634, "num_input_tokens_seen": 59317040, "step": 1395 }, { "epoch": 0.3375730906022063, "grad_norm": 1.2757381200790405, "learning_rate": 4.6571291908399225e-05, "loss": 0.8966, "num_input_tokens_seen": 59536200, "step": 1400 }, { "epoch": 0.33877870878292843, "grad_norm": 1.1735286712646484, "learning_rate": 4.654732116743194e-05, "loss": 0.8822, "num_input_tokens_seen": 59749896, "step": 1405 }, { "epoch": 0.3399843269636506, "grad_norm": 1.016580581665039, "learning_rate": 4.6523273139749854e-05, "loss": 0.8689, "num_input_tokens_seen": 59970640, "step": 1410 }, { "epoch": 0.3411899451443728, "grad_norm": 0.985004186630249, "learning_rate": 4.6499147911609335e-05, "loss": 0.913, "num_input_tokens_seen": 60185160, "step": 1415 }, { "epoch": 0.34239556332509496, "grad_norm": 0.9623458385467529, "learning_rate": 4.647494556954363e-05, "loss": 0.8814, "num_input_tokens_seen": 60398240, "step": 1420 }, { "epoch": 0.3436011815058171, "grad_norm": 1.2659087181091309, "learning_rate": 4.6450666200362567e-05, "loss": 0.7971, "num_input_tokens_seen": 60609168, "step": 1425 }, { "epoch": 0.34480679968653927, "grad_norm": 2.0166943073272705, "learning_rate": 4.642630989115229e-05, "loss": 0.9658, "num_input_tokens_seen": 60822280, "step": 1430 }, { "epoch": 0.34601241786726145, "grad_norm": 1.0546671152114868, "learning_rate": 4.640187672927487e-05, "loss": 0.9029, "num_input_tokens_seen": 61043600, "step": 1435 }, { "epoch": 0.3472180360479836, "grad_norm": 0.8899186849594116, "learning_rate": 4.6377366802368074e-05, "loss": 0.8833, "num_input_tokens_seen": 61254472, "step": 1440 }, { "epoch": 0.34842365422870575, "grad_norm": 0.9994027614593506, "learning_rate": 4.635278019834499e-05, "loss": 0.8951, "num_input_tokens_seen": 61468680, "step": 1445 }, { "epoch": 0.3496292724094279, "grad_norm": 1.0637496709823608, "learning_rate": 4.632811700539376e-05, "loss": 0.9191, "num_input_tokens_seen": 61675368, "step": 1450 }, { "epoch": 0.3508348905901501, "grad_norm": 1.0631794929504395, "learning_rate": 4.6303377311977195e-05, "loss": 0.8986, "num_input_tokens_seen": 61892352, "step": 1455 }, { "epoch": 0.3520405087708723, "grad_norm": 1.2513526678085327, "learning_rate": 4.627856120683255e-05, "loss": 0.8942, "num_input_tokens_seen": 62101400, "step": 1460 }, { "epoch": 0.3532461269515944, "grad_norm": 0.9860830903053284, "learning_rate": 4.625366877897112e-05, "loss": 0.8542, "num_input_tokens_seen": 62322448, "step": 1465 }, { "epoch": 0.3544517451323166, "grad_norm": 1.0612962245941162, "learning_rate": 4.6228700117677976e-05, "loss": 0.8862, "num_input_tokens_seen": 62551024, "step": 1470 }, { "epoch": 0.35565736331303877, "grad_norm": 1.1695340871810913, "learning_rate": 4.6203655312511616e-05, "loss": 0.9591, "num_input_tokens_seen": 62746312, "step": 1475 }, { "epoch": 0.35686298149376094, "grad_norm": 1.0335479974746704, "learning_rate": 4.6178534453303666e-05, "loss": 0.9288, "num_input_tokens_seen": 62964440, "step": 1480 }, { "epoch": 0.35806859967448307, "grad_norm": 0.862133800983429, "learning_rate": 4.615333763015852e-05, "loss": 0.9228, "num_input_tokens_seen": 63180552, "step": 1485 }, { "epoch": 0.35927421785520525, "grad_norm": 1.5365206003189087, "learning_rate": 4.612806493345308e-05, "loss": 0.8683, "num_input_tokens_seen": 63387552, "step": 1490 }, { "epoch": 0.3604798360359274, "grad_norm": 1.238351821899414, "learning_rate": 4.6102716453836355e-05, "loss": 0.9421, "num_input_tokens_seen": 63596640, "step": 1495 }, { "epoch": 0.3616854542166496, "grad_norm": 1.0521694421768188, "learning_rate": 4.607729228222921e-05, "loss": 0.9079, "num_input_tokens_seen": 63813984, "step": 1500 }, { "epoch": 0.36289107239737173, "grad_norm": 1.0266447067260742, "learning_rate": 4.6051792509823964e-05, "loss": 0.9036, "num_input_tokens_seen": 64027432, "step": 1505 }, { "epoch": 0.3640966905780939, "grad_norm": 0.9994391798973083, "learning_rate": 4.6026217228084156e-05, "loss": 0.9072, "num_input_tokens_seen": 64247632, "step": 1510 }, { "epoch": 0.3653023087588161, "grad_norm": 1.150465726852417, "learning_rate": 4.600056652874412e-05, "loss": 0.9164, "num_input_tokens_seen": 64468480, "step": 1515 }, { "epoch": 0.36650792693953826, "grad_norm": 1.1295479536056519, "learning_rate": 4.5974840503808714e-05, "loss": 0.8745, "num_input_tokens_seen": 64689864, "step": 1520 }, { "epoch": 0.3677135451202604, "grad_norm": 1.1001403331756592, "learning_rate": 4.5949039245552985e-05, "loss": 0.8906, "num_input_tokens_seen": 64900856, "step": 1525 }, { "epoch": 0.36891916330098257, "grad_norm": 1.585651159286499, "learning_rate": 4.5923162846521824e-05, "loss": 0.9252, "num_input_tokens_seen": 65108032, "step": 1530 }, { "epoch": 0.37012478148170475, "grad_norm": 0.9815211296081543, "learning_rate": 4.589721139952964e-05, "loss": 0.879, "num_input_tokens_seen": 65324272, "step": 1535 }, { "epoch": 0.3713303996624269, "grad_norm": 1.4441436529159546, "learning_rate": 4.587118499766002e-05, "loss": 0.8306, "num_input_tokens_seen": 65543456, "step": 1540 }, { "epoch": 0.37253601784314905, "grad_norm": 1.1803338527679443, "learning_rate": 4.584508373426542e-05, "loss": 0.9109, "num_input_tokens_seen": 65766352, "step": 1545 }, { "epoch": 0.3737416360238712, "grad_norm": 1.0547415018081665, "learning_rate": 4.581890770296679e-05, "loss": 0.8945, "num_input_tokens_seen": 65985336, "step": 1550 }, { "epoch": 0.3749472542045934, "grad_norm": 1.0030617713928223, "learning_rate": 4.579265699765328e-05, "loss": 0.8614, "num_input_tokens_seen": 66203256, "step": 1555 }, { "epoch": 0.3761528723853156, "grad_norm": 0.9463897943496704, "learning_rate": 4.5766331712481866e-05, "loss": 0.9066, "num_input_tokens_seen": 66422704, "step": 1560 }, { "epoch": 0.37735849056603776, "grad_norm": 1.0580909252166748, "learning_rate": 4.573993194187703e-05, "loss": 0.8728, "num_input_tokens_seen": 66637240, "step": 1565 }, { "epoch": 0.3785641087467599, "grad_norm": 1.052911639213562, "learning_rate": 4.571345778053046e-05, "loss": 0.9351, "num_input_tokens_seen": 66854048, "step": 1570 }, { "epoch": 0.37976972692748207, "grad_norm": 0.9918159246444702, "learning_rate": 4.56869093234006e-05, "loss": 0.9182, "num_input_tokens_seen": 67071600, "step": 1575 }, { "epoch": 0.38097534510820424, "grad_norm": 0.9827377796173096, "learning_rate": 4.566028666571245e-05, "loss": 0.9354, "num_input_tokens_seen": 67275344, "step": 1580 }, { "epoch": 0.3821809632889264, "grad_norm": 1.2000900506973267, "learning_rate": 4.563358990295711e-05, "loss": 0.8829, "num_input_tokens_seen": 67484320, "step": 1585 }, { "epoch": 0.38338658146964855, "grad_norm": 1.1439824104309082, "learning_rate": 4.560681913089151e-05, "loss": 0.8754, "num_input_tokens_seen": 67701400, "step": 1590 }, { "epoch": 0.3845921996503707, "grad_norm": 0.9922711849212646, "learning_rate": 4.557997444553802e-05, "loss": 0.8946, "num_input_tokens_seen": 67916832, "step": 1595 }, { "epoch": 0.3857978178310929, "grad_norm": 1.013388991355896, "learning_rate": 4.555305594318414e-05, "loss": 0.9173, "num_input_tokens_seen": 68122184, "step": 1600 }, { "epoch": 0.3870034360118151, "grad_norm": 1.0183783769607544, "learning_rate": 4.552606372038213e-05, "loss": 0.8855, "num_input_tokens_seen": 68329440, "step": 1605 }, { "epoch": 0.3882090541925372, "grad_norm": 1.0178639888763428, "learning_rate": 4.549899787394867e-05, "loss": 0.8779, "num_input_tokens_seen": 68542128, "step": 1610 }, { "epoch": 0.3894146723732594, "grad_norm": 1.0089093446731567, "learning_rate": 4.547185850096454e-05, "loss": 0.9038, "num_input_tokens_seen": 68760320, "step": 1615 }, { "epoch": 0.39062029055398156, "grad_norm": 0.8575237989425659, "learning_rate": 4.54446456987742e-05, "loss": 0.8478, "num_input_tokens_seen": 68976016, "step": 1620 }, { "epoch": 0.39182590873470374, "grad_norm": 1.0033999681472778, "learning_rate": 4.541735956498554e-05, "loss": 0.8928, "num_input_tokens_seen": 69180408, "step": 1625 }, { "epoch": 0.39303152691542587, "grad_norm": 0.9211997985839844, "learning_rate": 4.539000019746946e-05, "loss": 0.8489, "num_input_tokens_seen": 69398208, "step": 1630 }, { "epoch": 0.39423714509614805, "grad_norm": 1.0493967533111572, "learning_rate": 4.536256769435953e-05, "loss": 0.8844, "num_input_tokens_seen": 69603144, "step": 1635 }, { "epoch": 0.3954427632768702, "grad_norm": 0.8278038501739502, "learning_rate": 4.5335062154051625e-05, "loss": 0.9223, "num_input_tokens_seen": 69810992, "step": 1640 }, { "epoch": 0.3966483814575924, "grad_norm": 0.9260925054550171, "learning_rate": 4.530748367520365e-05, "loss": 0.8448, "num_input_tokens_seen": 70016896, "step": 1645 }, { "epoch": 0.3978539996383145, "grad_norm": 0.9692575335502625, "learning_rate": 4.5279832356735075e-05, "loss": 0.8835, "num_input_tokens_seen": 70235888, "step": 1650 }, { "epoch": 0.3990596178190367, "grad_norm": 0.9719855189323425, "learning_rate": 4.5252108297826666e-05, "loss": 0.8569, "num_input_tokens_seen": 70451320, "step": 1655 }, { "epoch": 0.4002652359997589, "grad_norm": 0.9605731964111328, "learning_rate": 4.522431159792008e-05, "loss": 0.8572, "num_input_tokens_seen": 70664984, "step": 1660 }, { "epoch": 0.40147085418048106, "grad_norm": 1.00998854637146, "learning_rate": 4.519644235671752e-05, "loss": 0.8652, "num_input_tokens_seen": 70898520, "step": 1665 }, { "epoch": 0.4026764723612032, "grad_norm": 1.0643452405929565, "learning_rate": 4.5168500674181413e-05, "loss": 0.9286, "num_input_tokens_seen": 71117784, "step": 1670 }, { "epoch": 0.40388209054192536, "grad_norm": 0.9840123057365417, "learning_rate": 4.5140486650533996e-05, "loss": 0.834, "num_input_tokens_seen": 71330816, "step": 1675 }, { "epoch": 0.40508770872264754, "grad_norm": 1.0877817869186401, "learning_rate": 4.5112400386256984e-05, "loss": 0.8691, "num_input_tokens_seen": 71541048, "step": 1680 }, { "epoch": 0.4062933269033697, "grad_norm": 0.9681184887886047, "learning_rate": 4.50842419820912e-05, "loss": 0.9006, "num_input_tokens_seen": 71754744, "step": 1685 }, { "epoch": 0.40749894508409185, "grad_norm": 0.965379536151886, "learning_rate": 4.5056011539036244e-05, "loss": 0.8828, "num_input_tokens_seen": 71958864, "step": 1690 }, { "epoch": 0.408704563264814, "grad_norm": 0.9823299646377563, "learning_rate": 4.50277091583501e-05, "loss": 0.8908, "num_input_tokens_seen": 72160688, "step": 1695 }, { "epoch": 0.4099101814455362, "grad_norm": 1.0665885210037231, "learning_rate": 4.499933494154877e-05, "loss": 0.916, "num_input_tokens_seen": 72378704, "step": 1700 }, { "epoch": 0.4111157996262584, "grad_norm": 1.0019546747207642, "learning_rate": 4.497088899040592e-05, "loss": 0.8801, "num_input_tokens_seen": 72600752, "step": 1705 }, { "epoch": 0.4123214178069805, "grad_norm": 1.1004503965377808, "learning_rate": 4.494237140695253e-05, "loss": 0.8778, "num_input_tokens_seen": 72804216, "step": 1710 }, { "epoch": 0.4135270359877027, "grad_norm": 1.7972251176834106, "learning_rate": 4.49137822934765e-05, "loss": 0.8205, "num_input_tokens_seen": 73015032, "step": 1715 }, { "epoch": 0.41473265416842486, "grad_norm": 1.1007394790649414, "learning_rate": 4.4885121752522304e-05, "loss": 0.8911, "num_input_tokens_seen": 73224032, "step": 1720 }, { "epoch": 0.41593827234914704, "grad_norm": 0.9819033741950989, "learning_rate": 4.485638988689061e-05, "loss": 0.832, "num_input_tokens_seen": 73427032, "step": 1725 }, { "epoch": 0.41714389052986917, "grad_norm": 0.9891818761825562, "learning_rate": 4.4827586799637916e-05, "loss": 0.9171, "num_input_tokens_seen": 73635464, "step": 1730 }, { "epoch": 0.41834950871059134, "grad_norm": 1.0625104904174805, "learning_rate": 4.479871259407619e-05, "loss": 0.8925, "num_input_tokens_seen": 73845664, "step": 1735 }, { "epoch": 0.4195551268913135, "grad_norm": 1.5029146671295166, "learning_rate": 4.4769767373772474e-05, "loss": 0.9052, "num_input_tokens_seen": 74054672, "step": 1740 }, { "epoch": 0.4207607450720357, "grad_norm": 0.9299123287200928, "learning_rate": 4.4740751242548545e-05, "loss": 0.869, "num_input_tokens_seen": 74266264, "step": 1745 }, { "epoch": 0.4219663632527578, "grad_norm": 0.951828122138977, "learning_rate": 4.47116643044805e-05, "loss": 0.9022, "num_input_tokens_seen": 74476904, "step": 1750 }, { "epoch": 0.42317198143348, "grad_norm": 1.0803813934326172, "learning_rate": 4.468250666389846e-05, "loss": 0.8934, "num_input_tokens_seen": 74695360, "step": 1755 }, { "epoch": 0.4243775996142022, "grad_norm": 0.8496319055557251, "learning_rate": 4.4653278425386066e-05, "loss": 0.757, "num_input_tokens_seen": 74915336, "step": 1760 }, { "epoch": 0.42558321779492436, "grad_norm": 1.1456303596496582, "learning_rate": 4.462397969378026e-05, "loss": 0.9025, "num_input_tokens_seen": 75130504, "step": 1765 }, { "epoch": 0.4267888359756465, "grad_norm": 1.5711309909820557, "learning_rate": 4.459461057417078e-05, "loss": 0.8719, "num_input_tokens_seen": 75348376, "step": 1770 }, { "epoch": 0.42799445415636866, "grad_norm": 1.0066384077072144, "learning_rate": 4.456517117189987e-05, "loss": 0.8712, "num_input_tokens_seen": 75559976, "step": 1775 }, { "epoch": 0.42920007233709084, "grad_norm": 1.0628305673599243, "learning_rate": 4.4535661592561815e-05, "loss": 0.8701, "num_input_tokens_seen": 75779528, "step": 1780 }, { "epoch": 0.430405690517813, "grad_norm": 1.0169196128845215, "learning_rate": 4.4506081942002687e-05, "loss": 0.922, "num_input_tokens_seen": 75995104, "step": 1785 }, { "epoch": 0.4316113086985352, "grad_norm": 1.8851474523544312, "learning_rate": 4.4476432326319816e-05, "loss": 0.8929, "num_input_tokens_seen": 76204832, "step": 1790 }, { "epoch": 0.4328169268792573, "grad_norm": 0.9910247921943665, "learning_rate": 4.444671285186155e-05, "loss": 0.8565, "num_input_tokens_seen": 76421112, "step": 1795 }, { "epoch": 0.4340225450599795, "grad_norm": 1.0163054466247559, "learning_rate": 4.441692362522676e-05, "loss": 0.8321, "num_input_tokens_seen": 76636048, "step": 1800 }, { "epoch": 0.4352281632407017, "grad_norm": 2.8645615577697754, "learning_rate": 4.438706475326453e-05, "loss": 0.8878, "num_input_tokens_seen": 76850320, "step": 1805 }, { "epoch": 0.43643378142142386, "grad_norm": 1.0695887804031372, "learning_rate": 4.4357136343073756e-05, "loss": 0.9484, "num_input_tokens_seen": 77058224, "step": 1810 }, { "epoch": 0.437639399602146, "grad_norm": 1.030790090560913, "learning_rate": 4.432713850200275e-05, "loss": 0.8818, "num_input_tokens_seen": 77259216, "step": 1815 }, { "epoch": 0.43884501778286816, "grad_norm": 1.0077475309371948, "learning_rate": 4.429707133764885e-05, "loss": 0.914, "num_input_tokens_seen": 77463144, "step": 1820 }, { "epoch": 0.44005063596359034, "grad_norm": 0.969842255115509, "learning_rate": 4.4266934957858055e-05, "loss": 0.8839, "num_input_tokens_seen": 77673976, "step": 1825 }, { "epoch": 0.4412562541443125, "grad_norm": 0.9910542368888855, "learning_rate": 4.4236729470724624e-05, "loss": 0.9145, "num_input_tokens_seen": 77893288, "step": 1830 }, { "epoch": 0.44246187232503464, "grad_norm": 1.0562111139297485, "learning_rate": 4.42064549845907e-05, "loss": 0.9622, "num_input_tokens_seen": 78114832, "step": 1835 }, { "epoch": 0.4436674905057568, "grad_norm": 1.0072940587997437, "learning_rate": 4.417611160804591e-05, "loss": 0.9224, "num_input_tokens_seen": 78323128, "step": 1840 }, { "epoch": 0.444873108686479, "grad_norm": 0.968207061290741, "learning_rate": 4.414569944992699e-05, "loss": 0.8422, "num_input_tokens_seen": 78549648, "step": 1845 }, { "epoch": 0.4460787268672012, "grad_norm": 1.0113698244094849, "learning_rate": 4.411521861931736e-05, "loss": 0.8595, "num_input_tokens_seen": 78759408, "step": 1850 }, { "epoch": 0.4472843450479233, "grad_norm": 1.0367155075073242, "learning_rate": 4.408466922554678e-05, "loss": 0.8703, "num_input_tokens_seen": 78975328, "step": 1855 }, { "epoch": 0.4484899632286455, "grad_norm": 1.0319128036499023, "learning_rate": 4.4054051378190915e-05, "loss": 0.8338, "num_input_tokens_seen": 79203968, "step": 1860 }, { "epoch": 0.44969558140936766, "grad_norm": 1.287933349609375, "learning_rate": 4.402336518707099e-05, "loss": 0.877, "num_input_tokens_seen": 79431976, "step": 1865 }, { "epoch": 0.45090119959008984, "grad_norm": 0.9600107669830322, "learning_rate": 4.399261076225333e-05, "loss": 0.9126, "num_input_tokens_seen": 79633440, "step": 1870 }, { "epoch": 0.45210681777081196, "grad_norm": 0.9947012662887573, "learning_rate": 4.396178821404904e-05, "loss": 0.8829, "num_input_tokens_seen": 79835256, "step": 1875 }, { "epoch": 0.45331243595153414, "grad_norm": 0.9866642355918884, "learning_rate": 4.393089765301354e-05, "loss": 0.8875, "num_input_tokens_seen": 80056552, "step": 1880 }, { "epoch": 0.4545180541322563, "grad_norm": 0.9471333026885986, "learning_rate": 4.389993918994624e-05, "loss": 0.8274, "num_input_tokens_seen": 80280552, "step": 1885 }, { "epoch": 0.4557236723129785, "grad_norm": 1.0181150436401367, "learning_rate": 4.386891293589007e-05, "loss": 0.9256, "num_input_tokens_seen": 80485144, "step": 1890 }, { "epoch": 0.4569292904937006, "grad_norm": 1.093173623085022, "learning_rate": 4.383781900213112e-05, "loss": 0.8879, "num_input_tokens_seen": 80694472, "step": 1895 }, { "epoch": 0.4581349086744228, "grad_norm": 0.9689379930496216, "learning_rate": 4.380665750019824e-05, "loss": 0.9113, "num_input_tokens_seen": 80893296, "step": 1900 }, { "epoch": 0.459340526855145, "grad_norm": 1.8322420120239258, "learning_rate": 4.3775428541862654e-05, "loss": 0.895, "num_input_tokens_seen": 81108520, "step": 1905 }, { "epoch": 0.46054614503586716, "grad_norm": 0.9644396901130676, "learning_rate": 4.3744132239137514e-05, "loss": 0.9255, "num_input_tokens_seen": 81316560, "step": 1910 }, { "epoch": 0.4617517632165893, "grad_norm": 0.9988536238670349, "learning_rate": 4.371276870427753e-05, "loss": 0.8893, "num_input_tokens_seen": 81521808, "step": 1915 }, { "epoch": 0.46295738139731146, "grad_norm": 0.9600037336349487, "learning_rate": 4.368133804977858e-05, "loss": 0.9003, "num_input_tokens_seen": 81740472, "step": 1920 }, { "epoch": 0.46416299957803364, "grad_norm": 1.0125172138214111, "learning_rate": 4.364984038837727e-05, "loss": 0.8675, "num_input_tokens_seen": 81952648, "step": 1925 }, { "epoch": 0.4653686177587558, "grad_norm": 0.9779416918754578, "learning_rate": 4.361827583305057e-05, "loss": 0.8456, "num_input_tokens_seen": 82165632, "step": 1930 }, { "epoch": 0.46657423593947794, "grad_norm": 1.188330054283142, "learning_rate": 4.358664449701536e-05, "loss": 0.9547, "num_input_tokens_seen": 82384576, "step": 1935 }, { "epoch": 0.4677798541202001, "grad_norm": 1.2116247415542603, "learning_rate": 4.355494649372807e-05, "loss": 0.8647, "num_input_tokens_seen": 82602392, "step": 1940 }, { "epoch": 0.4689854723009223, "grad_norm": 1.0101758241653442, "learning_rate": 4.352318193688425e-05, "loss": 0.9327, "num_input_tokens_seen": 82814336, "step": 1945 }, { "epoch": 0.4701910904816445, "grad_norm": 1.002609372138977, "learning_rate": 4.349135094041817e-05, "loss": 0.8563, "num_input_tokens_seen": 83027968, "step": 1950 }, { "epoch": 0.4713967086623666, "grad_norm": 0.9704388976097107, "learning_rate": 4.3459453618502405e-05, "loss": 0.8628, "num_input_tokens_seen": 83238576, "step": 1955 }, { "epoch": 0.4726023268430888, "grad_norm": 1.0719282627105713, "learning_rate": 4.342749008554743e-05, "loss": 0.8802, "num_input_tokens_seen": 83449768, "step": 1960 }, { "epoch": 0.47380794502381096, "grad_norm": 0.9311699271202087, "learning_rate": 4.33954604562012e-05, "loss": 0.8766, "num_input_tokens_seen": 83657016, "step": 1965 }, { "epoch": 0.47501356320453314, "grad_norm": 1.0356444120407104, "learning_rate": 4.3363364845348755e-05, "loss": 0.8767, "num_input_tokens_seen": 83862304, "step": 1970 }, { "epoch": 0.47621918138525526, "grad_norm": 0.9390013217926025, "learning_rate": 4.333120336811181e-05, "loss": 0.8689, "num_input_tokens_seen": 84075576, "step": 1975 }, { "epoch": 0.47742479956597744, "grad_norm": 1.0459247827529907, "learning_rate": 4.3298976139848305e-05, "loss": 0.8786, "num_input_tokens_seen": 84289416, "step": 1980 }, { "epoch": 0.4786304177466996, "grad_norm": 0.9590256810188293, "learning_rate": 4.3266683276152043e-05, "loss": 0.8692, "num_input_tokens_seen": 84499344, "step": 1985 }, { "epoch": 0.4798360359274218, "grad_norm": 1.050489902496338, "learning_rate": 4.323432489285223e-05, "loss": 0.9151, "num_input_tokens_seen": 84712688, "step": 1990 }, { "epoch": 0.4810416541081439, "grad_norm": 0.945336103439331, "learning_rate": 4.320190110601311e-05, "loss": 0.8794, "num_input_tokens_seen": 84919856, "step": 1995 }, { "epoch": 0.4822472722888661, "grad_norm": 0.9381400942802429, "learning_rate": 4.316941203193349e-05, "loss": 0.9303, "num_input_tokens_seen": 85127552, "step": 2000 }, { "epoch": 0.4834528904695883, "grad_norm": 1.0190868377685547, "learning_rate": 4.313685778714636e-05, "loss": 0.8515, "num_input_tokens_seen": 85344848, "step": 2005 }, { "epoch": 0.48465850865031046, "grad_norm": 1.012560248374939, "learning_rate": 4.310423848841847e-05, "loss": 0.8259, "num_input_tokens_seen": 85569632, "step": 2010 }, { "epoch": 0.48586412683103264, "grad_norm": 0.9798316955566406, "learning_rate": 4.307155425274991e-05, "loss": 0.8347, "num_input_tokens_seen": 85793424, "step": 2015 }, { "epoch": 0.48706974501175476, "grad_norm": 1.0481104850769043, "learning_rate": 4.3038805197373685e-05, "loss": 0.8385, "num_input_tokens_seen": 86005296, "step": 2020 }, { "epoch": 0.48827536319247694, "grad_norm": 0.9876573085784912, "learning_rate": 4.300599143975529e-05, "loss": 0.8785, "num_input_tokens_seen": 86221856, "step": 2025 }, { "epoch": 0.4894809813731991, "grad_norm": 1.0380154848098755, "learning_rate": 4.297311309759231e-05, "loss": 0.8624, "num_input_tokens_seen": 86429952, "step": 2030 }, { "epoch": 0.4906865995539213, "grad_norm": 1.0706663131713867, "learning_rate": 4.294017028881397e-05, "loss": 0.8845, "num_input_tokens_seen": 86638136, "step": 2035 }, { "epoch": 0.4918922177346434, "grad_norm": 2.4976260662078857, "learning_rate": 4.2907163131580755e-05, "loss": 0.9028, "num_input_tokens_seen": 86846008, "step": 2040 }, { "epoch": 0.4930978359153656, "grad_norm": 0.8978025913238525, "learning_rate": 4.2874091744283906e-05, "loss": 0.8885, "num_input_tokens_seen": 87063200, "step": 2045 }, { "epoch": 0.4943034540960878, "grad_norm": 0.9562691450119019, "learning_rate": 4.284095624554509e-05, "loss": 0.877, "num_input_tokens_seen": 87280888, "step": 2050 }, { "epoch": 0.49550907227680996, "grad_norm": 1.3563538789749146, "learning_rate": 4.280775675421593e-05, "loss": 0.8765, "num_input_tokens_seen": 87488016, "step": 2055 }, { "epoch": 0.4967146904575321, "grad_norm": 1.0287649631500244, "learning_rate": 4.2774493389377545e-05, "loss": 0.8648, "num_input_tokens_seen": 87703176, "step": 2060 }, { "epoch": 0.49792030863825426, "grad_norm": 1.2325608730316162, "learning_rate": 4.2741166270340205e-05, "loss": 0.8641, "num_input_tokens_seen": 87922832, "step": 2065 }, { "epoch": 0.49912592681897644, "grad_norm": 1.273120403289795, "learning_rate": 4.270777551664282e-05, "loss": 0.8921, "num_input_tokens_seen": 88137296, "step": 2070 }, { "epoch": 0.5003315449996986, "grad_norm": 0.912801206111908, "learning_rate": 4.267432124805256e-05, "loss": 0.87, "num_input_tokens_seen": 88341720, "step": 2075 }, { "epoch": 0.5015371631804207, "grad_norm": 1.0284984111785889, "learning_rate": 4.264080358456441e-05, "loss": 0.8901, "num_input_tokens_seen": 88563616, "step": 2080 }, { "epoch": 0.5027427813611429, "grad_norm": 1.0365489721298218, "learning_rate": 4.260722264640075e-05, "loss": 0.8375, "num_input_tokens_seen": 88771104, "step": 2085 }, { "epoch": 0.5039483995418651, "grad_norm": 1.0729199647903442, "learning_rate": 4.257357855401089e-05, "loss": 0.8879, "num_input_tokens_seen": 88977696, "step": 2090 }, { "epoch": 0.5051540177225873, "grad_norm": 0.937209963798523, "learning_rate": 4.253987142807072e-05, "loss": 0.8793, "num_input_tokens_seen": 89190920, "step": 2095 }, { "epoch": 0.5063596359033095, "grad_norm": 1.8470715284347534, "learning_rate": 4.250610138948215e-05, "loss": 0.904, "num_input_tokens_seen": 89405656, "step": 2100 }, { "epoch": 0.5075652540840316, "grad_norm": 0.9279716610908508, "learning_rate": 4.2472268559372795e-05, "loss": 0.857, "num_input_tokens_seen": 89628096, "step": 2105 }, { "epoch": 0.5087708722647537, "grad_norm": 1.032220482826233, "learning_rate": 4.2438373059095486e-05, "loss": 0.8435, "num_input_tokens_seen": 89836488, "step": 2110 }, { "epoch": 0.5099764904454759, "grad_norm": 0.9627379775047302, "learning_rate": 4.240441501022783e-05, "loss": 0.8274, "num_input_tokens_seen": 90045008, "step": 2115 }, { "epoch": 0.5111821086261981, "grad_norm": 0.9730749726295471, "learning_rate": 4.237039453457179e-05, "loss": 0.8539, "num_input_tokens_seen": 90267664, "step": 2120 }, { "epoch": 0.5123877268069202, "grad_norm": 0.9753642082214355, "learning_rate": 4.2336311754153243e-05, "loss": 0.8844, "num_input_tokens_seen": 90486552, "step": 2125 }, { "epoch": 0.5135933449876424, "grad_norm": 1.1769671440124512, "learning_rate": 4.230216679122154e-05, "loss": 0.8186, "num_input_tokens_seen": 90696456, "step": 2130 }, { "epoch": 0.5147989631683646, "grad_norm": 1.0043829679489136, "learning_rate": 4.22679597682491e-05, "loss": 0.9042, "num_input_tokens_seen": 90915056, "step": 2135 }, { "epoch": 0.5160045813490868, "grad_norm": 0.9277008771896362, "learning_rate": 4.223369080793088e-05, "loss": 0.8264, "num_input_tokens_seen": 91124080, "step": 2140 }, { "epoch": 0.517210199529809, "grad_norm": 1.0587533712387085, "learning_rate": 4.219936003318405e-05, "loss": 0.8643, "num_input_tokens_seen": 91335016, "step": 2145 }, { "epoch": 0.518415817710531, "grad_norm": 0.9981306791305542, "learning_rate": 4.2164967567147495e-05, "loss": 0.9289, "num_input_tokens_seen": 91555856, "step": 2150 }, { "epoch": 0.5196214358912532, "grad_norm": 1.0546942949295044, "learning_rate": 4.213051353318135e-05, "loss": 0.968, "num_input_tokens_seen": 91768304, "step": 2155 }, { "epoch": 0.5208270540719754, "grad_norm": 1.0108380317687988, "learning_rate": 4.209599805486658e-05, "loss": 0.8957, "num_input_tokens_seen": 91969648, "step": 2160 }, { "epoch": 0.5220326722526976, "grad_norm": 1.0711324214935303, "learning_rate": 4.206142125600458e-05, "loss": 0.8896, "num_input_tokens_seen": 92179808, "step": 2165 }, { "epoch": 0.5232382904334197, "grad_norm": 1.0854318141937256, "learning_rate": 4.202678326061667e-05, "loss": 0.9457, "num_input_tokens_seen": 92400240, "step": 2170 }, { "epoch": 0.5244439086141419, "grad_norm": 1.0108792781829834, "learning_rate": 4.199208419294365e-05, "loss": 0.8512, "num_input_tokens_seen": 92609928, "step": 2175 }, { "epoch": 0.5256495267948641, "grad_norm": 0.9412888288497925, "learning_rate": 4.195732417744542e-05, "loss": 0.8473, "num_input_tokens_seen": 92816744, "step": 2180 }, { "epoch": 0.5268551449755863, "grad_norm": 0.9911028146743774, "learning_rate": 4.192250333880045e-05, "loss": 0.9032, "num_input_tokens_seen": 93027576, "step": 2185 }, { "epoch": 0.5280607631563083, "grad_norm": 0.9824265837669373, "learning_rate": 4.1887621801905396e-05, "loss": 0.8966, "num_input_tokens_seen": 93235568, "step": 2190 }, { "epoch": 0.5292663813370305, "grad_norm": 1.1147829294204712, "learning_rate": 4.185267969187463e-05, "loss": 0.8848, "num_input_tokens_seen": 93449072, "step": 2195 }, { "epoch": 0.5304719995177527, "grad_norm": 1.0127527713775635, "learning_rate": 4.181767713403976e-05, "loss": 0.864, "num_input_tokens_seen": 93654104, "step": 2200 }, { "epoch": 0.5316776176984749, "grad_norm": 0.9450674653053284, "learning_rate": 4.178261425394926e-05, "loss": 0.9139, "num_input_tokens_seen": 93856176, "step": 2205 }, { "epoch": 0.5328832358791971, "grad_norm": 0.9451528191566467, "learning_rate": 4.174749117736793e-05, "loss": 0.8963, "num_input_tokens_seen": 94075664, "step": 2210 }, { "epoch": 0.5340888540599192, "grad_norm": 1.063743233680725, "learning_rate": 4.1712308030276494e-05, "loss": 0.8938, "num_input_tokens_seen": 94286888, "step": 2215 }, { "epoch": 0.5352944722406414, "grad_norm": 1.0185256004333496, "learning_rate": 4.167706493887115e-05, "loss": 0.8586, "num_input_tokens_seen": 94501120, "step": 2220 }, { "epoch": 0.5365000904213636, "grad_norm": 0.9913762807846069, "learning_rate": 4.164176202956309e-05, "loss": 0.8591, "num_input_tokens_seen": 94716872, "step": 2225 }, { "epoch": 0.5377057086020858, "grad_norm": 0.971479594707489, "learning_rate": 4.160639942897808e-05, "loss": 0.8747, "num_input_tokens_seen": 94936000, "step": 2230 }, { "epoch": 0.5389113267828078, "grad_norm": 1.2606984376907349, "learning_rate": 4.157097726395599e-05, "loss": 0.8803, "num_input_tokens_seen": 95156248, "step": 2235 }, { "epoch": 0.54011694496353, "grad_norm": 0.9564799666404724, "learning_rate": 4.153549566155032e-05, "loss": 0.9039, "num_input_tokens_seen": 95362832, "step": 2240 }, { "epoch": 0.5413225631442522, "grad_norm": 1.0540192127227783, "learning_rate": 4.149995474902776e-05, "loss": 0.8726, "num_input_tokens_seen": 95573528, "step": 2245 }, { "epoch": 0.5425281813249744, "grad_norm": 0.958853006362915, "learning_rate": 4.146435465386775e-05, "loss": 0.8692, "num_input_tokens_seen": 95782784, "step": 2250 }, { "epoch": 0.5437337995056966, "grad_norm": 1.0907020568847656, "learning_rate": 4.1428695503762035e-05, "loss": 0.8315, "num_input_tokens_seen": 96005984, "step": 2255 }, { "epoch": 0.5449394176864187, "grad_norm": 1.0126972198486328, "learning_rate": 4.139297742661411e-05, "loss": 0.8594, "num_input_tokens_seen": 96208336, "step": 2260 }, { "epoch": 0.5461450358671409, "grad_norm": 1.1242884397506714, "learning_rate": 4.13572005505389e-05, "loss": 0.8891, "num_input_tokens_seen": 96415976, "step": 2265 }, { "epoch": 0.5473506540478631, "grad_norm": 1.3613101243972778, "learning_rate": 4.13213650038622e-05, "loss": 0.8291, "num_input_tokens_seen": 96635312, "step": 2270 }, { "epoch": 0.5485562722285852, "grad_norm": 0.9981886744499207, "learning_rate": 4.128547091512023e-05, "loss": 0.8314, "num_input_tokens_seen": 96843120, "step": 2275 }, { "epoch": 0.5497618904093073, "grad_norm": 0.8886982798576355, "learning_rate": 4.124951841305924e-05, "loss": 0.8697, "num_input_tokens_seen": 97047688, "step": 2280 }, { "epoch": 0.5509675085900295, "grad_norm": 1.0462706089019775, "learning_rate": 4.121350762663496e-05, "loss": 0.8562, "num_input_tokens_seen": 97254264, "step": 2285 }, { "epoch": 0.5521731267707517, "grad_norm": 0.9085835218429565, "learning_rate": 4.117743868501218e-05, "loss": 0.8475, "num_input_tokens_seen": 97464224, "step": 2290 }, { "epoch": 0.5533787449514739, "grad_norm": 1.6278166770935059, "learning_rate": 4.11413117175643e-05, "loss": 0.8646, "num_input_tokens_seen": 97677928, "step": 2295 }, { "epoch": 0.5545843631321961, "grad_norm": 1.0514858961105347, "learning_rate": 4.1105126853872845e-05, "loss": 0.94, "num_input_tokens_seen": 97892760, "step": 2300 }, { "epoch": 0.5557899813129182, "grad_norm": 1.1050893068313599, "learning_rate": 4.1068884223726994e-05, "loss": 0.9246, "num_input_tokens_seen": 98096592, "step": 2305 }, { "epoch": 0.5569955994936404, "grad_norm": 0.9537079930305481, "learning_rate": 4.1032583957123125e-05, "loss": 0.8721, "num_input_tokens_seen": 98306936, "step": 2310 }, { "epoch": 0.5582012176743625, "grad_norm": 1.1096903085708618, "learning_rate": 4.099622618426436e-05, "loss": 0.8303, "num_input_tokens_seen": 98520240, "step": 2315 }, { "epoch": 0.5594068358550847, "grad_norm": 0.977304220199585, "learning_rate": 4.0959811035560067e-05, "loss": 0.8754, "num_input_tokens_seen": 98733680, "step": 2320 }, { "epoch": 0.5606124540358068, "grad_norm": 0.9519372582435608, "learning_rate": 4.092333864162545e-05, "loss": 0.8761, "num_input_tokens_seen": 98955144, "step": 2325 }, { "epoch": 0.561818072216529, "grad_norm": 1.0054187774658203, "learning_rate": 4.0886809133281e-05, "loss": 0.9038, "num_input_tokens_seen": 99167176, "step": 2330 }, { "epoch": 0.5630236903972512, "grad_norm": 0.9660202264785767, "learning_rate": 4.085022264155208e-05, "loss": 0.8947, "num_input_tokens_seen": 99389856, "step": 2335 }, { "epoch": 0.5642293085779734, "grad_norm": 1.0499833822250366, "learning_rate": 4.0813579297668466e-05, "loss": 0.8452, "num_input_tokens_seen": 99601416, "step": 2340 }, { "epoch": 0.5654349267586956, "grad_norm": 0.8520085215568542, "learning_rate": 4.077687923306383e-05, "loss": 0.8375, "num_input_tokens_seen": 99825624, "step": 2345 }, { "epoch": 0.5666405449394177, "grad_norm": 0.934125542640686, "learning_rate": 4.0740122579375286e-05, "loss": 0.8371, "num_input_tokens_seen": 100045976, "step": 2350 }, { "epoch": 0.5678461631201398, "grad_norm": 0.9949510097503662, "learning_rate": 4.070330946844295e-05, "loss": 0.8465, "num_input_tokens_seen": 100252088, "step": 2355 }, { "epoch": 0.569051781300862, "grad_norm": 0.9440064430236816, "learning_rate": 4.066644003230942e-05, "loss": 0.8836, "num_input_tokens_seen": 100458752, "step": 2360 }, { "epoch": 0.5702573994815842, "grad_norm": 1.0115270614624023, "learning_rate": 4.062951440321933e-05, "loss": 0.8456, "num_input_tokens_seen": 100680672, "step": 2365 }, { "epoch": 0.5714630176623063, "grad_norm": 1.177482008934021, "learning_rate": 4.059253271361886e-05, "loss": 0.9517, "num_input_tokens_seen": 100890424, "step": 2370 }, { "epoch": 0.5726686358430285, "grad_norm": 1.1401214599609375, "learning_rate": 4.05554950961553e-05, "loss": 0.8511, "num_input_tokens_seen": 101098408, "step": 2375 }, { "epoch": 0.5738742540237507, "grad_norm": 1.1411595344543457, "learning_rate": 4.05184016836765e-05, "loss": 0.8487, "num_input_tokens_seen": 101309800, "step": 2380 }, { "epoch": 0.5750798722044729, "grad_norm": 1.142316222190857, "learning_rate": 4.048125260923047e-05, "loss": 0.8574, "num_input_tokens_seen": 101519128, "step": 2385 }, { "epoch": 0.576285490385195, "grad_norm": 1.0340522527694702, "learning_rate": 4.044404800606486e-05, "loss": 0.8721, "num_input_tokens_seen": 101735288, "step": 2390 }, { "epoch": 0.5774911085659171, "grad_norm": 0.9926255941390991, "learning_rate": 4.04067880076265e-05, "loss": 0.9365, "num_input_tokens_seen": 101953272, "step": 2395 }, { "epoch": 0.5786967267466393, "grad_norm": 1.0671162605285645, "learning_rate": 4.03694727475609e-05, "loss": 0.8686, "num_input_tokens_seen": 102155672, "step": 2400 }, { "epoch": 0.5799023449273615, "grad_norm": 2.0279886722564697, "learning_rate": 4.033210235971179e-05, "loss": 0.9497, "num_input_tokens_seen": 102371776, "step": 2405 }, { "epoch": 0.5811079631080837, "grad_norm": 1.6203309297561646, "learning_rate": 4.029467697812064e-05, "loss": 0.9144, "num_input_tokens_seen": 102565872, "step": 2410 }, { "epoch": 0.5823135812888058, "grad_norm": 0.9298354387283325, "learning_rate": 4.0257196737026156e-05, "loss": 0.8636, "num_input_tokens_seen": 102775960, "step": 2415 }, { "epoch": 0.583519199469528, "grad_norm": 0.9989774823188782, "learning_rate": 4.0219661770863845e-05, "loss": 0.8666, "num_input_tokens_seen": 102990696, "step": 2420 }, { "epoch": 0.5847248176502502, "grad_norm": 1.0378210544586182, "learning_rate": 4.018207221426548e-05, "loss": 0.9192, "num_input_tokens_seen": 103197184, "step": 2425 }, { "epoch": 0.5859304358309724, "grad_norm": 0.9344012141227722, "learning_rate": 4.014442820205865e-05, "loss": 0.8582, "num_input_tokens_seen": 103411256, "step": 2430 }, { "epoch": 0.5871360540116944, "grad_norm": 1.0504661798477173, "learning_rate": 4.010672986926627e-05, "loss": 0.8943, "num_input_tokens_seen": 103621336, "step": 2435 }, { "epoch": 0.5883416721924166, "grad_norm": 1.012403964996338, "learning_rate": 4.006897735110608e-05, "loss": 0.8763, "num_input_tokens_seen": 103848608, "step": 2440 }, { "epoch": 0.5895472903731388, "grad_norm": 1.0709983110427856, "learning_rate": 4.003117078299021e-05, "loss": 0.8367, "num_input_tokens_seen": 104061656, "step": 2445 }, { "epoch": 0.590752908553861, "grad_norm": 1.049465298652649, "learning_rate": 3.99933103005246e-05, "loss": 0.777, "num_input_tokens_seen": 104280408, "step": 2450 }, { "epoch": 0.5919585267345832, "grad_norm": 1.1090967655181885, "learning_rate": 3.995539603950862e-05, "loss": 0.8922, "num_input_tokens_seen": 104500440, "step": 2455 }, { "epoch": 0.5931641449153053, "grad_norm": 1.0203666687011719, "learning_rate": 3.991742813593453e-05, "loss": 0.8727, "num_input_tokens_seen": 104713288, "step": 2460 }, { "epoch": 0.5943697630960275, "grad_norm": 0.8837824463844299, "learning_rate": 3.9879406725986965e-05, "loss": 0.8746, "num_input_tokens_seen": 104936152, "step": 2465 }, { "epoch": 0.5955753812767497, "grad_norm": 0.9739139676094055, "learning_rate": 3.984133194604251e-05, "loss": 0.8599, "num_input_tokens_seen": 105143240, "step": 2470 }, { "epoch": 0.5967809994574719, "grad_norm": 1.058484673500061, "learning_rate": 3.980320393266918e-05, "loss": 0.8757, "num_input_tokens_seen": 105364864, "step": 2475 }, { "epoch": 0.5979866176381939, "grad_norm": 1.253820776939392, "learning_rate": 3.97650228226259e-05, "loss": 0.8016, "num_input_tokens_seen": 105579872, "step": 2480 }, { "epoch": 0.5991922358189161, "grad_norm": 1.009536623954773, "learning_rate": 3.9726788752862075e-05, "loss": 0.938, "num_input_tokens_seen": 105787680, "step": 2485 }, { "epoch": 0.6003978539996383, "grad_norm": 1.0429348945617676, "learning_rate": 3.9688501860517055e-05, "loss": 0.8591, "num_input_tokens_seen": 106003264, "step": 2490 }, { "epoch": 0.6016034721803605, "grad_norm": 1.1083070039749146, "learning_rate": 3.9650162282919655e-05, "loss": 0.8671, "num_input_tokens_seen": 106215224, "step": 2495 }, { "epoch": 0.6028090903610827, "grad_norm": 1.0113500356674194, "learning_rate": 3.961177015758767e-05, "loss": 0.9668, "num_input_tokens_seen": 106421936, "step": 2500 }, { "epoch": 0.6040147085418048, "grad_norm": 0.9991928339004517, "learning_rate": 3.9573325622227365e-05, "loss": 0.863, "num_input_tokens_seen": 106626560, "step": 2505 }, { "epoch": 0.605220326722527, "grad_norm": 1.0193400382995605, "learning_rate": 3.9534828814733e-05, "loss": 0.8393, "num_input_tokens_seen": 106844288, "step": 2510 }, { "epoch": 0.6064259449032492, "grad_norm": 1.889656662940979, "learning_rate": 3.949627987318633e-05, "loss": 0.8691, "num_input_tokens_seen": 107059776, "step": 2515 }, { "epoch": 0.6076315630839713, "grad_norm": 0.9568857550621033, "learning_rate": 3.945767893585608e-05, "loss": 0.9128, "num_input_tokens_seen": 107260704, "step": 2520 }, { "epoch": 0.6088371812646934, "grad_norm": 0.9454445242881775, "learning_rate": 3.9419026141197513e-05, "loss": 0.8414, "num_input_tokens_seen": 107468880, "step": 2525 }, { "epoch": 0.6100427994454156, "grad_norm": 1.0334218740463257, "learning_rate": 3.9380321627851866e-05, "loss": 0.8117, "num_input_tokens_seen": 107680736, "step": 2530 }, { "epoch": 0.6112484176261378, "grad_norm": 1.1677589416503906, "learning_rate": 3.934156553464591e-05, "loss": 0.9207, "num_input_tokens_seen": 107901568, "step": 2535 }, { "epoch": 0.61245403580686, "grad_norm": 1.1337138414382935, "learning_rate": 3.93027580005914e-05, "loss": 0.917, "num_input_tokens_seen": 108106672, "step": 2540 }, { "epoch": 0.6136596539875822, "grad_norm": 1.007638931274414, "learning_rate": 3.92638991648846e-05, "loss": 0.8499, "num_input_tokens_seen": 108311624, "step": 2545 }, { "epoch": 0.6148652721683043, "grad_norm": 1.3441106081008911, "learning_rate": 3.922498916690581e-05, "loss": 0.9152, "num_input_tokens_seen": 108523488, "step": 2550 }, { "epoch": 0.6160708903490265, "grad_norm": 0.9456208944320679, "learning_rate": 3.918602814621882e-05, "loss": 0.8448, "num_input_tokens_seen": 108742704, "step": 2555 }, { "epoch": 0.6172765085297486, "grad_norm": 1.0522723197937012, "learning_rate": 3.914701624257043e-05, "loss": 0.9138, "num_input_tokens_seen": 108958768, "step": 2560 }, { "epoch": 0.6184821267104708, "grad_norm": 1.0404298305511475, "learning_rate": 3.910795359588996e-05, "loss": 0.8847, "num_input_tokens_seen": 109168312, "step": 2565 }, { "epoch": 0.6196877448911929, "grad_norm": 1.0700573921203613, "learning_rate": 3.906884034628873e-05, "loss": 0.904, "num_input_tokens_seen": 109379640, "step": 2570 }, { "epoch": 0.6208933630719151, "grad_norm": 1.06218683719635, "learning_rate": 3.902967663405956e-05, "loss": 0.8737, "num_input_tokens_seen": 109594264, "step": 2575 }, { "epoch": 0.6220989812526373, "grad_norm": 1.0209461450576782, "learning_rate": 3.899046259967628e-05, "loss": 0.8403, "num_input_tokens_seen": 109810368, "step": 2580 }, { "epoch": 0.6233045994333595, "grad_norm": 3.143138885498047, "learning_rate": 3.895119838379323e-05, "loss": 0.822, "num_input_tokens_seen": 110021712, "step": 2585 }, { "epoch": 0.6245102176140817, "grad_norm": 1.0705281496047974, "learning_rate": 3.891188412724469e-05, "loss": 0.8813, "num_input_tokens_seen": 110236248, "step": 2590 }, { "epoch": 0.6257158357948038, "grad_norm": 0.9724747538566589, "learning_rate": 3.8872519971044496e-05, "loss": 0.8861, "num_input_tokens_seen": 110448448, "step": 2595 }, { "epoch": 0.6269214539755259, "grad_norm": 1.1382485628128052, "learning_rate": 3.883310605638543e-05, "loss": 0.8961, "num_input_tokens_seen": 110661096, "step": 2600 }, { "epoch": 0.6281270721562481, "grad_norm": 1.1390583515167236, "learning_rate": 3.879364252463874e-05, "loss": 0.8701, "num_input_tokens_seen": 110873256, "step": 2605 }, { "epoch": 0.6293326903369703, "grad_norm": 0.9645174145698547, "learning_rate": 3.8754129517353665e-05, "loss": 0.8238, "num_input_tokens_seen": 111076504, "step": 2610 }, { "epoch": 0.6305383085176924, "grad_norm": 1.0927597284317017, "learning_rate": 3.87145671762569e-05, "loss": 0.8579, "num_input_tokens_seen": 111289144, "step": 2615 }, { "epoch": 0.6317439266984146, "grad_norm": 1.014082908630371, "learning_rate": 3.8674955643252075e-05, "loss": 0.8473, "num_input_tokens_seen": 111513824, "step": 2620 }, { "epoch": 0.6329495448791368, "grad_norm": 1.0572882890701294, "learning_rate": 3.863529506041929e-05, "loss": 0.8393, "num_input_tokens_seen": 111727344, "step": 2625 }, { "epoch": 0.634155163059859, "grad_norm": 0.8850515484809875, "learning_rate": 3.859558557001456e-05, "loss": 0.8208, "num_input_tokens_seen": 111950776, "step": 2630 }, { "epoch": 0.6353607812405812, "grad_norm": 0.8921672701835632, "learning_rate": 3.855582731446933e-05, "loss": 0.8761, "num_input_tokens_seen": 112168472, "step": 2635 }, { "epoch": 0.6365663994213032, "grad_norm": 1.0492134094238281, "learning_rate": 3.851602043638994e-05, "loss": 0.8797, "num_input_tokens_seen": 112383184, "step": 2640 }, { "epoch": 0.6377720176020254, "grad_norm": 1.0101476907730103, "learning_rate": 3.8476165078557176e-05, "loss": 0.876, "num_input_tokens_seen": 112592328, "step": 2645 }, { "epoch": 0.6389776357827476, "grad_norm": 1.0663129091262817, "learning_rate": 3.843626138392566e-05, "loss": 0.8721, "num_input_tokens_seen": 112800264, "step": 2650 }, { "epoch": 0.6401832539634698, "grad_norm": 0.8912302851676941, "learning_rate": 3.839630949562343e-05, "loss": 0.8377, "num_input_tokens_seen": 113007136, "step": 2655 }, { "epoch": 0.6413888721441919, "grad_norm": 0.9328052997589111, "learning_rate": 3.8356309556951365e-05, "loss": 0.8705, "num_input_tokens_seen": 113218104, "step": 2660 }, { "epoch": 0.6425944903249141, "grad_norm": 1.2713571786880493, "learning_rate": 3.831626171138269e-05, "loss": 0.9125, "num_input_tokens_seen": 113420392, "step": 2665 }, { "epoch": 0.6438001085056363, "grad_norm": 0.8158183693885803, "learning_rate": 3.8276166102562495e-05, "loss": 0.9149, "num_input_tokens_seen": 113651752, "step": 2670 }, { "epoch": 0.6450057266863585, "grad_norm": 0.977465033531189, "learning_rate": 3.823602287430715e-05, "loss": 0.8782, "num_input_tokens_seen": 113857816, "step": 2675 }, { "epoch": 0.6462113448670807, "grad_norm": 0.9960053563117981, "learning_rate": 3.819583217060384e-05, "loss": 0.8915, "num_input_tokens_seen": 114064208, "step": 2680 }, { "epoch": 0.6474169630478027, "grad_norm": 1.1053845882415771, "learning_rate": 3.8155594135610064e-05, "loss": 0.8631, "num_input_tokens_seen": 114272240, "step": 2685 }, { "epoch": 0.6486225812285249, "grad_norm": 1.1738579273223877, "learning_rate": 3.811530891365305e-05, "loss": 0.8184, "num_input_tokens_seen": 114482440, "step": 2690 }, { "epoch": 0.6498281994092471, "grad_norm": 1.7948781251907349, "learning_rate": 3.8074976649229305e-05, "loss": 0.9034, "num_input_tokens_seen": 114704512, "step": 2695 }, { "epoch": 0.6510338175899693, "grad_norm": 1.101294755935669, "learning_rate": 3.8034597487004055e-05, "loss": 0.8684, "num_input_tokens_seen": 114925128, "step": 2700 }, { "epoch": 0.6522394357706914, "grad_norm": 1.029611587524414, "learning_rate": 3.799417157181076e-05, "loss": 0.8369, "num_input_tokens_seen": 115140848, "step": 2705 }, { "epoch": 0.6534450539514136, "grad_norm": 0.9372984766960144, "learning_rate": 3.7953699048650536e-05, "loss": 0.8195, "num_input_tokens_seen": 115353976, "step": 2710 }, { "epoch": 0.6546506721321358, "grad_norm": 1.0723334550857544, "learning_rate": 3.791318006269173e-05, "loss": 0.8571, "num_input_tokens_seen": 115570512, "step": 2715 }, { "epoch": 0.655856290312858, "grad_norm": 1.0645432472229004, "learning_rate": 3.7872614759269294e-05, "loss": 0.8624, "num_input_tokens_seen": 115779288, "step": 2720 }, { "epoch": 0.65706190849358, "grad_norm": 1.0881311893463135, "learning_rate": 3.783200328388434e-05, "loss": 0.8666, "num_input_tokens_seen": 115992536, "step": 2725 }, { "epoch": 0.6582675266743022, "grad_norm": 1.1028244495391846, "learning_rate": 3.779134578220358e-05, "loss": 0.8608, "num_input_tokens_seen": 116209080, "step": 2730 }, { "epoch": 0.6594731448550244, "grad_norm": 1.167299747467041, "learning_rate": 3.7750642400058823e-05, "loss": 0.891, "num_input_tokens_seen": 116422120, "step": 2735 }, { "epoch": 0.6606787630357466, "grad_norm": 0.9750291109085083, "learning_rate": 3.770989328344645e-05, "loss": 0.8567, "num_input_tokens_seen": 116648536, "step": 2740 }, { "epoch": 0.6618843812164688, "grad_norm": 1.1971921920776367, "learning_rate": 3.7669098578526846e-05, "loss": 0.8853, "num_input_tokens_seen": 116871696, "step": 2745 }, { "epoch": 0.6630899993971909, "grad_norm": 1.0117955207824707, "learning_rate": 3.762825843162397e-05, "loss": 0.9127, "num_input_tokens_seen": 117083288, "step": 2750 }, { "epoch": 0.6642956175779131, "grad_norm": 0.93205726146698, "learning_rate": 3.758737298922472e-05, "loss": 0.8967, "num_input_tokens_seen": 117296040, "step": 2755 }, { "epoch": 0.6655012357586353, "grad_norm": 1.0302330255508423, "learning_rate": 3.75464423979785e-05, "loss": 0.7956, "num_input_tokens_seen": 117518480, "step": 2760 }, { "epoch": 0.6667068539393574, "grad_norm": 0.8864453434944153, "learning_rate": 3.750546680469666e-05, "loss": 0.9054, "num_input_tokens_seen": 117737696, "step": 2765 }, { "epoch": 0.6679124721200795, "grad_norm": 0.9106459617614746, "learning_rate": 3.74644463563519e-05, "loss": 0.758, "num_input_tokens_seen": 117944488, "step": 2770 }, { "epoch": 0.6691180903008017, "grad_norm": 0.9680925011634827, "learning_rate": 3.742338120007791e-05, "loss": 0.8393, "num_input_tokens_seen": 118152520, "step": 2775 }, { "epoch": 0.6703237084815239, "grad_norm": 1.0835579633712769, "learning_rate": 3.738227148316865e-05, "loss": 0.7656, "num_input_tokens_seen": 118372168, "step": 2780 }, { "epoch": 0.6715293266622461, "grad_norm": 0.9953375458717346, "learning_rate": 3.7341117353077966e-05, "loss": 0.8416, "num_input_tokens_seen": 118586424, "step": 2785 }, { "epoch": 0.6727349448429683, "grad_norm": 0.9300422072410583, "learning_rate": 3.7299918957418966e-05, "loss": 0.9425, "num_input_tokens_seen": 118806232, "step": 2790 }, { "epoch": 0.6739405630236904, "grad_norm": 0.9841498732566833, "learning_rate": 3.725867644396358e-05, "loss": 0.8685, "num_input_tokens_seen": 119013832, "step": 2795 }, { "epoch": 0.6751461812044126, "grad_norm": 1.0520286560058594, "learning_rate": 3.721738996064193e-05, "loss": 0.8638, "num_input_tokens_seen": 119226528, "step": 2800 }, { "epoch": 0.6763517993851347, "grad_norm": 0.9749354124069214, "learning_rate": 3.7176059655541884e-05, "loss": 0.8441, "num_input_tokens_seen": 119437552, "step": 2805 }, { "epoch": 0.6775574175658569, "grad_norm": 1.0180864334106445, "learning_rate": 3.713468567690849e-05, "loss": 0.881, "num_input_tokens_seen": 119652040, "step": 2810 }, { "epoch": 0.678763035746579, "grad_norm": 1.0900230407714844, "learning_rate": 3.7093268173143426e-05, "loss": 0.7892, "num_input_tokens_seen": 119866552, "step": 2815 }, { "epoch": 0.6799686539273012, "grad_norm": 0.9280741810798645, "learning_rate": 3.7051807292804506e-05, "loss": 0.9135, "num_input_tokens_seen": 120073360, "step": 2820 }, { "epoch": 0.6811742721080234, "grad_norm": 0.9373129606246948, "learning_rate": 3.701030318460513e-05, "loss": 0.8808, "num_input_tokens_seen": 120282720, "step": 2825 }, { "epoch": 0.6823798902887456, "grad_norm": 1.0410915613174438, "learning_rate": 3.696875599741373e-05, "loss": 0.9337, "num_input_tokens_seen": 120491656, "step": 2830 }, { "epoch": 0.6835855084694678, "grad_norm": 1.1757481098175049, "learning_rate": 3.6927165880253266e-05, "loss": 0.8992, "num_input_tokens_seen": 120710640, "step": 2835 }, { "epoch": 0.6847911266501899, "grad_norm": 0.899632453918457, "learning_rate": 3.688553298230069e-05, "loss": 0.8161, "num_input_tokens_seen": 120923864, "step": 2840 }, { "epoch": 0.685996744830912, "grad_norm": 1.3205392360687256, "learning_rate": 3.6843857452886396e-05, "loss": 0.8961, "num_input_tokens_seen": 121138784, "step": 2845 }, { "epoch": 0.6872023630116342, "grad_norm": 1.4027127027511597, "learning_rate": 3.680213944149368e-05, "loss": 0.835, "num_input_tokens_seen": 121345408, "step": 2850 }, { "epoch": 0.6884079811923564, "grad_norm": 1.0866031646728516, "learning_rate": 3.676037909775823e-05, "loss": 0.8689, "num_input_tokens_seen": 121554016, "step": 2855 }, { "epoch": 0.6896135993730785, "grad_norm": 1.0311752557754517, "learning_rate": 3.6718576571467555e-05, "loss": 0.9147, "num_input_tokens_seen": 121760160, "step": 2860 }, { "epoch": 0.6908192175538007, "grad_norm": 0.982082188129425, "learning_rate": 3.6676732012560484e-05, "loss": 0.8857, "num_input_tokens_seen": 121965672, "step": 2865 }, { "epoch": 0.6920248357345229, "grad_norm": 1.4018497467041016, "learning_rate": 3.6634845571126595e-05, "loss": 0.8703, "num_input_tokens_seen": 122177728, "step": 2870 }, { "epoch": 0.6932304539152451, "grad_norm": 0.9450177550315857, "learning_rate": 3.659291739740571e-05, "loss": 0.9109, "num_input_tokens_seen": 122380312, "step": 2875 }, { "epoch": 0.6944360720959672, "grad_norm": 1.0262683629989624, "learning_rate": 3.655094764178731e-05, "loss": 0.8257, "num_input_tokens_seen": 122584960, "step": 2880 }, { "epoch": 0.6956416902766893, "grad_norm": 1.0550391674041748, "learning_rate": 3.650893645481005e-05, "loss": 0.8431, "num_input_tokens_seen": 122790336, "step": 2885 }, { "epoch": 0.6968473084574115, "grad_norm": 0.916770339012146, "learning_rate": 3.6466883987161174e-05, "loss": 0.8691, "num_input_tokens_seen": 123011280, "step": 2890 }, { "epoch": 0.6980529266381337, "grad_norm": 1.042137861251831, "learning_rate": 3.642479038967602e-05, "loss": 0.8567, "num_input_tokens_seen": 123226000, "step": 2895 }, { "epoch": 0.6992585448188559, "grad_norm": 1.078047752380371, "learning_rate": 3.638265581333742e-05, "loss": 0.8628, "num_input_tokens_seen": 123433568, "step": 2900 }, { "epoch": 0.700464162999578, "grad_norm": 1.1284925937652588, "learning_rate": 3.63404804092752e-05, "loss": 0.8657, "num_input_tokens_seen": 123640000, "step": 2905 }, { "epoch": 0.7016697811803002, "grad_norm": 0.9820643663406372, "learning_rate": 3.629826432876564e-05, "loss": 0.838, "num_input_tokens_seen": 123849072, "step": 2910 }, { "epoch": 0.7028753993610224, "grad_norm": 1.2253223657608032, "learning_rate": 3.6256007723230916e-05, "loss": 0.8778, "num_input_tokens_seen": 124060920, "step": 2915 }, { "epoch": 0.7040810175417446, "grad_norm": 1.0384769439697266, "learning_rate": 3.621371074423855e-05, "loss": 0.8273, "num_input_tokens_seen": 124264656, "step": 2920 }, { "epoch": 0.7052866357224667, "grad_norm": 1.40061616897583, "learning_rate": 3.6171373543500876e-05, "loss": 0.7833, "num_input_tokens_seen": 124483792, "step": 2925 }, { "epoch": 0.7064922539031888, "grad_norm": 4.481344699859619, "learning_rate": 3.612899627287452e-05, "loss": 0.8556, "num_input_tokens_seen": 124689728, "step": 2930 }, { "epoch": 0.707697872083911, "grad_norm": 0.9528825879096985, "learning_rate": 3.608657908435981e-05, "loss": 0.8339, "num_input_tokens_seen": 124904528, "step": 2935 }, { "epoch": 0.7089034902646332, "grad_norm": 0.9773399233818054, "learning_rate": 3.604412213010026e-05, "loss": 0.8755, "num_input_tokens_seen": 125118888, "step": 2940 }, { "epoch": 0.7101091084453554, "grad_norm": 1.0076602697372437, "learning_rate": 3.6001625562382016e-05, "loss": 0.7842, "num_input_tokens_seen": 125334240, "step": 2945 }, { "epoch": 0.7113147266260775, "grad_norm": 0.8572450280189514, "learning_rate": 3.5959089533633314e-05, "loss": 0.8052, "num_input_tokens_seen": 125551408, "step": 2950 }, { "epoch": 0.7125203448067997, "grad_norm": 1.092774748802185, "learning_rate": 3.5916514196423924e-05, "loss": 0.8878, "num_input_tokens_seen": 125781304, "step": 2955 }, { "epoch": 0.7137259629875219, "grad_norm": 1.0529296398162842, "learning_rate": 3.587389970346461e-05, "loss": 0.88, "num_input_tokens_seen": 125996840, "step": 2960 }, { "epoch": 0.7149315811682441, "grad_norm": 0.8798431158065796, "learning_rate": 3.583124620760659e-05, "loss": 0.8566, "num_input_tokens_seen": 126207008, "step": 2965 }, { "epoch": 0.7161371993489661, "grad_norm": 1.0709937810897827, "learning_rate": 3.578855386184098e-05, "loss": 0.8942, "num_input_tokens_seen": 126422648, "step": 2970 }, { "epoch": 0.7173428175296883, "grad_norm": 0.90622478723526, "learning_rate": 3.574582281929822e-05, "loss": 0.903, "num_input_tokens_seen": 126647464, "step": 2975 }, { "epoch": 0.7185484357104105, "grad_norm": 0.890801191329956, "learning_rate": 3.570305323324758e-05, "loss": 0.8569, "num_input_tokens_seen": 126870328, "step": 2980 }, { "epoch": 0.7197540538911327, "grad_norm": 1.3304017782211304, "learning_rate": 3.566024525709657e-05, "loss": 0.875, "num_input_tokens_seen": 127076264, "step": 2985 }, { "epoch": 0.7209596720718549, "grad_norm": 0.9527329206466675, "learning_rate": 3.561739904439038e-05, "loss": 0.8892, "num_input_tokens_seen": 127291336, "step": 2990 }, { "epoch": 0.722165290252577, "grad_norm": 1.0986108779907227, "learning_rate": 3.5574514748811374e-05, "loss": 0.9024, "num_input_tokens_seen": 127505392, "step": 2995 }, { "epoch": 0.7233709084332992, "grad_norm": 1.4437150955200195, "learning_rate": 3.55315925241785e-05, "loss": 0.8522, "num_input_tokens_seen": 127711008, "step": 3000 }, { "epoch": 0.7245765266140214, "grad_norm": 0.8954431414604187, "learning_rate": 3.548863252444675e-05, "loss": 0.8517, "num_input_tokens_seen": 127939120, "step": 3005 }, { "epoch": 0.7257821447947435, "grad_norm": 1.0404318571090698, "learning_rate": 3.544563490370661e-05, "loss": 0.8535, "num_input_tokens_seen": 128128216, "step": 3010 }, { "epoch": 0.7269877629754656, "grad_norm": 0.9409679770469666, "learning_rate": 3.540259981618353e-05, "loss": 0.8967, "num_input_tokens_seen": 128344232, "step": 3015 }, { "epoch": 0.7281933811561878, "grad_norm": 1.0856478214263916, "learning_rate": 3.5359527416237296e-05, "loss": 0.8694, "num_input_tokens_seen": 128561608, "step": 3020 }, { "epoch": 0.72939899933691, "grad_norm": 0.9041839838027954, "learning_rate": 3.5316417858361585e-05, "loss": 0.8326, "num_input_tokens_seen": 128775768, "step": 3025 }, { "epoch": 0.7306046175176322, "grad_norm": 1.1169114112854004, "learning_rate": 3.52732712971833e-05, "loss": 0.834, "num_input_tokens_seen": 128989608, "step": 3030 }, { "epoch": 0.7318102356983543, "grad_norm": 0.9338088035583496, "learning_rate": 3.523008788746211e-05, "loss": 0.8432, "num_input_tokens_seen": 129195528, "step": 3035 }, { "epoch": 0.7330158538790765, "grad_norm": 1.127341866493225, "learning_rate": 3.5186867784089844e-05, "loss": 0.8904, "num_input_tokens_seen": 129400000, "step": 3040 }, { "epoch": 0.7342214720597987, "grad_norm": 1.2860651016235352, "learning_rate": 3.514361114208993e-05, "loss": 0.8216, "num_input_tokens_seen": 129617592, "step": 3045 }, { "epoch": 0.7354270902405208, "grad_norm": 0.8987249135971069, "learning_rate": 3.5100318116616856e-05, "loss": 0.8647, "num_input_tokens_seen": 129835432, "step": 3050 }, { "epoch": 0.736632708421243, "grad_norm": 0.9867064952850342, "learning_rate": 3.505698886295564e-05, "loss": 0.8298, "num_input_tokens_seen": 130048632, "step": 3055 }, { "epoch": 0.7378383266019651, "grad_norm": 1.1246784925460815, "learning_rate": 3.501362353652121e-05, "loss": 0.8623, "num_input_tokens_seen": 130253528, "step": 3060 }, { "epoch": 0.7390439447826873, "grad_norm": 0.9166871309280396, "learning_rate": 3.4970222292857894e-05, "loss": 0.8455, "num_input_tokens_seen": 130471400, "step": 3065 }, { "epoch": 0.7402495629634095, "grad_norm": 0.9950452446937561, "learning_rate": 3.492678528763887e-05, "loss": 0.8752, "num_input_tokens_seen": 130687976, "step": 3070 }, { "epoch": 0.7414551811441317, "grad_norm": 1.0307538509368896, "learning_rate": 3.4883312676665536e-05, "loss": 0.8087, "num_input_tokens_seen": 130907552, "step": 3075 }, { "epoch": 0.7426607993248538, "grad_norm": 0.9104265570640564, "learning_rate": 3.4839804615867053e-05, "loss": 0.8354, "num_input_tokens_seen": 131123896, "step": 3080 }, { "epoch": 0.743866417505576, "grad_norm": 1.007165551185608, "learning_rate": 3.479626126129972e-05, "loss": 0.7727, "num_input_tokens_seen": 131345888, "step": 3085 }, { "epoch": 0.7450720356862981, "grad_norm": 1.0339388847351074, "learning_rate": 3.475268276914641e-05, "loss": 0.8884, "num_input_tokens_seen": 131559520, "step": 3090 }, { "epoch": 0.7462776538670203, "grad_norm": 1.0359889268875122, "learning_rate": 3.470906929571605e-05, "loss": 0.8345, "num_input_tokens_seen": 131784488, "step": 3095 }, { "epoch": 0.7474832720477425, "grad_norm": 1.022627353668213, "learning_rate": 3.4665420997443024e-05, "loss": 0.9158, "num_input_tokens_seen": 131994600, "step": 3100 }, { "epoch": 0.7486888902284646, "grad_norm": 0.953904926776886, "learning_rate": 3.4621738030886655e-05, "loss": 0.9023, "num_input_tokens_seen": 132197304, "step": 3105 }, { "epoch": 0.7498945084091868, "grad_norm": 0.9814931154251099, "learning_rate": 3.457802055273056e-05, "loss": 0.8548, "num_input_tokens_seen": 132407632, "step": 3110 }, { "epoch": 0.751100126589909, "grad_norm": 0.9165716171264648, "learning_rate": 3.453426871978222e-05, "loss": 0.9018, "num_input_tokens_seen": 132616880, "step": 3115 }, { "epoch": 0.7523057447706312, "grad_norm": 0.9459496140480042, "learning_rate": 3.4490482688972264e-05, "loss": 0.819, "num_input_tokens_seen": 132823800, "step": 3120 }, { "epoch": 0.7535113629513533, "grad_norm": 0.9716479778289795, "learning_rate": 3.4446662617354034e-05, "loss": 0.8212, "num_input_tokens_seen": 133024808, "step": 3125 }, { "epoch": 0.7547169811320755, "grad_norm": 0.9451080560684204, "learning_rate": 3.440280866210296e-05, "loss": 0.8407, "num_input_tokens_seen": 133238496, "step": 3130 }, { "epoch": 0.7559225993127976, "grad_norm": 1.0247324705123901, "learning_rate": 3.4358920980516e-05, "loss": 0.8418, "num_input_tokens_seen": 133448336, "step": 3135 }, { "epoch": 0.7571282174935198, "grad_norm": 0.913414478302002, "learning_rate": 3.4314999730011087e-05, "loss": 0.8276, "num_input_tokens_seen": 133656704, "step": 3140 }, { "epoch": 0.758333835674242, "grad_norm": 0.9673576354980469, "learning_rate": 3.427104506812655e-05, "loss": 0.8765, "num_input_tokens_seen": 133882464, "step": 3145 }, { "epoch": 0.7595394538549641, "grad_norm": 1.0487371683120728, "learning_rate": 3.422705715252057e-05, "loss": 0.866, "num_input_tokens_seen": 134106720, "step": 3150 }, { "epoch": 0.7607450720356863, "grad_norm": 1.1488994359970093, "learning_rate": 3.418303614097062e-05, "loss": 0.9026, "num_input_tokens_seen": 134324080, "step": 3155 }, { "epoch": 0.7619506902164085, "grad_norm": 1.0286263227462769, "learning_rate": 3.4138982191372834e-05, "loss": 0.8341, "num_input_tokens_seen": 134539904, "step": 3160 }, { "epoch": 0.7631563083971307, "grad_norm": 0.9409207105636597, "learning_rate": 3.409489546174154e-05, "loss": 0.8364, "num_input_tokens_seen": 134760176, "step": 3165 }, { "epoch": 0.7643619265778528, "grad_norm": 1.5405821800231934, "learning_rate": 3.4050776110208626e-05, "loss": 0.8492, "num_input_tokens_seen": 134968984, "step": 3170 }, { "epoch": 0.7655675447585749, "grad_norm": 0.9386600255966187, "learning_rate": 3.400662429502298e-05, "loss": 0.7724, "num_input_tokens_seen": 135189656, "step": 3175 }, { "epoch": 0.7667731629392971, "grad_norm": 0.9039964079856873, "learning_rate": 3.3962440174549924e-05, "loss": 0.8301, "num_input_tokens_seen": 135406936, "step": 3180 }, { "epoch": 0.7679787811200193, "grad_norm": 1.0080549716949463, "learning_rate": 3.391822390727068e-05, "loss": 0.8176, "num_input_tokens_seen": 135615032, "step": 3185 }, { "epoch": 0.7691843993007415, "grad_norm": 1.2047741413116455, "learning_rate": 3.387397565178176e-05, "loss": 0.8565, "num_input_tokens_seen": 135832096, "step": 3190 }, { "epoch": 0.7703900174814636, "grad_norm": 0.9866753220558167, "learning_rate": 3.382969556679442e-05, "loss": 0.8913, "num_input_tokens_seen": 136044728, "step": 3195 }, { "epoch": 0.7715956356621858, "grad_norm": 0.9877943396568298, "learning_rate": 3.3785383811134075e-05, "loss": 0.7834, "num_input_tokens_seen": 136265192, "step": 3200 }, { "epoch": 0.772801253842908, "grad_norm": 1.0064071416854858, "learning_rate": 3.374104054373973e-05, "loss": 0.89, "num_input_tokens_seen": 136469168, "step": 3205 }, { "epoch": 0.7740068720236302, "grad_norm": 1.0065826177597046, "learning_rate": 3.369666592366346e-05, "loss": 0.8477, "num_input_tokens_seen": 136678688, "step": 3210 }, { "epoch": 0.7752124902043522, "grad_norm": 1.220650553703308, "learning_rate": 3.365226011006972e-05, "loss": 0.8801, "num_input_tokens_seen": 136889272, "step": 3215 }, { "epoch": 0.7764181083850744, "grad_norm": 1.0406436920166016, "learning_rate": 3.360782326223493e-05, "loss": 0.855, "num_input_tokens_seen": 137109376, "step": 3220 }, { "epoch": 0.7776237265657966, "grad_norm": 0.9763860106468201, "learning_rate": 3.3563355539546795e-05, "loss": 0.832, "num_input_tokens_seen": 137331424, "step": 3225 }, { "epoch": 0.7788293447465188, "grad_norm": 1.0234390497207642, "learning_rate": 3.351885710150373e-05, "loss": 0.8917, "num_input_tokens_seen": 137547976, "step": 3230 }, { "epoch": 0.780034962927241, "grad_norm": 1.0497173070907593, "learning_rate": 3.347432810771436e-05, "loss": 0.8741, "num_input_tokens_seen": 137770664, "step": 3235 }, { "epoch": 0.7812405811079631, "grad_norm": 0.9197514057159424, "learning_rate": 3.342976871789692e-05, "loss": 0.8633, "num_input_tokens_seen": 137980912, "step": 3240 }, { "epoch": 0.7824461992886853, "grad_norm": 1.053252935409546, "learning_rate": 3.338517909187863e-05, "loss": 0.861, "num_input_tokens_seen": 138179288, "step": 3245 }, { "epoch": 0.7836518174694075, "grad_norm": 0.9343007206916809, "learning_rate": 3.3340559389595174e-05, "loss": 0.8563, "num_input_tokens_seen": 138395240, "step": 3250 }, { "epoch": 0.7848574356501296, "grad_norm": 0.8722341060638428, "learning_rate": 3.329590977109014e-05, "loss": 0.8152, "num_input_tokens_seen": 138615144, "step": 3255 }, { "epoch": 0.7860630538308517, "grad_norm": 0.908607006072998, "learning_rate": 3.32512303965144e-05, "loss": 0.834, "num_input_tokens_seen": 138840072, "step": 3260 }, { "epoch": 0.7872686720115739, "grad_norm": 0.9660098552703857, "learning_rate": 3.320652142612555e-05, "loss": 0.892, "num_input_tokens_seen": 139051296, "step": 3265 }, { "epoch": 0.7884742901922961, "grad_norm": 0.9980911612510681, "learning_rate": 3.316178302028734e-05, "loss": 0.8429, "num_input_tokens_seen": 139257976, "step": 3270 }, { "epoch": 0.7896799083730183, "grad_norm": 1.0296955108642578, "learning_rate": 3.311701533946914e-05, "loss": 0.8773, "num_input_tokens_seen": 139467504, "step": 3275 }, { "epoch": 0.7908855265537404, "grad_norm": 1.0136126279830933, "learning_rate": 3.307221854424527e-05, "loss": 0.7918, "num_input_tokens_seen": 139684472, "step": 3280 }, { "epoch": 0.7920911447344626, "grad_norm": 1.0575262308120728, "learning_rate": 3.302739279529452e-05, "loss": 0.8924, "num_input_tokens_seen": 139899176, "step": 3285 }, { "epoch": 0.7932967629151848, "grad_norm": 0.975049614906311, "learning_rate": 3.29825382533995e-05, "loss": 0.8328, "num_input_tokens_seen": 140108896, "step": 3290 }, { "epoch": 0.7945023810959069, "grad_norm": 1.9772605895996094, "learning_rate": 3.2937655079446125e-05, "loss": 0.8562, "num_input_tokens_seen": 140321096, "step": 3295 }, { "epoch": 0.795707999276629, "grad_norm": 0.9851700663566589, "learning_rate": 3.289274343442299e-05, "loss": 0.8671, "num_input_tokens_seen": 140536616, "step": 3300 }, { "epoch": 0.7969136174573512, "grad_norm": 0.9599255323410034, "learning_rate": 3.2847803479420806e-05, "loss": 0.892, "num_input_tokens_seen": 140754480, "step": 3305 }, { "epoch": 0.7981192356380734, "grad_norm": 1.0963901281356812, "learning_rate": 3.280283537563185e-05, "loss": 0.853, "num_input_tokens_seen": 140967704, "step": 3310 }, { "epoch": 0.7993248538187956, "grad_norm": 0.9567300081253052, "learning_rate": 3.2757839284349353e-05, "loss": 0.8773, "num_input_tokens_seen": 141170952, "step": 3315 }, { "epoch": 0.8005304719995178, "grad_norm": 1.083093523979187, "learning_rate": 3.271281536696692e-05, "loss": 0.8219, "num_input_tokens_seen": 141396720, "step": 3320 }, { "epoch": 0.80173609018024, "grad_norm": 1.0143555402755737, "learning_rate": 3.266776378497798e-05, "loss": 0.8499, "num_input_tokens_seen": 141614392, "step": 3325 }, { "epoch": 0.8029417083609621, "grad_norm": 1.0233615636825562, "learning_rate": 3.262268469997519e-05, "loss": 0.8352, "num_input_tokens_seen": 141821384, "step": 3330 }, { "epoch": 0.8041473265416842, "grad_norm": 1.0595847368240356, "learning_rate": 3.257757827364984e-05, "loss": 0.9133, "num_input_tokens_seen": 142039728, "step": 3335 }, { "epoch": 0.8053529447224064, "grad_norm": 0.9510430097579956, "learning_rate": 3.25324446677913e-05, "loss": 0.8928, "num_input_tokens_seen": 142256496, "step": 3340 }, { "epoch": 0.8065585629031286, "grad_norm": 0.9281471371650696, "learning_rate": 3.248728404428643e-05, "loss": 0.8364, "num_input_tokens_seen": 142481256, "step": 3345 }, { "epoch": 0.8077641810838507, "grad_norm": 1.0045799016952515, "learning_rate": 3.244209656511901e-05, "loss": 0.8487, "num_input_tokens_seen": 142674416, "step": 3350 }, { "epoch": 0.8089697992645729, "grad_norm": 1.2197424173355103, "learning_rate": 3.239688239236911e-05, "loss": 0.909, "num_input_tokens_seen": 142886440, "step": 3355 }, { "epoch": 0.8101754174452951, "grad_norm": 1.00840163230896, "learning_rate": 3.2351641688212585e-05, "loss": 0.8841, "num_input_tokens_seen": 143104104, "step": 3360 }, { "epoch": 0.8113810356260173, "grad_norm": 0.9274449944496155, "learning_rate": 3.230637461492043e-05, "loss": 0.8517, "num_input_tokens_seen": 143318088, "step": 3365 }, { "epoch": 0.8125866538067394, "grad_norm": 0.9366058111190796, "learning_rate": 3.2261081334858236e-05, "loss": 0.8554, "num_input_tokens_seen": 143536464, "step": 3370 }, { "epoch": 0.8137922719874616, "grad_norm": 1.076001763343811, "learning_rate": 3.221576201048557e-05, "loss": 0.9055, "num_input_tokens_seen": 143752400, "step": 3375 }, { "epoch": 0.8149978901681837, "grad_norm": 0.9790358543395996, "learning_rate": 3.2170416804355446e-05, "loss": 0.85, "num_input_tokens_seen": 143987776, "step": 3380 }, { "epoch": 0.8162035083489059, "grad_norm": 0.9812015295028687, "learning_rate": 3.21250458791137e-05, "loss": 0.8335, "num_input_tokens_seen": 144195432, "step": 3385 }, { "epoch": 0.817409126529628, "grad_norm": 1.4020839929580688, "learning_rate": 3.2079649397498404e-05, "loss": 0.8528, "num_input_tokens_seen": 144408632, "step": 3390 }, { "epoch": 0.8186147447103502, "grad_norm": 1.0115829706192017, "learning_rate": 3.203422752233932e-05, "loss": 0.8822, "num_input_tokens_seen": 144631432, "step": 3395 }, { "epoch": 0.8198203628910724, "grad_norm": 1.00081467628479, "learning_rate": 3.198878041655727e-05, "loss": 0.8343, "num_input_tokens_seen": 144836224, "step": 3400 }, { "epoch": 0.8210259810717946, "grad_norm": 0.9316551685333252, "learning_rate": 3.1943308243163594e-05, "loss": 0.8344, "num_input_tokens_seen": 145051280, "step": 3405 }, { "epoch": 0.8222315992525168, "grad_norm": 1.031455636024475, "learning_rate": 3.189781116525953e-05, "loss": 0.8643, "num_input_tokens_seen": 145268616, "step": 3410 }, { "epoch": 0.8234372174332389, "grad_norm": 0.8396311402320862, "learning_rate": 3.185228934603565e-05, "loss": 0.8183, "num_input_tokens_seen": 145485912, "step": 3415 }, { "epoch": 0.824642835613961, "grad_norm": 0.9971233010292053, "learning_rate": 3.180674294877127e-05, "loss": 0.8695, "num_input_tokens_seen": 145698704, "step": 3420 }, { "epoch": 0.8258484537946832, "grad_norm": 1.1169159412384033, "learning_rate": 3.176117213683387e-05, "loss": 0.8432, "num_input_tokens_seen": 145910960, "step": 3425 }, { "epoch": 0.8270540719754054, "grad_norm": 1.049144983291626, "learning_rate": 3.171557707367849e-05, "loss": 0.8485, "num_input_tokens_seen": 146130472, "step": 3430 }, { "epoch": 0.8282596901561275, "grad_norm": 0.9563018083572388, "learning_rate": 3.166995792284717e-05, "loss": 0.8665, "num_input_tokens_seen": 146337072, "step": 3435 }, { "epoch": 0.8294653083368497, "grad_norm": 1.2562298774719238, "learning_rate": 3.162431484796832e-05, "loss": 0.8332, "num_input_tokens_seen": 146559760, "step": 3440 }, { "epoch": 0.8306709265175719, "grad_norm": 0.9444025754928589, "learning_rate": 3.1578648012756195e-05, "loss": 0.866, "num_input_tokens_seen": 146760856, "step": 3445 }, { "epoch": 0.8318765446982941, "grad_norm": 0.9594274759292603, "learning_rate": 3.153295758101025e-05, "loss": 0.8445, "num_input_tokens_seen": 146974448, "step": 3450 }, { "epoch": 0.8330821628790163, "grad_norm": 1.3247997760772705, "learning_rate": 3.148724371661459e-05, "loss": 0.9256, "num_input_tokens_seen": 147187760, "step": 3455 }, { "epoch": 0.8342877810597383, "grad_norm": 0.8789427280426025, "learning_rate": 3.144150658353736e-05, "loss": 0.8828, "num_input_tokens_seen": 147406744, "step": 3460 }, { "epoch": 0.8354933992404605, "grad_norm": 0.9635398983955383, "learning_rate": 3.1395746345830184e-05, "loss": 0.8306, "num_input_tokens_seen": 147618240, "step": 3465 }, { "epoch": 0.8366990174211827, "grad_norm": 0.8610533475875854, "learning_rate": 3.1349963167627544e-05, "loss": 0.8825, "num_input_tokens_seen": 147832360, "step": 3470 }, { "epoch": 0.8379046356019049, "grad_norm": 1.0449845790863037, "learning_rate": 3.13041572131462e-05, "loss": 0.883, "num_input_tokens_seen": 148029464, "step": 3475 }, { "epoch": 0.839110253782627, "grad_norm": 1.0115342140197754, "learning_rate": 3.125832864668461e-05, "loss": 0.8778, "num_input_tokens_seen": 148237928, "step": 3480 }, { "epoch": 0.8403158719633492, "grad_norm": 0.9260898232460022, "learning_rate": 3.121247763262235e-05, "loss": 0.8283, "num_input_tokens_seen": 148451264, "step": 3485 }, { "epoch": 0.8415214901440714, "grad_norm": 0.9899423122406006, "learning_rate": 3.116660433541951e-05, "loss": 0.8293, "num_input_tokens_seen": 148669504, "step": 3490 }, { "epoch": 0.8427271083247936, "grad_norm": 0.9353440403938293, "learning_rate": 3.1120708919616085e-05, "loss": 0.8461, "num_input_tokens_seen": 148880544, "step": 3495 }, { "epoch": 0.8439327265055157, "grad_norm": 0.8515079617500305, "learning_rate": 3.1074791549831425e-05, "loss": 0.8268, "num_input_tokens_seen": 149105824, "step": 3500 }, { "epoch": 0.8451383446862378, "grad_norm": 1.050998568534851, "learning_rate": 3.102885239076364e-05, "loss": 0.8169, "num_input_tokens_seen": 149318448, "step": 3505 }, { "epoch": 0.84634396286696, "grad_norm": 0.9005813002586365, "learning_rate": 3.098289160718895e-05, "loss": 0.904, "num_input_tokens_seen": 149534024, "step": 3510 }, { "epoch": 0.8475495810476822, "grad_norm": 1.208495020866394, "learning_rate": 3.093690936396117e-05, "loss": 0.8416, "num_input_tokens_seen": 149749544, "step": 3515 }, { "epoch": 0.8487551992284044, "grad_norm": 0.9106577634811401, "learning_rate": 3.0890905826011093e-05, "loss": 0.8137, "num_input_tokens_seen": 149961376, "step": 3520 }, { "epoch": 0.8499608174091265, "grad_norm": 1.0562121868133545, "learning_rate": 3.0844881158345865e-05, "loss": 0.8052, "num_input_tokens_seen": 150174648, "step": 3525 }, { "epoch": 0.8511664355898487, "grad_norm": 1.0259089469909668, "learning_rate": 3.079883552604845e-05, "loss": 0.8085, "num_input_tokens_seen": 150388584, "step": 3530 }, { "epoch": 0.8523720537705709, "grad_norm": 1.0294978618621826, "learning_rate": 3.075276909427699e-05, "loss": 0.8246, "num_input_tokens_seen": 150605344, "step": 3535 }, { "epoch": 0.853577671951293, "grad_norm": 1.0734068155288696, "learning_rate": 3.070668202826424e-05, "loss": 0.8581, "num_input_tokens_seen": 150819352, "step": 3540 }, { "epoch": 0.8547832901320151, "grad_norm": 0.9585369825363159, "learning_rate": 3.066057449331697e-05, "loss": 0.7763, "num_input_tokens_seen": 151035056, "step": 3545 }, { "epoch": 0.8559889083127373, "grad_norm": 0.9017661213874817, "learning_rate": 3.061444665481534e-05, "loss": 0.8733, "num_input_tokens_seen": 151258664, "step": 3550 }, { "epoch": 0.8571945264934595, "grad_norm": 1.0550650358200073, "learning_rate": 3.056829867821238e-05, "loss": 0.8272, "num_input_tokens_seen": 151462776, "step": 3555 }, { "epoch": 0.8584001446741817, "grad_norm": 1.0138078927993774, "learning_rate": 3.052213072903332e-05, "loss": 0.8415, "num_input_tokens_seen": 151682168, "step": 3560 }, { "epoch": 0.8596057628549039, "grad_norm": 0.9518343806266785, "learning_rate": 3.047594297287504e-05, "loss": 0.8097, "num_input_tokens_seen": 151893648, "step": 3565 }, { "epoch": 0.860811381035626, "grad_norm": 1.0403308868408203, "learning_rate": 3.042973557540546e-05, "loss": 0.8631, "num_input_tokens_seen": 152111256, "step": 3570 }, { "epoch": 0.8620169992163482, "grad_norm": 1.5790611505508423, "learning_rate": 3.0383508702362955e-05, "loss": 0.9091, "num_input_tokens_seen": 152332416, "step": 3575 }, { "epoch": 0.8632226173970704, "grad_norm": 1.025078535079956, "learning_rate": 3.0337262519555754e-05, "loss": 0.7534, "num_input_tokens_seen": 152558424, "step": 3580 }, { "epoch": 0.8644282355777925, "grad_norm": 0.9668149948120117, "learning_rate": 3.029099719286135e-05, "loss": 0.8914, "num_input_tokens_seen": 152774776, "step": 3585 }, { "epoch": 0.8656338537585146, "grad_norm": 0.9929841160774231, "learning_rate": 3.024471288822589e-05, "loss": 0.8113, "num_input_tokens_seen": 152987896, "step": 3590 }, { "epoch": 0.8668394719392368, "grad_norm": 1.0230714082717896, "learning_rate": 3.0198409771663603e-05, "loss": 0.8435, "num_input_tokens_seen": 153208576, "step": 3595 }, { "epoch": 0.868045090119959, "grad_norm": 1.0586979389190674, "learning_rate": 3.0152088009256196e-05, "loss": 0.7859, "num_input_tokens_seen": 153415112, "step": 3600 }, { "epoch": 0.8692507083006812, "grad_norm": 3.4727652072906494, "learning_rate": 3.010574776715225e-05, "loss": 0.9253, "num_input_tokens_seen": 153622016, "step": 3605 }, { "epoch": 0.8704563264814034, "grad_norm": 0.9654550552368164, "learning_rate": 3.0059389211566623e-05, "loss": 0.8341, "num_input_tokens_seen": 153849728, "step": 3610 }, { "epoch": 0.8716619446621255, "grad_norm": 0.9048064351081848, "learning_rate": 3.001301250877987e-05, "loss": 0.8136, "num_input_tokens_seen": 154062736, "step": 3615 }, { "epoch": 0.8728675628428477, "grad_norm": 1.0449655055999756, "learning_rate": 2.996661782513764e-05, "loss": 0.872, "num_input_tokens_seen": 154263344, "step": 3620 }, { "epoch": 0.8740731810235698, "grad_norm": 1.063016414642334, "learning_rate": 2.9920205327050055e-05, "loss": 0.8901, "num_input_tokens_seen": 154475048, "step": 3625 }, { "epoch": 0.875278799204292, "grad_norm": 0.9423686265945435, "learning_rate": 2.987377518099117e-05, "loss": 0.8136, "num_input_tokens_seen": 154685912, "step": 3630 }, { "epoch": 0.8764844173850141, "grad_norm": 0.8531419038772583, "learning_rate": 2.9827327553498306e-05, "loss": 0.8419, "num_input_tokens_seen": 154897424, "step": 3635 }, { "epoch": 0.8776900355657363, "grad_norm": 1.0113821029663086, "learning_rate": 2.978086261117151e-05, "loss": 0.8204, "num_input_tokens_seen": 155122880, "step": 3640 }, { "epoch": 0.8788956537464585, "grad_norm": 1.0493823289871216, "learning_rate": 2.973438052067292e-05, "loss": 0.7685, "num_input_tokens_seen": 155336616, "step": 3645 }, { "epoch": 0.8801012719271807, "grad_norm": 1.0281301736831665, "learning_rate": 2.9687881448726196e-05, "loss": 0.8152, "num_input_tokens_seen": 155553672, "step": 3650 }, { "epoch": 0.8813068901079029, "grad_norm": 0.9440913200378418, "learning_rate": 2.9641365562115887e-05, "loss": 0.8272, "num_input_tokens_seen": 155767200, "step": 3655 }, { "epoch": 0.882512508288625, "grad_norm": 0.9846473932266235, "learning_rate": 2.959483302768688e-05, "loss": 0.835, "num_input_tokens_seen": 155984720, "step": 3660 }, { "epoch": 0.8837181264693471, "grad_norm": 1.0410343408584595, "learning_rate": 2.9548284012343746e-05, "loss": 0.8014, "num_input_tokens_seen": 156189320, "step": 3665 }, { "epoch": 0.8849237446500693, "grad_norm": 1.0477067232131958, "learning_rate": 2.95017186830502e-05, "loss": 0.8577, "num_input_tokens_seen": 156405528, "step": 3670 }, { "epoch": 0.8861293628307915, "grad_norm": 1.0016175508499146, "learning_rate": 2.9455137206828444e-05, "loss": 0.8153, "num_input_tokens_seen": 156631696, "step": 3675 }, { "epoch": 0.8873349810115136, "grad_norm": 1.4559437036514282, "learning_rate": 2.9408539750758625e-05, "loss": 0.9012, "num_input_tokens_seen": 156836120, "step": 3680 }, { "epoch": 0.8885405991922358, "grad_norm": 0.9088696837425232, "learning_rate": 2.936192648197818e-05, "loss": 0.7921, "num_input_tokens_seen": 157050344, "step": 3685 }, { "epoch": 0.889746217372958, "grad_norm": 1.2262157201766968, "learning_rate": 2.9315297567681277e-05, "loss": 0.8433, "num_input_tokens_seen": 157258664, "step": 3690 }, { "epoch": 0.8909518355536802, "grad_norm": 1.0281695127487183, "learning_rate": 2.9268653175118212e-05, "loss": 0.8301, "num_input_tokens_seen": 157461600, "step": 3695 }, { "epoch": 0.8921574537344024, "grad_norm": 0.9885977506637573, "learning_rate": 2.9221993471594772e-05, "loss": 0.8517, "num_input_tokens_seen": 157680432, "step": 3700 }, { "epoch": 0.8933630719151244, "grad_norm": 0.921521782875061, "learning_rate": 2.9175318624471687e-05, "loss": 0.778, "num_input_tokens_seen": 157902608, "step": 3705 }, { "epoch": 0.8945686900958466, "grad_norm": 1.064586877822876, "learning_rate": 2.9128628801163993e-05, "loss": 0.8789, "num_input_tokens_seen": 158111448, "step": 3710 }, { "epoch": 0.8957743082765688, "grad_norm": 0.9471330642700195, "learning_rate": 2.908192416914045e-05, "loss": 0.8353, "num_input_tokens_seen": 158332120, "step": 3715 }, { "epoch": 0.896979926457291, "grad_norm": 1.292681097984314, "learning_rate": 2.9035204895922917e-05, "loss": 0.8626, "num_input_tokens_seen": 158547624, "step": 3720 }, { "epoch": 0.8981855446380131, "grad_norm": 0.8782669305801392, "learning_rate": 2.8988471149085794e-05, "loss": 0.8343, "num_input_tokens_seen": 158747952, "step": 3725 }, { "epoch": 0.8993911628187353, "grad_norm": 0.968102216720581, "learning_rate": 2.894172309625538e-05, "loss": 0.8122, "num_input_tokens_seen": 158966200, "step": 3730 }, { "epoch": 0.9005967809994575, "grad_norm": 1.263906478881836, "learning_rate": 2.889496090510928e-05, "loss": 0.8704, "num_input_tokens_seen": 159175032, "step": 3735 }, { "epoch": 0.9018023991801797, "grad_norm": 0.9207846522331238, "learning_rate": 2.8848184743375832e-05, "loss": 0.8214, "num_input_tokens_seen": 159384816, "step": 3740 }, { "epoch": 0.9030080173609017, "grad_norm": 1.0213477611541748, "learning_rate": 2.8801394778833475e-05, "loss": 0.8439, "num_input_tokens_seen": 159601848, "step": 3745 }, { "epoch": 0.9042136355416239, "grad_norm": 0.9729105830192566, "learning_rate": 2.8754591179310152e-05, "loss": 0.7889, "num_input_tokens_seen": 159814184, "step": 3750 }, { "epoch": 0.9054192537223461, "grad_norm": 1.0304421186447144, "learning_rate": 2.8707774112682713e-05, "loss": 0.8485, "num_input_tokens_seen": 160037712, "step": 3755 }, { "epoch": 0.9066248719030683, "grad_norm": 1.0181950330734253, "learning_rate": 2.8660943746876305e-05, "loss": 0.8915, "num_input_tokens_seen": 160245248, "step": 3760 }, { "epoch": 0.9078304900837905, "grad_norm": 1.1236683130264282, "learning_rate": 2.8614100249863807e-05, "loss": 0.882, "num_input_tokens_seen": 160462496, "step": 3765 }, { "epoch": 0.9090361082645126, "grad_norm": 1.0356091260910034, "learning_rate": 2.856724378966516e-05, "loss": 0.858, "num_input_tokens_seen": 160665976, "step": 3770 }, { "epoch": 0.9102417264452348, "grad_norm": 0.8878853917121887, "learning_rate": 2.8520374534346826e-05, "loss": 0.8665, "num_input_tokens_seen": 160868240, "step": 3775 }, { "epoch": 0.911447344625957, "grad_norm": 0.9015271067619324, "learning_rate": 2.847349265202115e-05, "loss": 0.7952, "num_input_tokens_seen": 161089536, "step": 3780 }, { "epoch": 0.9126529628066792, "grad_norm": 1.0017156600952148, "learning_rate": 2.8426598310845782e-05, "loss": 0.8167, "num_input_tokens_seen": 161300480, "step": 3785 }, { "epoch": 0.9138585809874012, "grad_norm": 1.0327990055084229, "learning_rate": 2.8379691679023028e-05, "loss": 0.7672, "num_input_tokens_seen": 161515640, "step": 3790 }, { "epoch": 0.9150641991681234, "grad_norm": 0.9854059815406799, "learning_rate": 2.8332772924799318e-05, "loss": 0.8023, "num_input_tokens_seen": 161726288, "step": 3795 }, { "epoch": 0.9162698173488456, "grad_norm": 0.9627297520637512, "learning_rate": 2.8285842216464543e-05, "loss": 0.8666, "num_input_tokens_seen": 161949624, "step": 3800 }, { "epoch": 0.9174754355295678, "grad_norm": 1.8853353261947632, "learning_rate": 2.8238899722351465e-05, "loss": 0.7404, "num_input_tokens_seen": 162172904, "step": 3805 }, { "epoch": 0.91868105371029, "grad_norm": 1.0521774291992188, "learning_rate": 2.8191945610835137e-05, "loss": 0.8311, "num_input_tokens_seen": 162387208, "step": 3810 }, { "epoch": 0.9198866718910121, "grad_norm": 1.0755788087844849, "learning_rate": 2.8144980050332264e-05, "loss": 0.8479, "num_input_tokens_seen": 162599520, "step": 3815 }, { "epoch": 0.9210922900717343, "grad_norm": 0.9405021667480469, "learning_rate": 2.8098003209300638e-05, "loss": 0.9, "num_input_tokens_seen": 162817808, "step": 3820 }, { "epoch": 0.9222979082524565, "grad_norm": 0.9687652587890625, "learning_rate": 2.805101525623849e-05, "loss": 0.7824, "num_input_tokens_seen": 163036400, "step": 3825 }, { "epoch": 0.9235035264331786, "grad_norm": 0.9555730223655701, "learning_rate": 2.800401635968392e-05, "loss": 0.9033, "num_input_tokens_seen": 163242496, "step": 3830 }, { "epoch": 0.9247091446139007, "grad_norm": 0.9723814725875854, "learning_rate": 2.7957006688214286e-05, "loss": 0.858, "num_input_tokens_seen": 163459888, "step": 3835 }, { "epoch": 0.9259147627946229, "grad_norm": 1.0214556455612183, "learning_rate": 2.790998641044557e-05, "loss": 0.8266, "num_input_tokens_seen": 163687224, "step": 3840 }, { "epoch": 0.9271203809753451, "grad_norm": 0.8990281820297241, "learning_rate": 2.786295569503183e-05, "loss": 0.8175, "num_input_tokens_seen": 163904472, "step": 3845 }, { "epoch": 0.9283259991560673, "grad_norm": 0.9869774580001831, "learning_rate": 2.7815914710664543e-05, "loss": 0.8904, "num_input_tokens_seen": 164109720, "step": 3850 }, { "epoch": 0.9295316173367895, "grad_norm": 1.0521568059921265, "learning_rate": 2.7768863626072006e-05, "loss": 0.8923, "num_input_tokens_seen": 164326200, "step": 3855 }, { "epoch": 0.9307372355175116, "grad_norm": 0.994682788848877, "learning_rate": 2.772180261001878e-05, "loss": 0.7803, "num_input_tokens_seen": 164539008, "step": 3860 }, { "epoch": 0.9319428536982338, "grad_norm": 1.2990710735321045, "learning_rate": 2.767473183130502e-05, "loss": 0.8664, "num_input_tokens_seen": 164751264, "step": 3865 }, { "epoch": 0.9331484718789559, "grad_norm": 1.0573091506958008, "learning_rate": 2.76276514587659e-05, "loss": 0.8732, "num_input_tokens_seen": 164978960, "step": 3870 }, { "epoch": 0.9343540900596781, "grad_norm": 1.0687792301177979, "learning_rate": 2.7580561661271014e-05, "loss": 0.8685, "num_input_tokens_seen": 165187024, "step": 3875 }, { "epoch": 0.9355597082404002, "grad_norm": 1.1159818172454834, "learning_rate": 2.7533462607723764e-05, "loss": 0.8507, "num_input_tokens_seen": 165394160, "step": 3880 }, { "epoch": 0.9367653264211224, "grad_norm": 0.9253574013710022, "learning_rate": 2.7486354467060742e-05, "loss": 0.784, "num_input_tokens_seen": 165614360, "step": 3885 }, { "epoch": 0.9379709446018446, "grad_norm": 1.1766799688339233, "learning_rate": 2.7439237408251152e-05, "loss": 0.7817, "num_input_tokens_seen": 165819432, "step": 3890 }, { "epoch": 0.9391765627825668, "grad_norm": 1.399375081062317, "learning_rate": 2.7392111600296154e-05, "loss": 0.8801, "num_input_tokens_seen": 166034656, "step": 3895 }, { "epoch": 0.940382180963289, "grad_norm": 0.9970574378967285, "learning_rate": 2.7344977212228317e-05, "loss": 0.8514, "num_input_tokens_seen": 166242848, "step": 3900 }, { "epoch": 0.9415877991440111, "grad_norm": 0.9224359393119812, "learning_rate": 2.729783441311099e-05, "loss": 0.77, "num_input_tokens_seen": 166454952, "step": 3905 }, { "epoch": 0.9427934173247332, "grad_norm": 0.973322868347168, "learning_rate": 2.725068337203766e-05, "loss": 0.8268, "num_input_tokens_seen": 166658120, "step": 3910 }, { "epoch": 0.9439990355054554, "grad_norm": 0.982284665107727, "learning_rate": 2.7203524258131397e-05, "loss": 0.8417, "num_input_tokens_seen": 166866792, "step": 3915 }, { "epoch": 0.9452046536861776, "grad_norm": 0.9855775237083435, "learning_rate": 2.715635724054424e-05, "loss": 0.808, "num_input_tokens_seen": 167086280, "step": 3920 }, { "epoch": 0.9464102718668997, "grad_norm": 0.9698231220245361, "learning_rate": 2.7109182488456547e-05, "loss": 0.8742, "num_input_tokens_seen": 167292248, "step": 3925 }, { "epoch": 0.9476158900476219, "grad_norm": 1.029618740081787, "learning_rate": 2.7062000171076447e-05, "loss": 0.8206, "num_input_tokens_seen": 167495056, "step": 3930 }, { "epoch": 0.9488215082283441, "grad_norm": 0.8304679989814758, "learning_rate": 2.7014810457639174e-05, "loss": 0.7271, "num_input_tokens_seen": 167720704, "step": 3935 }, { "epoch": 0.9500271264090663, "grad_norm": 0.9710103273391724, "learning_rate": 2.6967613517406516e-05, "loss": 0.9016, "num_input_tokens_seen": 167933552, "step": 3940 }, { "epoch": 0.9512327445897885, "grad_norm": 1.1160974502563477, "learning_rate": 2.6920409519666174e-05, "loss": 0.8357, "num_input_tokens_seen": 168144968, "step": 3945 }, { "epoch": 0.9524383627705105, "grad_norm": 0.9108563661575317, "learning_rate": 2.6873198633731162e-05, "loss": 0.8745, "num_input_tokens_seen": 168365592, "step": 3950 }, { "epoch": 0.9536439809512327, "grad_norm": 0.9929471611976624, "learning_rate": 2.6825981028939208e-05, "loss": 0.8201, "num_input_tokens_seen": 168580088, "step": 3955 }, { "epoch": 0.9548495991319549, "grad_norm": 0.9432164430618286, "learning_rate": 2.677875687465212e-05, "loss": 0.8494, "num_input_tokens_seen": 168785832, "step": 3960 }, { "epoch": 0.9560552173126771, "grad_norm": 1.0776469707489014, "learning_rate": 2.6731526340255226e-05, "loss": 0.8412, "num_input_tokens_seen": 169001936, "step": 3965 }, { "epoch": 0.9572608354933992, "grad_norm": 0.9921455979347229, "learning_rate": 2.6684289595156707e-05, "loss": 0.9039, "num_input_tokens_seen": 169221624, "step": 3970 }, { "epoch": 0.9584664536741214, "grad_norm": 0.8535934686660767, "learning_rate": 2.6637046808787057e-05, "loss": 0.7992, "num_input_tokens_seen": 169436904, "step": 3975 }, { "epoch": 0.9596720718548436, "grad_norm": 1.0255541801452637, "learning_rate": 2.6589798150598404e-05, "loss": 0.9224, "num_input_tokens_seen": 169641128, "step": 3980 }, { "epoch": 0.9608776900355658, "grad_norm": 1.113061547279358, "learning_rate": 2.654254379006396e-05, "loss": 0.8072, "num_input_tokens_seen": 169851456, "step": 3985 }, { "epoch": 0.9620833082162878, "grad_norm": 0.9994587302207947, "learning_rate": 2.649528389667738e-05, "loss": 0.852, "num_input_tokens_seen": 170059376, "step": 3990 }, { "epoch": 0.96328892639701, "grad_norm": 0.9935539960861206, "learning_rate": 2.6448018639952166e-05, "loss": 0.7993, "num_input_tokens_seen": 170272000, "step": 3995 }, { "epoch": 0.9644945445777322, "grad_norm": 0.9309984445571899, "learning_rate": 2.6400748189421064e-05, "loss": 0.8926, "num_input_tokens_seen": 170486240, "step": 4000 }, { "epoch": 0.9657001627584544, "grad_norm": 1.0418404340744019, "learning_rate": 2.635347271463544e-05, "loss": 0.8683, "num_input_tokens_seen": 170702896, "step": 4005 }, { "epoch": 0.9669057809391766, "grad_norm": 1.0213428735733032, "learning_rate": 2.630619238516469e-05, "loss": 0.8047, "num_input_tokens_seen": 170911480, "step": 4010 }, { "epoch": 0.9681113991198987, "grad_norm": 0.9273322820663452, "learning_rate": 2.625890737059561e-05, "loss": 0.8805, "num_input_tokens_seen": 171113352, "step": 4015 }, { "epoch": 0.9693170173006209, "grad_norm": 0.8624117970466614, "learning_rate": 2.621161784053181e-05, "loss": 0.8744, "num_input_tokens_seen": 171328152, "step": 4020 }, { "epoch": 0.9705226354813431, "grad_norm": 0.979182779788971, "learning_rate": 2.6164323964593106e-05, "loss": 0.8366, "num_input_tokens_seen": 171549416, "step": 4025 }, { "epoch": 0.9717282536620653, "grad_norm": 1.1739531755447388, "learning_rate": 2.6117025912414884e-05, "loss": 0.8288, "num_input_tokens_seen": 171753392, "step": 4030 }, { "epoch": 0.9729338718427873, "grad_norm": 1.071925163269043, "learning_rate": 2.606972385364751e-05, "loss": 0.8541, "num_input_tokens_seen": 171953584, "step": 4035 }, { "epoch": 0.9741394900235095, "grad_norm": 0.9868650436401367, "learning_rate": 2.6022417957955737e-05, "loss": 0.8501, "num_input_tokens_seen": 172164520, "step": 4040 }, { "epoch": 0.9753451082042317, "grad_norm": 1.0950911045074463, "learning_rate": 2.5975108395018077e-05, "loss": 0.8488, "num_input_tokens_seen": 172377920, "step": 4045 }, { "epoch": 0.9765507263849539, "grad_norm": 0.9057542085647583, "learning_rate": 2.592779533452617e-05, "loss": 0.8962, "num_input_tokens_seen": 172591200, "step": 4050 }, { "epoch": 0.9777563445656761, "grad_norm": 1.1215919256210327, "learning_rate": 2.5880478946184246e-05, "loss": 0.8867, "num_input_tokens_seen": 172811696, "step": 4055 }, { "epoch": 0.9789619627463982, "grad_norm": 1.0954582691192627, "learning_rate": 2.5833159399708427e-05, "loss": 0.8822, "num_input_tokens_seen": 173017520, "step": 4060 }, { "epoch": 0.9801675809271204, "grad_norm": 1.0027035474777222, "learning_rate": 2.5785836864826184e-05, "loss": 0.8391, "num_input_tokens_seen": 173225912, "step": 4065 }, { "epoch": 0.9813731991078426, "grad_norm": 1.063950777053833, "learning_rate": 2.5738511511275715e-05, "loss": 0.8991, "num_input_tokens_seen": 173425360, "step": 4070 }, { "epoch": 0.9825788172885647, "grad_norm": 1.0029170513153076, "learning_rate": 2.569118350880532e-05, "loss": 0.8588, "num_input_tokens_seen": 173654280, "step": 4075 }, { "epoch": 0.9837844354692868, "grad_norm": 1.0298494100570679, "learning_rate": 2.564385302717278e-05, "loss": 0.8927, "num_input_tokens_seen": 173863264, "step": 4080 }, { "epoch": 0.984990053650009, "grad_norm": 0.973395824432373, "learning_rate": 2.559652023614481e-05, "loss": 0.8211, "num_input_tokens_seen": 174080448, "step": 4085 }, { "epoch": 0.9861956718307312, "grad_norm": 1.092687726020813, "learning_rate": 2.554918530549637e-05, "loss": 0.849, "num_input_tokens_seen": 174292256, "step": 4090 }, { "epoch": 0.9874012900114534, "grad_norm": 1.084289789199829, "learning_rate": 2.550184840501012e-05, "loss": 0.8438, "num_input_tokens_seen": 174520240, "step": 4095 }, { "epoch": 0.9886069081921756, "grad_norm": 1.1111483573913574, "learning_rate": 2.545450970447576e-05, "loss": 0.8474, "num_input_tokens_seen": 174732184, "step": 4100 }, { "epoch": 0.9898125263728977, "grad_norm": 0.9668362736701965, "learning_rate": 2.540716937368947e-05, "loss": 0.8732, "num_input_tokens_seen": 174942520, "step": 4105 }, { "epoch": 0.9910181445536199, "grad_norm": 1.1227084398269653, "learning_rate": 2.5359827582453276e-05, "loss": 0.8773, "num_input_tokens_seen": 175152960, "step": 4110 }, { "epoch": 0.992223762734342, "grad_norm": 1.0829321146011353, "learning_rate": 2.5312484500574434e-05, "loss": 0.8359, "num_input_tokens_seen": 175365720, "step": 4115 }, { "epoch": 0.9934293809150642, "grad_norm": 0.9443960785865784, "learning_rate": 2.526514029786482e-05, "loss": 0.8166, "num_input_tokens_seen": 175592912, "step": 4120 }, { "epoch": 0.9946349990957863, "grad_norm": 0.9705995321273804, "learning_rate": 2.521779514414035e-05, "loss": 0.8741, "num_input_tokens_seen": 175798832, "step": 4125 }, { "epoch": 0.9958406172765085, "grad_norm": 1.0715521574020386, "learning_rate": 2.5170449209220343e-05, "loss": 0.8195, "num_input_tokens_seen": 176012936, "step": 4130 }, { "epoch": 0.9970462354572307, "grad_norm": 0.9477148652076721, "learning_rate": 2.5123102662926913e-05, "loss": 0.8927, "num_input_tokens_seen": 176226744, "step": 4135 }, { "epoch": 0.9982518536379529, "grad_norm": 0.9794323444366455, "learning_rate": 2.5075755675084373e-05, "loss": 0.8198, "num_input_tokens_seen": 176445160, "step": 4140 }, { "epoch": 0.9994574718186751, "grad_norm": 0.9729568958282471, "learning_rate": 2.5028408415518627e-05, "loss": 0.8491, "num_input_tokens_seen": 176656064, "step": 4145 }, { "epoch": 1.000482247272289, "grad_norm": 1.0267333984375, "learning_rate": 2.4981061054056552e-05, "loss": 0.7271, "num_input_tokens_seen": 176836432, "step": 4150 }, { "epoch": 1.001687865453011, "grad_norm": 0.9398284554481506, "learning_rate": 2.4933713760525368e-05, "loss": 0.754, "num_input_tokens_seen": 177058616, "step": 4155 }, { "epoch": 1.002893483633733, "grad_norm": 1.1341493129730225, "learning_rate": 2.4886366704752083e-05, "loss": 0.8002, "num_input_tokens_seen": 177272776, "step": 4160 }, { "epoch": 1.0040991018144554, "grad_norm": 1.017069697380066, "learning_rate": 2.483902005656283e-05, "loss": 0.8103, "num_input_tokens_seen": 177488248, "step": 4165 }, { "epoch": 1.0053047199951775, "grad_norm": 1.0224635601043701, "learning_rate": 2.4791673985782285e-05, "loss": 0.8096, "num_input_tokens_seen": 177691088, "step": 4170 }, { "epoch": 1.0065103381758997, "grad_norm": 1.393256425857544, "learning_rate": 2.474432866223307e-05, "loss": 0.7804, "num_input_tokens_seen": 177894168, "step": 4175 }, { "epoch": 1.0077159563566218, "grad_norm": 1.1690846681594849, "learning_rate": 2.4696984255735094e-05, "loss": 0.7772, "num_input_tokens_seen": 178118568, "step": 4180 }, { "epoch": 1.008921574537344, "grad_norm": 1.0589479207992554, "learning_rate": 2.4649640936105006e-05, "loss": 0.7532, "num_input_tokens_seen": 178340640, "step": 4185 }, { "epoch": 1.0101271927180662, "grad_norm": 0.9789804220199585, "learning_rate": 2.460229887315554e-05, "loss": 0.8195, "num_input_tokens_seen": 178547360, "step": 4190 }, { "epoch": 1.0113328108987885, "grad_norm": 1.0155714750289917, "learning_rate": 2.4554958236694934e-05, "loss": 0.7415, "num_input_tokens_seen": 178763688, "step": 4195 }, { "epoch": 1.0125384290795105, "grad_norm": 1.1076892614364624, "learning_rate": 2.45076191965263e-05, "loss": 0.7783, "num_input_tokens_seen": 178976352, "step": 4200 }, { "epoch": 1.0137440472602326, "grad_norm": 1.1809134483337402, "learning_rate": 2.446028192244703e-05, "loss": 0.8568, "num_input_tokens_seen": 179189712, "step": 4205 }, { "epoch": 1.0149496654409549, "grad_norm": 1.2123162746429443, "learning_rate": 2.4412946584248186e-05, "loss": 0.7516, "num_input_tokens_seen": 179404760, "step": 4210 }, { "epoch": 1.016155283621677, "grad_norm": 1.0995322465896606, "learning_rate": 2.4365613351713865e-05, "loss": 0.7638, "num_input_tokens_seen": 179624880, "step": 4215 }, { "epoch": 1.0173609018023992, "grad_norm": 1.0913623571395874, "learning_rate": 2.431828239462063e-05, "loss": 0.8187, "num_input_tokens_seen": 179833528, "step": 4220 }, { "epoch": 1.0185665199831213, "grad_norm": 1.1424404382705688, "learning_rate": 2.4270953882736887e-05, "loss": 0.7416, "num_input_tokens_seen": 180055736, "step": 4225 }, { "epoch": 1.0197721381638436, "grad_norm": 1.1197949647903442, "learning_rate": 2.4223627985822257e-05, "loss": 0.7271, "num_input_tokens_seen": 180267768, "step": 4230 }, { "epoch": 1.0209777563445657, "grad_norm": 1.018120527267456, "learning_rate": 2.4176304873626985e-05, "loss": 0.7829, "num_input_tokens_seen": 180478184, "step": 4235 }, { "epoch": 1.0221833745252877, "grad_norm": 1.0164093971252441, "learning_rate": 2.412898471589133e-05, "loss": 0.847, "num_input_tokens_seen": 180692992, "step": 4240 }, { "epoch": 1.02338899270601, "grad_norm": 1.0555306673049927, "learning_rate": 2.4081667682344968e-05, "loss": 0.7708, "num_input_tokens_seen": 180895696, "step": 4245 }, { "epoch": 1.024594610886732, "grad_norm": 1.0555599927902222, "learning_rate": 2.4034353942706335e-05, "loss": 0.7994, "num_input_tokens_seen": 181121360, "step": 4250 }, { "epoch": 1.0258002290674544, "grad_norm": 1.2403364181518555, "learning_rate": 2.3987043666682076e-05, "loss": 0.8023, "num_input_tokens_seen": 181349000, "step": 4255 }, { "epoch": 1.0270058472481765, "grad_norm": 1.1050076484680176, "learning_rate": 2.3939737023966416e-05, "loss": 0.819, "num_input_tokens_seen": 181558040, "step": 4260 }, { "epoch": 1.0282114654288987, "grad_norm": 1.1275123357772827, "learning_rate": 2.3892434184240533e-05, "loss": 0.8294, "num_input_tokens_seen": 181767384, "step": 4265 }, { "epoch": 1.0294170836096208, "grad_norm": 1.1636650562286377, "learning_rate": 2.384513531717198e-05, "loss": 0.8647, "num_input_tokens_seen": 181967584, "step": 4270 }, { "epoch": 1.030622701790343, "grad_norm": 1.05282723903656, "learning_rate": 2.379784059241405e-05, "loss": 0.8348, "num_input_tokens_seen": 182179736, "step": 4275 }, { "epoch": 1.0318283199710652, "grad_norm": 1.2866379022598267, "learning_rate": 2.3750550179605187e-05, "loss": 0.8345, "num_input_tokens_seen": 182385176, "step": 4280 }, { "epoch": 1.0330339381517872, "grad_norm": 0.9946674108505249, "learning_rate": 2.3703264248368348e-05, "loss": 0.8051, "num_input_tokens_seen": 182593344, "step": 4285 }, { "epoch": 1.0342395563325095, "grad_norm": 1.1821004152297974, "learning_rate": 2.3655982968310436e-05, "loss": 0.7814, "num_input_tokens_seen": 182796944, "step": 4290 }, { "epoch": 1.0354451745132316, "grad_norm": 1.2033579349517822, "learning_rate": 2.3608706509021668e-05, "loss": 0.7791, "num_input_tokens_seen": 183012112, "step": 4295 }, { "epoch": 1.0366507926939539, "grad_norm": 1.4577603340148926, "learning_rate": 2.3561435040074962e-05, "loss": 0.7863, "num_input_tokens_seen": 183222688, "step": 4300 }, { "epoch": 1.037856410874676, "grad_norm": 1.0904340744018555, "learning_rate": 2.351416873102535e-05, "loss": 0.7482, "num_input_tokens_seen": 183433480, "step": 4305 }, { "epoch": 1.0390620290553982, "grad_norm": 1.1385043859481812, "learning_rate": 2.3466907751409338e-05, "loss": 0.7602, "num_input_tokens_seen": 183646872, "step": 4310 }, { "epoch": 1.0402676472361203, "grad_norm": 1.1816341876983643, "learning_rate": 2.341965227074433e-05, "loss": 0.7761, "num_input_tokens_seen": 183859920, "step": 4315 }, { "epoch": 1.0414732654168426, "grad_norm": 1.0133589506149292, "learning_rate": 2.3372402458528003e-05, "loss": 0.7483, "num_input_tokens_seen": 184077304, "step": 4320 }, { "epoch": 1.0426788835975647, "grad_norm": 1.1632879972457886, "learning_rate": 2.3325158484237694e-05, "loss": 0.8288, "num_input_tokens_seen": 184289968, "step": 4325 }, { "epoch": 1.0438845017782867, "grad_norm": 1.0572251081466675, "learning_rate": 2.3277920517329812e-05, "loss": 0.7807, "num_input_tokens_seen": 184504288, "step": 4330 }, { "epoch": 1.045090119959009, "grad_norm": 1.0822540521621704, "learning_rate": 2.3230688727239214e-05, "loss": 0.811, "num_input_tokens_seen": 184718824, "step": 4335 }, { "epoch": 1.046295738139731, "grad_norm": 1.6243950128555298, "learning_rate": 2.3183463283378597e-05, "loss": 0.8137, "num_input_tokens_seen": 184928128, "step": 4340 }, { "epoch": 1.0475013563204534, "grad_norm": 1.0052193403244019, "learning_rate": 2.313624435513791e-05, "loss": 0.6862, "num_input_tokens_seen": 185146168, "step": 4345 }, { "epoch": 1.0487069745011754, "grad_norm": 1.0571144819259644, "learning_rate": 2.308903211188372e-05, "loss": 0.8278, "num_input_tokens_seen": 185348360, "step": 4350 }, { "epoch": 1.0499125926818977, "grad_norm": 1.119408369064331, "learning_rate": 2.3041826722958604e-05, "loss": 0.8422, "num_input_tokens_seen": 185552848, "step": 4355 }, { "epoch": 1.0511182108626198, "grad_norm": 0.9051406383514404, "learning_rate": 2.2994628357680572e-05, "loss": 0.7202, "num_input_tokens_seen": 185773400, "step": 4360 }, { "epoch": 1.0523238290433419, "grad_norm": 1.1888370513916016, "learning_rate": 2.294743718534244e-05, "loss": 0.7435, "num_input_tokens_seen": 185982048, "step": 4365 }, { "epoch": 1.0535294472240642, "grad_norm": 1.6544849872589111, "learning_rate": 2.290025337521122e-05, "loss": 0.8469, "num_input_tokens_seen": 186187432, "step": 4370 }, { "epoch": 1.0547350654047862, "grad_norm": 1.2136576175689697, "learning_rate": 2.2853077096527526e-05, "loss": 0.7702, "num_input_tokens_seen": 186406536, "step": 4375 }, { "epoch": 1.0559406835855085, "grad_norm": 0.9883787035942078, "learning_rate": 2.280590851850493e-05, "loss": 0.7798, "num_input_tokens_seen": 186626600, "step": 4380 }, { "epoch": 1.0571463017662306, "grad_norm": 1.0503785610198975, "learning_rate": 2.275874781032942e-05, "loss": 0.74, "num_input_tokens_seen": 186847520, "step": 4385 }, { "epoch": 1.0583519199469529, "grad_norm": 1.0976778268814087, "learning_rate": 2.2711595141158716e-05, "loss": 0.7672, "num_input_tokens_seen": 187064824, "step": 4390 }, { "epoch": 1.059557538127675, "grad_norm": 1.119023323059082, "learning_rate": 2.2664450680121754e-05, "loss": 0.7952, "num_input_tokens_seen": 187274128, "step": 4395 }, { "epoch": 1.0607631563083972, "grad_norm": 1.1395831108093262, "learning_rate": 2.2617314596317977e-05, "loss": 0.8115, "num_input_tokens_seen": 187496672, "step": 4400 }, { "epoch": 1.0619687744891193, "grad_norm": 1.2253223657608032, "learning_rate": 2.2570187058816817e-05, "loss": 0.7914, "num_input_tokens_seen": 187707608, "step": 4405 }, { "epoch": 1.0631743926698414, "grad_norm": 1.0687687397003174, "learning_rate": 2.252306823665703e-05, "loss": 0.8282, "num_input_tokens_seen": 187932064, "step": 4410 }, { "epoch": 1.0643800108505637, "grad_norm": 1.064121961593628, "learning_rate": 2.2475958298846127e-05, "loss": 0.7881, "num_input_tokens_seen": 188147632, "step": 4415 }, { "epoch": 1.0655856290312857, "grad_norm": 1.1222933530807495, "learning_rate": 2.2428857414359753e-05, "loss": 0.8059, "num_input_tokens_seen": 188362616, "step": 4420 }, { "epoch": 1.066791247212008, "grad_norm": 1.1556960344314575, "learning_rate": 2.238176575214105e-05, "loss": 0.7609, "num_input_tokens_seen": 188586704, "step": 4425 }, { "epoch": 1.06799686539273, "grad_norm": 1.7908968925476074, "learning_rate": 2.2334683481100123e-05, "loss": 0.7882, "num_input_tokens_seen": 188804704, "step": 4430 }, { "epoch": 1.0692024835734524, "grad_norm": 1.2464038133621216, "learning_rate": 2.228761077011336e-05, "loss": 0.6923, "num_input_tokens_seen": 189018624, "step": 4435 }, { "epoch": 1.0704081017541744, "grad_norm": 1.1270554065704346, "learning_rate": 2.224054778802288e-05, "loss": 0.7448, "num_input_tokens_seen": 189238312, "step": 4440 }, { "epoch": 1.0716137199348967, "grad_norm": 1.0896384716033936, "learning_rate": 2.2193494703635894e-05, "loss": 0.7454, "num_input_tokens_seen": 189452824, "step": 4445 }, { "epoch": 1.0728193381156188, "grad_norm": 1.0339279174804688, "learning_rate": 2.2146451685724123e-05, "loss": 0.7471, "num_input_tokens_seen": 189663936, "step": 4450 }, { "epoch": 1.0740249562963409, "grad_norm": 1.1276991367340088, "learning_rate": 2.209941890302317e-05, "loss": 0.7849, "num_input_tokens_seen": 189882168, "step": 4455 }, { "epoch": 1.0752305744770632, "grad_norm": 0.9719217419624329, "learning_rate": 2.205239652423192e-05, "loss": 0.7451, "num_input_tokens_seen": 190104808, "step": 4460 }, { "epoch": 1.0764361926577852, "grad_norm": 1.1028916835784912, "learning_rate": 2.2005384718011964e-05, "loss": 0.8003, "num_input_tokens_seen": 190305824, "step": 4465 }, { "epoch": 1.0776418108385075, "grad_norm": 1.2030248641967773, "learning_rate": 2.195838365298695e-05, "loss": 0.784, "num_input_tokens_seen": 190520840, "step": 4470 }, { "epoch": 1.0788474290192296, "grad_norm": 1.1567641496658325, "learning_rate": 2.1911393497742004e-05, "loss": 0.8099, "num_input_tokens_seen": 190737576, "step": 4475 }, { "epoch": 1.0800530471999519, "grad_norm": 0.9854995608329773, "learning_rate": 2.1864414420823128e-05, "loss": 0.7876, "num_input_tokens_seen": 190961840, "step": 4480 }, { "epoch": 1.081258665380674, "grad_norm": 1.606543779373169, "learning_rate": 2.1817446590736578e-05, "loss": 0.756, "num_input_tokens_seen": 191180360, "step": 4485 }, { "epoch": 1.082464283561396, "grad_norm": 0.9927762150764465, "learning_rate": 2.177049017594829e-05, "loss": 0.7693, "num_input_tokens_seen": 191386872, "step": 4490 }, { "epoch": 1.0836699017421183, "grad_norm": 1.0909963846206665, "learning_rate": 2.1723545344883216e-05, "loss": 0.8183, "num_input_tokens_seen": 191596960, "step": 4495 }, { "epoch": 1.0848755199228404, "grad_norm": 1.057252287864685, "learning_rate": 2.1676612265924796e-05, "loss": 0.7664, "num_input_tokens_seen": 191813072, "step": 4500 }, { "epoch": 1.0860811381035627, "grad_norm": 1.0547430515289307, "learning_rate": 2.1629691107414297e-05, "loss": 0.8316, "num_input_tokens_seen": 192026008, "step": 4505 }, { "epoch": 1.0872867562842847, "grad_norm": 1.237196922302246, "learning_rate": 2.158278203765024e-05, "loss": 0.8219, "num_input_tokens_seen": 192224104, "step": 4510 }, { "epoch": 1.088492374465007, "grad_norm": 2.082291603088379, "learning_rate": 2.1535885224887783e-05, "loss": 0.7989, "num_input_tokens_seen": 192446984, "step": 4515 }, { "epoch": 1.089697992645729, "grad_norm": 1.257799506187439, "learning_rate": 2.148900083733812e-05, "loss": 0.8926, "num_input_tokens_seen": 192657960, "step": 4520 }, { "epoch": 1.0909036108264512, "grad_norm": 1.2047878503799438, "learning_rate": 2.1442129043167874e-05, "loss": 0.8669, "num_input_tokens_seen": 192876096, "step": 4525 }, { "epoch": 1.0921092290071734, "grad_norm": 1.2093663215637207, "learning_rate": 2.13952700104985e-05, "loss": 0.7625, "num_input_tokens_seen": 193085992, "step": 4530 }, { "epoch": 1.0933148471878955, "grad_norm": 1.2460426092147827, "learning_rate": 2.134842390740567e-05, "loss": 0.7914, "num_input_tokens_seen": 193291720, "step": 4535 }, { "epoch": 1.0945204653686178, "grad_norm": 1.3767956495285034, "learning_rate": 2.130159090191871e-05, "loss": 0.7688, "num_input_tokens_seen": 193510360, "step": 4540 }, { "epoch": 1.0957260835493399, "grad_norm": 1.1775809526443481, "learning_rate": 2.1254771162019926e-05, "loss": 0.7436, "num_input_tokens_seen": 193716288, "step": 4545 }, { "epoch": 1.0969317017300622, "grad_norm": 1.1931169033050537, "learning_rate": 2.1207964855644073e-05, "loss": 0.7694, "num_input_tokens_seen": 193920568, "step": 4550 }, { "epoch": 1.0981373199107842, "grad_norm": 1.1090948581695557, "learning_rate": 2.116117215067771e-05, "loss": 0.8351, "num_input_tokens_seen": 194135560, "step": 4555 }, { "epoch": 1.0993429380915065, "grad_norm": 2.6524770259857178, "learning_rate": 2.1114393214958622e-05, "loss": 0.7484, "num_input_tokens_seen": 194356824, "step": 4560 }, { "epoch": 1.1005485562722286, "grad_norm": 1.2231776714324951, "learning_rate": 2.106762821627518e-05, "loss": 0.7739, "num_input_tokens_seen": 194565288, "step": 4565 }, { "epoch": 1.1017541744529507, "grad_norm": 1.186087965965271, "learning_rate": 2.1020877322365786e-05, "loss": 0.7673, "num_input_tokens_seen": 194780456, "step": 4570 }, { "epoch": 1.102959792633673, "grad_norm": 1.019050121307373, "learning_rate": 2.0974140700918245e-05, "loss": 0.7481, "num_input_tokens_seen": 194992080, "step": 4575 }, { "epoch": 1.104165410814395, "grad_norm": 1.0645135641098022, "learning_rate": 2.092741851956917e-05, "loss": 0.8107, "num_input_tokens_seen": 195204944, "step": 4580 }, { "epoch": 1.1053710289951173, "grad_norm": 1.2379586696624756, "learning_rate": 2.0880710945903385e-05, "loss": 0.7972, "num_input_tokens_seen": 195416464, "step": 4585 }, { "epoch": 1.1065766471758394, "grad_norm": 1.17396879196167, "learning_rate": 2.083401814745332e-05, "loss": 0.8187, "num_input_tokens_seen": 195610776, "step": 4590 }, { "epoch": 1.1077822653565617, "grad_norm": 1.1373525857925415, "learning_rate": 2.078734029169838e-05, "loss": 0.7647, "num_input_tokens_seen": 195832544, "step": 4595 }, { "epoch": 1.1089878835372837, "grad_norm": 1.0416021347045898, "learning_rate": 2.074067754606441e-05, "loss": 0.7781, "num_input_tokens_seen": 196054032, "step": 4600 }, { "epoch": 1.110193501718006, "grad_norm": 1.407917857170105, "learning_rate": 2.0694030077923034e-05, "loss": 0.8229, "num_input_tokens_seen": 196262920, "step": 4605 }, { "epoch": 1.111399119898728, "grad_norm": 1.1164956092834473, "learning_rate": 2.0647398054591093e-05, "loss": 0.8247, "num_input_tokens_seen": 196475696, "step": 4610 }, { "epoch": 1.1126047380794502, "grad_norm": 1.1198545694351196, "learning_rate": 2.060078164333002e-05, "loss": 0.7405, "num_input_tokens_seen": 196685560, "step": 4615 }, { "epoch": 1.1138103562601724, "grad_norm": 1.0778928995132446, "learning_rate": 2.0554181011345253e-05, "loss": 0.8221, "num_input_tokens_seen": 196894176, "step": 4620 }, { "epoch": 1.1150159744408945, "grad_norm": 1.1877487897872925, "learning_rate": 2.0507596325785635e-05, "loss": 0.8686, "num_input_tokens_seen": 197108800, "step": 4625 }, { "epoch": 1.1162215926216168, "grad_norm": 1.2312906980514526, "learning_rate": 2.0461027753742812e-05, "loss": 0.8081, "num_input_tokens_seen": 197308904, "step": 4630 }, { "epoch": 1.1174272108023389, "grad_norm": 1.1836910247802734, "learning_rate": 2.0414475462250617e-05, "loss": 0.7543, "num_input_tokens_seen": 197529952, "step": 4635 }, { "epoch": 1.1186328289830612, "grad_norm": 1.119510531425476, "learning_rate": 2.0367939618284516e-05, "loss": 0.8103, "num_input_tokens_seen": 197750248, "step": 4640 }, { "epoch": 1.1198384471637832, "grad_norm": 1.2175461053848267, "learning_rate": 2.032142038876096e-05, "loss": 0.8197, "num_input_tokens_seen": 197960288, "step": 4645 }, { "epoch": 1.1210440653445053, "grad_norm": 1.2096143960952759, "learning_rate": 2.0274917940536807e-05, "loss": 0.6763, "num_input_tokens_seen": 198170488, "step": 4650 }, { "epoch": 1.1222496835252276, "grad_norm": 1.1178091764450073, "learning_rate": 2.022843244040874e-05, "loss": 0.8243, "num_input_tokens_seen": 198377344, "step": 4655 }, { "epoch": 1.1234553017059496, "grad_norm": 1.7666913270950317, "learning_rate": 2.0181964055112645e-05, "loss": 0.7445, "num_input_tokens_seen": 198584688, "step": 4660 }, { "epoch": 1.124660919886672, "grad_norm": 1.1990071535110474, "learning_rate": 2.0135512951322995e-05, "loss": 0.8033, "num_input_tokens_seen": 198803240, "step": 4665 }, { "epoch": 1.125866538067394, "grad_norm": 1.1227753162384033, "learning_rate": 2.0089079295652306e-05, "loss": 0.7719, "num_input_tokens_seen": 199001544, "step": 4670 }, { "epoch": 1.1270721562481163, "grad_norm": 1.171852707862854, "learning_rate": 2.004266325465051e-05, "loss": 0.8199, "num_input_tokens_seen": 199212856, "step": 4675 }, { "epoch": 1.1282777744288384, "grad_norm": 1.1146214008331299, "learning_rate": 1.999626499480434e-05, "loss": 0.7804, "num_input_tokens_seen": 199419024, "step": 4680 }, { "epoch": 1.1294833926095604, "grad_norm": 1.155569314956665, "learning_rate": 1.994988468253677e-05, "loss": 0.799, "num_input_tokens_seen": 199653192, "step": 4685 }, { "epoch": 1.1306890107902827, "grad_norm": 1.3172316551208496, "learning_rate": 1.990352248420638e-05, "loss": 0.8105, "num_input_tokens_seen": 199862256, "step": 4690 }, { "epoch": 1.1318946289710048, "grad_norm": 1.5764230489730835, "learning_rate": 1.98571785661068e-05, "loss": 0.7881, "num_input_tokens_seen": 200076880, "step": 4695 }, { "epoch": 1.133100247151727, "grad_norm": 1.1662232875823975, "learning_rate": 1.9810853094466073e-05, "loss": 0.8538, "num_input_tokens_seen": 200295224, "step": 4700 }, { "epoch": 1.1343058653324491, "grad_norm": 1.1323875188827515, "learning_rate": 1.976454623544608e-05, "loss": 0.78, "num_input_tokens_seen": 200498360, "step": 4705 }, { "epoch": 1.1355114835131714, "grad_norm": 1.1858762502670288, "learning_rate": 1.9718258155141944e-05, "loss": 0.7693, "num_input_tokens_seen": 200720744, "step": 4710 }, { "epoch": 1.1367171016938935, "grad_norm": 1.0989617109298706, "learning_rate": 1.9671989019581447e-05, "loss": 0.7757, "num_input_tokens_seen": 200934352, "step": 4715 }, { "epoch": 1.1379227198746158, "grad_norm": 1.0078232288360596, "learning_rate": 1.96257389947244e-05, "loss": 0.8151, "num_input_tokens_seen": 201130504, "step": 4720 }, { "epoch": 1.1391283380553379, "grad_norm": 3.9124374389648438, "learning_rate": 1.957950824646207e-05, "loss": 0.7524, "num_input_tokens_seen": 201337832, "step": 4725 }, { "epoch": 1.1403339562360602, "grad_norm": 1.0569738149642944, "learning_rate": 1.9533296940616604e-05, "loss": 0.7487, "num_input_tokens_seen": 201547672, "step": 4730 }, { "epoch": 1.1415395744167822, "grad_norm": 1.3725334405899048, "learning_rate": 1.948710524294036e-05, "loss": 0.7477, "num_input_tokens_seen": 201777856, "step": 4735 }, { "epoch": 1.1427451925975043, "grad_norm": 1.1365457773208618, "learning_rate": 1.9440933319115423e-05, "loss": 0.7085, "num_input_tokens_seen": 201986880, "step": 4740 }, { "epoch": 1.1439508107782266, "grad_norm": 1.068569540977478, "learning_rate": 1.9394781334752917e-05, "loss": 0.7754, "num_input_tokens_seen": 202187288, "step": 4745 }, { "epoch": 1.1451564289589486, "grad_norm": 1.0936781167984009, "learning_rate": 1.9348649455392462e-05, "loss": 0.7387, "num_input_tokens_seen": 202403976, "step": 4750 }, { "epoch": 1.146362047139671, "grad_norm": 1.257634162902832, "learning_rate": 1.9302537846501558e-05, "loss": 0.7931, "num_input_tokens_seen": 202604464, "step": 4755 }, { "epoch": 1.147567665320393, "grad_norm": 1.2617489099502563, "learning_rate": 1.925644667347501e-05, "loss": 0.8372, "num_input_tokens_seen": 202811240, "step": 4760 }, { "epoch": 1.1487732835011153, "grad_norm": 1.489582896232605, "learning_rate": 1.9210376101634297e-05, "loss": 0.773, "num_input_tokens_seen": 203035864, "step": 4765 }, { "epoch": 1.1499789016818374, "grad_norm": 1.0744582414627075, "learning_rate": 1.9164326296227025e-05, "loss": 0.7411, "num_input_tokens_seen": 203251400, "step": 4770 }, { "epoch": 1.1511845198625594, "grad_norm": 1.210374355316162, "learning_rate": 1.9118297422426316e-05, "loss": 0.796, "num_input_tokens_seen": 203470872, "step": 4775 }, { "epoch": 1.1523901380432817, "grad_norm": 1.0916566848754883, "learning_rate": 1.9072289645330198e-05, "loss": 0.7086, "num_input_tokens_seen": 203688480, "step": 4780 }, { "epoch": 1.1535957562240038, "grad_norm": 1.0224199295043945, "learning_rate": 1.9026303129961047e-05, "loss": 0.7635, "num_input_tokens_seen": 203906568, "step": 4785 }, { "epoch": 1.154801374404726, "grad_norm": 1.234421968460083, "learning_rate": 1.898033804126496e-05, "loss": 0.7828, "num_input_tokens_seen": 204124000, "step": 4790 }, { "epoch": 1.1560069925854481, "grad_norm": 1.1589895486831665, "learning_rate": 1.8934394544111197e-05, "loss": 0.7949, "num_input_tokens_seen": 204332136, "step": 4795 }, { "epoch": 1.1572126107661704, "grad_norm": 1.1610705852508545, "learning_rate": 1.8888472803291567e-05, "loss": 0.7998, "num_input_tokens_seen": 204541984, "step": 4800 }, { "epoch": 1.1584182289468925, "grad_norm": 1.2197575569152832, "learning_rate": 1.884257298351982e-05, "loss": 0.7923, "num_input_tokens_seen": 204758432, "step": 4805 }, { "epoch": 1.1596238471276146, "grad_norm": 1.0949653387069702, "learning_rate": 1.8796695249431113e-05, "loss": 0.8099, "num_input_tokens_seen": 204976344, "step": 4810 }, { "epoch": 1.1608294653083369, "grad_norm": 1.1807087659835815, "learning_rate": 1.875083976558136e-05, "loss": 0.7263, "num_input_tokens_seen": 205194840, "step": 4815 }, { "epoch": 1.162035083489059, "grad_norm": 1.2181516885757446, "learning_rate": 1.8705006696446686e-05, "loss": 0.8199, "num_input_tokens_seen": 205403000, "step": 4820 }, { "epoch": 1.1632407016697812, "grad_norm": 1.1358294486999512, "learning_rate": 1.865919620642281e-05, "loss": 0.8315, "num_input_tokens_seen": 205613032, "step": 4825 }, { "epoch": 1.1644463198505033, "grad_norm": 1.1632124185562134, "learning_rate": 1.8613408459824462e-05, "loss": 0.7971, "num_input_tokens_seen": 205826992, "step": 4830 }, { "epoch": 1.1656519380312256, "grad_norm": 1.262551188468933, "learning_rate": 1.8567643620884796e-05, "loss": 0.8236, "num_input_tokens_seen": 206040376, "step": 4835 }, { "epoch": 1.1668575562119476, "grad_norm": 1.2578322887420654, "learning_rate": 1.852190185375479e-05, "loss": 0.8446, "num_input_tokens_seen": 206255272, "step": 4840 }, { "epoch": 1.16806317439267, "grad_norm": 1.1082228422164917, "learning_rate": 1.8476183322502687e-05, "loss": 0.7589, "num_input_tokens_seen": 206479640, "step": 4845 }, { "epoch": 1.169268792573392, "grad_norm": 1.0959522724151611, "learning_rate": 1.8430488191113374e-05, "loss": 0.7876, "num_input_tokens_seen": 206695608, "step": 4850 }, { "epoch": 1.1704744107541143, "grad_norm": 1.1998076438903809, "learning_rate": 1.8384816623487804e-05, "loss": 0.7906, "num_input_tokens_seen": 206904968, "step": 4855 }, { "epoch": 1.1716800289348364, "grad_norm": 1.1138511896133423, "learning_rate": 1.8339168783442424e-05, "loss": 0.7898, "num_input_tokens_seen": 207126648, "step": 4860 }, { "epoch": 1.1728856471155584, "grad_norm": 1.2177280187606812, "learning_rate": 1.8293544834708564e-05, "loss": 0.7978, "num_input_tokens_seen": 207338336, "step": 4865 }, { "epoch": 1.1740912652962807, "grad_norm": 1.1709709167480469, "learning_rate": 1.824794494093187e-05, "loss": 0.7883, "num_input_tokens_seen": 207544920, "step": 4870 }, { "epoch": 1.1752968834770028, "grad_norm": 1.2269909381866455, "learning_rate": 1.820236926567168e-05, "loss": 0.7775, "num_input_tokens_seen": 207751288, "step": 4875 }, { "epoch": 1.176502501657725, "grad_norm": 1.1870896816253662, "learning_rate": 1.815681797240049e-05, "loss": 0.7265, "num_input_tokens_seen": 207967776, "step": 4880 }, { "epoch": 1.1777081198384471, "grad_norm": 1.2596670389175415, "learning_rate": 1.8111291224503334e-05, "loss": 0.7544, "num_input_tokens_seen": 208182840, "step": 4885 }, { "epoch": 1.1789137380191694, "grad_norm": 1.1292271614074707, "learning_rate": 1.806578918527721e-05, "loss": 0.7728, "num_input_tokens_seen": 208385176, "step": 4890 }, { "epoch": 1.1801193561998915, "grad_norm": 1.2037241458892822, "learning_rate": 1.802031201793049e-05, "loss": 0.843, "num_input_tokens_seen": 208595344, "step": 4895 }, { "epoch": 1.1813249743806136, "grad_norm": 1.1358380317687988, "learning_rate": 1.797485988558232e-05, "loss": 0.7822, "num_input_tokens_seen": 208822440, "step": 4900 }, { "epoch": 1.1825305925613359, "grad_norm": 0.9900984764099121, "learning_rate": 1.7929432951262075e-05, "loss": 0.7708, "num_input_tokens_seen": 209040752, "step": 4905 }, { "epoch": 1.183736210742058, "grad_norm": 1.310274362564087, "learning_rate": 1.7884031377908718e-05, "loss": 0.7556, "num_input_tokens_seen": 209252664, "step": 4910 }, { "epoch": 1.1849418289227802, "grad_norm": 1.1843197345733643, "learning_rate": 1.783865532837027e-05, "loss": 0.7695, "num_input_tokens_seen": 209465280, "step": 4915 }, { "epoch": 1.1861474471035023, "grad_norm": 1.16232168674469, "learning_rate": 1.7793304965403195e-05, "loss": 0.7723, "num_input_tokens_seen": 209686608, "step": 4920 }, { "epoch": 1.1873530652842246, "grad_norm": 1.1505098342895508, "learning_rate": 1.7747980451671824e-05, "loss": 0.7709, "num_input_tokens_seen": 209890824, "step": 4925 }, { "epoch": 1.1885586834649466, "grad_norm": 1.359110951423645, "learning_rate": 1.770268194974776e-05, "loss": 0.7471, "num_input_tokens_seen": 210110424, "step": 4930 }, { "epoch": 1.1897643016456687, "grad_norm": 1.1967742443084717, "learning_rate": 1.7657409622109338e-05, "loss": 0.8596, "num_input_tokens_seen": 210321896, "step": 4935 }, { "epoch": 1.190969919826391, "grad_norm": 1.134162425994873, "learning_rate": 1.7612163631140976e-05, "loss": 0.7891, "num_input_tokens_seen": 210540024, "step": 4940 }, { "epoch": 1.192175538007113, "grad_norm": 1.2749091386795044, "learning_rate": 1.7566944139132636e-05, "loss": 0.7281, "num_input_tokens_seen": 210755992, "step": 4945 }, { "epoch": 1.1933811561878354, "grad_norm": 1.2467238903045654, "learning_rate": 1.752175130827923e-05, "loss": 0.8129, "num_input_tokens_seen": 210957920, "step": 4950 }, { "epoch": 1.1945867743685574, "grad_norm": 1.1985461711883545, "learning_rate": 1.747658530068006e-05, "loss": 0.8101, "num_input_tokens_seen": 211165616, "step": 4955 }, { "epoch": 1.1957923925492797, "grad_norm": 1.115654706954956, "learning_rate": 1.7431446278338197e-05, "loss": 0.7538, "num_input_tokens_seen": 211377064, "step": 4960 }, { "epoch": 1.1969980107300018, "grad_norm": 1.3392772674560547, "learning_rate": 1.738633440315993e-05, "loss": 0.7595, "num_input_tokens_seen": 211591760, "step": 4965 }, { "epoch": 1.1982036289107239, "grad_norm": 1.2307289838790894, "learning_rate": 1.734124983695417e-05, "loss": 0.8143, "num_input_tokens_seen": 211787384, "step": 4970 }, { "epoch": 1.1994092470914461, "grad_norm": 1.178209900856018, "learning_rate": 1.7296192741431893e-05, "loss": 0.7867, "num_input_tokens_seen": 212006928, "step": 4975 }, { "epoch": 1.2006148652721682, "grad_norm": 1.130049467086792, "learning_rate": 1.7251163278205502e-05, "loss": 0.8367, "num_input_tokens_seen": 212220992, "step": 4980 }, { "epoch": 1.2018204834528905, "grad_norm": 1.007415771484375, "learning_rate": 1.7206161608788317e-05, "loss": 0.7871, "num_input_tokens_seen": 212424600, "step": 4985 }, { "epoch": 1.2030261016336126, "grad_norm": 1.1432521343231201, "learning_rate": 1.7161187894593975e-05, "loss": 0.8095, "num_input_tokens_seen": 212633832, "step": 4990 }, { "epoch": 1.2042317198143349, "grad_norm": 1.1130658388137817, "learning_rate": 1.7116242296935825e-05, "loss": 0.7608, "num_input_tokens_seen": 212849296, "step": 4995 }, { "epoch": 1.205437337995057, "grad_norm": 8.688248634338379, "learning_rate": 1.707132497702636e-05, "loss": 0.7822, "num_input_tokens_seen": 213056632, "step": 5000 }, { "epoch": 1.2066429561757792, "grad_norm": 1.2927147150039673, "learning_rate": 1.7026436095976668e-05, "loss": 0.7849, "num_input_tokens_seen": 213268872, "step": 5005 }, { "epoch": 1.2078485743565013, "grad_norm": 1.0952832698822021, "learning_rate": 1.698157581479582e-05, "loss": 0.7545, "num_input_tokens_seen": 213481968, "step": 5010 }, { "epoch": 1.2090541925372236, "grad_norm": 1.1615872383117676, "learning_rate": 1.693674429439029e-05, "loss": 0.7722, "num_input_tokens_seen": 213699440, "step": 5015 }, { "epoch": 1.2102598107179456, "grad_norm": 1.1104487180709839, "learning_rate": 1.689194169556341e-05, "loss": 0.7801, "num_input_tokens_seen": 213922480, "step": 5020 }, { "epoch": 1.2114654288986677, "grad_norm": 1.0576695203781128, "learning_rate": 1.684716817901477e-05, "loss": 0.7086, "num_input_tokens_seen": 214140480, "step": 5025 }, { "epoch": 1.21267104707939, "grad_norm": 2.8195509910583496, "learning_rate": 1.6802423905339653e-05, "loss": 0.7382, "num_input_tokens_seen": 214353728, "step": 5030 }, { "epoch": 1.213876665260112, "grad_norm": 1.0559022426605225, "learning_rate": 1.6757709035028446e-05, "loss": 0.7499, "num_input_tokens_seen": 214561120, "step": 5035 }, { "epoch": 1.2150822834408344, "grad_norm": 1.2666276693344116, "learning_rate": 1.6713023728466058e-05, "loss": 0.8266, "num_input_tokens_seen": 214775776, "step": 5040 }, { "epoch": 1.2162879016215564, "grad_norm": 1.102250576019287, "learning_rate": 1.6668368145931397e-05, "loss": 0.7847, "num_input_tokens_seen": 214986568, "step": 5045 }, { "epoch": 1.2174935198022787, "grad_norm": 1.2046215534210205, "learning_rate": 1.6623742447596702e-05, "loss": 0.7523, "num_input_tokens_seen": 215205024, "step": 5050 }, { "epoch": 1.2186991379830008, "grad_norm": 1.1446651220321655, "learning_rate": 1.657914679352706e-05, "loss": 0.7713, "num_input_tokens_seen": 215404008, "step": 5055 }, { "epoch": 1.2199047561637228, "grad_norm": 2.2933807373046875, "learning_rate": 1.6534581343679784e-05, "loss": 0.7681, "num_input_tokens_seen": 215619472, "step": 5060 }, { "epoch": 1.2211103743444451, "grad_norm": 5.962594509124756, "learning_rate": 1.6490046257903852e-05, "loss": 0.752, "num_input_tokens_seen": 215828184, "step": 5065 }, { "epoch": 1.2223159925251672, "grad_norm": 1.224616527557373, "learning_rate": 1.6445541695939324e-05, "loss": 0.7766, "num_input_tokens_seen": 216040872, "step": 5070 }, { "epoch": 1.2235216107058895, "grad_norm": 1.1699986457824707, "learning_rate": 1.6401067817416783e-05, "loss": 0.7797, "num_input_tokens_seen": 216253392, "step": 5075 }, { "epoch": 1.2247272288866116, "grad_norm": 1.1951082944869995, "learning_rate": 1.635662478185676e-05, "loss": 0.713, "num_input_tokens_seen": 216476320, "step": 5080 }, { "epoch": 1.2259328470673339, "grad_norm": 1.1653621196746826, "learning_rate": 1.631221274866914e-05, "loss": 0.756, "num_input_tokens_seen": 216689560, "step": 5085 }, { "epoch": 1.227138465248056, "grad_norm": 1.282157063484192, "learning_rate": 1.626783187715262e-05, "loss": 0.7968, "num_input_tokens_seen": 216908664, "step": 5090 }, { "epoch": 1.228344083428778, "grad_norm": 1.3534682989120483, "learning_rate": 1.622348232649412e-05, "loss": 0.7876, "num_input_tokens_seen": 217127368, "step": 5095 }, { "epoch": 1.2295497016095003, "grad_norm": 1.211969017982483, "learning_rate": 1.617916425576823e-05, "loss": 0.7581, "num_input_tokens_seen": 217342952, "step": 5100 }, { "epoch": 1.2307553197902223, "grad_norm": 1.1870031356811523, "learning_rate": 1.613487782393661e-05, "loss": 0.7796, "num_input_tokens_seen": 217544832, "step": 5105 }, { "epoch": 1.2319609379709446, "grad_norm": 1.1773090362548828, "learning_rate": 1.6090623189847444e-05, "loss": 0.7998, "num_input_tokens_seen": 217751752, "step": 5110 }, { "epoch": 1.2331665561516667, "grad_norm": 1.2375316619873047, "learning_rate": 1.604640051223487e-05, "loss": 0.7776, "num_input_tokens_seen": 217970728, "step": 5115 }, { "epoch": 1.234372174332389, "grad_norm": 1.1594902276992798, "learning_rate": 1.6002209949718384e-05, "loss": 0.8278, "num_input_tokens_seen": 218186344, "step": 5120 }, { "epoch": 1.235577792513111, "grad_norm": 1.2290233373641968, "learning_rate": 1.5958051660802306e-05, "loss": 0.8062, "num_input_tokens_seen": 218407552, "step": 5125 }, { "epoch": 1.2367834106938334, "grad_norm": 1.1465390920639038, "learning_rate": 1.5913925803875194e-05, "loss": 0.7351, "num_input_tokens_seen": 218625784, "step": 5130 }, { "epoch": 1.2379890288745554, "grad_norm": 1.1368411779403687, "learning_rate": 1.586983253720927e-05, "loss": 0.7901, "num_input_tokens_seen": 218838048, "step": 5135 }, { "epoch": 1.2391946470552777, "grad_norm": 1.2145025730133057, "learning_rate": 1.5825772018959877e-05, "loss": 0.7368, "num_input_tokens_seen": 219050080, "step": 5140 }, { "epoch": 1.2404002652359998, "grad_norm": 1.3726462125778198, "learning_rate": 1.5781744407164874e-05, "loss": 0.8202, "num_input_tokens_seen": 219253352, "step": 5145 }, { "epoch": 1.2416058834167218, "grad_norm": 1.1592856645584106, "learning_rate": 1.57377498597441e-05, "loss": 0.7934, "num_input_tokens_seen": 219463088, "step": 5150 }, { "epoch": 1.2428115015974441, "grad_norm": 1.1335358619689941, "learning_rate": 1.5693788534498802e-05, "loss": 0.8524, "num_input_tokens_seen": 219668824, "step": 5155 }, { "epoch": 1.2440171197781662, "grad_norm": 1.2924506664276123, "learning_rate": 1.5649860589111047e-05, "loss": 0.8639, "num_input_tokens_seen": 219878096, "step": 5160 }, { "epoch": 1.2452227379588885, "grad_norm": 1.2613240480422974, "learning_rate": 1.56059661811432e-05, "loss": 0.78, "num_input_tokens_seen": 220082328, "step": 5165 }, { "epoch": 1.2464283561396106, "grad_norm": 1.083790898323059, "learning_rate": 1.5562105468037313e-05, "loss": 0.7677, "num_input_tokens_seen": 220291720, "step": 5170 }, { "epoch": 1.2476339743203329, "grad_norm": 1.156510353088379, "learning_rate": 1.5518278607114585e-05, "loss": 0.7733, "num_input_tokens_seen": 220502992, "step": 5175 }, { "epoch": 1.248839592501055, "grad_norm": 0.9745100140571594, "learning_rate": 1.5474485755574793e-05, "loss": 0.7825, "num_input_tokens_seen": 220730688, "step": 5180 }, { "epoch": 1.250045210681777, "grad_norm": 1.3736803531646729, "learning_rate": 1.5430727070495745e-05, "loss": 0.7572, "num_input_tokens_seen": 220929224, "step": 5185 }, { "epoch": 1.2512508288624993, "grad_norm": 1.2251843214035034, "learning_rate": 1.538700270883266e-05, "loss": 0.8232, "num_input_tokens_seen": 221149384, "step": 5190 }, { "epoch": 1.2524564470432213, "grad_norm": 1.2398916482925415, "learning_rate": 1.534331282741768e-05, "loss": 0.8018, "num_input_tokens_seen": 221361784, "step": 5195 }, { "epoch": 1.2536620652239436, "grad_norm": 1.0857963562011719, "learning_rate": 1.529965758295925e-05, "loss": 0.7664, "num_input_tokens_seen": 221576344, "step": 5200 }, { "epoch": 1.2548676834046657, "grad_norm": 1.11154305934906, "learning_rate": 1.5256037132041595e-05, "loss": 0.7843, "num_input_tokens_seen": 221789560, "step": 5205 }, { "epoch": 1.256073301585388, "grad_norm": 1.2462410926818848, "learning_rate": 1.5212451631124142e-05, "loss": 0.7878, "num_input_tokens_seen": 222002904, "step": 5210 }, { "epoch": 1.25727891976611, "grad_norm": 1.151702880859375, "learning_rate": 1.5168901236540936e-05, "loss": 0.7761, "num_input_tokens_seen": 222217040, "step": 5215 }, { "epoch": 1.2584845379468321, "grad_norm": 1.082759141921997, "learning_rate": 1.5125386104500124e-05, "loss": 0.7709, "num_input_tokens_seen": 222435616, "step": 5220 }, { "epoch": 1.2596901561275544, "grad_norm": 1.1599650382995605, "learning_rate": 1.5081906391083355e-05, "loss": 0.7376, "num_input_tokens_seen": 222644904, "step": 5225 }, { "epoch": 1.2608957743082767, "grad_norm": 1.0293262004852295, "learning_rate": 1.503846225224526e-05, "loss": 0.7613, "num_input_tokens_seen": 222857104, "step": 5230 }, { "epoch": 1.2621013924889988, "grad_norm": 1.14164137840271, "learning_rate": 1.4995053843812838e-05, "loss": 0.7185, "num_input_tokens_seen": 223079720, "step": 5235 }, { "epoch": 1.2633070106697208, "grad_norm": 1.0623764991760254, "learning_rate": 1.495168132148495e-05, "loss": 0.7692, "num_input_tokens_seen": 223309136, "step": 5240 }, { "epoch": 1.2645126288504431, "grad_norm": 1.1020876169204712, "learning_rate": 1.4908344840831746e-05, "loss": 0.7469, "num_input_tokens_seen": 223527176, "step": 5245 }, { "epoch": 1.2657182470311652, "grad_norm": 1.2364848852157593, "learning_rate": 1.486504455729408e-05, "loss": 0.7915, "num_input_tokens_seen": 223742776, "step": 5250 }, { "epoch": 1.2669238652118873, "grad_norm": 1.2406792640686035, "learning_rate": 1.4821780626182999e-05, "loss": 0.7965, "num_input_tokens_seen": 223955232, "step": 5255 }, { "epoch": 1.2681294833926096, "grad_norm": 1.2916890382766724, "learning_rate": 1.4778553202679119e-05, "loss": 0.8393, "num_input_tokens_seen": 224163520, "step": 5260 }, { "epoch": 1.2693351015733318, "grad_norm": 1.058854579925537, "learning_rate": 1.4735362441832149e-05, "loss": 0.7533, "num_input_tokens_seen": 224372120, "step": 5265 }, { "epoch": 1.270540719754054, "grad_norm": 1.0896540880203247, "learning_rate": 1.4692208498560276e-05, "loss": 0.7137, "num_input_tokens_seen": 224585576, "step": 5270 }, { "epoch": 1.271746337934776, "grad_norm": 1.1920496225357056, "learning_rate": 1.4649091527649634e-05, "loss": 0.7544, "num_input_tokens_seen": 224798176, "step": 5275 }, { "epoch": 1.2729519561154983, "grad_norm": 1.1693156957626343, "learning_rate": 1.4606011683753737e-05, "loss": 0.7721, "num_input_tokens_seen": 225012336, "step": 5280 }, { "epoch": 1.2741575742962203, "grad_norm": 1.2815747261047363, "learning_rate": 1.4562969121392941e-05, "loss": 0.7143, "num_input_tokens_seen": 225234304, "step": 5285 }, { "epoch": 1.2753631924769426, "grad_norm": 1.2615185976028442, "learning_rate": 1.4519963994953889e-05, "loss": 0.7844, "num_input_tokens_seen": 225445416, "step": 5290 }, { "epoch": 1.2765688106576647, "grad_norm": 1.120100498199463, "learning_rate": 1.4476996458688891e-05, "loss": 0.7386, "num_input_tokens_seen": 225655560, "step": 5295 }, { "epoch": 1.277774428838387, "grad_norm": 1.2106016874313354, "learning_rate": 1.4434066666715501e-05, "loss": 0.7616, "num_input_tokens_seen": 225865840, "step": 5300 }, { "epoch": 1.278980047019109, "grad_norm": 1.1748045682907104, "learning_rate": 1.4391174773015836e-05, "loss": 0.7885, "num_input_tokens_seen": 226070840, "step": 5305 }, { "epoch": 1.2801856651998311, "grad_norm": 1.0970255136489868, "learning_rate": 1.434832093143612e-05, "loss": 0.7502, "num_input_tokens_seen": 226279168, "step": 5310 }, { "epoch": 1.2813912833805534, "grad_norm": 1.1312586069107056, "learning_rate": 1.4305505295686053e-05, "loss": 0.7853, "num_input_tokens_seen": 226483592, "step": 5315 }, { "epoch": 1.2825969015612755, "grad_norm": 1.2077428102493286, "learning_rate": 1.4262728019338328e-05, "loss": 0.8098, "num_input_tokens_seen": 226681936, "step": 5320 }, { "epoch": 1.2838025197419978, "grad_norm": 1.1393928527832031, "learning_rate": 1.4219989255828029e-05, "loss": 0.7793, "num_input_tokens_seen": 226895032, "step": 5325 }, { "epoch": 1.2850081379227198, "grad_norm": 1.2322579622268677, "learning_rate": 1.4177289158452103e-05, "loss": 0.7661, "num_input_tokens_seen": 227113680, "step": 5330 }, { "epoch": 1.2862137561034421, "grad_norm": 1.0955026149749756, "learning_rate": 1.4134627880368805e-05, "loss": 0.763, "num_input_tokens_seen": 227330560, "step": 5335 }, { "epoch": 1.2874193742841642, "grad_norm": 1.0076054334640503, "learning_rate": 1.4092005574597172e-05, "loss": 0.7603, "num_input_tokens_seen": 227544024, "step": 5340 }, { "epoch": 1.2886249924648863, "grad_norm": 1.2629588842391968, "learning_rate": 1.4049422394016437e-05, "loss": 0.7889, "num_input_tokens_seen": 227754176, "step": 5345 }, { "epoch": 1.2898306106456086, "grad_norm": 1.2239567041397095, "learning_rate": 1.4006878491365488e-05, "loss": 0.7787, "num_input_tokens_seen": 227968464, "step": 5350 }, { "epoch": 1.2910362288263306, "grad_norm": 1.1745800971984863, "learning_rate": 1.3964374019242358e-05, "loss": 0.824, "num_input_tokens_seen": 228181240, "step": 5355 }, { "epoch": 1.292241847007053, "grad_norm": 1.2027627229690552, "learning_rate": 1.3921909130103625e-05, "loss": 0.8061, "num_input_tokens_seen": 228398104, "step": 5360 }, { "epoch": 1.293447465187775, "grad_norm": 1.4895256757736206, "learning_rate": 1.3879483976263897e-05, "loss": 0.7669, "num_input_tokens_seen": 228609128, "step": 5365 }, { "epoch": 1.2946530833684973, "grad_norm": 1.1386936902999878, "learning_rate": 1.3837098709895246e-05, "loss": 0.7588, "num_input_tokens_seen": 228820480, "step": 5370 }, { "epoch": 1.2958587015492193, "grad_norm": 1.1970571279525757, "learning_rate": 1.3794753483026706e-05, "loss": 0.7454, "num_input_tokens_seen": 229036680, "step": 5375 }, { "epoch": 1.2970643197299414, "grad_norm": 1.1394007205963135, "learning_rate": 1.375244844754366e-05, "loss": 0.8129, "num_input_tokens_seen": 229264072, "step": 5380 }, { "epoch": 1.2982699379106637, "grad_norm": 1.8836475610733032, "learning_rate": 1.3710183755187362e-05, "loss": 0.7092, "num_input_tokens_seen": 229487448, "step": 5385 }, { "epoch": 1.299475556091386, "grad_norm": 1.314429521560669, "learning_rate": 1.366795955755433e-05, "loss": 0.787, "num_input_tokens_seen": 229700216, "step": 5390 }, { "epoch": 1.300681174272108, "grad_norm": 1.1800388097763062, "learning_rate": 1.3625776006095881e-05, "loss": 0.7527, "num_input_tokens_seen": 229917496, "step": 5395 }, { "epoch": 1.3018867924528301, "grad_norm": 1.436825156211853, "learning_rate": 1.3583633252117466e-05, "loss": 0.7003, "num_input_tokens_seen": 230115248, "step": 5400 }, { "epoch": 1.3030924106335524, "grad_norm": 1.2827026844024658, "learning_rate": 1.3541531446778286e-05, "loss": 0.7977, "num_input_tokens_seen": 230321456, "step": 5405 }, { "epoch": 1.3042980288142745, "grad_norm": 1.1346313953399658, "learning_rate": 1.3499470741090608e-05, "loss": 0.7959, "num_input_tokens_seen": 230536656, "step": 5410 }, { "epoch": 1.3055036469949965, "grad_norm": 1.2701466083526611, "learning_rate": 1.3457451285919292e-05, "loss": 0.8524, "num_input_tokens_seen": 230754232, "step": 5415 }, { "epoch": 1.3067092651757188, "grad_norm": 1.182754635810852, "learning_rate": 1.3415473231981274e-05, "loss": 0.8091, "num_input_tokens_seen": 230957976, "step": 5420 }, { "epoch": 1.3079148833564411, "grad_norm": 1.123382806777954, "learning_rate": 1.3373536729844943e-05, "loss": 0.8141, "num_input_tokens_seen": 231160512, "step": 5425 }, { "epoch": 1.3091205015371632, "grad_norm": 1.1939311027526855, "learning_rate": 1.3331641929929673e-05, "loss": 0.7751, "num_input_tokens_seen": 231375664, "step": 5430 }, { "epoch": 1.3103261197178853, "grad_norm": 1.2396235466003418, "learning_rate": 1.328978898250525e-05, "loss": 0.7877, "num_input_tokens_seen": 231587488, "step": 5435 }, { "epoch": 1.3115317378986076, "grad_norm": 1.5525935888290405, "learning_rate": 1.3247978037691361e-05, "loss": 0.7714, "num_input_tokens_seen": 231793728, "step": 5440 }, { "epoch": 1.3127373560793296, "grad_norm": 1.514390468597412, "learning_rate": 1.3206209245457008e-05, "loss": 0.7318, "num_input_tokens_seen": 232002184, "step": 5445 }, { "epoch": 1.313942974260052, "grad_norm": 1.0112282037734985, "learning_rate": 1.3164482755620028e-05, "loss": 0.7703, "num_input_tokens_seen": 232225416, "step": 5450 }, { "epoch": 1.315148592440774, "grad_norm": 1.3021211624145508, "learning_rate": 1.3122798717846493e-05, "loss": 0.7804, "num_input_tokens_seen": 232436464, "step": 5455 }, { "epoch": 1.3163542106214963, "grad_norm": 1.101696252822876, "learning_rate": 1.3081157281650258e-05, "loss": 0.7033, "num_input_tokens_seen": 232658488, "step": 5460 }, { "epoch": 1.3175598288022183, "grad_norm": 1.2393107414245605, "learning_rate": 1.3039558596392296e-05, "loss": 0.8022, "num_input_tokens_seen": 232883480, "step": 5465 }, { "epoch": 1.3187654469829404, "grad_norm": 1.2774510383605957, "learning_rate": 1.2998002811280314e-05, "loss": 0.7869, "num_input_tokens_seen": 233114824, "step": 5470 }, { "epoch": 1.3199710651636627, "grad_norm": 1.084691047668457, "learning_rate": 1.2956490075368093e-05, "loss": 0.768, "num_input_tokens_seen": 233322488, "step": 5475 }, { "epoch": 1.3211766833443848, "grad_norm": 1.2052024602890015, "learning_rate": 1.2915020537555047e-05, "loss": 0.7609, "num_input_tokens_seen": 233533592, "step": 5480 }, { "epoch": 1.322382301525107, "grad_norm": 2.182495355606079, "learning_rate": 1.2873594346585604e-05, "loss": 0.7861, "num_input_tokens_seen": 233737336, "step": 5485 }, { "epoch": 1.3235879197058291, "grad_norm": 1.1540703773498535, "learning_rate": 1.2832211651048731e-05, "loss": 0.7716, "num_input_tokens_seen": 233944064, "step": 5490 }, { "epoch": 1.3247935378865514, "grad_norm": 1.096030592918396, "learning_rate": 1.2790872599377396e-05, "loss": 0.8122, "num_input_tokens_seen": 234161944, "step": 5495 }, { "epoch": 1.3259991560672735, "grad_norm": 1.2095472812652588, "learning_rate": 1.2749577339848007e-05, "loss": 0.7553, "num_input_tokens_seen": 234369856, "step": 5500 }, { "epoch": 1.3272047742479955, "grad_norm": 1.1381498575210571, "learning_rate": 1.2708326020579896e-05, "loss": 0.7759, "num_input_tokens_seen": 234574704, "step": 5505 }, { "epoch": 1.3284103924287178, "grad_norm": 1.1884804964065552, "learning_rate": 1.2667118789534793e-05, "loss": 0.7558, "num_input_tokens_seen": 234791976, "step": 5510 }, { "epoch": 1.3296160106094401, "grad_norm": 1.4575458765029907, "learning_rate": 1.2625955794516302e-05, "loss": 0.7727, "num_input_tokens_seen": 235006232, "step": 5515 }, { "epoch": 1.3308216287901622, "grad_norm": 1.1291191577911377, "learning_rate": 1.2584837183169343e-05, "loss": 0.8068, "num_input_tokens_seen": 235225888, "step": 5520 }, { "epoch": 1.3320272469708843, "grad_norm": 1.1552376747131348, "learning_rate": 1.2543763102979656e-05, "loss": 0.8452, "num_input_tokens_seen": 235436944, "step": 5525 }, { "epoch": 1.3332328651516065, "grad_norm": 1.1026359796524048, "learning_rate": 1.2502733701273234e-05, "loss": 0.7681, "num_input_tokens_seen": 235638536, "step": 5530 }, { "epoch": 1.3344384833323286, "grad_norm": 1.183136224746704, "learning_rate": 1.2461749125215832e-05, "loss": 0.7865, "num_input_tokens_seen": 235854480, "step": 5535 }, { "epoch": 1.3356441015130507, "grad_norm": 1.245698094367981, "learning_rate": 1.2420809521812404e-05, "loss": 0.7872, "num_input_tokens_seen": 236076304, "step": 5540 }, { "epoch": 1.336849719693773, "grad_norm": 1.3329147100448608, "learning_rate": 1.2379915037906628e-05, "loss": 0.8076, "num_input_tokens_seen": 236293720, "step": 5545 }, { "epoch": 1.3380553378744953, "grad_norm": 1.089760184288025, "learning_rate": 1.2339065820180306e-05, "loss": 0.7592, "num_input_tokens_seen": 236498488, "step": 5550 }, { "epoch": 1.3392609560552173, "grad_norm": 1.1718480587005615, "learning_rate": 1.2298262015152918e-05, "loss": 0.7424, "num_input_tokens_seen": 236721208, "step": 5555 }, { "epoch": 1.3404665742359394, "grad_norm": 1.1498292684555054, "learning_rate": 1.2257503769181023e-05, "loss": 0.781, "num_input_tokens_seen": 236925080, "step": 5560 }, { "epoch": 1.3416721924166617, "grad_norm": 1.18236243724823, "learning_rate": 1.2216791228457778e-05, "loss": 0.7498, "num_input_tokens_seen": 237140880, "step": 5565 }, { "epoch": 1.3428778105973838, "grad_norm": 1.4807162284851074, "learning_rate": 1.2176124539012395e-05, "loss": 0.8092, "num_input_tokens_seen": 237349928, "step": 5570 }, { "epoch": 1.344083428778106, "grad_norm": 1.2179722785949707, "learning_rate": 1.2135503846709656e-05, "loss": 0.8496, "num_input_tokens_seen": 237564928, "step": 5575 }, { "epoch": 1.3452890469588281, "grad_norm": 1.2029340267181396, "learning_rate": 1.2094929297249324e-05, "loss": 0.8328, "num_input_tokens_seen": 237791248, "step": 5580 }, { "epoch": 1.3464946651395504, "grad_norm": 1.296565055847168, "learning_rate": 1.2054401036165661e-05, "loss": 0.7809, "num_input_tokens_seen": 238008056, "step": 5585 }, { "epoch": 1.3477002833202725, "grad_norm": 1.3349584341049194, "learning_rate": 1.2013919208826924e-05, "loss": 0.7716, "num_input_tokens_seen": 238217976, "step": 5590 }, { "epoch": 1.3489059015009945, "grad_norm": 1.1008596420288086, "learning_rate": 1.1973483960434784e-05, "loss": 0.7783, "num_input_tokens_seen": 238413016, "step": 5595 }, { "epoch": 1.3501115196817168, "grad_norm": 1.3848090171813965, "learning_rate": 1.1933095436023886e-05, "loss": 0.8138, "num_input_tokens_seen": 238627632, "step": 5600 }, { "epoch": 1.351317137862439, "grad_norm": 1.1326169967651367, "learning_rate": 1.1892753780461225e-05, "loss": 0.7072, "num_input_tokens_seen": 238840312, "step": 5605 }, { "epoch": 1.3525227560431612, "grad_norm": 1.1592400074005127, "learning_rate": 1.1852459138445743e-05, "loss": 0.7778, "num_input_tokens_seen": 239042464, "step": 5610 }, { "epoch": 1.3537283742238833, "grad_norm": 1.2238402366638184, "learning_rate": 1.1812211654507705e-05, "loss": 0.7758, "num_input_tokens_seen": 239253416, "step": 5615 }, { "epoch": 1.3549339924046055, "grad_norm": 1.3015955686569214, "learning_rate": 1.177201147300827e-05, "loss": 0.7556, "num_input_tokens_seen": 239465528, "step": 5620 }, { "epoch": 1.3561396105853276, "grad_norm": 1.272373914718628, "learning_rate": 1.1731858738138895e-05, "loss": 0.8255, "num_input_tokens_seen": 239665832, "step": 5625 }, { "epoch": 1.3573452287660497, "grad_norm": 1.0304120779037476, "learning_rate": 1.1691753593920885e-05, "loss": 0.8212, "num_input_tokens_seen": 239874584, "step": 5630 }, { "epoch": 1.358550846946772, "grad_norm": 1.206579327583313, "learning_rate": 1.1651696184204819e-05, "loss": 0.8201, "num_input_tokens_seen": 240090080, "step": 5635 }, { "epoch": 1.359756465127494, "grad_norm": 1.0642815828323364, "learning_rate": 1.161168665267007e-05, "loss": 0.7444, "num_input_tokens_seen": 240307816, "step": 5640 }, { "epoch": 1.3609620833082163, "grad_norm": 1.1648123264312744, "learning_rate": 1.157172514282428e-05, "loss": 0.7988, "num_input_tokens_seen": 240527776, "step": 5645 }, { "epoch": 1.3621677014889384, "grad_norm": 1.1461207866668701, "learning_rate": 1.1531811798002837e-05, "loss": 0.753, "num_input_tokens_seen": 240748512, "step": 5650 }, { "epoch": 1.3633733196696607, "grad_norm": 1.2051174640655518, "learning_rate": 1.1491946761368397e-05, "loss": 0.7673, "num_input_tokens_seen": 240958160, "step": 5655 }, { "epoch": 1.3645789378503828, "grad_norm": 1.102009892463684, "learning_rate": 1.14521301759103e-05, "loss": 0.774, "num_input_tokens_seen": 241177632, "step": 5660 }, { "epoch": 1.3657845560311048, "grad_norm": 1.248652458190918, "learning_rate": 1.1412362184444142e-05, "loss": 0.7129, "num_input_tokens_seen": 241403864, "step": 5665 }, { "epoch": 1.3669901742118271, "grad_norm": 1.0028680562973022, "learning_rate": 1.137264292961119e-05, "loss": 0.7341, "num_input_tokens_seen": 241621320, "step": 5670 }, { "epoch": 1.3681957923925494, "grad_norm": 1.2857555150985718, "learning_rate": 1.1332972553877915e-05, "loss": 0.7802, "num_input_tokens_seen": 241837240, "step": 5675 }, { "epoch": 1.3694014105732715, "grad_norm": 1.3461661338806152, "learning_rate": 1.1293351199535452e-05, "loss": 0.7412, "num_input_tokens_seen": 242046280, "step": 5680 }, { "epoch": 1.3706070287539935, "grad_norm": 8.73119068145752, "learning_rate": 1.1253779008699131e-05, "loss": 0.8004, "num_input_tokens_seen": 242263640, "step": 5685 }, { "epoch": 1.3718126469347158, "grad_norm": 1.2424880266189575, "learning_rate": 1.1214256123307905e-05, "loss": 0.7795, "num_input_tokens_seen": 242476952, "step": 5690 }, { "epoch": 1.373018265115438, "grad_norm": 1.172890067100525, "learning_rate": 1.1174782685123918e-05, "loss": 0.7871, "num_input_tokens_seen": 242690920, "step": 5695 }, { "epoch": 1.3742238832961602, "grad_norm": 1.2698285579681396, "learning_rate": 1.1135358835731924e-05, "loss": 0.7749, "num_input_tokens_seen": 242904472, "step": 5700 }, { "epoch": 1.3754295014768823, "grad_norm": 1.1469109058380127, "learning_rate": 1.1095984716538816e-05, "loss": 0.8111, "num_input_tokens_seen": 243130864, "step": 5705 }, { "epoch": 1.3766351196576045, "grad_norm": 1.1048592329025269, "learning_rate": 1.1056660468773108e-05, "loss": 0.7708, "num_input_tokens_seen": 243340512, "step": 5710 }, { "epoch": 1.3778407378383266, "grad_norm": 1.1180328130722046, "learning_rate": 1.1017386233484458e-05, "loss": 0.739, "num_input_tokens_seen": 243567000, "step": 5715 }, { "epoch": 1.3790463560190487, "grad_norm": 1.2143408060073853, "learning_rate": 1.0978162151543117e-05, "loss": 0.8003, "num_input_tokens_seen": 243780224, "step": 5720 }, { "epoch": 1.380251974199771, "grad_norm": 1.1927489042282104, "learning_rate": 1.0938988363639432e-05, "loss": 0.7944, "num_input_tokens_seen": 244006048, "step": 5725 }, { "epoch": 1.381457592380493, "grad_norm": 1.1540040969848633, "learning_rate": 1.0899865010283391e-05, "loss": 0.6951, "num_input_tokens_seen": 244224536, "step": 5730 }, { "epoch": 1.3826632105612153, "grad_norm": 1.289109468460083, "learning_rate": 1.086079223180404e-05, "loss": 0.7822, "num_input_tokens_seen": 244436912, "step": 5735 }, { "epoch": 1.3838688287419374, "grad_norm": 1.5851844549179077, "learning_rate": 1.0821770168349072e-05, "loss": 0.7213, "num_input_tokens_seen": 244654064, "step": 5740 }, { "epoch": 1.3850744469226597, "grad_norm": 1.2661908864974976, "learning_rate": 1.0782798959884203e-05, "loss": 0.8082, "num_input_tokens_seen": 244864280, "step": 5745 }, { "epoch": 1.3862800651033818, "grad_norm": 1.1718969345092773, "learning_rate": 1.0743878746192806e-05, "loss": 0.8124, "num_input_tokens_seen": 245069848, "step": 5750 }, { "epoch": 1.3874856832841038, "grad_norm": 2.273829460144043, "learning_rate": 1.0705009666875304e-05, "loss": 0.8163, "num_input_tokens_seen": 245291848, "step": 5755 }, { "epoch": 1.3886913014648261, "grad_norm": 1.2271844148635864, "learning_rate": 1.066619186134874e-05, "loss": 0.7338, "num_input_tokens_seen": 245507224, "step": 5760 }, { "epoch": 1.3898969196455482, "grad_norm": 1.2843536138534546, "learning_rate": 1.062742546884621e-05, "loss": 0.7899, "num_input_tokens_seen": 245725296, "step": 5765 }, { "epoch": 1.3911025378262705, "grad_norm": 1.2141307592391968, "learning_rate": 1.0588710628416442e-05, "loss": 0.7312, "num_input_tokens_seen": 245946824, "step": 5770 }, { "epoch": 1.3923081560069925, "grad_norm": 1.136815071105957, "learning_rate": 1.0550047478923219e-05, "loss": 0.78, "num_input_tokens_seen": 246153256, "step": 5775 }, { "epoch": 1.3935137741877148, "grad_norm": 1.3246034383773804, "learning_rate": 1.0511436159044935e-05, "loss": 0.8079, "num_input_tokens_seen": 246358248, "step": 5780 }, { "epoch": 1.394719392368437, "grad_norm": 1.292661428451538, "learning_rate": 1.0472876807274063e-05, "loss": 0.733, "num_input_tokens_seen": 246574680, "step": 5785 }, { "epoch": 1.395925010549159, "grad_norm": 1.2007768154144287, "learning_rate": 1.0434369561916707e-05, "loss": 0.8187, "num_input_tokens_seen": 246801072, "step": 5790 }, { "epoch": 1.3971306287298813, "grad_norm": 1.1482620239257812, "learning_rate": 1.0395914561092046e-05, "loss": 0.7082, "num_input_tokens_seen": 247015088, "step": 5795 }, { "epoch": 1.3983362469106035, "grad_norm": 1.3149832487106323, "learning_rate": 1.0357511942731865e-05, "loss": 0.7503, "num_input_tokens_seen": 247222976, "step": 5800 }, { "epoch": 1.3995418650913256, "grad_norm": 1.1376652717590332, "learning_rate": 1.0319161844580092e-05, "loss": 0.7399, "num_input_tokens_seen": 247432264, "step": 5805 }, { "epoch": 1.4007474832720477, "grad_norm": 1.293440818786621, "learning_rate": 1.0280864404192239e-05, "loss": 0.7881, "num_input_tokens_seen": 247643584, "step": 5810 }, { "epoch": 1.40195310145277, "grad_norm": 1.2135790586471558, "learning_rate": 1.0242619758934959e-05, "loss": 0.757, "num_input_tokens_seen": 247851960, "step": 5815 }, { "epoch": 1.403158719633492, "grad_norm": 1.3574644327163696, "learning_rate": 1.020442804598553e-05, "loss": 0.7447, "num_input_tokens_seen": 248059848, "step": 5820 }, { "epoch": 1.404364337814214, "grad_norm": 1.4904676675796509, "learning_rate": 1.0166289402331391e-05, "loss": 0.8392, "num_input_tokens_seen": 248260088, "step": 5825 }, { "epoch": 1.4055699559949364, "grad_norm": 1.2711467742919922, "learning_rate": 1.0128203964769601e-05, "loss": 0.8031, "num_input_tokens_seen": 248475704, "step": 5830 }, { "epoch": 1.4067755741756587, "grad_norm": 1.202745795249939, "learning_rate": 1.0090171869906404e-05, "loss": 0.7708, "num_input_tokens_seen": 248683760, "step": 5835 }, { "epoch": 1.4079811923563808, "grad_norm": 1.2807767391204834, "learning_rate": 1.0052193254156684e-05, "loss": 0.8484, "num_input_tokens_seen": 248882560, "step": 5840 }, { "epoch": 1.4091868105371028, "grad_norm": 1.1888561248779297, "learning_rate": 1.001426825374355e-05, "loss": 0.7869, "num_input_tokens_seen": 249093664, "step": 5845 }, { "epoch": 1.410392428717825, "grad_norm": 1.1173053979873657, "learning_rate": 9.976397004697738e-06, "loss": 0.7367, "num_input_tokens_seen": 249300448, "step": 5850 }, { "epoch": 1.4115980468985472, "grad_norm": 1.0321965217590332, "learning_rate": 9.93857964285724e-06, "loss": 0.8159, "num_input_tokens_seen": 249514424, "step": 5855 }, { "epoch": 1.4128036650792695, "grad_norm": 1.142570972442627, "learning_rate": 9.900816303866733e-06, "loss": 0.8005, "num_input_tokens_seen": 249725616, "step": 5860 }, { "epoch": 1.4140092832599915, "grad_norm": 1.119364857673645, "learning_rate": 9.863107123177149e-06, "loss": 0.7459, "num_input_tokens_seen": 249939208, "step": 5865 }, { "epoch": 1.4152149014407138, "grad_norm": 1.3895007371902466, "learning_rate": 9.825452236045138e-06, "loss": 0.8078, "num_input_tokens_seen": 250155672, "step": 5870 }, { "epoch": 1.416420519621436, "grad_norm": 1.3005709648132324, "learning_rate": 9.78785177753261e-06, "loss": 0.7079, "num_input_tokens_seen": 250367760, "step": 5875 }, { "epoch": 1.417626137802158, "grad_norm": 1.2307443618774414, "learning_rate": 9.750305882506286e-06, "loss": 0.7793, "num_input_tokens_seen": 250589840, "step": 5880 }, { "epoch": 1.4188317559828802, "grad_norm": 1.2361856698989868, "learning_rate": 9.712814685637105e-06, "loss": 0.7682, "num_input_tokens_seen": 250801136, "step": 5885 }, { "epoch": 1.4200373741636023, "grad_norm": 1.2237071990966797, "learning_rate": 9.67537832139989e-06, "loss": 0.8375, "num_input_tokens_seen": 251017088, "step": 5890 }, { "epoch": 1.4212429923443246, "grad_norm": 1.2322887182235718, "learning_rate": 9.637996924072729e-06, "loss": 0.7218, "num_input_tokens_seen": 251233648, "step": 5895 }, { "epoch": 1.4224486105250467, "grad_norm": 1.3259872198104858, "learning_rate": 9.600670627736594e-06, "loss": 0.7508, "num_input_tokens_seen": 251445800, "step": 5900 }, { "epoch": 1.423654228705769, "grad_norm": 1.1735033988952637, "learning_rate": 9.563399566274786e-06, "loss": 0.7796, "num_input_tokens_seen": 251656936, "step": 5905 }, { "epoch": 1.424859846886491, "grad_norm": 1.5024365186691284, "learning_rate": 9.526183873372521e-06, "loss": 0.7923, "num_input_tokens_seen": 251862520, "step": 5910 }, { "epoch": 1.426065465067213, "grad_norm": 1.1441612243652344, "learning_rate": 9.489023682516387e-06, "loss": 0.7909, "num_input_tokens_seen": 252081976, "step": 5915 }, { "epoch": 1.4272710832479354, "grad_norm": 1.2995668649673462, "learning_rate": 9.451919126993914e-06, "loss": 0.7039, "num_input_tokens_seen": 252294576, "step": 5920 }, { "epoch": 1.4284767014286577, "grad_norm": 1.5635764598846436, "learning_rate": 9.414870339893053e-06, "loss": 0.7554, "num_input_tokens_seen": 252510960, "step": 5925 }, { "epoch": 1.4296823196093797, "grad_norm": 1.3692034482955933, "learning_rate": 9.377877454101764e-06, "loss": 0.7882, "num_input_tokens_seen": 252734216, "step": 5930 }, { "epoch": 1.4308879377901018, "grad_norm": 1.2846990823745728, "learning_rate": 9.340940602307455e-06, "loss": 0.7906, "num_input_tokens_seen": 252939520, "step": 5935 }, { "epoch": 1.432093555970824, "grad_norm": 1.0842149257659912, "learning_rate": 9.304059916996585e-06, "loss": 0.6777, "num_input_tokens_seen": 253152880, "step": 5940 }, { "epoch": 1.4332991741515462, "grad_norm": 1.1369374990463257, "learning_rate": 9.267235530454133e-06, "loss": 0.7494, "num_input_tokens_seen": 253362560, "step": 5945 }, { "epoch": 1.4345047923322682, "grad_norm": 1.1539865732192993, "learning_rate": 9.230467574763138e-06, "loss": 0.737, "num_input_tokens_seen": 253575200, "step": 5950 }, { "epoch": 1.4357104105129905, "grad_norm": 1.2195959091186523, "learning_rate": 9.193756181804248e-06, "loss": 0.8001, "num_input_tokens_seen": 253792472, "step": 5955 }, { "epoch": 1.4369160286937128, "grad_norm": 1.1975902318954468, "learning_rate": 9.157101483255209e-06, "loss": 0.7599, "num_input_tokens_seen": 254014568, "step": 5960 }, { "epoch": 1.438121646874435, "grad_norm": 1.2792272567749023, "learning_rate": 9.120503610590448e-06, "loss": 0.7998, "num_input_tokens_seen": 254227032, "step": 5965 }, { "epoch": 1.439327265055157, "grad_norm": 1.1961296796798706, "learning_rate": 9.08396269508052e-06, "loss": 0.8066, "num_input_tokens_seen": 254446400, "step": 5970 }, { "epoch": 1.4405328832358792, "grad_norm": 1.3403884172439575, "learning_rate": 9.047478867791732e-06, "loss": 0.7666, "num_input_tokens_seen": 254645928, "step": 5975 }, { "epoch": 1.4417385014166013, "grad_norm": 1.4832690954208374, "learning_rate": 9.01105225958558e-06, "loss": 0.8388, "num_input_tokens_seen": 254853024, "step": 5980 }, { "epoch": 1.4429441195973236, "grad_norm": 1.2486157417297363, "learning_rate": 8.97468300111838e-06, "loss": 0.7425, "num_input_tokens_seen": 255070864, "step": 5985 }, { "epoch": 1.4441497377780457, "grad_norm": 1.2758798599243164, "learning_rate": 8.93837122284067e-06, "loss": 0.7659, "num_input_tokens_seen": 255273840, "step": 5990 }, { "epoch": 1.445355355958768, "grad_norm": 1.1793161630630493, "learning_rate": 8.90211705499688e-06, "loss": 0.7659, "num_input_tokens_seen": 255491888, "step": 5995 }, { "epoch": 1.44656097413949, "grad_norm": 1.2306231260299683, "learning_rate": 8.865920627624765e-06, "loss": 0.7933, "num_input_tokens_seen": 255692616, "step": 6000 }, { "epoch": 1.447766592320212, "grad_norm": 1.2573320865631104, "learning_rate": 8.829782070555004e-06, "loss": 0.8002, "num_input_tokens_seen": 255906448, "step": 6005 }, { "epoch": 1.4489722105009344, "grad_norm": 1.1104896068572998, "learning_rate": 8.793701513410674e-06, "loss": 0.7431, "num_input_tokens_seen": 256122544, "step": 6010 }, { "epoch": 1.4501778286816565, "grad_norm": 1.5015738010406494, "learning_rate": 8.757679085606821e-06, "loss": 0.799, "num_input_tokens_seen": 256325624, "step": 6015 }, { "epoch": 1.4513834468623787, "grad_norm": 1.0320836305618286, "learning_rate": 8.721714916350019e-06, "loss": 0.713, "num_input_tokens_seen": 256540232, "step": 6020 }, { "epoch": 1.4525890650431008, "grad_norm": 1.2305946350097656, "learning_rate": 8.685809134637842e-06, "loss": 0.7963, "num_input_tokens_seen": 256753120, "step": 6025 }, { "epoch": 1.453794683223823, "grad_norm": 1.3092156648635864, "learning_rate": 8.649961869258463e-06, "loss": 0.7618, "num_input_tokens_seen": 256968768, "step": 6030 }, { "epoch": 1.4550003014045452, "grad_norm": 1.174912691116333, "learning_rate": 8.614173248790139e-06, "loss": 0.7452, "num_input_tokens_seen": 257182160, "step": 6035 }, { "epoch": 1.4562059195852672, "grad_norm": 1.242514729499817, "learning_rate": 8.57844340160082e-06, "loss": 0.7626, "num_input_tokens_seen": 257401216, "step": 6040 }, { "epoch": 1.4574115377659895, "grad_norm": 1.5287408828735352, "learning_rate": 8.542772455847595e-06, "loss": 0.7826, "num_input_tokens_seen": 257620208, "step": 6045 }, { "epoch": 1.4586171559467116, "grad_norm": 1.208906650543213, "learning_rate": 8.50716053947633e-06, "loss": 0.7706, "num_input_tokens_seen": 257838312, "step": 6050 }, { "epoch": 1.4598227741274339, "grad_norm": 1.294435739517212, "learning_rate": 8.471607780221133e-06, "loss": 0.8116, "num_input_tokens_seen": 258051216, "step": 6055 }, { "epoch": 1.461028392308156, "grad_norm": 1.1349109411239624, "learning_rate": 8.436114305603931e-06, "loss": 0.781, "num_input_tokens_seen": 258269560, "step": 6060 }, { "epoch": 1.4622340104888782, "grad_norm": 1.134616494178772, "learning_rate": 8.400680242934005e-06, "loss": 0.7567, "num_input_tokens_seen": 258475808, "step": 6065 }, { "epoch": 1.4634396286696003, "grad_norm": 1.2754604816436768, "learning_rate": 8.365305719307553e-06, "loss": 0.7945, "num_input_tokens_seen": 258691784, "step": 6070 }, { "epoch": 1.4646452468503224, "grad_norm": 1.320137619972229, "learning_rate": 8.329990861607185e-06, "loss": 0.8099, "num_input_tokens_seen": 258899720, "step": 6075 }, { "epoch": 1.4658508650310447, "grad_norm": 1.3317651748657227, "learning_rate": 8.294735796501533e-06, "loss": 0.7901, "num_input_tokens_seen": 259104144, "step": 6080 }, { "epoch": 1.467056483211767, "grad_norm": 1.1528047323226929, "learning_rate": 8.259540650444736e-06, "loss": 0.7745, "num_input_tokens_seen": 259326464, "step": 6085 }, { "epoch": 1.468262101392489, "grad_norm": 1.3166098594665527, "learning_rate": 8.224405549676026e-06, "loss": 0.7921, "num_input_tokens_seen": 259537760, "step": 6090 }, { "epoch": 1.469467719573211, "grad_norm": 1.1402934789657593, "learning_rate": 8.189330620219249e-06, "loss": 0.7478, "num_input_tokens_seen": 259740680, "step": 6095 }, { "epoch": 1.4706733377539334, "grad_norm": 1.5318069458007812, "learning_rate": 8.154315987882458e-06, "loss": 0.7823, "num_input_tokens_seen": 259955704, "step": 6100 }, { "epoch": 1.4718789559346555, "grad_norm": 1.292269229888916, "learning_rate": 8.119361778257394e-06, "loss": 0.7587, "num_input_tokens_seen": 260163336, "step": 6105 }, { "epoch": 1.4730845741153777, "grad_norm": 1.2149937152862549, "learning_rate": 8.084468116719085e-06, "loss": 0.759, "num_input_tokens_seen": 260372936, "step": 6110 }, { "epoch": 1.4742901922960998, "grad_norm": 1.080544114112854, "learning_rate": 8.049635128425395e-06, "loss": 0.7184, "num_input_tokens_seen": 260600488, "step": 6115 }, { "epoch": 1.475495810476822, "grad_norm": 1.1710779666900635, "learning_rate": 8.014862938316542e-06, "loss": 0.7942, "num_input_tokens_seen": 260812448, "step": 6120 }, { "epoch": 1.4767014286575442, "grad_norm": 1.2740488052368164, "learning_rate": 7.9801516711147e-06, "loss": 0.7407, "num_input_tokens_seen": 261019912, "step": 6125 }, { "epoch": 1.4779070468382662, "grad_norm": 1.3471128940582275, "learning_rate": 7.945501451323476e-06, "loss": 0.8275, "num_input_tokens_seen": 261223264, "step": 6130 }, { "epoch": 1.4791126650189885, "grad_norm": 1.1933817863464355, "learning_rate": 7.91091240322756e-06, "loss": 0.7592, "num_input_tokens_seen": 261443704, "step": 6135 }, { "epoch": 1.4803182831997106, "grad_norm": 1.317888617515564, "learning_rate": 7.876384650892191e-06, "loss": 0.7475, "num_input_tokens_seen": 261656072, "step": 6140 }, { "epoch": 1.4815239013804329, "grad_norm": 1.3462644815444946, "learning_rate": 7.841918318162783e-06, "loss": 0.7469, "num_input_tokens_seen": 261869968, "step": 6145 }, { "epoch": 1.482729519561155, "grad_norm": 1.1389620304107666, "learning_rate": 7.807513528664414e-06, "loss": 0.7945, "num_input_tokens_seen": 262089776, "step": 6150 }, { "epoch": 1.4839351377418772, "grad_norm": 1.3131595849990845, "learning_rate": 7.77317040580145e-06, "loss": 0.7713, "num_input_tokens_seen": 262306488, "step": 6155 }, { "epoch": 1.4851407559225993, "grad_norm": 1.2933955192565918, "learning_rate": 7.738889072757043e-06, "loss": 0.7978, "num_input_tokens_seen": 262521480, "step": 6160 }, { "epoch": 1.4863463741033214, "grad_norm": 1.0636521577835083, "learning_rate": 7.704669652492726e-06, "loss": 0.7649, "num_input_tokens_seen": 262738624, "step": 6165 }, { "epoch": 1.4875519922840437, "grad_norm": 1.201112151145935, "learning_rate": 7.670512267747953e-06, "loss": 0.7581, "num_input_tokens_seen": 262944512, "step": 6170 }, { "epoch": 1.4887576104647657, "grad_norm": 1.1374708414077759, "learning_rate": 7.636417041039687e-06, "loss": 0.7952, "num_input_tokens_seen": 263163480, "step": 6175 }, { "epoch": 1.489963228645488, "grad_norm": 1.1835105419158936, "learning_rate": 7.6023840946619185e-06, "loss": 0.7271, "num_input_tokens_seen": 263373256, "step": 6180 }, { "epoch": 1.49116884682621, "grad_norm": 1.2155743837356567, "learning_rate": 7.568413550685249e-06, "loss": 0.772, "num_input_tokens_seen": 263584992, "step": 6185 }, { "epoch": 1.4923744650069324, "grad_norm": 1.1627577543258667, "learning_rate": 7.534505530956479e-06, "loss": 0.8066, "num_input_tokens_seen": 263803736, "step": 6190 }, { "epoch": 1.4935800831876545, "grad_norm": 1.2297110557556152, "learning_rate": 7.50066015709811e-06, "loss": 0.7974, "num_input_tokens_seen": 264017640, "step": 6195 }, { "epoch": 1.4947857013683765, "grad_norm": 1.9522889852523804, "learning_rate": 7.46687755050797e-06, "loss": 0.7601, "num_input_tokens_seen": 264240528, "step": 6200 }, { "epoch": 1.4959913195490988, "grad_norm": 1.0940107107162476, "learning_rate": 7.433157832358725e-06, "loss": 0.7385, "num_input_tokens_seen": 264454344, "step": 6205 }, { "epoch": 1.497196937729821, "grad_norm": 1.2452374696731567, "learning_rate": 7.399501123597502e-06, "loss": 0.7532, "num_input_tokens_seen": 264671168, "step": 6210 }, { "epoch": 1.4984025559105432, "grad_norm": 1.3203458786010742, "learning_rate": 7.365907544945397e-06, "loss": 0.8065, "num_input_tokens_seen": 264876384, "step": 6215 }, { "epoch": 1.4996081740912652, "grad_norm": 1.1105108261108398, "learning_rate": 7.332377216897088e-06, "loss": 0.7831, "num_input_tokens_seen": 265078288, "step": 6220 }, { "epoch": 1.5008137922719875, "grad_norm": 1.6644234657287598, "learning_rate": 7.298910259720371e-06, "loss": 0.8056, "num_input_tokens_seen": 265298936, "step": 6225 }, { "epoch": 1.5020194104527096, "grad_norm": 1.2088329792022705, "learning_rate": 7.26550679345574e-06, "loss": 0.7255, "num_input_tokens_seen": 265518840, "step": 6230 }, { "epoch": 1.5032250286334317, "grad_norm": 1.212149739265442, "learning_rate": 7.23216693791596e-06, "loss": 0.7551, "num_input_tokens_seen": 265736968, "step": 6235 }, { "epoch": 1.504430646814154, "grad_norm": 4.546824932098389, "learning_rate": 7.198890812685649e-06, "loss": 0.7296, "num_input_tokens_seen": 265941792, "step": 6240 }, { "epoch": 1.5056362649948762, "grad_norm": 1.2064158916473389, "learning_rate": 7.165678537120815e-06, "loss": 0.7616, "num_input_tokens_seen": 266153880, "step": 6245 }, { "epoch": 1.5068418831755983, "grad_norm": 1.305077075958252, "learning_rate": 7.132530230348447e-06, "loss": 0.7967, "num_input_tokens_seen": 266371728, "step": 6250 }, { "epoch": 1.5080475013563204, "grad_norm": 1.0801681280136108, "learning_rate": 7.099446011266114e-06, "loss": 0.8014, "num_input_tokens_seen": 266579136, "step": 6255 }, { "epoch": 1.5092531195370427, "grad_norm": 1.1263065338134766, "learning_rate": 7.066425998541485e-06, "loss": 0.7694, "num_input_tokens_seen": 266794176, "step": 6260 }, { "epoch": 1.5104587377177647, "grad_norm": 1.7501317262649536, "learning_rate": 7.033470310611945e-06, "loss": 0.7558, "num_input_tokens_seen": 267007560, "step": 6265 }, { "epoch": 1.5116643558984868, "grad_norm": 1.2361748218536377, "learning_rate": 7.000579065684143e-06, "loss": 0.7695, "num_input_tokens_seen": 267219456, "step": 6270 }, { "epoch": 1.512869974079209, "grad_norm": 1.5580161809921265, "learning_rate": 6.967752381733608e-06, "loss": 0.8522, "num_input_tokens_seen": 267417136, "step": 6275 }, { "epoch": 1.5140755922599314, "grad_norm": 1.1279025077819824, "learning_rate": 6.934990376504269e-06, "loss": 0.7931, "num_input_tokens_seen": 267626912, "step": 6280 }, { "epoch": 1.5152812104406534, "grad_norm": 1.1919885873794556, "learning_rate": 6.902293167508092e-06, "loss": 0.7584, "num_input_tokens_seen": 267844744, "step": 6285 }, { "epoch": 1.5164868286213755, "grad_norm": 1.2096060514450073, "learning_rate": 6.8696608720245995e-06, "loss": 0.8169, "num_input_tokens_seen": 268060696, "step": 6290 }, { "epoch": 1.5176924468020978, "grad_norm": 1.2011953592300415, "learning_rate": 6.837093607100517e-06, "loss": 0.826, "num_input_tokens_seen": 268269560, "step": 6295 }, { "epoch": 1.51889806498282, "grad_norm": 1.3001834154129028, "learning_rate": 6.804591489549264e-06, "loss": 0.8006, "num_input_tokens_seen": 268487552, "step": 6300 }, { "epoch": 1.520103683163542, "grad_norm": 1.3032008409500122, "learning_rate": 6.7721546359506395e-06, "loss": 0.7474, "num_input_tokens_seen": 268702016, "step": 6305 }, { "epoch": 1.5213093013442642, "grad_norm": 1.1443711519241333, "learning_rate": 6.7397831626503146e-06, "loss": 0.7699, "num_input_tokens_seen": 268914984, "step": 6310 }, { "epoch": 1.5225149195249865, "grad_norm": 1.2309426069259644, "learning_rate": 6.707477185759484e-06, "loss": 0.7539, "num_input_tokens_seen": 269126032, "step": 6315 }, { "epoch": 1.5237205377057086, "grad_norm": 1.2731608152389526, "learning_rate": 6.6752368211543875e-06, "loss": 0.7257, "num_input_tokens_seen": 269344480, "step": 6320 }, { "epoch": 1.5249261558864307, "grad_norm": 1.202053189277649, "learning_rate": 6.643062184475932e-06, "loss": 0.707, "num_input_tokens_seen": 269548960, "step": 6325 }, { "epoch": 1.526131774067153, "grad_norm": 1.2009083032608032, "learning_rate": 6.610953391129288e-06, "loss": 0.772, "num_input_tokens_seen": 269756568, "step": 6330 }, { "epoch": 1.5273373922478752, "grad_norm": 1.1979689598083496, "learning_rate": 6.578910556283435e-06, "loss": 0.8382, "num_input_tokens_seen": 269960512, "step": 6335 }, { "epoch": 1.5285430104285973, "grad_norm": 1.4399319887161255, "learning_rate": 6.54693379487078e-06, "loss": 0.7792, "num_input_tokens_seen": 270174400, "step": 6340 }, { "epoch": 1.5297486286093194, "grad_norm": 1.2751766443252563, "learning_rate": 6.515023221586722e-06, "loss": 0.7638, "num_input_tokens_seen": 270390624, "step": 6345 }, { "epoch": 1.5309542467900417, "grad_norm": 1.929681420326233, "learning_rate": 6.483178950889282e-06, "loss": 0.7732, "num_input_tokens_seen": 270598176, "step": 6350 }, { "epoch": 1.5321598649707637, "grad_norm": 1.0662893056869507, "learning_rate": 6.451401096998635e-06, "loss": 0.7098, "num_input_tokens_seen": 270820248, "step": 6355 }, { "epoch": 1.5333654831514858, "grad_norm": 1.2035515308380127, "learning_rate": 6.4196897738967536e-06, "loss": 0.8019, "num_input_tokens_seen": 271027480, "step": 6360 }, { "epoch": 1.534571101332208, "grad_norm": 1.2498165369033813, "learning_rate": 6.388045095326958e-06, "loss": 0.7851, "num_input_tokens_seen": 271226632, "step": 6365 }, { "epoch": 1.5357767195129304, "grad_norm": 1.1396733522415161, "learning_rate": 6.3564671747935316e-06, "loss": 0.8054, "num_input_tokens_seen": 271437584, "step": 6370 }, { "epoch": 1.5369823376936524, "grad_norm": 1.153017282485962, "learning_rate": 6.324956125561299e-06, "loss": 0.8038, "num_input_tokens_seen": 271646112, "step": 6375 }, { "epoch": 1.5381879558743745, "grad_norm": 1.182814121246338, "learning_rate": 6.293512060655255e-06, "loss": 0.8037, "num_input_tokens_seen": 271847544, "step": 6380 }, { "epoch": 1.5393935740550968, "grad_norm": 1.2497626543045044, "learning_rate": 6.262135092860097e-06, "loss": 0.8121, "num_input_tokens_seen": 272048888, "step": 6385 }, { "epoch": 1.5405991922358189, "grad_norm": 1.1984546184539795, "learning_rate": 6.230825334719889e-06, "loss": 0.8317, "num_input_tokens_seen": 272250520, "step": 6390 }, { "epoch": 1.541804810416541, "grad_norm": 1.1580780744552612, "learning_rate": 6.199582898537604e-06, "loss": 0.757, "num_input_tokens_seen": 272478936, "step": 6395 }, { "epoch": 1.5430104285972632, "grad_norm": 1.4981476068496704, "learning_rate": 6.1684078963747426e-06, "loss": 0.74, "num_input_tokens_seen": 272694904, "step": 6400 }, { "epoch": 1.5442160467779855, "grad_norm": 1.1572012901306152, "learning_rate": 6.137300440050933e-06, "loss": 0.7624, "num_input_tokens_seen": 272909528, "step": 6405 }, { "epoch": 1.5454216649587076, "grad_norm": 1.2814265489578247, "learning_rate": 6.106260641143546e-06, "loss": 0.7598, "num_input_tokens_seen": 273127544, "step": 6410 }, { "epoch": 1.5466272831394297, "grad_norm": 1.3408452272415161, "learning_rate": 6.075288610987248e-06, "loss": 0.7637, "num_input_tokens_seen": 273342776, "step": 6415 }, { "epoch": 1.547832901320152, "grad_norm": 1.2514641284942627, "learning_rate": 6.044384460673641e-06, "loss": 0.8116, "num_input_tokens_seen": 273556816, "step": 6420 }, { "epoch": 1.5490385195008742, "grad_norm": 1.1772489547729492, "learning_rate": 6.013548301050864e-06, "loss": 0.7014, "num_input_tokens_seen": 273765200, "step": 6425 }, { "epoch": 1.550244137681596, "grad_norm": 1.3586758375167847, "learning_rate": 5.982780242723163e-06, "loss": 0.8416, "num_input_tokens_seen": 273970728, "step": 6430 }, { "epoch": 1.5514497558623184, "grad_norm": 1.2848085165023804, "learning_rate": 5.952080396050552e-06, "loss": 0.818, "num_input_tokens_seen": 274194160, "step": 6435 }, { "epoch": 1.5526553740430407, "grad_norm": 1.3518309593200684, "learning_rate": 5.9214488711483245e-06, "loss": 0.7796, "num_input_tokens_seen": 274402112, "step": 6440 }, { "epoch": 1.5538609922237627, "grad_norm": 1.2340294122695923, "learning_rate": 5.890885777886771e-06, "loss": 0.8041, "num_input_tokens_seen": 274608280, "step": 6445 }, { "epoch": 1.5550666104044848, "grad_norm": 1.1730895042419434, "learning_rate": 5.860391225890688e-06, "loss": 0.6862, "num_input_tokens_seen": 274817424, "step": 6450 }, { "epoch": 1.556272228585207, "grad_norm": 1.3520046472549438, "learning_rate": 5.829965324539064e-06, "loss": 0.7786, "num_input_tokens_seen": 275023528, "step": 6455 }, { "epoch": 1.5574778467659294, "grad_norm": 1.3118226528167725, "learning_rate": 5.799608182964605e-06, "loss": 0.7755, "num_input_tokens_seen": 275241184, "step": 6460 }, { "epoch": 1.5586834649466514, "grad_norm": 1.2160563468933105, "learning_rate": 5.76931991005343e-06, "loss": 0.7079, "num_input_tokens_seen": 275457936, "step": 6465 }, { "epoch": 1.5598890831273735, "grad_norm": 1.2737008333206177, "learning_rate": 5.739100614444609e-06, "loss": 0.7217, "num_input_tokens_seen": 275677496, "step": 6470 }, { "epoch": 1.5610947013080958, "grad_norm": 1.2261526584625244, "learning_rate": 5.7089504045298115e-06, "loss": 0.8144, "num_input_tokens_seen": 275888544, "step": 6475 }, { "epoch": 1.5623003194888179, "grad_norm": 1.1137715578079224, "learning_rate": 5.678869388452901e-06, "loss": 0.735, "num_input_tokens_seen": 276112128, "step": 6480 }, { "epoch": 1.56350593766954, "grad_norm": 1.2673994302749634, "learning_rate": 5.648857674109556e-06, "loss": 0.7425, "num_input_tokens_seen": 276330008, "step": 6485 }, { "epoch": 1.5647115558502622, "grad_norm": 1.4254534244537354, "learning_rate": 5.618915369146899e-06, "loss": 0.7916, "num_input_tokens_seen": 276544912, "step": 6490 }, { "epoch": 1.5659171740309845, "grad_norm": 1.176647663116455, "learning_rate": 5.589042580963064e-06, "loss": 0.7462, "num_input_tokens_seen": 276744208, "step": 6495 }, { "epoch": 1.5671227922117066, "grad_norm": 1.161970853805542, "learning_rate": 5.559239416706863e-06, "loss": 0.8021, "num_input_tokens_seen": 276957904, "step": 6500 }, { "epoch": 1.5683284103924287, "grad_norm": 1.1753110885620117, "learning_rate": 5.529505983277369e-06, "loss": 0.7924, "num_input_tokens_seen": 277162608, "step": 6505 }, { "epoch": 1.569534028573151, "grad_norm": 1.164531946182251, "learning_rate": 5.4998423873235335e-06, "loss": 0.743, "num_input_tokens_seen": 277377792, "step": 6510 }, { "epoch": 1.570739646753873, "grad_norm": 1.298555612564087, "learning_rate": 5.470248735243822e-06, "loss": 0.8141, "num_input_tokens_seen": 277581336, "step": 6515 }, { "epoch": 1.571945264934595, "grad_norm": 1.3875616788864136, "learning_rate": 5.440725133185831e-06, "loss": 0.8497, "num_input_tokens_seen": 277798984, "step": 6520 }, { "epoch": 1.5731508831153174, "grad_norm": 1.069815993309021, "learning_rate": 5.411271687045874e-06, "loss": 0.7281, "num_input_tokens_seen": 278009416, "step": 6525 }, { "epoch": 1.5743565012960397, "grad_norm": 1.1550933122634888, "learning_rate": 5.38188850246866e-06, "loss": 0.8052, "num_input_tokens_seen": 278222312, "step": 6530 }, { "epoch": 1.5755621194767617, "grad_norm": 1.225906252861023, "learning_rate": 5.352575684846856e-06, "loss": 0.7776, "num_input_tokens_seen": 278432120, "step": 6535 }, { "epoch": 1.5767677376574838, "grad_norm": 1.2616294622421265, "learning_rate": 5.323333339320738e-06, "loss": 0.7876, "num_input_tokens_seen": 278638752, "step": 6540 }, { "epoch": 1.577973355838206, "grad_norm": 1.203956127166748, "learning_rate": 5.294161570777811e-06, "loss": 0.7521, "num_input_tokens_seen": 278862552, "step": 6545 }, { "epoch": 1.5791789740189284, "grad_norm": 1.166542410850525, "learning_rate": 5.265060483852446e-06, "loss": 0.7928, "num_input_tokens_seen": 279086448, "step": 6550 }, { "epoch": 1.5803845921996502, "grad_norm": 1.141257643699646, "learning_rate": 5.236030182925475e-06, "loss": 0.801, "num_input_tokens_seen": 279305456, "step": 6555 }, { "epoch": 1.5815902103803725, "grad_norm": 1.205160140991211, "learning_rate": 5.2070707721238285e-06, "loss": 0.7888, "num_input_tokens_seen": 279514832, "step": 6560 }, { "epoch": 1.5827958285610948, "grad_norm": 1.3986281156539917, "learning_rate": 5.1781823553201855e-06, "loss": 0.7942, "num_input_tokens_seen": 279739192, "step": 6565 }, { "epoch": 1.5840014467418169, "grad_norm": 1.0493364334106445, "learning_rate": 5.149365036132558e-06, "loss": 0.7452, "num_input_tokens_seen": 279961304, "step": 6570 }, { "epoch": 1.585207064922539, "grad_norm": 1.2806085348129272, "learning_rate": 5.120618917923975e-06, "loss": 0.7511, "num_input_tokens_seen": 280172776, "step": 6575 }, { "epoch": 1.5864126831032612, "grad_norm": 1.2800973653793335, "learning_rate": 5.091944103802027e-06, "loss": 0.7991, "num_input_tokens_seen": 280386216, "step": 6580 }, { "epoch": 1.5876183012839835, "grad_norm": 1.2039763927459717, "learning_rate": 5.063340696618604e-06, "loss": 0.7295, "num_input_tokens_seen": 280617440, "step": 6585 }, { "epoch": 1.5888239194647056, "grad_norm": 1.2479844093322754, "learning_rate": 5.034808798969434e-06, "loss": 0.7635, "num_input_tokens_seen": 280839032, "step": 6590 }, { "epoch": 1.5900295376454276, "grad_norm": 1.1666340827941895, "learning_rate": 5.006348513193773e-06, "loss": 0.7975, "num_input_tokens_seen": 281059680, "step": 6595 }, { "epoch": 1.59123515582615, "grad_norm": 1.1738654375076294, "learning_rate": 4.977959941374e-06, "loss": 0.7205, "num_input_tokens_seen": 281281568, "step": 6600 }, { "epoch": 1.592440774006872, "grad_norm": 1.2309415340423584, "learning_rate": 4.949643185335287e-06, "loss": 0.7499, "num_input_tokens_seen": 281498312, "step": 6605 }, { "epoch": 1.593646392187594, "grad_norm": 1.1562498807907104, "learning_rate": 4.921398346645198e-06, "loss": 0.7139, "num_input_tokens_seen": 281700656, "step": 6610 }, { "epoch": 1.5948520103683164, "grad_norm": 1.2082377672195435, "learning_rate": 4.8932255266133455e-06, "loss": 0.79, "num_input_tokens_seen": 281913880, "step": 6615 }, { "epoch": 1.5960576285490387, "grad_norm": 1.3383643627166748, "learning_rate": 4.8651248262910205e-06, "loss": 0.7157, "num_input_tokens_seen": 282132608, "step": 6620 }, { "epoch": 1.5972632467297607, "grad_norm": 1.1922709941864014, "learning_rate": 4.837096346470849e-06, "loss": 0.7772, "num_input_tokens_seen": 282342200, "step": 6625 }, { "epoch": 1.5984688649104828, "grad_norm": 1.3023651838302612, "learning_rate": 4.809140187686392e-06, "loss": 0.7539, "num_input_tokens_seen": 282556704, "step": 6630 }, { "epoch": 1.599674483091205, "grad_norm": 1.174684762954712, "learning_rate": 4.781256450211813e-06, "loss": 0.7731, "num_input_tokens_seen": 282770384, "step": 6635 }, { "epoch": 1.6008801012719271, "grad_norm": 1.5758352279663086, "learning_rate": 4.753445234061524e-06, "loss": 0.6887, "num_input_tokens_seen": 282984152, "step": 6640 }, { "epoch": 1.6020857194526492, "grad_norm": 1.3949154615402222, "learning_rate": 4.725706638989805e-06, "loss": 0.7449, "num_input_tokens_seen": 283183800, "step": 6645 }, { "epoch": 1.6032913376333715, "grad_norm": 1.3048564195632935, "learning_rate": 4.698040764490452e-06, "loss": 0.7407, "num_input_tokens_seen": 283400968, "step": 6650 }, { "epoch": 1.6044969558140938, "grad_norm": 1.3724099397659302, "learning_rate": 4.670447709796425e-06, "loss": 0.7651, "num_input_tokens_seen": 283614648, "step": 6655 }, { "epoch": 1.6057025739948159, "grad_norm": 1.205494999885559, "learning_rate": 4.642927573879507e-06, "loss": 0.8007, "num_input_tokens_seen": 283809352, "step": 6660 }, { "epoch": 1.606908192175538, "grad_norm": 1.1890389919281006, "learning_rate": 4.6154804554499135e-06, "loss": 0.7548, "num_input_tokens_seen": 284021192, "step": 6665 }, { "epoch": 1.6081138103562602, "grad_norm": 1.1561270952224731, "learning_rate": 4.588106452955973e-06, "loss": 0.7196, "num_input_tokens_seen": 284232864, "step": 6670 }, { "epoch": 1.6093194285369823, "grad_norm": 1.2145832777023315, "learning_rate": 4.560805664583745e-06, "loss": 0.7719, "num_input_tokens_seen": 284437512, "step": 6675 }, { "epoch": 1.6105250467177044, "grad_norm": 1.129553198814392, "learning_rate": 4.533578188256707e-06, "loss": 0.7458, "num_input_tokens_seen": 284650424, "step": 6680 }, { "epoch": 1.6117306648984266, "grad_norm": 1.2756515741348267, "learning_rate": 4.5064241216353335e-06, "loss": 0.776, "num_input_tokens_seen": 284866776, "step": 6685 }, { "epoch": 1.612936283079149, "grad_norm": 1.83440363407135, "learning_rate": 4.479343562116836e-06, "loss": 0.7589, "num_input_tokens_seen": 285078160, "step": 6690 }, { "epoch": 1.614141901259871, "grad_norm": 1.231008529663086, "learning_rate": 4.452336606834742e-06, "loss": 0.7784, "num_input_tokens_seen": 285290712, "step": 6695 }, { "epoch": 1.615347519440593, "grad_norm": 1.278198003768921, "learning_rate": 4.425403352658591e-06, "loss": 0.7519, "num_input_tokens_seen": 285503832, "step": 6700 }, { "epoch": 1.6165531376213154, "grad_norm": 1.2519234418869019, "learning_rate": 4.398543896193549e-06, "loss": 0.773, "num_input_tokens_seen": 285714760, "step": 6705 }, { "epoch": 1.6177587558020377, "grad_norm": 1.2918680906295776, "learning_rate": 4.3717583337800874e-06, "loss": 0.7576, "num_input_tokens_seen": 285930640, "step": 6710 }, { "epoch": 1.6189643739827595, "grad_norm": 1.1284878253936768, "learning_rate": 4.345046761493654e-06, "loss": 0.8115, "num_input_tokens_seen": 286142640, "step": 6715 }, { "epoch": 1.6201699921634818, "grad_norm": 1.1455811262130737, "learning_rate": 4.318409275144258e-06, "loss": 0.7233, "num_input_tokens_seen": 286351288, "step": 6720 }, { "epoch": 1.621375610344204, "grad_norm": 1.2973227500915527, "learning_rate": 4.291845970276229e-06, "loss": 0.8005, "num_input_tokens_seen": 286556400, "step": 6725 }, { "epoch": 1.6225812285249261, "grad_norm": 1.4170106649398804, "learning_rate": 4.265356942167775e-06, "loss": 0.8173, "num_input_tokens_seen": 286778312, "step": 6730 }, { "epoch": 1.6237868467056482, "grad_norm": 1.2347239255905151, "learning_rate": 4.238942285830724e-06, "loss": 0.7638, "num_input_tokens_seen": 286996648, "step": 6735 }, { "epoch": 1.6249924648863705, "grad_norm": 1.6410541534423828, "learning_rate": 4.2126020960101094e-06, "loss": 0.7994, "num_input_tokens_seen": 287215048, "step": 6740 }, { "epoch": 1.6261980830670928, "grad_norm": 1.3060860633850098, "learning_rate": 4.186336467183894e-06, "loss": 0.7619, "num_input_tokens_seen": 287440368, "step": 6745 }, { "epoch": 1.6274037012478149, "grad_norm": 1.4145536422729492, "learning_rate": 4.1601454935625814e-06, "loss": 0.793, "num_input_tokens_seen": 287652800, "step": 6750 }, { "epoch": 1.628609319428537, "grad_norm": 1.5937716960906982, "learning_rate": 4.134029269088913e-06, "loss": 0.8114, "num_input_tokens_seen": 287864928, "step": 6755 }, { "epoch": 1.6298149376092592, "grad_norm": 1.559962511062622, "learning_rate": 4.107987887437504e-06, "loss": 0.7761, "num_input_tokens_seen": 288069976, "step": 6760 }, { "epoch": 1.6310205557899813, "grad_norm": 1.8200944662094116, "learning_rate": 4.082021442014539e-06, "loss": 0.8001, "num_input_tokens_seen": 288278176, "step": 6765 }, { "epoch": 1.6322261739707034, "grad_norm": 1.252773404121399, "learning_rate": 4.056130025957397e-06, "loss": 0.7719, "num_input_tokens_seen": 288491720, "step": 6770 }, { "epoch": 1.6334317921514256, "grad_norm": 1.166760802268982, "learning_rate": 4.030313732134364e-06, "loss": 0.7684, "num_input_tokens_seen": 288713000, "step": 6775 }, { "epoch": 1.634637410332148, "grad_norm": 2.8450169563293457, "learning_rate": 4.0045726531442535e-06, "loss": 0.7565, "num_input_tokens_seen": 288936176, "step": 6780 }, { "epoch": 1.63584302851287, "grad_norm": 1.2430541515350342, "learning_rate": 3.978906881316105e-06, "loss": 0.7233, "num_input_tokens_seen": 289151360, "step": 6785 }, { "epoch": 1.637048646693592, "grad_norm": 1.320282220840454, "learning_rate": 3.953316508708838e-06, "loss": 0.7982, "num_input_tokens_seen": 289362048, "step": 6790 }, { "epoch": 1.6382542648743144, "grad_norm": 1.387012004852295, "learning_rate": 3.927801627110927e-06, "loss": 0.7985, "num_input_tokens_seen": 289571608, "step": 6795 }, { "epoch": 1.6394598830550364, "grad_norm": 1.204607605934143, "learning_rate": 3.902362328040091e-06, "loss": 0.7629, "num_input_tokens_seen": 289782072, "step": 6800 }, { "epoch": 1.6406655012357585, "grad_norm": 1.2769464254379272, "learning_rate": 3.876998702742921e-06, "loss": 0.753, "num_input_tokens_seen": 289996968, "step": 6805 }, { "epoch": 1.6418711194164808, "grad_norm": 1.2991886138916016, "learning_rate": 3.851710842194595e-06, "loss": 0.7316, "num_input_tokens_seen": 290212128, "step": 6810 }, { "epoch": 1.643076737597203, "grad_norm": 2.5376298427581787, "learning_rate": 3.826498837098527e-06, "loss": 0.7766, "num_input_tokens_seen": 290424824, "step": 6815 }, { "epoch": 1.6442823557779251, "grad_norm": 1.1109745502471924, "learning_rate": 3.8013627778860665e-06, "loss": 0.7441, "num_input_tokens_seen": 290638312, "step": 6820 }, { "epoch": 1.6454879739586472, "grad_norm": 1.2438985109329224, "learning_rate": 3.7763027547161212e-06, "loss": 0.7576, "num_input_tokens_seen": 290855496, "step": 6825 }, { "epoch": 1.6466935921393695, "grad_norm": 1.2216014862060547, "learning_rate": 3.7513188574749116e-06, "loss": 0.8446, "num_input_tokens_seen": 291069064, "step": 6830 }, { "epoch": 1.6478992103200918, "grad_norm": 1.6537330150604248, "learning_rate": 3.7264111757755764e-06, "loss": 0.7814, "num_input_tokens_seen": 291287112, "step": 6835 }, { "epoch": 1.6491048285008136, "grad_norm": 1.279992938041687, "learning_rate": 3.7015797989579075e-06, "loss": 0.7846, "num_input_tokens_seen": 291501608, "step": 6840 }, { "epoch": 1.650310446681536, "grad_norm": 1.0998634099960327, "learning_rate": 3.6768248160879787e-06, "loss": 0.7276, "num_input_tokens_seen": 291714968, "step": 6845 }, { "epoch": 1.6515160648622582, "grad_norm": 1.2975636720657349, "learning_rate": 3.6521463159578606e-06, "loss": 0.7912, "num_input_tokens_seen": 291929800, "step": 6850 }, { "epoch": 1.6527216830429803, "grad_norm": 1.1825451850891113, "learning_rate": 3.627544387085308e-06, "loss": 0.7831, "num_input_tokens_seen": 292142528, "step": 6855 }, { "epoch": 1.6539273012237024, "grad_norm": 1.264003872871399, "learning_rate": 3.603019117713402e-06, "loss": 0.7585, "num_input_tokens_seen": 292359896, "step": 6860 }, { "epoch": 1.6551329194044246, "grad_norm": 1.1863354444503784, "learning_rate": 3.578570595810274e-06, "loss": 0.7568, "num_input_tokens_seen": 292574008, "step": 6865 }, { "epoch": 1.656338537585147, "grad_norm": 1.2148429155349731, "learning_rate": 3.554198909068765e-06, "loss": 0.7234, "num_input_tokens_seen": 292793552, "step": 6870 }, { "epoch": 1.657544155765869, "grad_norm": 1.2198923826217651, "learning_rate": 3.5299041449061377e-06, "loss": 0.7688, "num_input_tokens_seen": 292999824, "step": 6875 }, { "epoch": 1.658749773946591, "grad_norm": 1.3424564599990845, "learning_rate": 3.5056863904637223e-06, "loss": 0.787, "num_input_tokens_seen": 293210008, "step": 6880 }, { "epoch": 1.6599553921273134, "grad_norm": 1.2602789402008057, "learning_rate": 3.481545732606656e-06, "loss": 0.8476, "num_input_tokens_seen": 293428320, "step": 6885 }, { "epoch": 1.6611610103080354, "grad_norm": 1.4678528308868408, "learning_rate": 3.4574822579235194e-06, "loss": 0.7656, "num_input_tokens_seen": 293640112, "step": 6890 }, { "epoch": 1.6623666284887575, "grad_norm": 1.1669840812683105, "learning_rate": 3.4334960527260596e-06, "loss": 0.8092, "num_input_tokens_seen": 293856912, "step": 6895 }, { "epoch": 1.6635722466694798, "grad_norm": 1.3056279420852661, "learning_rate": 3.409587203048864e-06, "loss": 0.7893, "num_input_tokens_seen": 294070216, "step": 6900 }, { "epoch": 1.664777864850202, "grad_norm": 1.0737676620483398, "learning_rate": 3.385755794649073e-06, "loss": 0.7601, "num_input_tokens_seen": 294294584, "step": 6905 }, { "epoch": 1.6659834830309241, "grad_norm": 1.672483205795288, "learning_rate": 3.3620019130060383e-06, "loss": 0.8207, "num_input_tokens_seen": 294504496, "step": 6910 }, { "epoch": 1.6671891012116462, "grad_norm": 1.1329741477966309, "learning_rate": 3.3383256433210554e-06, "loss": 0.7843, "num_input_tokens_seen": 294709600, "step": 6915 }, { "epoch": 1.6683947193923685, "grad_norm": 1.1830151081085205, "learning_rate": 3.3147270705170263e-06, "loss": 0.7172, "num_input_tokens_seen": 294934464, "step": 6920 }, { "epoch": 1.6696003375730906, "grad_norm": 1.5698306560516357, "learning_rate": 3.2912062792381682e-06, "loss": 0.7782, "num_input_tokens_seen": 295140784, "step": 6925 }, { "epoch": 1.6708059557538126, "grad_norm": 1.2856553792953491, "learning_rate": 3.267763353849704e-06, "loss": 0.7634, "num_input_tokens_seen": 295356512, "step": 6930 }, { "epoch": 1.672011573934535, "grad_norm": 1.146692156791687, "learning_rate": 3.2443983784375824e-06, "loss": 0.8085, "num_input_tokens_seen": 295566800, "step": 6935 }, { "epoch": 1.6732171921152572, "grad_norm": 1.1469289064407349, "learning_rate": 3.2211114368081414e-06, "loss": 0.7492, "num_input_tokens_seen": 295780200, "step": 6940 }, { "epoch": 1.6744228102959793, "grad_norm": 1.2914984226226807, "learning_rate": 3.197902612487824e-06, "loss": 0.7996, "num_input_tokens_seen": 295995112, "step": 6945 }, { "epoch": 1.6756284284767013, "grad_norm": 1.2538847923278809, "learning_rate": 3.1747719887228966e-06, "loss": 0.7612, "num_input_tokens_seen": 296206376, "step": 6950 }, { "epoch": 1.6768340466574236, "grad_norm": 1.2275216579437256, "learning_rate": 3.1517196484791093e-06, "loss": 0.7807, "num_input_tokens_seen": 296415440, "step": 6955 }, { "epoch": 1.6780396648381457, "grad_norm": 1.6452786922454834, "learning_rate": 3.1287456744414525e-06, "loss": 0.8172, "num_input_tokens_seen": 296617888, "step": 6960 }, { "epoch": 1.6792452830188678, "grad_norm": 1.2149267196655273, "learning_rate": 3.105850149013784e-06, "loss": 0.7702, "num_input_tokens_seen": 296814960, "step": 6965 }, { "epoch": 1.68045090119959, "grad_norm": 1.1549654006958008, "learning_rate": 3.083033154318629e-06, "loss": 0.8007, "num_input_tokens_seen": 297024736, "step": 6970 }, { "epoch": 1.6816565193803124, "grad_norm": 1.2014344930648804, "learning_rate": 3.0602947721967978e-06, "loss": 0.7576, "num_input_tokens_seen": 297254696, "step": 6975 }, { "epoch": 1.6828621375610344, "grad_norm": 1.3584998846054077, "learning_rate": 3.0376350842071603e-06, "loss": 0.8042, "num_input_tokens_seen": 297462352, "step": 6980 }, { "epoch": 1.6840677557417565, "grad_norm": 1.3357442617416382, "learning_rate": 3.015054171626297e-06, "loss": 0.7479, "num_input_tokens_seen": 297683120, "step": 6985 }, { "epoch": 1.6852733739224788, "grad_norm": 1.1580986976623535, "learning_rate": 2.992552115448258e-06, "loss": 0.7998, "num_input_tokens_seen": 297886016, "step": 6990 }, { "epoch": 1.686478992103201, "grad_norm": 1.4386252164840698, "learning_rate": 2.9701289963842276e-06, "loss": 0.7756, "num_input_tokens_seen": 298104424, "step": 6995 }, { "epoch": 1.6876846102839231, "grad_norm": 1.06174898147583, "learning_rate": 2.947784894862268e-06, "loss": 0.7806, "num_input_tokens_seen": 298311056, "step": 7000 }, { "epoch": 1.6888902284646452, "grad_norm": 1.2828367948532104, "learning_rate": 2.9255198910270064e-06, "loss": 0.8147, "num_input_tokens_seen": 298525960, "step": 7005 }, { "epoch": 1.6900958466453675, "grad_norm": 1.2340973615646362, "learning_rate": 2.90333406473938e-06, "loss": 0.7028, "num_input_tokens_seen": 298737184, "step": 7010 }, { "epoch": 1.6913014648260896, "grad_norm": 1.2289063930511475, "learning_rate": 2.8812274955763135e-06, "loss": 0.7451, "num_input_tokens_seen": 298945848, "step": 7015 }, { "epoch": 1.6925070830068116, "grad_norm": 1.2605866193771362, "learning_rate": 2.8592002628304417e-06, "loss": 0.7625, "num_input_tokens_seen": 299163416, "step": 7020 }, { "epoch": 1.693712701187534, "grad_norm": 1.2770594358444214, "learning_rate": 2.837252445509861e-06, "loss": 0.7865, "num_input_tokens_seen": 299376336, "step": 7025 }, { "epoch": 1.6949183193682562, "grad_norm": 1.271036148071289, "learning_rate": 2.815384122337791e-06, "loss": 0.8078, "num_input_tokens_seen": 299586832, "step": 7030 }, { "epoch": 1.6961239375489783, "grad_norm": 1.1534024477005005, "learning_rate": 2.793595371752328e-06, "loss": 0.7532, "num_input_tokens_seen": 299806792, "step": 7035 }, { "epoch": 1.6973295557297003, "grad_norm": 1.2255442142486572, "learning_rate": 2.7718862719061516e-06, "loss": 0.8033, "num_input_tokens_seen": 300030976, "step": 7040 }, { "epoch": 1.6985351739104226, "grad_norm": 1.4745612144470215, "learning_rate": 2.7502569006662593e-06, "loss": 0.7978, "num_input_tokens_seen": 300229608, "step": 7045 }, { "epoch": 1.6997407920911447, "grad_norm": 1.1694791316986084, "learning_rate": 2.728707335613656e-06, "loss": 0.7583, "num_input_tokens_seen": 300453416, "step": 7050 }, { "epoch": 1.7009464102718668, "grad_norm": 2.4279778003692627, "learning_rate": 2.7072376540431154e-06, "loss": 0.7429, "num_input_tokens_seen": 300659176, "step": 7055 }, { "epoch": 1.702152028452589, "grad_norm": 1.258379340171814, "learning_rate": 2.6858479329628677e-06, "loss": 0.7588, "num_input_tokens_seen": 300875456, "step": 7060 }, { "epoch": 1.7033576466333114, "grad_norm": 1.276883602142334, "learning_rate": 2.6645382490943364e-06, "loss": 0.8153, "num_input_tokens_seen": 301082176, "step": 7065 }, { "epoch": 1.7045632648140334, "grad_norm": 1.3666375875473022, "learning_rate": 2.6433086788718727e-06, "loss": 0.7635, "num_input_tokens_seen": 301297576, "step": 7070 }, { "epoch": 1.7057688829947555, "grad_norm": 1.1365221738815308, "learning_rate": 2.622159298442478e-06, "loss": 0.7494, "num_input_tokens_seen": 301516784, "step": 7075 }, { "epoch": 1.7069745011754778, "grad_norm": 1.0726186037063599, "learning_rate": 2.601090183665511e-06, "loss": 0.7154, "num_input_tokens_seen": 301727160, "step": 7080 }, { "epoch": 1.7081801193561998, "grad_norm": 1.3404488563537598, "learning_rate": 2.580101410112437e-06, "loss": 0.7807, "num_input_tokens_seen": 301941800, "step": 7085 }, { "epoch": 1.709385737536922, "grad_norm": 1.2545300722122192, "learning_rate": 2.559193053066561e-06, "loss": 0.8095, "num_input_tokens_seen": 302159864, "step": 7090 }, { "epoch": 1.7105913557176442, "grad_norm": 1.3362168073654175, "learning_rate": 2.5383651875227317e-06, "loss": 0.7695, "num_input_tokens_seen": 302371472, "step": 7095 }, { "epoch": 1.7117969738983665, "grad_norm": 1.2579748630523682, "learning_rate": 2.5176178881870934e-06, "loss": 0.7749, "num_input_tokens_seen": 302590704, "step": 7100 }, { "epoch": 1.7130025920790886, "grad_norm": 1.231776475906372, "learning_rate": 2.4969512294768126e-06, "loss": 0.7776, "num_input_tokens_seen": 302809160, "step": 7105 }, { "epoch": 1.7142082102598106, "grad_norm": 1.7875968217849731, "learning_rate": 2.476365285519819e-06, "loss": 0.7924, "num_input_tokens_seen": 303015408, "step": 7110 }, { "epoch": 1.715413828440533, "grad_norm": 1.3618780374526978, "learning_rate": 2.4558601301545162e-06, "loss": 0.7083, "num_input_tokens_seen": 303221760, "step": 7115 }, { "epoch": 1.7166194466212552, "grad_norm": 1.1527081727981567, "learning_rate": 2.4354358369295475e-06, "loss": 0.8029, "num_input_tokens_seen": 303424704, "step": 7120 }, { "epoch": 1.717825064801977, "grad_norm": 1.4081279039382935, "learning_rate": 2.4150924791035035e-06, "loss": 0.7606, "num_input_tokens_seen": 303642560, "step": 7125 }, { "epoch": 1.7190306829826993, "grad_norm": 1.1287294626235962, "learning_rate": 2.3948301296446957e-06, "loss": 0.749, "num_input_tokens_seen": 303855792, "step": 7130 }, { "epoch": 1.7202363011634216, "grad_norm": 1.351492166519165, "learning_rate": 2.37464886123083e-06, "loss": 0.7711, "num_input_tokens_seen": 304068632, "step": 7135 }, { "epoch": 1.7214419193441437, "grad_norm": 1.125603437423706, "learning_rate": 2.354548746248836e-06, "loss": 0.7563, "num_input_tokens_seen": 304289192, "step": 7140 }, { "epoch": 1.7226475375248658, "grad_norm": 1.6024318933486938, "learning_rate": 2.3345298567945204e-06, "loss": 0.7738, "num_input_tokens_seen": 304506760, "step": 7145 }, { "epoch": 1.723853155705588, "grad_norm": 2.3387210369110107, "learning_rate": 2.3145922646723745e-06, "loss": 0.8697, "num_input_tokens_seen": 304710992, "step": 7150 }, { "epoch": 1.7250587738863103, "grad_norm": 1.2635908126831055, "learning_rate": 2.2947360413952764e-06, "loss": 0.7419, "num_input_tokens_seen": 304926976, "step": 7155 }, { "epoch": 1.7262643920670324, "grad_norm": 1.2980493307113647, "learning_rate": 2.2749612581842437e-06, "loss": 0.7653, "num_input_tokens_seen": 305146920, "step": 7160 }, { "epoch": 1.7274700102477545, "grad_norm": 1.2822463512420654, "learning_rate": 2.255267985968196e-06, "loss": 0.7446, "num_input_tokens_seen": 305349896, "step": 7165 }, { "epoch": 1.7286756284284768, "grad_norm": 1.1810208559036255, "learning_rate": 2.235656295383673e-06, "loss": 0.7403, "num_input_tokens_seen": 305553376, "step": 7170 }, { "epoch": 1.7298812466091988, "grad_norm": 1.2997260093688965, "learning_rate": 2.2161262567746034e-06, "loss": 0.8292, "num_input_tokens_seen": 305769112, "step": 7175 }, { "epoch": 1.731086864789921, "grad_norm": 1.246925950050354, "learning_rate": 2.1966779401920267e-06, "loss": 0.8015, "num_input_tokens_seen": 305989192, "step": 7180 }, { "epoch": 1.7322924829706432, "grad_norm": 1.2908066511154175, "learning_rate": 2.17731141539389e-06, "loss": 0.7155, "num_input_tokens_seen": 306200120, "step": 7185 }, { "epoch": 1.7334981011513655, "grad_norm": 1.2899476289749146, "learning_rate": 2.158026751844733e-06, "loss": 0.7865, "num_input_tokens_seen": 306421128, "step": 7190 }, { "epoch": 1.7347037193320876, "grad_norm": 1.1882871389389038, "learning_rate": 2.138824018715499e-06, "loss": 0.7575, "num_input_tokens_seen": 306634808, "step": 7195 }, { "epoch": 1.7359093375128096, "grad_norm": 1.0600775480270386, "learning_rate": 2.1197032848832456e-06, "loss": 0.7801, "num_input_tokens_seen": 306857864, "step": 7200 }, { "epoch": 1.737114955693532, "grad_norm": 1.1928852796554565, "learning_rate": 2.1006646189309144e-06, "loss": 0.7215, "num_input_tokens_seen": 307077792, "step": 7205 }, { "epoch": 1.738320573874254, "grad_norm": 1.17136812210083, "learning_rate": 2.0817080891470804e-06, "loss": 0.8529, "num_input_tokens_seen": 307291112, "step": 7210 }, { "epoch": 1.739526192054976, "grad_norm": 1.6304079294204712, "learning_rate": 2.062833763525726e-06, "loss": 0.7763, "num_input_tokens_seen": 307505088, "step": 7215 }, { "epoch": 1.7407318102356983, "grad_norm": 1.1832027435302734, "learning_rate": 2.0440417097659535e-06, "loss": 0.7278, "num_input_tokens_seen": 307722552, "step": 7220 }, { "epoch": 1.7419374284164206, "grad_norm": 1.3302998542785645, "learning_rate": 2.025331995271795e-06, "loss": 0.7917, "num_input_tokens_seen": 307929976, "step": 7225 }, { "epoch": 1.7431430465971427, "grad_norm": 1.2981916666030884, "learning_rate": 2.0067046871519335e-06, "loss": 0.7513, "num_input_tokens_seen": 308129232, "step": 7230 }, { "epoch": 1.7443486647778648, "grad_norm": 1.2100777626037598, "learning_rate": 1.9881598522194704e-06, "loss": 0.7802, "num_input_tokens_seen": 308347000, "step": 7235 }, { "epoch": 1.745554282958587, "grad_norm": 1.3850957155227661, "learning_rate": 1.969697556991695e-06, "loss": 0.7854, "num_input_tokens_seen": 308562312, "step": 7240 }, { "epoch": 1.7467599011393093, "grad_norm": 1.2744855880737305, "learning_rate": 1.951317867689842e-06, "loss": 0.7723, "num_input_tokens_seen": 308783912, "step": 7245 }, { "epoch": 1.7479655193200312, "grad_norm": 1.0807815790176392, "learning_rate": 1.933020850238845e-06, "loss": 0.7295, "num_input_tokens_seen": 308990464, "step": 7250 }, { "epoch": 1.7491711375007535, "grad_norm": 1.1983681917190552, "learning_rate": 1.9148065702671108e-06, "loss": 0.7943, "num_input_tokens_seen": 309197240, "step": 7255 }, { "epoch": 1.7503767556814758, "grad_norm": 1.188482403755188, "learning_rate": 1.8966750931062871e-06, "loss": 0.7086, "num_input_tokens_seen": 309407952, "step": 7260 }, { "epoch": 1.7515823738621978, "grad_norm": 1.697256088256836, "learning_rate": 1.8786264837910072e-06, "loss": 0.7858, "num_input_tokens_seen": 309619560, "step": 7265 }, { "epoch": 1.75278799204292, "grad_norm": 1.2997246980667114, "learning_rate": 1.8606608070586922e-06, "loss": 0.8641, "num_input_tokens_seen": 309833912, "step": 7270 }, { "epoch": 1.7539936102236422, "grad_norm": 1.3501850366592407, "learning_rate": 1.842778127349265e-06, "loss": 0.8059, "num_input_tokens_seen": 310047408, "step": 7275 }, { "epoch": 1.7551992284043645, "grad_norm": 1.129868745803833, "learning_rate": 1.8249785088049893e-06, "loss": 0.7518, "num_input_tokens_seen": 310263264, "step": 7280 }, { "epoch": 1.7564048465850866, "grad_norm": 1.1932473182678223, "learning_rate": 1.8072620152701696e-06, "loss": 0.7376, "num_input_tokens_seen": 310475224, "step": 7285 }, { "epoch": 1.7576104647658086, "grad_norm": 1.2993606328964233, "learning_rate": 1.7896287102909776e-06, "loss": 0.7782, "num_input_tokens_seen": 310691744, "step": 7290 }, { "epoch": 1.758816082946531, "grad_norm": 1.162045955657959, "learning_rate": 1.7720786571151892e-06, "loss": 0.716, "num_input_tokens_seen": 310907408, "step": 7295 }, { "epoch": 1.760021701127253, "grad_norm": 1.210849642753601, "learning_rate": 1.7546119186919784e-06, "loss": 0.7999, "num_input_tokens_seen": 311113232, "step": 7300 }, { "epoch": 1.761227319307975, "grad_norm": 1.5466002225875854, "learning_rate": 1.7372285576716717e-06, "loss": 0.7965, "num_input_tokens_seen": 311324840, "step": 7305 }, { "epoch": 1.7624329374886973, "grad_norm": 1.1124991178512573, "learning_rate": 1.7199286364055407e-06, "loss": 0.7677, "num_input_tokens_seen": 311540472, "step": 7310 }, { "epoch": 1.7636385556694196, "grad_norm": 1.359221339225769, "learning_rate": 1.7027122169455762e-06, "loss": 0.7883, "num_input_tokens_seen": 311746504, "step": 7315 }, { "epoch": 1.7648441738501417, "grad_norm": 1.30941641330719, "learning_rate": 1.6855793610442484e-06, "loss": 0.8059, "num_input_tokens_seen": 311954904, "step": 7320 }, { "epoch": 1.7660497920308638, "grad_norm": 1.0762748718261719, "learning_rate": 1.6685301301543165e-06, "loss": 0.7523, "num_input_tokens_seen": 312157880, "step": 7325 }, { "epoch": 1.767255410211586, "grad_norm": 1.1811197996139526, "learning_rate": 1.651564585428575e-06, "loss": 0.7534, "num_input_tokens_seen": 312367424, "step": 7330 }, { "epoch": 1.7684610283923081, "grad_norm": 1.2100934982299805, "learning_rate": 1.634682787719663e-06, "loss": 0.7503, "num_input_tokens_seen": 312583664, "step": 7335 }, { "epoch": 1.7696666465730302, "grad_norm": 1.5569101572036743, "learning_rate": 1.6178847975798246e-06, "loss": 0.7745, "num_input_tokens_seen": 312798048, "step": 7340 }, { "epoch": 1.7708722647537525, "grad_norm": 1.37936532497406, "learning_rate": 1.6011706752606992e-06, "loss": 0.7301, "num_input_tokens_seen": 313010720, "step": 7345 }, { "epoch": 1.7720778829344748, "grad_norm": 1.3801496028900146, "learning_rate": 1.5845404807131036e-06, "loss": 0.738, "num_input_tokens_seen": 313210136, "step": 7350 }, { "epoch": 1.7732835011151968, "grad_norm": 1.3658719062805176, "learning_rate": 1.567994273586834e-06, "loss": 0.8369, "num_input_tokens_seen": 313428936, "step": 7355 }, { "epoch": 1.774489119295919, "grad_norm": 1.3410077095031738, "learning_rate": 1.5515321132304194e-06, "loss": 0.8, "num_input_tokens_seen": 313647568, "step": 7360 }, { "epoch": 1.7756947374766412, "grad_norm": 1.227380633354187, "learning_rate": 1.5351540586909408e-06, "loss": 0.7145, "num_input_tokens_seen": 313856864, "step": 7365 }, { "epoch": 1.7769003556573633, "grad_norm": 4.534316062927246, "learning_rate": 1.5188601687137954e-06, "loss": 0.7348, "num_input_tokens_seen": 314080528, "step": 7370 }, { "epoch": 1.7781059738380853, "grad_norm": 1.4429187774658203, "learning_rate": 1.5026505017425086e-06, "loss": 0.7603, "num_input_tokens_seen": 314296304, "step": 7375 }, { "epoch": 1.7793115920188076, "grad_norm": 1.171933889389038, "learning_rate": 1.4865251159184885e-06, "loss": 0.7296, "num_input_tokens_seen": 314510432, "step": 7380 }, { "epoch": 1.78051721019953, "grad_norm": 1.2285550832748413, "learning_rate": 1.4704840690808659e-06, "loss": 0.7853, "num_input_tokens_seen": 314730280, "step": 7385 }, { "epoch": 1.781722828380252, "grad_norm": 1.232077956199646, "learning_rate": 1.4545274187662467e-06, "loss": 0.7884, "num_input_tokens_seen": 314951152, "step": 7390 }, { "epoch": 1.782928446560974, "grad_norm": 1.2002185583114624, "learning_rate": 1.4386552222085237e-06, "loss": 0.7435, "num_input_tokens_seen": 315161864, "step": 7395 }, { "epoch": 1.7841340647416963, "grad_norm": 1.5089046955108643, "learning_rate": 1.4228675363386734e-06, "loss": 0.7829, "num_input_tokens_seen": 315372416, "step": 7400 }, { "epoch": 1.7853396829224186, "grad_norm": 1.3056570291519165, "learning_rate": 1.4071644177845317e-06, "loss": 0.7722, "num_input_tokens_seen": 315593416, "step": 7405 }, { "epoch": 1.7865453011031405, "grad_norm": 1.1797047853469849, "learning_rate": 1.3915459228706297e-06, "loss": 0.7128, "num_input_tokens_seen": 315816888, "step": 7410 }, { "epoch": 1.7877509192838628, "grad_norm": 1.2840813398361206, "learning_rate": 1.3760121076179383e-06, "loss": 0.7568, "num_input_tokens_seen": 316028312, "step": 7415 }, { "epoch": 1.788956537464585, "grad_norm": 1.4333577156066895, "learning_rate": 1.3605630277437193e-06, "loss": 0.8459, "num_input_tokens_seen": 316238648, "step": 7420 }, { "epoch": 1.7901621556453071, "grad_norm": 1.308198094367981, "learning_rate": 1.3451987386612851e-06, "loss": 0.7858, "num_input_tokens_seen": 316456008, "step": 7425 }, { "epoch": 1.7913677738260292, "grad_norm": 1.222340703010559, "learning_rate": 1.3299192954798395e-06, "loss": 0.7748, "num_input_tokens_seen": 316663752, "step": 7430 }, { "epoch": 1.7925733920067515, "grad_norm": 1.2082847356796265, "learning_rate": 1.3147247530042318e-06, "loss": 0.7081, "num_input_tokens_seen": 316875312, "step": 7435 }, { "epoch": 1.7937790101874738, "grad_norm": 1.3130263090133667, "learning_rate": 1.2996151657348077e-06, "loss": 0.7548, "num_input_tokens_seen": 317087712, "step": 7440 }, { "epoch": 1.7949846283681958, "grad_norm": 1.236423134803772, "learning_rate": 1.2845905878671822e-06, "loss": 0.7883, "num_input_tokens_seen": 317295864, "step": 7445 }, { "epoch": 1.796190246548918, "grad_norm": 1.3039389848709106, "learning_rate": 1.269651073292058e-06, "loss": 0.6931, "num_input_tokens_seen": 317524448, "step": 7450 }, { "epoch": 1.7973958647296402, "grad_norm": 1.7779850959777832, "learning_rate": 1.2547966755950213e-06, "loss": 0.7798, "num_input_tokens_seen": 317735280, "step": 7455 }, { "epoch": 1.7986014829103623, "grad_norm": 1.197275996208191, "learning_rate": 1.2400274480563773e-06, "loss": 0.7436, "num_input_tokens_seen": 317952696, "step": 7460 }, { "epoch": 1.7998071010910843, "grad_norm": 1.2717430591583252, "learning_rate": 1.2253434436509227e-06, "loss": 0.7383, "num_input_tokens_seen": 318171032, "step": 7465 }, { "epoch": 1.8010127192718066, "grad_norm": 1.235499620437622, "learning_rate": 1.2107447150477735e-06, "loss": 0.7643, "num_input_tokens_seen": 318374496, "step": 7470 }, { "epoch": 1.802218337452529, "grad_norm": 1.4563218355178833, "learning_rate": 1.1962313146101883e-06, "loss": 0.7351, "num_input_tokens_seen": 318595000, "step": 7475 }, { "epoch": 1.803423955633251, "grad_norm": 1.2457001209259033, "learning_rate": 1.1818032943953534e-06, "loss": 0.7516, "num_input_tokens_seen": 318807144, "step": 7480 }, { "epoch": 1.804629573813973, "grad_norm": 1.2290685176849365, "learning_rate": 1.167460706154222e-06, "loss": 0.7978, "num_input_tokens_seen": 319033112, "step": 7485 }, { "epoch": 1.8058351919946953, "grad_norm": 1.2053683996200562, "learning_rate": 1.153203601331304e-06, "loss": 0.8035, "num_input_tokens_seen": 319246952, "step": 7490 }, { "epoch": 1.8070408101754174, "grad_norm": 1.0817500352859497, "learning_rate": 1.1390320310645076e-06, "loss": 0.7658, "num_input_tokens_seen": 319469200, "step": 7495 }, { "epoch": 1.8082464283561395, "grad_norm": 1.810150384902954, "learning_rate": 1.124946046184927e-06, "loss": 0.7888, "num_input_tokens_seen": 319673824, "step": 7500 }, { "epoch": 1.8094520465368618, "grad_norm": 1.349724292755127, "learning_rate": 1.1109456972166976e-06, "loss": 0.7796, "num_input_tokens_seen": 319885400, "step": 7505 }, { "epoch": 1.810657664717584, "grad_norm": 1.280307412147522, "learning_rate": 1.0970310343767693e-06, "loss": 0.7188, "num_input_tokens_seen": 320100416, "step": 7510 }, { "epoch": 1.8118632828983061, "grad_norm": 1.2069122791290283, "learning_rate": 1.0832021075747711e-06, "loss": 0.7494, "num_input_tokens_seen": 320328024, "step": 7515 }, { "epoch": 1.8130689010790282, "grad_norm": 1.2093557119369507, "learning_rate": 1.0694589664127868e-06, "loss": 0.7151, "num_input_tokens_seen": 320540096, "step": 7520 }, { "epoch": 1.8142745192597505, "grad_norm": 1.3928128480911255, "learning_rate": 1.0558016601852266e-06, "loss": 0.8017, "num_input_tokens_seen": 320762392, "step": 7525 }, { "epoch": 1.8154801374404728, "grad_norm": 1.2013825178146362, "learning_rate": 1.0422302378786025e-06, "loss": 0.771, "num_input_tokens_seen": 320961376, "step": 7530 }, { "epoch": 1.8166857556211946, "grad_norm": 1.289758563041687, "learning_rate": 1.0287447481713951e-06, "loss": 0.7887, "num_input_tokens_seen": 321167280, "step": 7535 }, { "epoch": 1.817891373801917, "grad_norm": 1.166558027267456, "learning_rate": 1.0153452394338458e-06, "loss": 0.8105, "num_input_tokens_seen": 321373336, "step": 7540 }, { "epoch": 1.8190969919826392, "grad_norm": 1.2280970811843872, "learning_rate": 1.002031759727795e-06, "loss": 0.7319, "num_input_tokens_seen": 321595912, "step": 7545 }, { "epoch": 1.8203026101633613, "grad_norm": 1.203267216682434, "learning_rate": 9.888043568065247e-07, "loss": 0.7854, "num_input_tokens_seen": 321809032, "step": 7550 }, { "epoch": 1.8215082283440833, "grad_norm": 1.0540038347244263, "learning_rate": 9.756630781145527e-07, "loss": 0.734, "num_input_tokens_seen": 322029352, "step": 7555 }, { "epoch": 1.8227138465248056, "grad_norm": 1.22452712059021, "learning_rate": 9.626079707875045e-07, "loss": 0.8147, "num_input_tokens_seen": 322241152, "step": 7560 }, { "epoch": 1.823919464705528, "grad_norm": 1.3003278970718384, "learning_rate": 9.496390816519097e-07, "loss": 0.7998, "num_input_tokens_seen": 322440304, "step": 7565 }, { "epoch": 1.82512508288625, "grad_norm": 1.4745782613754272, "learning_rate": 9.36756457225052e-07, "loss": 0.7582, "num_input_tokens_seen": 322658104, "step": 7570 }, { "epoch": 1.826330701066972, "grad_norm": 1.3537230491638184, "learning_rate": 9.239601437147944e-07, "loss": 0.7989, "num_input_tokens_seen": 322866616, "step": 7575 }, { "epoch": 1.8275363192476943, "grad_norm": 1.363696813583374, "learning_rate": 9.112501870194273e-07, "loss": 0.7928, "num_input_tokens_seen": 323084272, "step": 7580 }, { "epoch": 1.8287419374284164, "grad_norm": 1.2390482425689697, "learning_rate": 8.986266327274784e-07, "loss": 0.7825, "num_input_tokens_seen": 323309456, "step": 7585 }, { "epoch": 1.8299475556091385, "grad_norm": 1.1271288394927979, "learning_rate": 8.8608952611757e-07, "loss": 0.7605, "num_input_tokens_seen": 323533392, "step": 7590 }, { "epoch": 1.8311531737898608, "grad_norm": 1.495709776878357, "learning_rate": 8.736389121582561e-07, "loss": 0.7699, "num_input_tokens_seen": 323754128, "step": 7595 }, { "epoch": 1.832358791970583, "grad_norm": 1.143614411354065, "learning_rate": 8.612748355078498e-07, "loss": 0.8582, "num_input_tokens_seen": 323958744, "step": 7600 }, { "epoch": 1.8335644101513051, "grad_norm": 1.2313352823257446, "learning_rate": 8.489973405142681e-07, "loss": 0.8066, "num_input_tokens_seen": 324163752, "step": 7605 }, { "epoch": 1.8347700283320272, "grad_norm": 1.4584070444107056, "learning_rate": 8.368064712148821e-07, "loss": 0.7908, "num_input_tokens_seen": 324379792, "step": 7610 }, { "epoch": 1.8359756465127495, "grad_norm": 1.17233145236969, "learning_rate": 8.247022713363417e-07, "loss": 0.8023, "num_input_tokens_seen": 324600560, "step": 7615 }, { "epoch": 1.8371812646934715, "grad_norm": 1.3379162549972534, "learning_rate": 8.126847842944319e-07, "loss": 0.7609, "num_input_tokens_seen": 324804440, "step": 7620 }, { "epoch": 1.8383868828741936, "grad_norm": 1.525472640991211, "learning_rate": 8.007540531939112e-07, "loss": 0.744, "num_input_tokens_seen": 325024192, "step": 7625 }, { "epoch": 1.839592501054916, "grad_norm": 1.1654099225997925, "learning_rate": 7.889101208283539e-07, "loss": 0.7455, "num_input_tokens_seen": 325234352, "step": 7630 }, { "epoch": 1.8407981192356382, "grad_norm": 1.4289826154708862, "learning_rate": 7.771530296800167e-07, "loss": 0.7926, "num_input_tokens_seen": 325454904, "step": 7635 }, { "epoch": 1.8420037374163603, "grad_norm": 1.2645341157913208, "learning_rate": 7.654828219196553e-07, "loss": 0.7496, "num_input_tokens_seen": 325664488, "step": 7640 }, { "epoch": 1.8432093555970823, "grad_norm": 1.1354546546936035, "learning_rate": 7.538995394063996e-07, "loss": 0.7583, "num_input_tokens_seen": 325877184, "step": 7645 }, { "epoch": 1.8444149737778046, "grad_norm": 1.129745602607727, "learning_rate": 7.424032236875849e-07, "loss": 0.7929, "num_input_tokens_seen": 326079176, "step": 7650 }, { "epoch": 1.8456205919585267, "grad_norm": 1.2182037830352783, "learning_rate": 7.309939159986206e-07, "loss": 0.7355, "num_input_tokens_seen": 326297656, "step": 7655 }, { "epoch": 1.8468262101392487, "grad_norm": 1.2107971906661987, "learning_rate": 7.196716572628131e-07, "loss": 0.8733, "num_input_tokens_seen": 326511496, "step": 7660 }, { "epoch": 1.848031828319971, "grad_norm": 1.632023572921753, "learning_rate": 7.08436488091263e-07, "loss": 0.8065, "num_input_tokens_seen": 326719632, "step": 7665 }, { "epoch": 1.8492374465006933, "grad_norm": 1.4219262599945068, "learning_rate": 6.972884487826792e-07, "loss": 0.8252, "num_input_tokens_seen": 326937112, "step": 7670 }, { "epoch": 1.8504430646814154, "grad_norm": 1.3737879991531372, "learning_rate": 6.862275793232564e-07, "loss": 0.8059, "num_input_tokens_seen": 327142056, "step": 7675 }, { "epoch": 1.8516486828621375, "grad_norm": 1.196880578994751, "learning_rate": 6.752539193865232e-07, "loss": 0.7321, "num_input_tokens_seen": 327358800, "step": 7680 }, { "epoch": 1.8528543010428598, "grad_norm": 1.123176097869873, "learning_rate": 6.643675083331968e-07, "loss": 0.7305, "num_input_tokens_seen": 327561112, "step": 7685 }, { "epoch": 1.854059919223582, "grad_norm": 1.2582488059997559, "learning_rate": 6.535683852110563e-07, "loss": 0.7903, "num_input_tokens_seen": 327760840, "step": 7690 }, { "epoch": 1.855265537404304, "grad_norm": 1.4382667541503906, "learning_rate": 6.428565887547921e-07, "loss": 0.7588, "num_input_tokens_seen": 327975368, "step": 7695 }, { "epoch": 1.8564711555850262, "grad_norm": 1.2697174549102783, "learning_rate": 6.322321573858592e-07, "loss": 0.7715, "num_input_tokens_seen": 328187960, "step": 7700 }, { "epoch": 1.8576767737657485, "grad_norm": 1.2168370485305786, "learning_rate": 6.216951292123574e-07, "loss": 0.8075, "num_input_tokens_seen": 328401584, "step": 7705 }, { "epoch": 1.8588823919464705, "grad_norm": 1.273916482925415, "learning_rate": 6.11245542028882e-07, "loss": 0.7555, "num_input_tokens_seen": 328620312, "step": 7710 }, { "epoch": 1.8600880101271926, "grad_norm": 1.4386879205703735, "learning_rate": 6.008834333163876e-07, "loss": 0.7344, "num_input_tokens_seen": 328830864, "step": 7715 }, { "epoch": 1.861293628307915, "grad_norm": 1.2526341676712036, "learning_rate": 5.906088402420712e-07, "loss": 0.8058, "num_input_tokens_seen": 329037712, "step": 7720 }, { "epoch": 1.8624992464886372, "grad_norm": 1.2262234687805176, "learning_rate": 5.804217996592115e-07, "loss": 0.7447, "num_input_tokens_seen": 329251312, "step": 7725 }, { "epoch": 1.8637048646693593, "grad_norm": 1.3696452379226685, "learning_rate": 5.70322348107058e-07, "loss": 0.761, "num_input_tokens_seen": 329459928, "step": 7730 }, { "epoch": 1.8649104828500813, "grad_norm": 1.1781162023544312, "learning_rate": 5.603105218106836e-07, "loss": 0.8259, "num_input_tokens_seen": 329661888, "step": 7735 }, { "epoch": 1.8661161010308036, "grad_norm": 1.2499477863311768, "learning_rate": 5.50386356680882e-07, "loss": 0.7594, "num_input_tokens_seen": 329886808, "step": 7740 }, { "epoch": 1.8673217192115257, "grad_norm": 1.3065365552902222, "learning_rate": 5.405498883139987e-07, "loss": 0.763, "num_input_tokens_seen": 330085824, "step": 7745 }, { "epoch": 1.8685273373922477, "grad_norm": 1.2730084657669067, "learning_rate": 5.308011519918444e-07, "loss": 0.7654, "num_input_tokens_seen": 330297296, "step": 7750 }, { "epoch": 1.86973295557297, "grad_norm": 1.1980284452438354, "learning_rate": 5.21140182681537e-07, "loss": 0.7762, "num_input_tokens_seen": 330517888, "step": 7755 }, { "epoch": 1.8709385737536923, "grad_norm": 1.2424657344818115, "learning_rate": 5.115670150353941e-07, "loss": 0.7709, "num_input_tokens_seen": 330722056, "step": 7760 }, { "epoch": 1.8721441919344144, "grad_norm": 1.1736173629760742, "learning_rate": 5.020816833907982e-07, "loss": 0.7966, "num_input_tokens_seen": 330937136, "step": 7765 }, { "epoch": 1.8733498101151365, "grad_norm": 1.2429529428482056, "learning_rate": 4.926842217700845e-07, "loss": 0.769, "num_input_tokens_seen": 331146136, "step": 7770 }, { "epoch": 1.8745554282958587, "grad_norm": 1.3555958271026611, "learning_rate": 4.833746638804093e-07, "loss": 0.7078, "num_input_tokens_seen": 331358376, "step": 7775 }, { "epoch": 1.8757610464765808, "grad_norm": 1.3319071531295776, "learning_rate": 4.741530431136315e-07, "loss": 0.7649, "num_input_tokens_seen": 331576824, "step": 7780 }, { "epoch": 1.8769666646573029, "grad_norm": 1.1155213117599487, "learning_rate": 4.650193925461982e-07, "loss": 0.7639, "num_input_tokens_seen": 331774728, "step": 7785 }, { "epoch": 1.8781722828380252, "grad_norm": 1.2029117345809937, "learning_rate": 4.559737449390145e-07, "loss": 0.7838, "num_input_tokens_seen": 331980392, "step": 7790 }, { "epoch": 1.8793779010187475, "grad_norm": 1.2548964023590088, "learning_rate": 4.47016132737349e-07, "loss": 0.8122, "num_input_tokens_seen": 332190896, "step": 7795 }, { "epoch": 1.8805835191994695, "grad_norm": 1.412028193473816, "learning_rate": 4.3814658807067865e-07, "loss": 0.7493, "num_input_tokens_seen": 332396928, "step": 7800 }, { "epoch": 1.8817891373801916, "grad_norm": 1.2237025499343872, "learning_rate": 4.2936514275261354e-07, "loss": 0.7768, "num_input_tokens_seen": 332602784, "step": 7805 }, { "epoch": 1.882994755560914, "grad_norm": 1.3775051832199097, "learning_rate": 4.206718282807609e-07, "loss": 0.7914, "num_input_tokens_seen": 332812464, "step": 7810 }, { "epoch": 1.8842003737416362, "grad_norm": 1.2393053770065308, "learning_rate": 4.1206667583661705e-07, "loss": 0.7074, "num_input_tokens_seen": 333027288, "step": 7815 }, { "epoch": 1.885405991922358, "grad_norm": 4.288917064666748, "learning_rate": 4.0354971628545345e-07, "loss": 0.7806, "num_input_tokens_seen": 333240392, "step": 7820 }, { "epoch": 1.8866116101030803, "grad_norm": 1.4643956422805786, "learning_rate": 3.951209801762168e-07, "loss": 0.7774, "num_input_tokens_seen": 333456728, "step": 7825 }, { "epoch": 1.8878172282838026, "grad_norm": 1.3703731298446655, "learning_rate": 3.867804977413986e-07, "loss": 0.8184, "num_input_tokens_seen": 333668928, "step": 7830 }, { "epoch": 1.8890228464645247, "grad_norm": 1.7461374998092651, "learning_rate": 3.785282988969435e-07, "loss": 0.8471, "num_input_tokens_seen": 333879080, "step": 7835 }, { "epoch": 1.8902284646452467, "grad_norm": 1.4222255945205688, "learning_rate": 3.7036441324213857e-07, "loss": 0.7019, "num_input_tokens_seen": 334096168, "step": 7840 }, { "epoch": 1.891434082825969, "grad_norm": 1.2670111656188965, "learning_rate": 3.622888700595101e-07, "loss": 0.7973, "num_input_tokens_seen": 334298400, "step": 7845 }, { "epoch": 1.8926397010066913, "grad_norm": 1.2581616640090942, "learning_rate": 3.5430169831470737e-07, "loss": 0.7596, "num_input_tokens_seen": 334519648, "step": 7850 }, { "epoch": 1.8938453191874134, "grad_norm": 1.5669320821762085, "learning_rate": 3.464029266564056e-07, "loss": 0.7873, "num_input_tokens_seen": 334724032, "step": 7855 }, { "epoch": 1.8950509373681355, "grad_norm": 1.3047648668289185, "learning_rate": 3.3859258341621125e-07, "loss": 0.7078, "num_input_tokens_seen": 334943712, "step": 7860 }, { "epoch": 1.8962565555488577, "grad_norm": 1.3401533365249634, "learning_rate": 3.3087069660854286e-07, "loss": 0.7518, "num_input_tokens_seen": 335159168, "step": 7865 }, { "epoch": 1.8974621737295798, "grad_norm": 1.229848027229309, "learning_rate": 3.232372939305478e-07, "loss": 0.774, "num_input_tokens_seen": 335375264, "step": 7870 }, { "epoch": 1.8986677919103019, "grad_norm": 1.2235866785049438, "learning_rate": 3.156924027619884e-07, "loss": 0.7554, "num_input_tokens_seen": 335592280, "step": 7875 }, { "epoch": 1.8998734100910242, "grad_norm": 1.1256613731384277, "learning_rate": 3.0823605016515875e-07, "loss": 0.7598, "num_input_tokens_seen": 335809960, "step": 7880 }, { "epoch": 1.9010790282717465, "grad_norm": 1.2484843730926514, "learning_rate": 3.008682628847709e-07, "loss": 0.7658, "num_input_tokens_seen": 336027056, "step": 7885 }, { "epoch": 1.9022846464524685, "grad_norm": 1.2252024412155151, "learning_rate": 2.9358906734787974e-07, "loss": 0.7592, "num_input_tokens_seen": 336243240, "step": 7890 }, { "epoch": 1.9034902646331906, "grad_norm": 1.6444954872131348, "learning_rate": 2.8639848966376116e-07, "loss": 0.7602, "num_input_tokens_seen": 336458080, "step": 7895 }, { "epoch": 1.9046958828139129, "grad_norm": 1.4392935037612915, "learning_rate": 2.792965556238425e-07, "loss": 0.8616, "num_input_tokens_seen": 336667280, "step": 7900 }, { "epoch": 1.905901500994635, "grad_norm": 1.111231803894043, "learning_rate": 2.722832907015971e-07, "loss": 0.741, "num_input_tokens_seen": 336877240, "step": 7905 }, { "epoch": 1.907107119175357, "grad_norm": 1.1384365558624268, "learning_rate": 2.6535872005246374e-07, "loss": 0.7475, "num_input_tokens_seen": 337100920, "step": 7910 }, { "epoch": 1.9083127373560793, "grad_norm": 1.3425742387771606, "learning_rate": 2.585228685137414e-07, "loss": 0.8264, "num_input_tokens_seen": 337328512, "step": 7915 }, { "epoch": 1.9095183555368016, "grad_norm": 1.2245519161224365, "learning_rate": 2.517757606045085e-07, "loss": 0.8152, "num_input_tokens_seen": 337543536, "step": 7920 }, { "epoch": 1.9107239737175237, "grad_norm": 2.2603209018707275, "learning_rate": 2.4511742052553966e-07, "loss": 0.7708, "num_input_tokens_seen": 337760816, "step": 7925 }, { "epoch": 1.9119295918982457, "grad_norm": 1.2692008018493652, "learning_rate": 2.385478721592116e-07, "loss": 0.7944, "num_input_tokens_seen": 337977704, "step": 7930 }, { "epoch": 1.913135210078968, "grad_norm": 1.2433571815490723, "learning_rate": 2.320671390694168e-07, "loss": 0.7044, "num_input_tokens_seen": 338198736, "step": 7935 }, { "epoch": 1.9143408282596903, "grad_norm": 1.2858800888061523, "learning_rate": 2.256752445014859e-07, "loss": 0.7885, "num_input_tokens_seen": 338403536, "step": 7940 }, { "epoch": 1.9155464464404122, "grad_norm": 1.7531826496124268, "learning_rate": 2.1937221138209896e-07, "loss": 0.7587, "num_input_tokens_seen": 338620096, "step": 7945 }, { "epoch": 1.9167520646211345, "grad_norm": 1.2397584915161133, "learning_rate": 2.1315806231920198e-07, "loss": 0.7074, "num_input_tokens_seen": 338820832, "step": 7950 }, { "epoch": 1.9179576828018567, "grad_norm": 1.2662335634231567, "learning_rate": 2.0703281960193221e-07, "loss": 0.7928, "num_input_tokens_seen": 339026392, "step": 7955 }, { "epoch": 1.9191633009825788, "grad_norm": 1.3926514387130737, "learning_rate": 2.0099650520053191e-07, "loss": 0.7719, "num_input_tokens_seen": 339242560, "step": 7960 }, { "epoch": 1.9203689191633009, "grad_norm": 1.2877957820892334, "learning_rate": 1.9504914076627912e-07, "loss": 0.8043, "num_input_tokens_seen": 339450160, "step": 7965 }, { "epoch": 1.9215745373440232, "grad_norm": 1.1825233697891235, "learning_rate": 1.8919074763138754e-07, "loss": 0.7806, "num_input_tokens_seen": 339665448, "step": 7970 }, { "epoch": 1.9227801555247455, "grad_norm": 1.934077262878418, "learning_rate": 1.834213468089596e-07, "loss": 0.8313, "num_input_tokens_seen": 339865336, "step": 7975 }, { "epoch": 1.9239857737054675, "grad_norm": 1.1284459829330444, "learning_rate": 1.7774095899288912e-07, "loss": 0.7432, "num_input_tokens_seen": 340092288, "step": 7980 }, { "epoch": 1.9251913918861896, "grad_norm": 1.2420666217803955, "learning_rate": 1.7214960455780305e-07, "loss": 0.7722, "num_input_tokens_seen": 340316632, "step": 7985 }, { "epoch": 1.9263970100669119, "grad_norm": 1.25881826877594, "learning_rate": 1.6664730355896997e-07, "loss": 0.7509, "num_input_tokens_seen": 340523944, "step": 7990 }, { "epoch": 1.927602628247634, "grad_norm": 1.29314386844635, "learning_rate": 1.612340757322417e-07, "loss": 0.8244, "num_input_tokens_seen": 340733920, "step": 7995 }, { "epoch": 1.928808246428356, "grad_norm": 1.1809169054031372, "learning_rate": 1.5590994049398677e-07, "loss": 0.77, "num_input_tokens_seen": 340953720, "step": 8000 }, { "epoch": 1.9300138646090783, "grad_norm": 1.2310028076171875, "learning_rate": 1.5067491694100154e-07, "loss": 0.7444, "num_input_tokens_seen": 341158216, "step": 8005 }, { "epoch": 1.9312194827898006, "grad_norm": 1.9220391511917114, "learning_rate": 1.455290238504603e-07, "loss": 0.7379, "num_input_tokens_seen": 341375512, "step": 8010 }, { "epoch": 1.9324251009705227, "grad_norm": 1.3316396474838257, "learning_rate": 1.4047227967984023e-07, "loss": 0.7471, "num_input_tokens_seen": 341588224, "step": 8015 }, { "epoch": 1.9336307191512447, "grad_norm": 1.3437633514404297, "learning_rate": 1.3550470256685488e-07, "loss": 0.8101, "num_input_tokens_seen": 341798824, "step": 8020 }, { "epoch": 1.934836337331967, "grad_norm": 1.363929271697998, "learning_rate": 1.3062631032939032e-07, "loss": 0.7667, "num_input_tokens_seen": 342014272, "step": 8025 }, { "epoch": 1.936041955512689, "grad_norm": 1.6635732650756836, "learning_rate": 1.2583712046544127e-07, "loss": 0.7175, "num_input_tokens_seen": 342232672, "step": 8030 }, { "epoch": 1.9372475736934112, "grad_norm": 1.369596004486084, "learning_rate": 1.2113715015304728e-07, "loss": 0.6871, "num_input_tokens_seen": 342436960, "step": 8035 }, { "epoch": 1.9384531918741335, "grad_norm": 1.3870971202850342, "learning_rate": 1.1652641625023165e-07, "loss": 0.7386, "num_input_tokens_seen": 342657696, "step": 8040 }, { "epoch": 1.9396588100548557, "grad_norm": 1.267480492591858, "learning_rate": 1.1200493529494593e-07, "loss": 0.8408, "num_input_tokens_seen": 342865272, "step": 8045 }, { "epoch": 1.9408644282355778, "grad_norm": 1.3030672073364258, "learning_rate": 1.0757272350500059e-07, "loss": 0.8363, "num_input_tokens_seen": 343071616, "step": 8050 }, { "epoch": 1.9420700464162999, "grad_norm": 1.1753603219985962, "learning_rate": 1.0322979677800936e-07, "loss": 0.6914, "num_input_tokens_seen": 343284040, "step": 8055 }, { "epoch": 1.9432756645970222, "grad_norm": 1.3663969039916992, "learning_rate": 9.89761706913478e-08, "loss": 0.7651, "num_input_tokens_seen": 343506872, "step": 8060 }, { "epoch": 1.9444812827777442, "grad_norm": 1.0867863893508911, "learning_rate": 9.481186050207258e-08, "loss": 0.7364, "num_input_tokens_seen": 343722152, "step": 8065 }, { "epoch": 1.9456869009584663, "grad_norm": 1.1085820198059082, "learning_rate": 9.073688114688284e-08, "loss": 0.776, "num_input_tokens_seen": 343940792, "step": 8070 }, { "epoch": 1.9468925191391886, "grad_norm": 1.7150299549102783, "learning_rate": 8.675124724206174e-08, "loss": 0.7763, "num_input_tokens_seen": 344153936, "step": 8075 }, { "epoch": 1.9480981373199109, "grad_norm": 1.2732090950012207, "learning_rate": 8.285497308342937e-08, "loss": 0.7802, "num_input_tokens_seen": 344352792, "step": 8080 }, { "epoch": 1.949303755500633, "grad_norm": 1.2661283016204834, "learning_rate": 7.904807264627889e-08, "loss": 0.8202, "num_input_tokens_seen": 344576632, "step": 8085 }, { "epoch": 1.950509373681355, "grad_norm": 1.5215775966644287, "learning_rate": 7.533055958534318e-08, "loss": 0.7893, "num_input_tokens_seen": 344798912, "step": 8090 }, { "epoch": 1.9517149918620773, "grad_norm": 1.2088274955749512, "learning_rate": 7.17024472347283e-08, "loss": 0.7167, "num_input_tokens_seen": 345007296, "step": 8095 }, { "epoch": 1.9529206100427996, "grad_norm": 1.412129521369934, "learning_rate": 6.816374860788566e-08, "loss": 0.7636, "num_input_tokens_seen": 345223376, "step": 8100 }, { "epoch": 1.9541262282235214, "grad_norm": 1.2035419940948486, "learning_rate": 6.471447639754268e-08, "loss": 0.7809, "num_input_tokens_seen": 345432232, "step": 8105 }, { "epoch": 1.9553318464042437, "grad_norm": 1.2374058961868286, "learning_rate": 6.135464297567783e-08, "loss": 0.8121, "num_input_tokens_seen": 345640560, "step": 8110 }, { "epoch": 1.956537464584966, "grad_norm": 1.2151460647583008, "learning_rate": 5.8084260393462265e-08, "loss": 0.755, "num_input_tokens_seen": 345862408, "step": 8115 }, { "epoch": 1.957743082765688, "grad_norm": 1.539618730545044, "learning_rate": 5.49033403812238e-08, "loss": 0.7747, "num_input_tokens_seen": 346078496, "step": 8120 }, { "epoch": 1.9589487009464102, "grad_norm": 1.22273588180542, "learning_rate": 5.18118943484025e-08, "loss": 0.7624, "num_input_tokens_seen": 346289432, "step": 8125 }, { "epoch": 1.9601543191271324, "grad_norm": 2.4519598484039307, "learning_rate": 4.880993338350626e-08, "loss": 0.7941, "num_input_tokens_seen": 346498968, "step": 8130 }, { "epoch": 1.9613599373078547, "grad_norm": 1.381422519683838, "learning_rate": 4.589746825408303e-08, "loss": 0.726, "num_input_tokens_seen": 346717968, "step": 8135 }, { "epoch": 1.9625655554885768, "grad_norm": 1.3372561931610107, "learning_rate": 4.3074509406662575e-08, "loss": 0.7835, "num_input_tokens_seen": 346929816, "step": 8140 }, { "epoch": 1.9637711736692989, "grad_norm": 1.1733534336090088, "learning_rate": 4.034106696674256e-08, "loss": 0.7873, "num_input_tokens_seen": 347143688, "step": 8145 }, { "epoch": 1.9649767918500212, "grad_norm": 1.4351152181625366, "learning_rate": 3.769715073872748e-08, "loss": 0.7958, "num_input_tokens_seen": 347353568, "step": 8150 }, { "epoch": 1.9661824100307432, "grad_norm": 1.2121278047561646, "learning_rate": 3.514277020591483e-08, "loss": 0.7114, "num_input_tokens_seen": 347570288, "step": 8155 }, { "epoch": 1.9673880282114653, "grad_norm": 2.3380539417266846, "learning_rate": 3.2677934530450646e-08, "loss": 0.7232, "num_input_tokens_seen": 347779392, "step": 8160 }, { "epoch": 1.9685936463921876, "grad_norm": 1.2221498489379883, "learning_rate": 3.0302652553296226e-08, "loss": 0.7782, "num_input_tokens_seen": 347982088, "step": 8165 }, { "epoch": 1.9697992645729099, "grad_norm": 1.2077677249908447, "learning_rate": 2.801693279420037e-08, "loss": 0.7564, "num_input_tokens_seen": 348196448, "step": 8170 }, { "epoch": 1.971004882753632, "grad_norm": 1.229386329650879, "learning_rate": 2.5820783451663276e-08, "loss": 0.7654, "num_input_tokens_seen": 348414864, "step": 8175 }, { "epoch": 1.972210500934354, "grad_norm": 1.2667386531829834, "learning_rate": 2.3714212402917156e-08, "loss": 0.7574, "num_input_tokens_seen": 348619544, "step": 8180 }, { "epoch": 1.9734161191150763, "grad_norm": 1.1851519346237183, "learning_rate": 2.169722720388179e-08, "loss": 0.8048, "num_input_tokens_seen": 348824464, "step": 8185 }, { "epoch": 1.9746217372957984, "grad_norm": 2.6571383476257324, "learning_rate": 1.9769835089158974e-08, "loss": 0.7266, "num_input_tokens_seen": 349033472, "step": 8190 }, { "epoch": 1.9758273554765204, "grad_norm": 1.2532614469528198, "learning_rate": 1.7932042971985363e-08, "loss": 0.7818, "num_input_tokens_seen": 349260576, "step": 8195 }, { "epoch": 1.9770329736572427, "grad_norm": 1.2295736074447632, "learning_rate": 1.6183857444221352e-08, "loss": 0.7689, "num_input_tokens_seen": 349468080, "step": 8200 }, { "epoch": 1.978238591837965, "grad_norm": 1.5051919221878052, "learning_rate": 1.452528477632331e-08, "loss": 0.8164, "num_input_tokens_seen": 349679136, "step": 8205 }, { "epoch": 1.979444210018687, "grad_norm": 1.2129673957824707, "learning_rate": 1.2956330917318626e-08, "loss": 0.749, "num_input_tokens_seen": 349891872, "step": 8210 }, { "epoch": 1.9806498281994092, "grad_norm": 1.3315292596817017, "learning_rate": 1.1477001494786255e-08, "loss": 0.8011, "num_input_tokens_seen": 350101728, "step": 8215 }, { "epoch": 1.9818554463801314, "grad_norm": 1.3172804117202759, "learning_rate": 1.0087301814842853e-08, "loss": 0.7847, "num_input_tokens_seen": 350302416, "step": 8220 }, { "epoch": 1.9830610645608537, "grad_norm": 1.1456546783447266, "learning_rate": 8.787236862112247e-09, "loss": 0.7514, "num_input_tokens_seen": 350505544, "step": 8225 }, { "epoch": 1.9842666827415756, "grad_norm": 1.1131523847579956, "learning_rate": 7.576811299714327e-09, "loss": 0.7249, "num_input_tokens_seen": 350721288, "step": 8230 }, { "epoch": 1.9854723009222979, "grad_norm": 1.2367700338363647, "learning_rate": 6.456029469245617e-09, "loss": 0.7813, "num_input_tokens_seen": 350929912, "step": 8235 }, { "epoch": 1.9866779191030202, "grad_norm": 1.2594212293624878, "learning_rate": 5.424895390770957e-09, "loss": 0.7634, "num_input_tokens_seen": 351150808, "step": 8240 }, { "epoch": 1.9878835372837422, "grad_norm": 1.1975210905075073, "learning_rate": 4.483412762795736e-09, "loss": 0.6662, "num_input_tokens_seen": 351373424, "step": 8245 }, { "epoch": 1.9890891554644643, "grad_norm": 1.2396987676620483, "learning_rate": 3.631584962268675e-09, "loss": 0.7665, "num_input_tokens_seen": 351592304, "step": 8250 }, { "epoch": 1.9902947736451866, "grad_norm": 11.888749122619629, "learning_rate": 2.8694150445512933e-09, "loss": 0.7932, "num_input_tokens_seen": 351811264, "step": 8255 }, { "epoch": 1.9915003918259089, "grad_norm": 1.1473942995071411, "learning_rate": 2.1969057434262363e-09, "loss": 0.7449, "num_input_tokens_seen": 352020016, "step": 8260 }, { "epoch": 1.992706010006631, "grad_norm": 1.36952543258667, "learning_rate": 1.6140594710722935e-09, "loss": 0.7971, "num_input_tokens_seen": 352239440, "step": 8265 }, { "epoch": 1.993911628187353, "grad_norm": 1.2333790063858032, "learning_rate": 1.1208783180644e-09, "loss": 0.7048, "num_input_tokens_seen": 352450328, "step": 8270 }, { "epoch": 1.9951172463680753, "grad_norm": 1.1522952318191528, "learning_rate": 7.173640533625347e-10, "loss": 0.7488, "num_input_tokens_seen": 352674872, "step": 8275 }, { "epoch": 1.9963228645487974, "grad_norm": 1.6225148439407349, "learning_rate": 4.035181243061681e-10, "loss": 0.7412, "num_input_tokens_seen": 352884320, "step": 8280 }, { "epoch": 1.9975284827295194, "grad_norm": 1.3897708654403687, "learning_rate": 1.7934165660871138e-10, "loss": 0.8378, "num_input_tokens_seen": 353081112, "step": 8285 }, { "epoch": 1.9987341009102417, "grad_norm": 1.275547742843628, "learning_rate": 4.4835454357516905e-11, "loss": 0.7353, "num_input_tokens_seen": 353293320, "step": 8290 } ], "logging_steps": 5, "max_steps": 8294, "num_input_tokens_seen": 353454104, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.529750459499028e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }