{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5765350245027385, "eval_steps": 250, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002882675122513693, "grad_norm": 6585.90112269416, "learning_rate": 0.0, "loss": 70.0206, "num_input_tokens_seen": 134792, "step": 1 }, { "epoch": 0.0002882675122513693, "eval_websight_new_IoU": 0.0, "eval_websight_new_MAE_x": 30.116134643554688, "eval_websight_new_MAE_y": 81.22258377075195, "eval_websight_new_NUM_probability": 8.40946690061628e-08, "eval_websight_new_inside_bbox": 0.0, "eval_websight_new_loss": 30257.947265625, "eval_websight_new_loss_ce": 3.8668471574783325, "eval_websight_new_loss_xval": 30184.0, "eval_websight_new_runtime": 36.6864, "eval_websight_new_samples_per_second": 1.363, "eval_websight_new_steps_per_second": 0.055, "num_input_tokens_seen": 134792, "step": 1 }, { "epoch": 0.0002882675122513693, "eval_seeclick_IoU": 0.0, "eval_seeclick_MAE_x": 35.54628944396973, "eval_seeclick_MAE_y": 98.56820297241211, "eval_seeclick_NUM_probability": 9.84709807028139e-08, "eval_seeclick_inside_bbox": 0.0, "eval_seeclick_loss": 26904.9765625, "eval_seeclick_loss_ce": 4.496192216873169, "eval_seeclick_loss_xval": 26752.0, "eval_seeclick_runtime": 61.4943, "eval_seeclick_samples_per_second": 0.813, "eval_seeclick_steps_per_second": 0.033, "num_input_tokens_seen": 134792, "step": 1 }, { "epoch": 0.0002882675122513693, "eval_icons_IoU": 0.0, "eval_icons_MAE_x": 40.47983169555664, "eval_icons_MAE_y": 118.87767791748047, "eval_icons_NUM_probability": 6.818678244258081e-08, "eval_icons_inside_bbox": 0.0, "eval_icons_loss": 32003.8046875, "eval_icons_loss_ce": 3.8045945167541504, "eval_icons_loss_xval": 32144.0, "eval_icons_runtime": 64.6968, "eval_icons_samples_per_second": 0.773, "eval_icons_steps_per_second": 0.031, "num_input_tokens_seen": 134792, "step": 1 }, { "epoch": 0.0002882675122513693, "loss": 32243.8203125, "loss_ce": 3.820674419403076, "loss_xval": 32256.0, "num_input_tokens_seen": 134792, "step": 1 }, { "epoch": 0.0005765350245027386, "grad_norm": 996706.798337614, "learning_rate": 1.255369169267456e-05, "loss": 31499.8438, "num_input_tokens_seen": 269736, "step": 2 }, { "epoch": 0.0005765350245027386, "loss": 31795.837890625, "loss_ce": 3.8380894660949707, "loss_xval": 31744.0, "num_input_tokens_seen": 269736, "step": 2 }, { "epoch": 0.0008648025367541078, "grad_norm": 786586.824957904, "learning_rate": 1.9897130578503877e-05, "loss": 29204.1328, "num_input_tokens_seen": 442296, "step": 3 }, { "epoch": 0.0008648025367541078, "loss": 32579.794921875, "loss_ce": 3.79455304145813, "loss_xval": 32512.0, "num_input_tokens_seen": 442296, "step": 3 }, { "epoch": 0.0011530700490054772, "grad_norm": 269341.23696228134, "learning_rate": 2.510738338534912e-05, "loss": 6028.1396, "num_input_tokens_seen": 577064, "step": 4 }, { "epoch": 0.0011530700490054772, "loss": 5972.423828125, "loss_ce": 4.423999309539795, "loss_xval": 5952.0, "num_input_tokens_seen": 577064, "step": 4 }, { "epoch": 0.0014413375612568463, "grad_norm": 18537.21188035262, "learning_rate": 2.9148769435775147e-05, "loss": 121.8909, "num_input_tokens_seen": 712088, "step": 5 }, { "epoch": 0.0014413375612568463, "loss": 133.62535095214844, "loss_ce": 4.250359535217285, "loss_xval": 129.0, "num_input_tokens_seen": 712088, "step": 5 }, { "epoch": 0.0017296050735082155, "grad_norm": 32400.299293533124, "learning_rate": 3.2450822271178436e-05, "loss": 392.5652, "num_input_tokens_seen": 884592, "step": 6 }, { "epoch": 0.0017296050735082155, "loss": 344.605712890625, "loss_ce": 4.355719566345215, "loss_xval": 340.0, "num_input_tokens_seen": 884592, "step": 6 }, { "epoch": 0.002017872585759585, "grad_norm": 42990.192777167664, "learning_rate": 3.524266816342358e-05, "loss": 486.4419, "num_input_tokens_seen": 1019368, "step": 7 }, { "epoch": 0.002017872585759585, "loss": 469.3589782714844, "loss_ce": 5.35897159576416, "loss_xval": 464.0, "num_input_tokens_seen": 1019368, "step": 7 }, { "epoch": 0.0023061400980109543, "grad_norm": 22176.080277855566, "learning_rate": 3.766107507802368e-05, "loss": 163.1039, "num_input_tokens_seen": 1154456, "step": 8 }, { "epoch": 0.0023061400980109543, "loss": 207.9622344970703, "loss_ce": 4.587244033813477, "loss_xval": 203.0, "num_input_tokens_seen": 1154456, "step": 8 }, { "epoch": 0.0025944076102623233, "grad_norm": 5706.931463243618, "learning_rate": 3.9794261157007754e-05, "loss": 57.5576, "num_input_tokens_seen": 1326888, "step": 9 }, { "epoch": 0.0025944076102623233, "loss": 39.85100555419922, "loss_ce": 4.835380554199219, "loss_xval": 35.0, "num_input_tokens_seen": 1326888, "step": 9 }, { "epoch": 0.0028826751225136927, "grad_norm": 2176.5381898893447, "learning_rate": 4.1702461128449717e-05, "loss": 19.5141, "num_input_tokens_seen": 1461696, "step": 10 }, { "epoch": 0.0028826751225136927, "loss": 24.284374237060547, "loss_ce": 5.823436737060547, "loss_xval": 18.5, "num_input_tokens_seen": 1461696, "step": 10 }, { "epoch": 0.003170942634765062, "grad_norm": 945.057485234417, "learning_rate": 4.342863797226275e-05, "loss": 7.9961, "num_input_tokens_seen": 1596760, "step": 11 }, { "epoch": 0.003170942634765062, "loss": 8.878573417663574, "loss_ce": 5.153963088989258, "loss_xval": 3.71875, "num_input_tokens_seen": 1596760, "step": 11 }, { "epoch": 0.003459210147016431, "grad_norm": 1958.6007838874298, "learning_rate": 4.5004513963852995e-05, "loss": 18.0951, "num_input_tokens_seen": 1769296, "step": 12 }, { "epoch": 0.003459210147016431, "loss": 13.637873649597168, "loss_ce": 5.114436149597168, "loss_xval": 8.5, "num_input_tokens_seen": 1769296, "step": 12 }, { "epoch": 0.0037474776592678004, "grad_norm": 3655.5592110276825, "learning_rate": 4.6454179348870823e-05, "loss": 40.8945, "num_input_tokens_seen": 1904080, "step": 13 }, { "epoch": 0.0037474776592678004, "loss": 47.78251647949219, "loss_ce": 6.032514572143555, "loss_xval": 41.75, "num_input_tokens_seen": 1904080, "step": 13 }, { "epoch": 0.00403574517151917, "grad_norm": 3601.49256025649, "learning_rate": 4.779635985609814e-05, "loss": 42.2144, "num_input_tokens_seen": 2039112, "step": 14 }, { "epoch": 0.00403574517151917, "loss": 44.717384338378906, "loss_ce": 5.5298848152160645, "loss_xval": 39.25, "num_input_tokens_seen": 2039112, "step": 14 }, { "epoch": 0.004324012683770539, "grad_norm": 3510.379349357715, "learning_rate": 4.904590001427903e-05, "loss": 41.0742, "num_input_tokens_seen": 2211640, "step": 15 }, { "epoch": 0.004324012683770539, "loss": 29.72612762451172, "loss_ce": 5.2730021476745605, "loss_xval": 24.5, "num_input_tokens_seen": 2211640, "step": 15 }, { "epoch": 0.004612280196021909, "grad_norm": 2298.404288795249, "learning_rate": 5.021476677069824e-05, "loss": 22.6203, "num_input_tokens_seen": 2346448, "step": 16 }, { "epoch": 0.004612280196021909, "loss": 27.352113723754883, "loss_ce": 6.406801223754883, "loss_xval": 21.0, "num_input_tokens_seen": 2346448, "step": 16 }, { "epoch": 0.004900547708273278, "grad_norm": 532.1801093126115, "learning_rate": 5.1312748314320346e-05, "loss": 8.118, "num_input_tokens_seen": 2481456, "step": 17 }, { "epoch": 0.004900547708273278, "loss": 8.194659233093262, "loss_ce": 6.041337966918945, "loss_xval": 2.15625, "num_input_tokens_seen": 2481456, "step": 17 }, { "epoch": 0.0051888152205246466, "grad_norm": 837.9495849069981, "learning_rate": 5.2347952849682313e-05, "loss": 10.8715, "num_input_tokens_seen": 2654080, "step": 18 }, { "epoch": 0.0051888152205246466, "loss": 11.446012496948242, "loss_ce": 6.5553879737854, "loss_xval": 4.875, "num_input_tokens_seen": 2654080, "step": 18 }, { "epoch": 0.005477082732776016, "grad_norm": 1683.4434230931104, "learning_rate": 5.332717233660044e-05, "loss": 17.1665, "num_input_tokens_seen": 2788840, "step": 19 }, { "epoch": 0.005477082732776016, "loss": 17.23530387878418, "loss_ce": 7.58686637878418, "loss_xval": 9.625, "num_input_tokens_seen": 2788840, "step": 19 }, { "epoch": 0.005765350245027385, "grad_norm": 2513.5637155552545, "learning_rate": 5.4256152821124276e-05, "loss": 26.6961, "num_input_tokens_seen": 2923840, "step": 20 }, { "epoch": 0.005765350245027385, "loss": 24.286407470703125, "loss_ce": 6.841094970703125, "loss_xval": 17.5, "num_input_tokens_seen": 2923840, "step": 20 }, { "epoch": 0.006053617757278754, "grad_norm": 2736.6744740093, "learning_rate": 5.513979874192746e-05, "loss": 30.3839, "num_input_tokens_seen": 3096336, "step": 21 }, { "epoch": 0.006053617757278754, "loss": 30.83609962463379, "loss_ce": 7.554849624633789, "loss_xval": 23.25, "num_input_tokens_seen": 3096336, "step": 21 }, { "epoch": 0.006341885269530124, "grad_norm": 1821.5514426743405, "learning_rate": 5.598232966493732e-05, "loss": 18.9078, "num_input_tokens_seen": 3231216, "step": 22 }, { "epoch": 0.006341885269530124, "loss": 20.68878746032715, "loss_ce": 8.735663414001465, "loss_xval": 11.9375, "num_input_tokens_seen": 3231216, "step": 22 }, { "epoch": 0.006630152781781493, "grad_norm": 943.9392413077285, "learning_rate": 5.6787402149051605e-05, "loss": 10.9002, "num_input_tokens_seen": 3366376, "step": 23 }, { "epoch": 0.006630152781781493, "loss": 9.826754570007324, "loss_ce": 7.108004570007324, "loss_xval": 2.71875, "num_input_tokens_seen": 3366376, "step": 23 }, { "epoch": 0.006918420294032862, "grad_norm": 517.680000027031, "learning_rate": 5.755820565652756e-05, "loss": 10.216, "num_input_tokens_seen": 3538864, "step": 24 }, { "epoch": 0.006918420294032862, "loss": 8.601203918457031, "loss_ce": 7.817756175994873, "loss_xval": 0.78515625, "num_input_tokens_seen": 3538864, "step": 24 }, { "epoch": 0.007206687806284232, "grad_norm": 1570.5899753940823, "learning_rate": 5.8297538871550295e-05, "loss": 15.9455, "num_input_tokens_seen": 3673728, "step": 25 }, { "epoch": 0.007206687806284232, "loss": 18.513473510742188, "loss_ce": 8.568161010742188, "loss_xval": 9.9375, "num_input_tokens_seen": 3673728, "step": 25 }, { "epoch": 0.007494955318535601, "grad_norm": 1736.7074884499743, "learning_rate": 5.900787104154538e-05, "loss": 17.7182, "num_input_tokens_seen": 3808768, "step": 26 }, { "epoch": 0.007494955318535601, "loss": 16.724374771118164, "loss_ce": 7.150156497955322, "loss_xval": 9.5625, "num_input_tokens_seen": 3808768, "step": 26 }, { "epoch": 0.007783222830786971, "grad_norm": 1604.4374265767592, "learning_rate": 5.9691391735511625e-05, "loss": 17.0378, "num_input_tokens_seen": 3981328, "step": 27 }, { "epoch": 0.007783222830786971, "loss": 17.871387481689453, "loss_ce": 7.668261528015137, "loss_xval": 10.1875, "num_input_tokens_seen": 3981328, "step": 27 }, { "epoch": 0.00807149034303834, "grad_norm": 1119.1745907415943, "learning_rate": 6.03500515487727e-05, "loss": 12.7019, "num_input_tokens_seen": 4116112, "step": 28 }, { "epoch": 0.00807149034303834, "loss": 11.71876049041748, "loss_ce": 8.15430736541748, "loss_xval": 3.5625, "num_input_tokens_seen": 4116112, "step": 28 }, { "epoch": 0.00835975785528971, "grad_norm": 652.7927458908872, "learning_rate": 6.09855956617039e-05, "loss": 9.0132, "num_input_tokens_seen": 4251240, "step": 29 }, { "epoch": 0.00835975785528971, "loss": 8.91965103149414, "loss_ce": 6.815158843994141, "loss_xval": 2.109375, "num_input_tokens_seen": 4251240, "step": 29 }, { "epoch": 0.008648025367541078, "grad_norm": 309.08013570955944, "learning_rate": 6.159959170695358e-05, "loss": 8.2797, "num_input_tokens_seen": 4423704, "step": 30 }, { "epoch": 0.008648025367541078, "loss": 7.2059736251831055, "loss_ce": 6.9381513595581055, "loss_xval": 0.267578125, "num_input_tokens_seen": 4423704, "step": 30 }, { "epoch": 0.008936292879792447, "grad_norm": 821.4961563326211, "learning_rate": 6.219345306558267e-05, "loss": 10.3026, "num_input_tokens_seen": 4558472, "step": 31 }, { "epoch": 0.008936292879792447, "loss": 12.730304718017578, "loss_ce": 7.624835968017578, "loss_xval": 5.09375, "num_input_tokens_seen": 4558472, "step": 31 }, { "epoch": 0.009224560392043817, "grad_norm": 810.3899443561476, "learning_rate": 6.276845846337281e-05, "loss": 9.2442, "num_input_tokens_seen": 4693568, "step": 32 }, { "epoch": 0.009224560392043817, "loss": 9.102476119995117, "loss_ce": 6.311460494995117, "loss_xval": 2.796875, "num_input_tokens_seen": 4693568, "step": 32 }, { "epoch": 0.009512827904295185, "grad_norm": 1126.8158352879109, "learning_rate": 6.332576855076663e-05, "loss": 12.4498, "num_input_tokens_seen": 4865944, "step": 33 }, { "epoch": 0.009512827904295185, "loss": 9.520813941955566, "loss_ce": 6.290345191955566, "loss_xval": 3.234375, "num_input_tokens_seen": 4865944, "step": 33 }, { "epoch": 0.009801095416546555, "grad_norm": 946.1031004612911, "learning_rate": 6.386644000699491e-05, "loss": 10.5841, "num_input_tokens_seen": 5000728, "step": 34 }, { "epoch": 0.009801095416546555, "loss": 13.091533660888672, "loss_ce": 6.704814434051514, "loss_xval": 6.375, "num_input_tokens_seen": 5000728, "step": 34 }, { "epoch": 0.010089362928797925, "grad_norm": 254.32293701379814, "learning_rate": 6.439143759919874e-05, "loss": 6.2691, "num_input_tokens_seen": 5135872, "step": 35 }, { "epoch": 0.010089362928797925, "loss": 6.317028999328613, "loss_ce": 5.744763374328613, "loss_xval": 0.5703125, "num_input_tokens_seen": 5135872, "step": 35 }, { "epoch": 0.010377630441049293, "grad_norm": 96.83144759509878, "learning_rate": 6.490164454235687e-05, "loss": 6.4683, "num_input_tokens_seen": 5308288, "step": 36 }, { "epoch": 0.010377630441049293, "loss": 6.021927833557129, "loss_ce": 5.571000099182129, "loss_xval": 0.451171875, "num_input_tokens_seen": 5308288, "step": 36 }, { "epoch": 0.010665897953300663, "grad_norm": 329.47055076201457, "learning_rate": 6.539787143947167e-05, "loss": 6.44, "num_input_tokens_seen": 5443160, "step": 37 }, { "epoch": 0.010665897953300663, "loss": 6.1701226234436035, "loss_ce": 5.8368706703186035, "loss_xval": 0.333984375, "num_input_tokens_seen": 5443160, "step": 37 }, { "epoch": 0.010954165465552033, "grad_norm": 850.6431793469337, "learning_rate": 6.5880864029275e-05, "loss": 8.3984, "num_input_tokens_seen": 5578416, "step": 38 }, { "epoch": 0.010954165465552033, "loss": 7.8684587478637695, "loss_ce": 5.1809587478637695, "loss_xval": 2.6875, "num_input_tokens_seen": 5578416, "step": 38 }, { "epoch": 0.011242432977803401, "grad_norm": 692.1496892412431, "learning_rate": 6.63513099273747e-05, "loss": 7.6066, "num_input_tokens_seen": 5750856, "step": 39 }, { "epoch": 0.011242432977803401, "loss": 8.899791717529297, "loss_ce": 4.835338592529297, "loss_xval": 4.0625, "num_input_tokens_seen": 5750856, "step": 39 }, { "epoch": 0.01153070049005477, "grad_norm": 468.94665746005865, "learning_rate": 6.680984451379883e-05, "loss": 6.0939, "num_input_tokens_seen": 5885624, "step": 40 }, { "epoch": 0.01153070049005477, "loss": 5.63425350189209, "loss_ce": 4.90231990814209, "loss_xval": 0.73046875, "num_input_tokens_seen": 5885624, "step": 40 }, { "epoch": 0.01181896800230614, "grad_norm": 426.5270199999428, "learning_rate": 6.725705609344598e-05, "loss": 5.3084, "num_input_tokens_seen": 6020824, "step": 41 }, { "epoch": 0.01181896800230614, "loss": 5.0226240158081055, "loss_ce": 4.4718427658081055, "loss_xval": 0.55078125, "num_input_tokens_seen": 6020824, "step": 41 }, { "epoch": 0.012107235514557509, "grad_norm": 75.92312869871212, "learning_rate": 6.769349043460202e-05, "loss": 4.6539, "num_input_tokens_seen": 6193280, "step": 42 }, { "epoch": 0.012107235514557509, "loss": 4.303558826446533, "loss_ce": 4.052459716796875, "loss_xval": 0.251953125, "num_input_tokens_seen": 6193280, "step": 42 }, { "epoch": 0.012395503026808878, "grad_norm": 397.7092047833716, "learning_rate": 6.81196547733565e-05, "loss": 5.0739, "num_input_tokens_seen": 6328136, "step": 43 }, { "epoch": 0.012395503026808878, "loss": 5.631143093109131, "loss_ce": 4.236611843109131, "loss_xval": 1.390625, "num_input_tokens_seen": 6328136, "step": 43 }, { "epoch": 0.012683770539060248, "grad_norm": 430.60447024117013, "learning_rate": 6.853602135761187e-05, "loss": 4.6489, "num_input_tokens_seen": 6463240, "step": 44 }, { "epoch": 0.012683770539060248, "loss": 4.990880966186523, "loss_ce": 3.8263304233551025, "loss_xval": 1.1640625, "num_input_tokens_seen": 6463240, "step": 44 }, { "epoch": 0.012972038051311616, "grad_norm": 540.73323665623, "learning_rate": 6.89430305927829e-05, "loss": 5.161, "num_input_tokens_seen": 6635752, "step": 45 }, { "epoch": 0.012972038051311616, "loss": 4.3311381340026855, "loss_ce": 3.3467631340026855, "loss_xval": 0.984375, "num_input_tokens_seen": 6635752, "step": 45 }, { "epoch": 0.013260305563562986, "grad_norm": 458.0472938948524, "learning_rate": 6.934109384172616e-05, "loss": 4.7264, "num_input_tokens_seen": 6770488, "step": 46 }, { "epoch": 0.013260305563562986, "loss": 5.214451789855957, "loss_ce": 3.6080069541931152, "loss_xval": 1.609375, "num_input_tokens_seen": 6770488, "step": 46 }, { "epoch": 0.013548573075814356, "grad_norm": 129.32422015778562, "learning_rate": 6.973059592352828e-05, "loss": 3.311, "num_input_tokens_seen": 6905640, "step": 47 }, { "epoch": 0.013548573075814356, "loss": 3.4464430809020996, "loss_ce": 3.2276930809020996, "loss_xval": 0.21875, "num_input_tokens_seen": 6905640, "step": 47 }, { "epoch": 0.013836840588065724, "grad_norm": 65.53517362370098, "learning_rate": 7.011189734920213e-05, "loss": 3.3215, "num_input_tokens_seen": 7078176, "step": 48 }, { "epoch": 0.013836840588065724, "loss": 3.060483455657959, "loss_ce": 2.844907283782959, "loss_xval": 0.2158203125, "num_input_tokens_seen": 7078176, "step": 48 }, { "epoch": 0.014125108100317094, "grad_norm": 215.52012341313568, "learning_rate": 7.048533632684716e-05, "loss": 3.4409, "num_input_tokens_seen": 7213000, "step": 49 }, { "epoch": 0.014125108100317094, "loss": 3.4835710525512695, "loss_ce": 3.1407976150512695, "loss_xval": 0.34375, "num_input_tokens_seen": 7213000, "step": 49 }, { "epoch": 0.014413375612568464, "grad_norm": 431.0270260191144, "learning_rate": 7.085123056422486e-05, "loss": 3.7639, "num_input_tokens_seen": 7348040, "step": 50 }, { "epoch": 0.014413375612568464, "loss": 3.667371988296509, "loss_ce": 2.829481363296509, "loss_xval": 0.8359375, "num_input_tokens_seen": 7348040, "step": 50 }, { "epoch": 0.014701643124819834, "grad_norm": 378.86249167362797, "learning_rate": 7.120987889282422e-05, "loss": 3.703, "num_input_tokens_seen": 7520472, "step": 51 }, { "epoch": 0.014701643124819834, "eval_websight_new_IoU": 0.0, "eval_websight_new_MAE_x": 0.5620081424713135, "eval_websight_new_MAE_y": 0.8609068095684052, "eval_websight_new_NUM_probability": 2.2496185010822956e-05, "eval_websight_new_inside_bbox": 0.0, "eval_websight_new_loss": 2.959818124771118, "eval_websight_new_loss_ce": 2.5184309482574463, "eval_websight_new_loss_xval": 0.4293212890625, "eval_websight_new_runtime": 35.5903, "eval_websight_new_samples_per_second": 1.405, "eval_websight_new_steps_per_second": 0.056, "num_input_tokens_seen": 7520472, "step": 51 }, { "epoch": 0.014701643124819834, "eval_seeclick_IoU": 0.007790456060320139, "eval_seeclick_MAE_x": 0.39892005920410156, "eval_seeclick_MAE_y": 0.5314352512359619, "eval_seeclick_NUM_probability": 3.157063019898487e-05, "eval_seeclick_inside_bbox": 0.015625, "eval_seeclick_loss": 3.310542345046997, "eval_seeclick_loss_ce": 2.78228759765625, "eval_seeclick_loss_xval": 0.504058837890625, "eval_seeclick_runtime": 63.5135, "eval_seeclick_samples_per_second": 0.787, "eval_seeclick_steps_per_second": 0.031, "num_input_tokens_seen": 7520472, "step": 51 }, { "epoch": 0.014701643124819834, "eval_icons_IoU": 9.042318561114371e-05, "eval_icons_MAE_x": 0.8792010545730591, "eval_icons_MAE_y": 0.6797044575214386, "eval_icons_NUM_probability": 1.8500017176847905e-05, "eval_icons_inside_bbox": 0.0, "eval_icons_loss": 3.088502883911133, "eval_icons_loss_ce": 2.412390947341919, "eval_icons_loss_xval": 0.6337890625, "eval_icons_runtime": 67.6782, "eval_icons_samples_per_second": 0.739, "eval_icons_steps_per_second": 0.03, "num_input_tokens_seen": 7520472, "step": 51 }, { "epoch": 0.014701643124819834, "loss": 2.99432373046875, "loss_ce": 2.43255615234375, "loss_xval": 0.5625, "num_input_tokens_seen": 7520472, "step": 51 }, { "epoch": 0.014989910637071202, "grad_norm": 217.68238108622742, "learning_rate": 7.156156273421994e-05, "loss": 3.0923, "num_input_tokens_seen": 7655320, "step": 52 }, { "epoch": 0.014989910637071202, "loss": 3.246007204055786, "loss_ce": 2.754307985305786, "loss_xval": 0.4921875, "num_input_tokens_seen": 7655320, "step": 52 }, { "epoch": 0.015278178149322572, "grad_norm": 91.05016392033016, "learning_rate": 7.190654742675073e-05, "loss": 2.5477, "num_input_tokens_seen": 7790448, "step": 53 }, { "epoch": 0.015278178149322572, "loss": 2.598111391067505, "loss_ce": 2.440823793411255, "loss_xval": 0.1572265625, "num_input_tokens_seen": 7790448, "step": 53 }, { "epoch": 0.015566445661573941, "grad_norm": 144.81034266553996, "learning_rate": 7.224508342818619e-05, "loss": 2.71, "num_input_tokens_seen": 7962992, "step": 54 }, { "epoch": 0.015566445661573941, "loss": 2.4725775718688965, "loss_ce": 2.2418036460876465, "loss_xval": 0.23046875, "num_input_tokens_seen": 7962992, "step": 54 }, { "epoch": 0.01585471317382531, "grad_norm": 232.08300955372366, "learning_rate": 7.25774074080379e-05, "loss": 2.7493, "num_input_tokens_seen": 8097832, "step": 55 }, { "epoch": 0.01585471317382531, "loss": 2.8998870849609375, "loss_ce": 2.3862152099609375, "loss_xval": 0.515625, "num_input_tokens_seen": 8097832, "step": 55 }, { "epoch": 0.01614298068607668, "grad_norm": 312.2394215634939, "learning_rate": 7.290374324144727e-05, "loss": 2.719, "num_input_tokens_seen": 8232800, "step": 56 }, { "epoch": 0.01614298068607668, "loss": 2.6347908973693848, "loss_ce": 2.1626229286193848, "loss_xval": 0.47265625, "num_input_tokens_seen": 8232800, "step": 56 }, { "epoch": 0.016431248198328047, "grad_norm": 235.52491076478066, "learning_rate": 7.322430291510432e-05, "loss": 2.5759, "num_input_tokens_seen": 8405208, "step": 57 }, { "epoch": 0.016431248198328047, "loss": 2.5117573738098145, "loss_ce": 2.0200581550598145, "loss_xval": 0.4921875, "num_input_tokens_seen": 8405208, "step": 57 }, { "epoch": 0.01671951571057942, "grad_norm": 63.56214683438855, "learning_rate": 7.353928735437845e-05, "loss": 2.2731, "num_input_tokens_seen": 8539984, "step": 58 }, { "epoch": 0.01671951571057942, "loss": 2.37797212600708, "loss_ce": 2.14963960647583, "loss_xval": 0.228515625, "num_input_tokens_seen": 8539984, "step": 58 }, { "epoch": 0.017007783222830787, "grad_norm": 37.67195118941927, "learning_rate": 7.38488871797435e-05, "loss": 2.0322, "num_input_tokens_seen": 8675088, "step": 59 }, { "epoch": 0.017007783222830787, "loss": 2.0352678298950195, "loss_ce": 1.9146623611450195, "loss_xval": 0.12060546875, "num_input_tokens_seen": 8675088, "step": 59 }, { "epoch": 0.017296050735082155, "grad_norm": 167.7360509976244, "learning_rate": 7.415328339962814e-05, "loss": 2.2125, "num_input_tokens_seen": 8847512, "step": 60 }, { "epoch": 0.017296050735082155, "loss": 2.0584282875061035, "loss_ce": 1.800371766090393, "loss_xval": 0.2578125, "num_input_tokens_seen": 8847512, "step": 60 }, { "epoch": 0.017584318247333527, "grad_norm": 250.3157839443666, "learning_rate": 7.445264804599805e-05, "loss": 2.318, "num_input_tokens_seen": 8982392, "step": 61 }, { "epoch": 0.017584318247333527, "loss": 2.417539596557617, "loss_ce": 1.9382915496826172, "loss_xval": 0.478515625, "num_input_tokens_seen": 8982392, "step": 61 }, { "epoch": 0.017872585759584895, "grad_norm": 178.8772377202071, "learning_rate": 7.474714475825724e-05, "loss": 1.9919, "num_input_tokens_seen": 9117408, "step": 62 }, { "epoch": 0.017872585759584895, "loss": 2.056333303451538, "loss_ce": 1.7208842039108276, "loss_xval": 0.3359375, "num_input_tokens_seen": 9117408, "step": 62 }, { "epoch": 0.018160853271836263, "grad_norm": 92.49710835233134, "learning_rate": 7.503692932043134e-05, "loss": 1.91, "num_input_tokens_seen": 9289880, "step": 63 }, { "epoch": 0.018160853271836263, "loss": 1.787151575088501, "loss_ce": 1.618694543838501, "loss_xval": 0.16796875, "num_input_tokens_seen": 9289880, "step": 63 }, { "epoch": 0.018449120784087635, "grad_norm": 36.307905662307306, "learning_rate": 7.532215015604735e-05, "loss": 1.814, "num_input_tokens_seen": 9424680, "step": 64 }, { "epoch": 0.018449120784087635, "loss": 1.890938401222229, "loss_ce": 1.742561936378479, "loss_xval": 0.1484375, "num_input_tokens_seen": 9424680, "step": 64 }, { "epoch": 0.018737388296339003, "grad_norm": 188.18119861000676, "learning_rate": 7.560294878464597e-05, "loss": 1.8272, "num_input_tokens_seen": 9559816, "step": 65 }, { "epoch": 0.018737388296339003, "loss": 1.7736570835113525, "loss_ce": 1.5479490756988525, "loss_xval": 0.2255859375, "num_input_tokens_seen": 9559816, "step": 65 }, { "epoch": 0.01902565580859037, "grad_norm": 159.85223743496317, "learning_rate": 7.587946024344119e-05, "loss": 1.834, "num_input_tokens_seen": 9732376, "step": 66 }, { "epoch": 0.01902565580859037, "loss": 1.7517627477645874, "loss_ce": 1.4713672399520874, "loss_xval": 0.28125, "num_input_tokens_seen": 9732376, "step": 66 }, { "epoch": 0.019313923320841742, "grad_norm": 101.15058869594668, "learning_rate": 7.615181347727268e-05, "loss": 1.7053, "num_input_tokens_seen": 9867176, "step": 67 }, { "epoch": 0.019313923320841742, "loss": 1.7496159076690674, "loss_ce": 1.5835392475128174, "loss_xval": 0.166015625, "num_input_tokens_seen": 9867176, "step": 67 }, { "epoch": 0.01960219083309311, "grad_norm": 58.32331425616856, "learning_rate": 7.642013169966947e-05, "loss": 1.543, "num_input_tokens_seen": 10002352, "step": 68 }, { "epoch": 0.01960219083309311, "loss": 1.5576841831207275, "loss_ce": 1.4476988315582275, "loss_xval": 0.10986328125, "num_input_tokens_seen": 10002352, "step": 68 }, { "epoch": 0.01989045834534448, "grad_norm": 44.59367162259951, "learning_rate": 7.66845327275555e-05, "loss": 1.5735, "num_input_tokens_seen": 10174928, "step": 69 }, { "epoch": 0.01989045834534448, "loss": 1.4471509456634521, "loss_ce": 1.3455884456634521, "loss_xval": 0.1015625, "num_input_tokens_seen": 10174928, "step": 69 }, { "epoch": 0.02017872585759585, "grad_norm": 140.15921471971498, "learning_rate": 7.69451292918733e-05, "loss": 1.6784, "num_input_tokens_seen": 10309696, "step": 70 }, { "epoch": 0.02017872585759585, "loss": 1.7239234447479248, "loss_ce": 1.4675757884979248, "loss_xval": 0.255859375, "num_input_tokens_seen": 10309696, "step": 70 }, { "epoch": 0.020466993369847218, "grad_norm": 121.97678463125082, "learning_rate": 7.720202932617523e-05, "loss": 1.4855, "num_input_tokens_seen": 10444840, "step": 71 }, { "epoch": 0.020466993369847218, "loss": 1.5414938926696777, "loss_ce": 1.3424582481384277, "loss_xval": 0.19921875, "num_input_tokens_seen": 10444840, "step": 71 }, { "epoch": 0.020755260882098586, "grad_norm": 71.6510231372808, "learning_rate": 7.745533623503144e-05, "loss": 1.4846, "num_input_tokens_seen": 10617216, "step": 72 }, { "epoch": 0.020755260882098586, "loss": 1.3716990947723389, "loss_ce": 1.2538402080535889, "loss_xval": 0.11767578125, "num_input_tokens_seen": 10617216, "step": 72 }, { "epoch": 0.021043528394349958, "grad_norm": 17.112653827119473, "learning_rate": 7.770514914392505e-05, "loss": 1.4478, "num_input_tokens_seen": 10752048, "step": 73 }, { "epoch": 0.021043528394349958, "loss": 1.5130277872085571, "loss_ce": 1.3716398477554321, "loss_xval": 0.1416015625, "num_input_tokens_seen": 10752048, "step": 73 }, { "epoch": 0.021331795906601326, "grad_norm": 54.30781148516282, "learning_rate": 7.795156313214625e-05, "loss": 1.3234, "num_input_tokens_seen": 10887048, "step": 74 }, { "epoch": 0.021331795906601326, "loss": 1.3435890674591064, "loss_ce": 1.2606728076934814, "loss_xval": 0.0830078125, "num_input_tokens_seen": 10887048, "step": 74 }, { "epoch": 0.021620063418852694, "grad_norm": 126.25618466514295, "learning_rate": 7.819466945005417e-05, "loss": 1.4311, "num_input_tokens_seen": 11059536, "step": 75 }, { "epoch": 0.021620063418852694, "loss": 1.3155971765518188, "loss_ce": 1.1788784265518188, "loss_xval": 0.13671875, "num_input_tokens_seen": 11059536, "step": 75 }, { "epoch": 0.021908330931104066, "grad_norm": 71.84571532321051, "learning_rate": 7.843455572194957e-05, "loss": 1.3668, "num_input_tokens_seen": 11194248, "step": 76 }, { "epoch": 0.021908330931104066, "loss": 1.4305455684661865, "loss_ce": 1.2662389278411865, "loss_xval": 0.1640625, "num_input_tokens_seen": 11194248, "step": 76 }, { "epoch": 0.022196598443355434, "grad_norm": 18.12151091821412, "learning_rate": 7.867130613568635e-05, "loss": 1.2159, "num_input_tokens_seen": 11329488, "step": 77 }, { "epoch": 0.022196598443355434, "loss": 1.2611539363861084, "loss_ce": 1.1754605770111084, "loss_xval": 0.0859375, "num_input_tokens_seen": 11329488, "step": 77 }, { "epoch": 0.022484865955606802, "grad_norm": 16.94095251304994, "learning_rate": 7.890500162004926e-05, "loss": 1.2511, "num_input_tokens_seen": 11501960, "step": 78 }, { "epoch": 0.022484865955606802, "loss": 1.1699042320251465, "loss_ce": 1.1077704429626465, "loss_xval": 0.06201171875, "num_input_tokens_seen": 11501960, "step": 78 }, { "epoch": 0.022773133467858173, "grad_norm": 49.28829387349209, "learning_rate": 7.913572001083273e-05, "loss": 1.2514, "num_input_tokens_seen": 11636752, "step": 79 }, { "epoch": 0.022773133467858173, "loss": 1.2829480171203613, "loss_ce": 1.1588635444641113, "loss_xval": 0.1240234375, "num_input_tokens_seen": 11636752, "step": 79 }, { "epoch": 0.02306140098010954, "grad_norm": 86.79137128570692, "learning_rate": 7.93635362064734e-05, "loss": 1.1792, "num_input_tokens_seen": 11771712, "step": 80 }, { "epoch": 0.02306140098010954, "loss": 1.2060190439224243, "loss_ce": 1.0835214853286743, "loss_xval": 0.12255859375, "num_input_tokens_seen": 11771712, "step": 80 }, { "epoch": 0.02334966849236091, "grad_norm": 36.119976096556634, "learning_rate": 7.958852231401551e-05, "loss": 1.1689, "num_input_tokens_seen": 11944424, "step": 81 }, { "epoch": 0.02334966849236091, "loss": 1.1105940341949463, "loss_ce": 1.0218183994293213, "loss_xval": 0.0888671875, "num_input_tokens_seen": 11944424, "step": 81 }, { "epoch": 0.02363793600461228, "grad_norm": 12.644434454126534, "learning_rate": 7.981074778612054e-05, "loss": 1.1655, "num_input_tokens_seen": 12079392, "step": 82 }, { "epoch": 0.02363793600461228, "loss": 1.2264204025268555, "loss_ce": 1.1139326095581055, "loss_xval": 0.1123046875, "num_input_tokens_seen": 12079392, "step": 82 }, { "epoch": 0.02392620351686365, "grad_norm": 15.168981426164274, "learning_rate": 8.003027954977264e-05, "loss": 1.0712, "num_input_tokens_seen": 12214464, "step": 83 }, { "epoch": 0.02392620351686365, "loss": 1.1029493808746338, "loss_ce": 1.0238783359527588, "loss_xval": 0.0791015625, "num_input_tokens_seen": 12214464, "step": 83 }, { "epoch": 0.024214471029115017, "grad_norm": 81.32602157915647, "learning_rate": 8.024718212727658e-05, "loss": 1.1252, "num_input_tokens_seen": 12387112, "step": 84 }, { "epoch": 0.024214471029115017, "loss": 1.02632737159729, "loss_ce": 0.9486294984817505, "loss_xval": 0.07763671875, "num_input_tokens_seen": 12387112, "step": 84 }, { "epoch": 0.02450273854136639, "grad_norm": 43.813459977991975, "learning_rate": 8.04615177500955e-05, "loss": 1.1105, "num_input_tokens_seen": 12521856, "step": 85 }, { "epoch": 0.02450273854136639, "loss": 1.1658377647399902, "loss_ce": 1.0337576866149902, "loss_xval": 0.1318359375, "num_input_tokens_seen": 12521856, "step": 85 }, { "epoch": 0.024791006053617757, "grad_norm": 11.656507104084508, "learning_rate": 8.067334646603104e-05, "loss": 0.9894, "num_input_tokens_seen": 12657040, "step": 86 }, { "epoch": 0.024791006053617757, "loss": 1.0080392360687256, "loss_ce": 0.9314705729484558, "loss_xval": 0.07666015625, "num_input_tokens_seen": 12657040, "step": 86 }, { "epoch": 0.025079273565869125, "grad_norm": 12.676570492524768, "learning_rate": 8.088272624020777e-05, "loss": 1.025, "num_input_tokens_seen": 12829568, "step": 87 }, { "epoch": 0.025079273565869125, "loss": 0.9655556678771973, "loss_ce": 0.9059548377990723, "loss_xval": 0.0595703125, "num_input_tokens_seen": 12829568, "step": 87 }, { "epoch": 0.025367541078120497, "grad_norm": 40.00413507298051, "learning_rate": 8.108971305028644e-05, "loss": 1.0396, "num_input_tokens_seen": 12964304, "step": 88 }, { "epoch": 0.025367541078120497, "loss": 1.0686454772949219, "loss_ce": 0.9660453796386719, "loss_xval": 0.1025390625, "num_input_tokens_seen": 12964304, "step": 88 }, { "epoch": 0.025655808590371865, "grad_norm": 57.04743386781687, "learning_rate": 8.129436097629779e-05, "loss": 0.9546, "num_input_tokens_seen": 13099336, "step": 89 }, { "epoch": 0.025655808590371865, "loss": 0.9784692525863647, "loss_ce": 0.8800195455551147, "loss_xval": 0.0986328125, "num_input_tokens_seen": 13099336, "step": 89 }, { "epoch": 0.025944076102623233, "grad_norm": 11.991983730016383, "learning_rate": 8.149672228545746e-05, "loss": 0.9712, "num_input_tokens_seen": 13271816, "step": 90 }, { "epoch": 0.025944076102623233, "loss": 0.9001408815383911, "loss_ce": 0.8415776491165161, "loss_xval": 0.05859375, "num_input_tokens_seen": 13271816, "step": 90 }, { "epoch": 0.026232343614874604, "grad_norm": 17.076861856400765, "learning_rate": 8.16968475122944e-05, "loss": 0.939, "num_input_tokens_seen": 13406616, "step": 91 }, { "epoch": 0.026232343614874604, "loss": 0.9755129218101501, "loss_ce": 0.9001650214195251, "loss_xval": 0.0751953125, "num_input_tokens_seen": 13406616, "step": 91 }, { "epoch": 0.026520611127125972, "grad_norm": 13.426874489381975, "learning_rate": 8.189478553440074e-05, "loss": 0.8669, "num_input_tokens_seen": 13541608, "step": 92 }, { "epoch": 0.026520611127125972, "loss": 0.8719437122344971, "loss_ce": 0.8101913928985596, "loss_xval": 0.061767578125, "num_input_tokens_seen": 13541608, "step": 92 }, { "epoch": 0.02680887863937734, "grad_norm": 45.10735406441355, "learning_rate": 8.209058364408656e-05, "loss": 0.9199, "num_input_tokens_seen": 13714096, "step": 93 }, { "epoch": 0.02680887863937734, "loss": 0.8327312469482422, "loss_ce": 0.7779521942138672, "loss_xval": 0.0546875, "num_input_tokens_seen": 13714096, "step": 93 }, { "epoch": 0.027097146151628712, "grad_norm": 22.312072425981693, "learning_rate": 8.228428761620285e-05, "loss": 0.8977, "num_input_tokens_seen": 13848928, "step": 94 }, { "epoch": 0.027097146151628712, "loss": 0.9612225294113159, "loss_ce": 0.8525189161300659, "loss_xval": 0.10888671875, "num_input_tokens_seen": 13848928, "step": 94 }, { "epoch": 0.02738541366388008, "grad_norm": 14.377576713290015, "learning_rate": 8.247594177237559e-05, "loss": 0.8293, "num_input_tokens_seen": 13984080, "step": 95 }, { "epoch": 0.02738541366388008, "loss": 0.8508036732673645, "loss_ce": 0.7708476185798645, "loss_xval": 0.080078125, "num_input_tokens_seen": 13984080, "step": 95 }, { "epoch": 0.02767368117613145, "grad_norm": 22.570750319320567, "learning_rate": 8.266558904187667e-05, "loss": 0.842, "num_input_tokens_seen": 14156520, "step": 96 }, { "epoch": 0.02767368117613145, "loss": 0.7710494995117188, "loss_ce": 0.7205734252929688, "loss_xval": 0.050537109375, "num_input_tokens_seen": 14156520, "step": 96 }, { "epoch": 0.02796194868838282, "grad_norm": 36.281544865910995, "learning_rate": 8.285327101934069e-05, "loss": 0.8365, "num_input_tokens_seen": 14291312, "step": 97 }, { "epoch": 0.02796194868838282, "loss": 0.8652549982070923, "loss_ce": 0.7860618829727173, "loss_xval": 0.0791015625, "num_input_tokens_seen": 14291312, "step": 97 }, { "epoch": 0.028250216200634188, "grad_norm": 10.406571859201414, "learning_rate": 8.303902801952174e-05, "loss": 0.7746, "num_input_tokens_seen": 14426480, "step": 98 }, { "epoch": 0.028250216200634188, "loss": 0.8105872869491577, "loss_ce": 0.7177222967147827, "loss_xval": 0.0927734375, "num_input_tokens_seen": 14426480, "step": 98 }, { "epoch": 0.02853848371288556, "grad_norm": 11.45020509791116, "learning_rate": 8.322289912927049e-05, "loss": 0.7788, "num_input_tokens_seen": 14598976, "step": 99 }, { "epoch": 0.02853848371288556, "loss": 0.7215403318405151, "loss_ce": 0.6690806150436401, "loss_xval": 0.052490234375, "num_input_tokens_seen": 14598976, "step": 99 }, { "epoch": 0.028826751225136928, "grad_norm": 33.00790033062601, "learning_rate": 8.340492225689943e-05, "loss": 0.7793, "num_input_tokens_seen": 14733720, "step": 100 }, { "epoch": 0.028826751225136928, "loss": 0.8156184554100037, "loss_ce": 0.7315425276756287, "loss_xval": 0.083984375, "num_input_tokens_seen": 14733720, "step": 100 }, { "epoch": 0.029115018737388296, "grad_norm": 20.940949519344258, "learning_rate": 8.358513417909158e-05, "loss": 0.694, "num_input_tokens_seen": 14868704, "step": 101 }, { "epoch": 0.029115018737388296, "loss": 0.7094327807426453, "loss_ce": 0.6524564623832703, "loss_xval": 0.056884765625, "num_input_tokens_seen": 14868704, "step": 101 }, { "epoch": 0.029403286249639667, "grad_norm": 11.7677303756099, "learning_rate": 8.376357058549878e-05, "loss": 0.7215, "num_input_tokens_seen": 15041320, "step": 102 }, { "epoch": 0.029403286249639667, "loss": 0.6571950912475586, "loss_ce": 0.6112051010131836, "loss_xval": 0.0458984375, "num_input_tokens_seen": 15041320, "step": 102 }, { "epoch": 0.029691553761891035, "grad_norm": 13.477928271939529, "learning_rate": 8.394026612116405e-05, "loss": 0.7139, "num_input_tokens_seen": 15176128, "step": 103 }, { "epoch": 0.029691553761891035, "loss": 0.7441321611404419, "loss_ce": 0.6728125810623169, "loss_xval": 0.0712890625, "num_input_tokens_seen": 15176128, "step": 103 }, { "epoch": 0.029979821274142403, "grad_norm": 28.21428885187996, "learning_rate": 8.41152544268945e-05, "loss": 0.6557, "num_input_tokens_seen": 15311280, "step": 104 }, { "epoch": 0.029979821274142403, "loss": 0.6761811971664429, "loss_ce": 0.6040834188461304, "loss_xval": 0.072265625, "num_input_tokens_seen": 15311280, "step": 104 }, { "epoch": 0.030268088786393775, "grad_norm": 10.481020095069384, "learning_rate": 8.42885681777026e-05, "loss": 0.6775, "num_input_tokens_seen": 15483784, "step": 105 }, { "epoch": 0.030268088786393775, "loss": 0.6159076690673828, "loss_ce": 0.5645313262939453, "loss_xval": 0.05126953125, "num_input_tokens_seen": 15483784, "step": 105 }, { "epoch": 0.030556356298645143, "grad_norm": 12.736851354361377, "learning_rate": 8.446023911942528e-05, "loss": 0.6781, "num_input_tokens_seen": 15618472, "step": 106 }, { "epoch": 0.030556356298645143, "loss": 0.6953324675559998, "loss_ce": 0.6300248503684998, "loss_xval": 0.0654296875, "num_input_tokens_seen": 15618472, "step": 106 }, { "epoch": 0.03084462381089651, "grad_norm": 15.301730909445215, "learning_rate": 8.463029810362388e-05, "loss": 0.6077, "num_input_tokens_seen": 15753616, "step": 107 }, { "epoch": 0.03084462381089651, "loss": 0.6367394924163818, "loss_ce": 0.5607202053070068, "loss_xval": 0.076171875, "num_input_tokens_seen": 15753616, "step": 107 }, { "epoch": 0.031132891323147883, "grad_norm": 30.79147422211605, "learning_rate": 8.479877512086075e-05, "loss": 0.617, "num_input_tokens_seen": 15926144, "step": 108 }, { "epoch": 0.031132891323147883, "loss": 0.5589581727981567, "loss_ce": 0.5138074159622192, "loss_xval": 0.045166015625, "num_input_tokens_seen": 15926144, "step": 108 }, { "epoch": 0.03142115883539925, "grad_norm": 8.250685702158895, "learning_rate": 8.496569933244227e-05, "loss": 0.6087, "num_input_tokens_seen": 16061008, "step": 109 }, { "epoch": 0.03142115883539925, "loss": 0.631111741065979, "loss_ce": 0.564949631690979, "loss_xval": 0.06640625, "num_input_tokens_seen": 16061008, "step": 109 }, { "epoch": 0.03170942634765062, "grad_norm": 23.093026007657294, "learning_rate": 8.513109910071247e-05, "loss": 0.5623, "num_input_tokens_seen": 16196128, "step": 110 }, { "epoch": 0.03170942634765062, "loss": 0.5758240222930908, "loss_ce": 0.5295593738555908, "loss_xval": 0.04638671875, "num_input_tokens_seen": 16196128, "step": 110 }, { "epoch": 0.03199769385990199, "grad_norm": 19.06137292165344, "learning_rate": 8.529500201797555e-05, "loss": 0.5789, "num_input_tokens_seen": 16368512, "step": 111 }, { "epoch": 0.03199769385990199, "loss": 0.5283578634262085, "loss_ce": 0.4861825108528137, "loss_xval": 0.042236328125, "num_input_tokens_seen": 16368512, "step": 111 }, { "epoch": 0.03228596137215336, "grad_norm": 10.639807503403174, "learning_rate": 8.545743493412182e-05, "loss": 0.5752, "num_input_tokens_seen": 16503288, "step": 112 }, { "epoch": 0.03228596137215336, "loss": 0.5936908721923828, "loss_ce": 0.5266132950782776, "loss_xval": 0.06689453125, "num_input_tokens_seen": 16503288, "step": 112 }, { "epoch": 0.03257422888440473, "grad_norm": 16.230556762656512, "learning_rate": 8.561842398302536e-05, "loss": 0.5299, "num_input_tokens_seen": 16638280, "step": 113 }, { "epoch": 0.03257422888440473, "loss": 0.5637577772140503, "loss_ce": 0.5001896619796753, "loss_xval": 0.0634765625, "num_input_tokens_seen": 16638280, "step": 113 }, { "epoch": 0.032862496396656095, "grad_norm": 20.814059821840747, "learning_rate": 8.577799460777888e-05, "loss": 0.5511, "num_input_tokens_seen": 16810784, "step": 114 }, { "epoch": 0.032862496396656095, "loss": 0.5012564659118652, "loss_ce": 0.4504447281360626, "loss_xval": 0.05078125, "num_input_tokens_seen": 16810784, "step": 114 }, { "epoch": 0.03315076390890746, "grad_norm": 7.387134587767617, "learning_rate": 8.593617158482676e-05, "loss": 0.5444, "num_input_tokens_seen": 16945520, "step": 115 }, { "epoch": 0.03315076390890746, "loss": 0.560407280921936, "loss_ce": 0.507184624671936, "loss_xval": 0.05322265625, "num_input_tokens_seen": 16945520, "step": 115 }, { "epoch": 0.03343903142115884, "grad_norm": 6.797458685600256, "learning_rate": 8.609297904705301e-05, "loss": 0.5066, "num_input_tokens_seen": 17080640, "step": 116 }, { "epoch": 0.03343903142115884, "loss": 0.5233986377716064, "loss_ce": 0.47078636288642883, "loss_xval": 0.052734375, "num_input_tokens_seen": 17080640, "step": 116 }, { "epoch": 0.033727298933410206, "grad_norm": 18.86772578309578, "learning_rate": 8.624844050587858e-05, "loss": 0.5119, "num_input_tokens_seen": 17253208, "step": 117 }, { "epoch": 0.033727298933410206, "loss": 0.47286179661750793, "loss_ce": 0.42783311009407043, "loss_xval": 0.044921875, "num_input_tokens_seen": 17253208, "step": 117 }, { "epoch": 0.034015566445661574, "grad_norm": 8.023085850245478, "learning_rate": 8.640257887241806e-05, "loss": 0.5306, "num_input_tokens_seen": 17387920, "step": 118 }, { "epoch": 0.034015566445661574, "loss": 0.5475594997406006, "loss_ce": 0.4861886501312256, "loss_xval": 0.061279296875, "num_input_tokens_seen": 17387920, "step": 118 }, { "epoch": 0.03430383395791294, "grad_norm": 6.890785610877055, "learning_rate": 8.655541647774393e-05, "loss": 0.4852, "num_input_tokens_seen": 17522904, "step": 119 }, { "epoch": 0.03430383395791294, "loss": 0.5079690217971802, "loss_ce": 0.4557229280471802, "loss_xval": 0.05224609375, "num_input_tokens_seen": 17522904, "step": 119 }, { "epoch": 0.03459210147016431, "grad_norm": 11.321006785298598, "learning_rate": 8.67069750923027e-05, "loss": 0.4889, "num_input_tokens_seen": 17695320, "step": 120 }, { "epoch": 0.03459210147016431, "loss": 0.44401293992996216, "loss_ce": 0.40258532762527466, "loss_xval": 0.04150390625, "num_input_tokens_seen": 17695320, "step": 120 }, { "epoch": 0.03488036898241568, "grad_norm": 13.101992884551423, "learning_rate": 8.68572759445255e-05, "loss": 0.5142, "num_input_tokens_seen": 17830080, "step": 121 }, { "epoch": 0.03488036898241568, "loss": 0.5295823812484741, "loss_ce": 0.4632371664047241, "loss_xval": 0.06640625, "num_input_tokens_seen": 17830080, "step": 121 }, { "epoch": 0.035168636494667053, "grad_norm": 7.865552687957865, "learning_rate": 8.700633973867262e-05, "loss": 0.4699, "num_input_tokens_seen": 17965200, "step": 122 }, { "epoch": 0.035168636494667053, "loss": 0.49172335863113403, "loss_ce": 0.4352964162826538, "loss_xval": 0.056396484375, "num_input_tokens_seen": 17965200, "step": 122 }, { "epoch": 0.03545690400691842, "grad_norm": 8.34823959747774, "learning_rate": 8.715418667194984e-05, "loss": 0.4788, "num_input_tokens_seen": 18137616, "step": 123 }, { "epoch": 0.03545690400691842, "loss": 0.4347504675388336, "loss_ce": 0.3913239538669586, "loss_xval": 0.04345703125, "num_input_tokens_seen": 18137616, "step": 123 }, { "epoch": 0.03574517151916979, "grad_norm": 19.73006652507538, "learning_rate": 8.73008364509318e-05, "loss": 0.5042, "num_input_tokens_seen": 18272416, "step": 124 }, { "epoch": 0.03574517151916979, "loss": 0.5189793705940247, "loss_ce": 0.44375354051589966, "loss_xval": 0.0751953125, "num_input_tokens_seen": 18272416, "step": 124 }, { "epoch": 0.03603343903142116, "grad_norm": 7.744786447742085, "learning_rate": 8.744630830732546e-05, "loss": 0.4524, "num_input_tokens_seen": 18407576, "step": 125 }, { "epoch": 0.03603343903142116, "loss": 0.47686684131622314, "loss_ce": 0.42333900928497314, "loss_xval": 0.053466796875, "num_input_tokens_seen": 18407576, "step": 125 }, { "epoch": 0.036321706543672526, "grad_norm": 13.281162567225987, "learning_rate": 8.75906210131059e-05, "loss": 0.4611, "num_input_tokens_seen": 18580112, "step": 126 }, { "epoch": 0.036321706543672526, "loss": 0.41387027502059937, "loss_ce": 0.37471622228622437, "loss_xval": 0.0390625, "num_input_tokens_seen": 18580112, "step": 126 }, { "epoch": 0.036609974055923894, "grad_norm": 15.583006378460968, "learning_rate": 8.773379289505366e-05, "loss": 0.4688, "num_input_tokens_seen": 18714824, "step": 127 }, { "epoch": 0.036609974055923894, "loss": 0.4860740303993225, "loss_ce": 0.4178062081336975, "loss_xval": 0.068359375, "num_input_tokens_seen": 18714824, "step": 127 }, { "epoch": 0.03689824156817527, "grad_norm": 24.714679606403106, "learning_rate": 8.787584184872191e-05, "loss": 0.4365, "num_input_tokens_seen": 18850080, "step": 128 }, { "epoch": 0.03689824156817527, "loss": 0.45160603523254395, "loss_ce": 0.40163350105285645, "loss_xval": 0.050048828125, "num_input_tokens_seen": 18850080, "step": 128 }, { "epoch": 0.03718650908042664, "grad_norm": 8.236941080781937, "learning_rate": 8.801678535186036e-05, "loss": 0.4549, "num_input_tokens_seen": 19022432, "step": 129 }, { "epoch": 0.03718650908042664, "loss": 0.40648266673088074, "loss_ce": 0.35817334055900574, "loss_xval": 0.04833984375, "num_input_tokens_seen": 19022432, "step": 129 }, { "epoch": 0.037474776592678005, "grad_norm": 8.741920747317353, "learning_rate": 8.815664047732053e-05, "loss": 0.447, "num_input_tokens_seen": 19157248, "step": 130 }, { "epoch": 0.037474776592678005, "loss": 0.4431639313697815, "loss_ce": 0.3914671540260315, "loss_xval": 0.0517578125, "num_input_tokens_seen": 19157248, "step": 130 }, { "epoch": 0.03776304410492937, "grad_norm": 11.13044720072461, "learning_rate": 8.829542390546686e-05, "loss": 0.4074, "num_input_tokens_seen": 19292208, "step": 131 }, { "epoch": 0.03776304410492937, "loss": 0.43208760023117065, "loss_ce": 0.38637226819992065, "loss_xval": 0.045654296875, "num_input_tokens_seen": 19292208, "step": 131 }, { "epoch": 0.03805131161718074, "grad_norm": 7.227212476089253, "learning_rate": 8.843315193611575e-05, "loss": 0.4255, "num_input_tokens_seen": 19464680, "step": 132 }, { "epoch": 0.03805131161718074, "loss": 0.3858657479286194, "loss_ce": 0.3452315926551819, "loss_xval": 0.04052734375, "num_input_tokens_seen": 19464680, "step": 132 }, { "epoch": 0.038339579129432116, "grad_norm": 7.444913961645206, "learning_rate": 8.856984050002403e-05, "loss": 0.4414, "num_input_tokens_seen": 19599448, "step": 133 }, { "epoch": 0.038339579129432116, "loss": 0.4538487195968628, "loss_ce": 0.3836582899093628, "loss_xval": 0.0703125, "num_input_tokens_seen": 19599448, "step": 133 }, { "epoch": 0.038627846641683485, "grad_norm": 8.496712439073113, "learning_rate": 8.870550516994724e-05, "loss": 0.3984, "num_input_tokens_seen": 19734512, "step": 134 }, { "epoch": 0.038627846641683485, "loss": 0.4241769313812256, "loss_ce": 0.3713052272796631, "loss_xval": 0.052978515625, "num_input_tokens_seen": 19734512, "step": 134 }, { "epoch": 0.03891611415393485, "grad_norm": 10.293290447905349, "learning_rate": 8.884016117128679e-05, "loss": 0.3947, "num_input_tokens_seen": 19907056, "step": 135 }, { "epoch": 0.03891611415393485, "loss": 0.3680115342140198, "loss_ce": 0.3244324326515198, "loss_xval": 0.04345703125, "num_input_tokens_seen": 19907056, "step": 135 }, { "epoch": 0.03920438166618622, "grad_norm": 7.220263516510419, "learning_rate": 8.897382339234403e-05, "loss": 0.4308, "num_input_tokens_seen": 20041840, "step": 136 }, { "epoch": 0.03920438166618622, "loss": 0.44666624069213867, "loss_ce": 0.37360715866088867, "loss_xval": 0.0732421875, "num_input_tokens_seen": 20041840, "step": 136 }, { "epoch": 0.03949264917843759, "grad_norm": 8.10053991289463, "learning_rate": 8.910650639419907e-05, "loss": 0.3892, "num_input_tokens_seen": 20177016, "step": 137 }, { "epoch": 0.03949264917843759, "loss": 0.4107305407524109, "loss_ce": 0.3623601794242859, "loss_xval": 0.04833984375, "num_input_tokens_seen": 20177016, "step": 137 }, { "epoch": 0.03978091669068896, "grad_norm": 17.842547960477276, "learning_rate": 8.923822442023005e-05, "loss": 0.3782, "num_input_tokens_seen": 20349472, "step": 138 }, { "epoch": 0.03978091669068896, "loss": 0.3521159589290619, "loss_ce": 0.3164714276790619, "loss_xval": 0.03564453125, "num_input_tokens_seen": 20349472, "step": 138 }, { "epoch": 0.04006918420294033, "grad_norm": 6.227925946167984, "learning_rate": 8.936899140528881e-05, "loss": 0.405, "num_input_tokens_seen": 20484288, "step": 139 }, { "epoch": 0.04006918420294033, "loss": 0.4243292212486267, "loss_ce": 0.3574804663658142, "loss_xval": 0.06689453125, "num_input_tokens_seen": 20484288, "step": 139 }, { "epoch": 0.0403574517151917, "grad_norm": 19.35797505142977, "learning_rate": 8.949882098454784e-05, "loss": 0.3686, "num_input_tokens_seen": 20619376, "step": 140 }, { "epoch": 0.0403574517151917, "loss": 0.388784259557724, "loss_ce": 0.341237872838974, "loss_xval": 0.047607421875, "num_input_tokens_seen": 20619376, "step": 140 }, { "epoch": 0.04064571922744307, "grad_norm": 17.99154294533077, "learning_rate": 8.962772650203216e-05, "loss": 0.378, "num_input_tokens_seen": 20791928, "step": 141 }, { "epoch": 0.04064571922744307, "loss": 0.33298397064208984, "loss_ce": 0.29975032806396484, "loss_xval": 0.033203125, "num_input_tokens_seen": 20791928, "step": 141 }, { "epoch": 0.040933986739694436, "grad_norm": 16.729983956716715, "learning_rate": 8.97557210188498e-05, "loss": 0.3874, "num_input_tokens_seen": 20926808, "step": 142 }, { "epoch": 0.040933986739694436, "loss": 0.39781898260116577, "loss_ce": 0.34123939275741577, "loss_xval": 0.056640625, "num_input_tokens_seen": 20926808, "step": 142 }, { "epoch": 0.041222254251945804, "grad_norm": 20.555304511946005, "learning_rate": 8.988281732113356e-05, "loss": 0.3588, "num_input_tokens_seen": 21061904, "step": 143 }, { "epoch": 0.041222254251945804, "loss": 0.3827919661998749, "loss_ce": 0.3249458968639374, "loss_xval": 0.057861328125, "num_input_tokens_seen": 21061904, "step": 143 }, { "epoch": 0.04151052176419717, "grad_norm": 11.077156251039169, "learning_rate": 9.000902792770599e-05, "loss": 0.3618, "num_input_tokens_seen": 21234296, "step": 144 }, { "epoch": 0.04151052176419717, "loss": 0.3408665657043457, "loss_ce": 0.2942509651184082, "loss_xval": 0.046630859375, "num_input_tokens_seen": 21234296, "step": 144 }, { "epoch": 0.04179878927644855, "grad_norm": 7.537437415570367, "learning_rate": 9.013436509747905e-05, "loss": 0.3696, "num_input_tokens_seen": 21369176, "step": 145 }, { "epoch": 0.04179878927644855, "loss": 0.37368881702423096, "loss_ce": 0.31585800647735596, "loss_xval": 0.057861328125, "num_input_tokens_seen": 21369176, "step": 145 }, { "epoch": 0.042087056788699916, "grad_norm": 22.75373071527674, "learning_rate": 9.025884083659961e-05, "loss": 0.3445, "num_input_tokens_seen": 21504168, "step": 146 }, { "epoch": 0.042087056788699916, "loss": 0.3730278015136719, "loss_ce": 0.3152580261230469, "loss_xval": 0.057861328125, "num_input_tokens_seen": 21504168, "step": 146 }, { "epoch": 0.042375324300951284, "grad_norm": 8.750173390173343, "learning_rate": 9.038246690535104e-05, "loss": 0.3446, "num_input_tokens_seen": 21676760, "step": 147 }, { "epoch": 0.042375324300951284, "loss": 0.319990873336792, "loss_ce": 0.279219388961792, "loss_xval": 0.040771484375, "num_input_tokens_seen": 21676760, "step": 147 }, { "epoch": 0.04266359181320265, "grad_norm": 26.65442808332653, "learning_rate": 9.050525482482079e-05, "loss": 0.3634, "num_input_tokens_seen": 21811504, "step": 148 }, { "epoch": 0.04266359181320265, "loss": 0.3779270052909851, "loss_ce": 0.3187229037284851, "loss_xval": 0.05908203125, "num_input_tokens_seen": 21811504, "step": 148 }, { "epoch": 0.04295185932545402, "grad_norm": 9.765345028862136, "learning_rate": 9.062721588334353e-05, "loss": 0.328, "num_input_tokens_seen": 21946624, "step": 149 }, { "epoch": 0.04295185932545402, "loss": 0.35912925004959106, "loss_ce": 0.31243735551834106, "loss_xval": 0.046630859375, "num_input_tokens_seen": 21946624, "step": 149 }, { "epoch": 0.04324012683770539, "grad_norm": 19.40125110129968, "learning_rate": 9.074836114272874e-05, "loss": 0.3332, "num_input_tokens_seen": 22119168, "step": 150 }, { "epoch": 0.04324012683770539, "loss": 0.30681443214416504, "loss_ce": 0.26843857765197754, "loss_xval": 0.038330078125, "num_input_tokens_seen": 22119168, "step": 150 }, { "epoch": 0.04352839434995676, "grad_norm": 28.48434500578458, "learning_rate": 9.086870144428141e-05, "loss": 0.3599, "num_input_tokens_seen": 22254016, "step": 151 }, { "epoch": 0.04352839434995676, "loss": 0.37152981758117676, "loss_ce": 0.30808377265930176, "loss_xval": 0.0634765625, "num_input_tokens_seen": 22254016, "step": 151 }, { "epoch": 0.04381666186220813, "grad_norm": 7.423383361353423, "learning_rate": 9.098824741462412e-05, "loss": 0.318, "num_input_tokens_seen": 22389096, "step": 152 }, { "epoch": 0.04381666186220813, "loss": 0.35003936290740967, "loss_ce": 0.29791533946990967, "loss_xval": 0.05224609375, "num_input_tokens_seen": 22389096, "step": 152 }, { "epoch": 0.0441049293744595, "grad_norm": 27.525794406351512, "learning_rate": 9.110700947132808e-05, "loss": 0.3236, "num_input_tokens_seen": 22561800, "step": 153 }, { "epoch": 0.0441049293744595, "loss": 0.29091864824295044, "loss_ce": 0.25377875566482544, "loss_xval": 0.037109375, "num_input_tokens_seen": 22561800, "step": 153 }, { "epoch": 0.04439319688671087, "grad_norm": 11.347675941007187, "learning_rate": 9.12249978283609e-05, "loss": 0.3372, "num_input_tokens_seen": 22696600, "step": 154 }, { "epoch": 0.04439319688671087, "loss": 0.34830331802368164, "loss_ce": 0.28949594497680664, "loss_xval": 0.058837890625, "num_input_tokens_seen": 22696600, "step": 154 }, { "epoch": 0.044681464398962235, "grad_norm": 14.974404743639585, "learning_rate": 9.134222250135783e-05, "loss": 0.3118, "num_input_tokens_seen": 22831720, "step": 155 }, { "epoch": 0.044681464398962235, "loss": 0.33259499073028564, "loss_ce": 0.28230202198028564, "loss_xval": 0.05029296875, "num_input_tokens_seen": 22831720, "step": 155 }, { "epoch": 0.044969731911213603, "grad_norm": 14.382221862767326, "learning_rate": 9.145869331272382e-05, "loss": 0.3275, "num_input_tokens_seen": 23004192, "step": 156 }, { "epoch": 0.044969731911213603, "loss": 0.2943967580795288, "loss_ce": 0.2529538869857788, "loss_xval": 0.04150390625, "num_input_tokens_seen": 23004192, "step": 156 }, { "epoch": 0.04525799942346498, "grad_norm": 9.718539618915857, "learning_rate": 9.157441989657229e-05, "loss": 0.326, "num_input_tokens_seen": 23138984, "step": 157 }, { "epoch": 0.04525799942346498, "loss": 0.324166864156723, "loss_ce": 0.26999813318252563, "loss_xval": 0.05419921875, "num_input_tokens_seen": 23138984, "step": 157 }, { "epoch": 0.04554626693571635, "grad_norm": 26.006683493223683, "learning_rate": 9.168941170350729e-05, "loss": 0.3042, "num_input_tokens_seen": 23274144, "step": 158 }, { "epoch": 0.04554626693571635, "loss": 0.31835854053497314, "loss_ce": 0.27255165576934814, "loss_xval": 0.0458984375, "num_input_tokens_seen": 23274144, "step": 158 }, { "epoch": 0.045834534447967715, "grad_norm": 6.575263387107517, "learning_rate": 9.18036780052546e-05, "loss": 0.3047, "num_input_tokens_seen": 23446704, "step": 159 }, { "epoch": 0.045834534447967715, "loss": 0.2826117277145386, "loss_ce": 0.24026857316493988, "loss_xval": 0.042236328125, "num_input_tokens_seen": 23446704, "step": 159 }, { "epoch": 0.04612280196021908, "grad_norm": 22.489834908558066, "learning_rate": 9.191722789914795e-05, "loss": 0.3228, "num_input_tokens_seen": 23581568, "step": 160 }, { "epoch": 0.04612280196021908, "loss": 0.32048097252845764, "loss_ce": 0.26686158776283264, "loss_xval": 0.0537109375, "num_input_tokens_seen": 23581568, "step": 160 }, { "epoch": 0.04641106947247045, "grad_norm": 8.378774904416845, "learning_rate": 9.203007031247519e-05, "loss": 0.2805, "num_input_tokens_seen": 23716744, "step": 161 }, { "epoch": 0.04641106947247045, "loss": 0.29976916313171387, "loss_ce": 0.25937914848327637, "loss_xval": 0.040283203125, "num_input_tokens_seen": 23716744, "step": 161 }, { "epoch": 0.04669933698472182, "grad_norm": 7.8011862892598565, "learning_rate": 9.214221400669005e-05, "loss": 0.2955, "num_input_tokens_seen": 23889208, "step": 162 }, { "epoch": 0.04669933698472182, "loss": 0.25634777545928955, "loss_ce": 0.22622694075107574, "loss_xval": 0.0301513671875, "num_input_tokens_seen": 23889208, "step": 162 }, { "epoch": 0.046987604496973194, "grad_norm": 37.03342557004555, "learning_rate": 9.225366758149434e-05, "loss": 0.3199, "num_input_tokens_seen": 24023880, "step": 163 }, { "epoch": 0.046987604496973194, "loss": 0.3286735713481903, "loss_ce": 0.2644645869731903, "loss_xval": 0.064453125, "num_input_tokens_seen": 24023880, "step": 163 }, { "epoch": 0.04727587200922456, "grad_norm": 10.660900550482202, "learning_rate": 9.236443947879511e-05, "loss": 0.285, "num_input_tokens_seen": 24158960, "step": 164 }, { "epoch": 0.04727587200922456, "loss": 0.3078290522098541, "loss_ce": 0.2633801996707916, "loss_xval": 0.04443359375, "num_input_tokens_seen": 24158960, "step": 164 }, { "epoch": 0.04756413952147593, "grad_norm": 35.74031519686993, "learning_rate": 9.247453798654176e-05, "loss": 0.2879, "num_input_tokens_seen": 24331240, "step": 165 }, { "epoch": 0.04756413952147593, "loss": 0.2579522728919983, "loss_ce": 0.2230096459388733, "loss_xval": 0.034912109375, "num_input_tokens_seen": 24331240, "step": 165 }, { "epoch": 0.0478524070337273, "grad_norm": 10.026673974037537, "learning_rate": 9.25839712424472e-05, "loss": 0.3088, "num_input_tokens_seen": 24466016, "step": 166 }, { "epoch": 0.0478524070337273, "loss": 0.3210294842720032, "loss_ce": 0.2577055096626282, "loss_xval": 0.0634765625, "num_input_tokens_seen": 24466016, "step": 166 }, { "epoch": 0.048140674545978666, "grad_norm": 30.320189176787448, "learning_rate": 9.269274723759701e-05, "loss": 0.2739, "num_input_tokens_seen": 24601288, "step": 167 }, { "epoch": 0.048140674545978666, "loss": 0.29633641242980957, "loss_ce": 0.24610450863838196, "loss_xval": 0.05029296875, "num_input_tokens_seen": 24601288, "step": 167 }, { "epoch": 0.048428942058230035, "grad_norm": 11.424628650689558, "learning_rate": 9.280087381995114e-05, "loss": 0.2855, "num_input_tokens_seen": 24773760, "step": 168 }, { "epoch": 0.048428942058230035, "loss": 0.2584998905658722, "loss_ce": 0.2163856327533722, "loss_xval": 0.0419921875, "num_input_tokens_seen": 24773760, "step": 168 }, { "epoch": 0.04871720957048141, "grad_norm": 5.036083595566636, "learning_rate": 9.290835869774165e-05, "loss": 0.2927, "num_input_tokens_seen": 24908480, "step": 169 }, { "epoch": 0.04871720957048141, "loss": 0.2933216691017151, "loss_ce": 0.2397633194923401, "loss_xval": 0.053466796875, "num_input_tokens_seen": 24908480, "step": 169 }, { "epoch": 0.04900547708273278, "grad_norm": 27.447329626580203, "learning_rate": 9.301520944277006e-05, "loss": 0.2688, "num_input_tokens_seen": 25043544, "step": 170 }, { "epoch": 0.04900547708273278, "loss": 0.2911906838417053, "loss_ce": 0.23934133350849152, "loss_xval": 0.0517578125, "num_input_tokens_seen": 25043544, "step": 170 }, { "epoch": 0.049293744594984146, "grad_norm": 12.281459529846494, "learning_rate": 9.31214334936082e-05, "loss": 0.2634, "num_input_tokens_seen": 25216144, "step": 171 }, { "epoch": 0.049293744594984146, "loss": 0.23304098844528198, "loss_ce": 0.20543783903121948, "loss_xval": 0.027587890625, "num_input_tokens_seen": 25216144, "step": 171 }, { "epoch": 0.049582012107235514, "grad_norm": 31.969966090566412, "learning_rate": 9.32270381587056e-05, "loss": 0.2875, "num_input_tokens_seen": 25350928, "step": 172 }, { "epoch": 0.049582012107235514, "loss": 0.3008914291858673, "loss_ce": 0.2368045151233673, "loss_xval": 0.06396484375, "num_input_tokens_seen": 25350928, "step": 172 }, { "epoch": 0.04987027961948688, "grad_norm": 12.583456227749283, "learning_rate": 9.333203061940695e-05, "loss": 0.254, "num_input_tokens_seen": 25485920, "step": 173 }, { "epoch": 0.04987027961948688, "loss": 0.27169662714004517, "loss_ce": 0.22727830708026886, "loss_xval": 0.04443359375, "num_input_tokens_seen": 25485920, "step": 173 }, { "epoch": 0.05015854713173825, "grad_norm": 24.09416200574407, "learning_rate": 9.343641793288233e-05, "loss": 0.257, "num_input_tokens_seen": 25658352, "step": 174 }, { "epoch": 0.05015854713173825, "loss": 0.24052289128303528, "loss_ce": 0.20500043034553528, "loss_xval": 0.03564453125, "num_input_tokens_seen": 25658352, "step": 174 }, { "epoch": 0.050446814643989625, "grad_norm": 14.657839942966513, "learning_rate": 9.35402070349739e-05, "loss": 0.2797, "num_input_tokens_seen": 25793096, "step": 175 }, { "epoch": 0.050446814643989625, "loss": 0.27785447239875793, "loss_ce": 0.22736310958862305, "loss_xval": 0.050537109375, "num_input_tokens_seen": 25793096, "step": 175 }, { "epoch": 0.05073508215624099, "grad_norm": 19.861551728685, "learning_rate": 9.364340474296099e-05, "loss": 0.2496, "num_input_tokens_seen": 25928248, "step": 176 }, { "epoch": 0.05073508215624099, "loss": 0.26949241757392883, "loss_ce": 0.22600486874580383, "loss_xval": 0.04345703125, "num_input_tokens_seen": 25928248, "step": 176 }, { "epoch": 0.05102334966849236, "grad_norm": 16.02692098193626, "learning_rate": 9.374601775824737e-05, "loss": 0.2526, "num_input_tokens_seen": 26100808, "step": 177 }, { "epoch": 0.05102334966849236, "loss": 0.22894251346588135, "loss_ce": 0.19715844094753265, "loss_xval": 0.03173828125, "num_input_tokens_seen": 26100808, "step": 177 }, { "epoch": 0.05131161718074373, "grad_norm": 12.014856064552857, "learning_rate": 9.384805266897235e-05, "loss": 0.2682, "num_input_tokens_seen": 26235648, "step": 178 }, { "epoch": 0.05131161718074373, "loss": 0.27641862630844116, "loss_ce": 0.22081559896469116, "loss_xval": 0.0556640625, "num_input_tokens_seen": 26235648, "step": 178 }, { "epoch": 0.0515998846929951, "grad_norm": 17.513701257593887, "learning_rate": 9.394951595254911e-05, "loss": 0.2362, "num_input_tokens_seen": 26370816, "step": 179 }, { "epoch": 0.0515998846929951, "loss": 0.25344526767730713, "loss_ce": 0.21006453037261963, "loss_xval": 0.04345703125, "num_input_tokens_seen": 26370816, "step": 179 }, { "epoch": 0.051888152205246466, "grad_norm": 6.160409492259628, "learning_rate": 9.405041397813203e-05, "loss": 0.2412, "num_input_tokens_seen": 26543240, "step": 180 }, { "epoch": 0.051888152205246466, "loss": 0.21339178085327148, "loss_ce": 0.18534612655639648, "loss_xval": 0.028076171875, "num_input_tokens_seen": 26543240, "step": 180 }, { "epoch": 0.05217641971749784, "grad_norm": 20.522481305314027, "learning_rate": 9.415075300901591e-05, "loss": 0.2485, "num_input_tokens_seen": 26678072, "step": 181 }, { "epoch": 0.05217641971749784, "loss": 0.25265252590179443, "loss_ce": 0.21207940578460693, "loss_xval": 0.04052734375, "num_input_tokens_seen": 26678072, "step": 181 }, { "epoch": 0.05246468722974921, "grad_norm": 6.0065164673896705, "learning_rate": 9.425053920496895e-05, "loss": 0.2401, "num_input_tokens_seen": 26813064, "step": 182 }, { "epoch": 0.05246468722974921, "loss": 0.26624372601509094, "loss_ce": 0.21688155829906464, "loss_xval": 0.04931640625, "num_input_tokens_seen": 26813064, "step": 182 }, { "epoch": 0.05275295474200058, "grad_norm": 31.84749340439083, "learning_rate": 9.434977862450192e-05, "loss": 0.2362, "num_input_tokens_seen": 26985560, "step": 183 }, { "epoch": 0.05275295474200058, "loss": 0.22047150135040283, "loss_ce": 0.18293488025665283, "loss_xval": 0.03759765625, "num_input_tokens_seen": 26985560, "step": 183 }, { "epoch": 0.053041222254251945, "grad_norm": 5.518518017772564, "learning_rate": 9.44484772270753e-05, "loss": 0.2407, "num_input_tokens_seen": 27120344, "step": 184 }, { "epoch": 0.053041222254251945, "loss": 0.24068568646907806, "loss_ce": 0.19535183906555176, "loss_xval": 0.04541015625, "num_input_tokens_seen": 27120344, "step": 184 }, { "epoch": 0.05332948976650331, "grad_norm": 42.021018230561694, "learning_rate": 9.454664087524682e-05, "loss": 0.2365, "num_input_tokens_seen": 27255400, "step": 185 }, { "epoch": 0.05332948976650331, "loss": 0.257222980260849, "loss_ce": 0.207692950963974, "loss_xval": 0.049560546875, "num_input_tokens_seen": 27255400, "step": 185 }, { "epoch": 0.05361775727875468, "grad_norm": 6.447048391106758, "learning_rate": 9.464427533676112e-05, "loss": 0.2219, "num_input_tokens_seen": 27428048, "step": 186 }, { "epoch": 0.05361775727875468, "loss": 0.20040246844291687, "loss_ce": 0.17156335711479187, "loss_xval": 0.02880859375, "num_input_tokens_seen": 27428048, "step": 186 }, { "epoch": 0.053906024791006056, "grad_norm": 27.203776139916368, "learning_rate": 9.474138628658309e-05, "loss": 0.2491, "num_input_tokens_seen": 27562848, "step": 187 }, { "epoch": 0.053906024791006056, "loss": 0.2541777789592743, "loss_ce": 0.2019316852092743, "loss_xval": 0.05224609375, "num_input_tokens_seen": 27562848, "step": 187 }, { "epoch": 0.054194292303257424, "grad_norm": 15.749244789077474, "learning_rate": 9.48379793088774e-05, "loss": 0.2185, "num_input_tokens_seen": 27697960, "step": 188 }, { "epoch": 0.054194292303257424, "loss": 0.23451972007751465, "loss_ce": 0.19426703453063965, "loss_xval": 0.040283203125, "num_input_tokens_seen": 27697960, "step": 188 }, { "epoch": 0.05448255981550879, "grad_norm": 35.73821338153578, "learning_rate": 9.493405989893521e-05, "loss": 0.2304, "num_input_tokens_seen": 27870424, "step": 189 }, { "epoch": 0.05448255981550879, "loss": 0.19829484820365906, "loss_ce": 0.16846391558647156, "loss_xval": 0.02978515625, "num_input_tokens_seen": 27870424, "step": 189 }, { "epoch": 0.05477082732776016, "grad_norm": 9.214313193062441, "learning_rate": 9.502963346505015e-05, "loss": 0.2347, "num_input_tokens_seen": 28005296, "step": 190 }, { "epoch": 0.05477082732776016, "loss": 0.241441547870636, "loss_ce": 0.1921709179878235, "loss_xval": 0.04931640625, "num_input_tokens_seen": 28005296, "step": 190 }, { "epoch": 0.05505909484001153, "grad_norm": 31.476319932486625, "learning_rate": 9.512470533034511e-05, "loss": 0.2196, "num_input_tokens_seen": 28140448, "step": 191 }, { "epoch": 0.05505909484001153, "loss": 0.23726461827754974, "loss_ce": 0.19161030650138855, "loss_xval": 0.045654296875, "num_input_tokens_seen": 28140448, "step": 191 }, { "epoch": 0.0553473623522629, "grad_norm": 5.069492722780138, "learning_rate": 9.521928073455124e-05, "loss": 0.2243, "num_input_tokens_seen": 28312832, "step": 192 }, { "epoch": 0.0553473623522629, "loss": 0.1921926885843277, "loss_ce": 0.16705383360385895, "loss_xval": 0.025146484375, "num_input_tokens_seen": 28312832, "step": 192 }, { "epoch": 0.05563562986451427, "grad_norm": 14.335192287118785, "learning_rate": 9.531336483574082e-05, "loss": 0.23, "num_input_tokens_seen": 28447600, "step": 193 }, { "epoch": 0.05563562986451427, "loss": 0.2333214282989502, "loss_ce": 0.1864311695098877, "loss_xval": 0.046875, "num_input_tokens_seen": 28447600, "step": 193 }, { "epoch": 0.05592389737676564, "grad_norm": 20.486809993451025, "learning_rate": 9.540696271201525e-05, "loss": 0.2041, "num_input_tokens_seen": 28582696, "step": 194 }, { "epoch": 0.05592389737676564, "loss": 0.22749033570289612, "loss_ce": 0.18444529175758362, "loss_xval": 0.04296875, "num_input_tokens_seen": 28582696, "step": 194 }, { "epoch": 0.05621216488901701, "grad_norm": 34.51668792130806, "learning_rate": 9.550007936314986e-05, "loss": 0.212, "num_input_tokens_seen": 28755208, "step": 195 }, { "epoch": 0.05621216488901701, "loss": 0.18501949310302734, "loss_ce": 0.15514278411865234, "loss_xval": 0.0299072265625, "num_input_tokens_seen": 28755208, "step": 195 }, { "epoch": 0.056500432401268376, "grad_norm": 17.745854925138033, "learning_rate": 9.559271971219628e-05, "loss": 0.2279, "num_input_tokens_seen": 28890120, "step": 196 }, { "epoch": 0.056500432401268376, "loss": 0.241400346159935, "loss_ce": 0.184027299284935, "loss_xval": 0.057373046875, "num_input_tokens_seen": 28890120, "step": 196 }, { "epoch": 0.056788699913519744, "grad_norm": 54.68747170055578, "learning_rate": 9.568488860704453e-05, "loss": 0.2097, "num_input_tokens_seen": 29025136, "step": 197 }, { "epoch": 0.056788699913519744, "loss": 0.21792283654212952, "loss_ce": 0.17619004845619202, "loss_xval": 0.041748046875, "num_input_tokens_seen": 29025136, "step": 197 }, { "epoch": 0.05707696742577112, "grad_norm": 15.10090377209552, "learning_rate": 9.577659082194505e-05, "loss": 0.1959, "num_input_tokens_seen": 29197896, "step": 198 }, { "epoch": 0.05707696742577112, "loss": 0.17415854334831238, "loss_ce": 0.15132376551628113, "loss_xval": 0.0228271484375, "num_input_tokens_seen": 29197896, "step": 198 }, { "epoch": 0.05736523493802249, "grad_norm": 37.689905888594836, "learning_rate": 9.586783105899282e-05, "loss": 0.2207, "num_input_tokens_seen": 29332752, "step": 199 }, { "epoch": 0.05736523493802249, "loss": 0.22800716757774353, "loss_ce": 0.17243465781211853, "loss_xval": 0.0556640625, "num_input_tokens_seen": 29332752, "step": 199 }, { "epoch": 0.057653502450273855, "grad_norm": 13.807671874727676, "learning_rate": 9.595861394957398e-05, "loss": 0.1868, "num_input_tokens_seen": 29467792, "step": 200 }, { "epoch": 0.057653502450273855, "loss": 0.20190562307834625, "loss_ce": 0.16923655569553375, "loss_xval": 0.03271484375, "num_input_tokens_seen": 29467792, "step": 200 }, { "epoch": 0.05794176996252522, "grad_norm": 44.84280562592291, "learning_rate": 9.604894405577657e-05, "loss": 0.2107, "num_input_tokens_seen": 29640320, "step": 201 }, { "epoch": 0.05794176996252522, "loss": 0.1847914755344391, "loss_ce": 0.1469191610813141, "loss_xval": 0.037841796875, "num_input_tokens_seen": 29640320, "step": 201 }, { "epoch": 0.05823003747477659, "grad_norm": 28.05197649963492, "learning_rate": 9.613882587176614e-05, "loss": 0.2106, "num_input_tokens_seen": 29775112, "step": 202 }, { "epoch": 0.05823003747477659, "loss": 0.21230345964431763, "loss_ce": 0.16901427507400513, "loss_xval": 0.043212890625, "num_input_tokens_seen": 29775112, "step": 202 }, { "epoch": 0.05851830498702796, "grad_norm": 44.536626160823914, "learning_rate": 9.622826382512748e-05, "loss": 0.1915, "num_input_tokens_seen": 29910208, "step": 203 }, { "epoch": 0.05851830498702796, "loss": 0.20585274696350098, "loss_ce": 0.16460824012756348, "loss_xval": 0.041259765625, "num_input_tokens_seen": 29910208, "step": 203 }, { "epoch": 0.058806572499279335, "grad_norm": 61.770898696786716, "learning_rate": 9.631726227817333e-05, "loss": 0.204, "num_input_tokens_seen": 30082816, "step": 204 }, { "epoch": 0.058806572499279335, "loss": 0.18048658967018127, "loss_ce": 0.14203444123268127, "loss_xval": 0.03857421875, "num_input_tokens_seen": 30082816, "step": 204 }, { "epoch": 0.0590948400115307, "grad_norm": 5.133503721665133, "learning_rate": 9.640582552922112e-05, "loss": 0.2018, "num_input_tokens_seen": 30217600, "step": 205 }, { "epoch": 0.0590948400115307, "loss": 0.21230003237724304, "loss_ce": 0.16855308413505554, "loss_xval": 0.043701171875, "num_input_tokens_seen": 30217600, "step": 205 }, { "epoch": 0.05938310752378207, "grad_norm": 41.13106645832814, "learning_rate": 9.64939578138386e-05, "loss": 0.1953, "num_input_tokens_seen": 30352520, "step": 206 }, { "epoch": 0.05938310752378207, "loss": 0.20415394008159637, "loss_ce": 0.16251270473003387, "loss_xval": 0.041748046875, "num_input_tokens_seen": 30352520, "step": 206 }, { "epoch": 0.05967137503603344, "grad_norm": 7.532245333884863, "learning_rate": 9.658166330605936e-05, "loss": 0.1872, "num_input_tokens_seen": 30525000, "step": 207 }, { "epoch": 0.05967137503603344, "loss": 0.16164681315422058, "loss_ce": 0.13517281413078308, "loss_xval": 0.0264892578125, "num_input_tokens_seen": 30525000, "step": 207 }, { "epoch": 0.05995964254828481, "grad_norm": 68.5759865275484, "learning_rate": 9.666894611956906e-05, "loss": 0.2288, "num_input_tokens_seen": 30659808, "step": 208 }, { "epoch": 0.05995964254828481, "loss": 0.23044301569461823, "loss_ce": 0.16342641413211823, "loss_xval": 0.06689453125, "num_input_tokens_seen": 30659808, "step": 208 }, { "epoch": 0.060247910060536175, "grad_norm": 41.614219942429486, "learning_rate": 9.67558103088632e-05, "loss": 0.1869, "num_input_tokens_seen": 30794960, "step": 209 }, { "epoch": 0.060247910060536175, "loss": 0.1920596957206726, "loss_ce": 0.1534244418144226, "loss_xval": 0.03857421875, "num_input_tokens_seen": 30794960, "step": 209 }, { "epoch": 0.06053617757278755, "grad_norm": 54.20840230591495, "learning_rate": 9.684225987037716e-05, "loss": 0.1939, "num_input_tokens_seen": 30967408, "step": 210 }, { "epoch": 0.06053617757278755, "loss": 0.1660287082195282, "loss_ce": 0.1323220431804657, "loss_xval": 0.03369140625, "num_input_tokens_seen": 30967408, "step": 210 }, { "epoch": 0.06082444508503892, "grad_norm": 78.17547740458902, "learning_rate": 9.692829874358969e-05, "loss": 0.2254, "num_input_tokens_seen": 31102128, "step": 211 }, { "epoch": 0.06082444508503892, "loss": 0.21866479516029358, "loss_ce": 0.15173974633216858, "loss_xval": 0.06689453125, "num_input_tokens_seen": 31102128, "step": 211 }, { "epoch": 0.061112712597290286, "grad_norm": 31.53452634225132, "learning_rate": 9.701393081209986e-05, "loss": 0.1722, "num_input_tokens_seen": 31237240, "step": 212 }, { "epoch": 0.061112712597290286, "loss": 0.1789591759443283, "loss_ce": 0.1464884728193283, "loss_xval": 0.032470703125, "num_input_tokens_seen": 31237240, "step": 212 }, { "epoch": 0.061400980109541654, "grad_norm": 113.61471944977563, "learning_rate": 9.709915990467911e-05, "loss": 0.2605, "num_input_tokens_seen": 31409720, "step": 213 }, { "epoch": 0.061400980109541654, "loss": 0.20028024911880493, "loss_ce": 0.13012033700942993, "loss_xval": 0.0703125, "num_input_tokens_seen": 31409720, "step": 213 }, { "epoch": 0.06168924762179302, "grad_norm": 13.533712490533789, "learning_rate": 9.718398979629844e-05, "loss": 0.1859, "num_input_tokens_seen": 31544496, "step": 214 }, { "epoch": 0.06168924762179302, "loss": 0.18483048677444458, "loss_ce": 0.14607316255569458, "loss_xval": 0.038818359375, "num_input_tokens_seen": 31544496, "step": 214 }, { "epoch": 0.06197751513404439, "grad_norm": 153.1001234370429, "learning_rate": 9.726842420913164e-05, "loss": 0.3144, "num_input_tokens_seen": 31679576, "step": 215 }, { "epoch": 0.06197751513404439, "loss": 0.24575066566467285, "loss_ce": 0.13930535316467285, "loss_xval": 0.1064453125, "num_input_tokens_seen": 31679576, "step": 215 }, { "epoch": 0.062265782646295766, "grad_norm": 114.7913479047939, "learning_rate": 9.735246681353531e-05, "loss": 0.2533, "num_input_tokens_seen": 31852216, "step": 216 }, { "epoch": 0.062265782646295766, "loss": 0.1870504915714264, "loss_ce": 0.12320771813392639, "loss_xval": 0.06396484375, "num_input_tokens_seen": 31852216, "step": 216 }, { "epoch": 0.06255405015854713, "grad_norm": 127.62253929979651, "learning_rate": 9.743612122900626e-05, "loss": 0.2881, "num_input_tokens_seen": 31987000, "step": 217 }, { "epoch": 0.06255405015854713, "loss": 0.2577114701271057, "loss_ce": 0.14381985366344452, "loss_xval": 0.11376953125, "num_input_tokens_seen": 31987000, "step": 217 }, { "epoch": 0.0628423176707985, "grad_norm": 235.00377257321716, "learning_rate": 9.751939102511684e-05, "loss": 0.4947, "num_input_tokens_seen": 32122032, "step": 218 }, { "epoch": 0.0628423176707985, "loss": 0.3868434727191925, "loss_ce": 0.14050555229187012, "loss_xval": 0.24609375, "num_input_tokens_seen": 32122032, "step": 218 }, { "epoch": 0.06313058518304987, "grad_norm": 12.837730790336758, "learning_rate": 9.760227972242893e-05, "loss": 0.1585, "num_input_tokens_seen": 32294488, "step": 219 }, { "epoch": 0.06313058518304987, "loss": 0.1404755860567093, "loss_ce": 0.11587841808795929, "loss_xval": 0.024658203125, "num_input_tokens_seen": 32294488, "step": 219 }, { "epoch": 0.06341885269530124, "grad_norm": 265.78254131812344, "learning_rate": 9.768479079338703e-05, "loss": 0.6116, "num_input_tokens_seen": 32429264, "step": 220 }, { "epoch": 0.06341885269530124, "loss": 0.46776074171066284, "loss_ce": 0.14061228930950165, "loss_xval": 0.328125, "num_input_tokens_seen": 32429264, "step": 220 }, { "epoch": 0.0637071202075526, "grad_norm": 161.92075515006673, "learning_rate": 9.776692766319115e-05, "loss": 0.3099, "num_input_tokens_seen": 32564304, "step": 221 }, { "epoch": 0.0637071202075526, "loss": 0.30831146240234375, "loss_ce": 0.12667082250118256, "loss_xval": 0.181640625, "num_input_tokens_seen": 32564304, "step": 221 }, { "epoch": 0.06399538771980398, "grad_norm": 213.97148511685998, "learning_rate": 9.784869371065011e-05, "loss": 0.4637, "num_input_tokens_seen": 32736864, "step": 222 }, { "epoch": 0.06399538771980398, "loss": 0.28785890340805054, "loss_ce": 0.11268798261880875, "loss_xval": 0.1748046875, "num_input_tokens_seen": 32736864, "step": 222 }, { "epoch": 0.06428365523205534, "grad_norm": 279.0437182260339, "learning_rate": 9.793009226901534e-05, "loss": 0.6737, "num_input_tokens_seen": 32871640, "step": 223 }, { "epoch": 0.06428365523205534, "loss": 0.5465670824050903, "loss_ce": 0.14129364490509033, "loss_xval": 0.40625, "num_input_tokens_seen": 32871640, "step": 223 }, { "epoch": 0.06457192274430672, "grad_norm": 148.45369429539775, "learning_rate": 9.801112662679638e-05, "loss": 0.3168, "num_input_tokens_seen": 33006704, "step": 224 }, { "epoch": 0.06457192274430672, "loss": 0.21143169701099396, "loss_ce": 0.12302227318286896, "loss_xval": 0.08837890625, "num_input_tokens_seen": 33006704, "step": 224 }, { "epoch": 0.06486019025655809, "grad_norm": 444.77590098553196, "learning_rate": 9.809180002855806e-05, "loss": 1.4688, "num_input_tokens_seen": 33179216, "step": 225 }, { "epoch": 0.06486019025655809, "loss": 0.8774189949035645, "loss_ce": 0.10886429250240326, "loss_xval": 0.76953125, "num_input_tokens_seen": 33179216, "step": 225 }, { "epoch": 0.06514845776880945, "grad_norm": 31.205263410938144, "learning_rate": 9.817211567569992e-05, "loss": 0.1857, "num_input_tokens_seen": 33313952, "step": 226 }, { "epoch": 0.06514845776880945, "loss": 0.18774542212486267, "loss_ce": 0.13681158423423767, "loss_xval": 0.051025390625, "num_input_tokens_seen": 33313952, "step": 226 }, { "epoch": 0.06543672528106083, "grad_norm": 598.3519060203606, "learning_rate": 9.825207672721861e-05, "loss": 2.4385, "num_input_tokens_seen": 33449056, "step": 227 }, { "epoch": 0.06543672528106083, "loss": 1.5101361274719238, "loss_ce": 0.12537051737308502, "loss_xval": 1.3828125, "num_input_tokens_seen": 33449056, "step": 227 }, { "epoch": 0.06572499279331219, "grad_norm": 384.2944927117941, "learning_rate": 9.833168630045344e-05, "loss": 1.1605, "num_input_tokens_seen": 33621544, "step": 228 }, { "epoch": 0.06572499279331219, "loss": 0.7385019063949585, "loss_ce": 0.11203701794147491, "loss_xval": 0.625, "num_input_tokens_seen": 33621544, "step": 228 }, { "epoch": 0.06601326030556356, "grad_norm": 505.26874849553155, "learning_rate": 9.841094747181556e-05, "loss": 1.8865, "num_input_tokens_seen": 33756344, "step": 229 }, { "epoch": 0.06601326030556356, "loss": 1.398691177368164, "loss_ce": 0.13599595427513123, "loss_xval": 1.265625, "num_input_tokens_seen": 33756344, "step": 229 }, { "epoch": 0.06630152781781493, "grad_norm": 634.3239419924421, "learning_rate": 9.848986327750132e-05, "loss": 2.831, "num_input_tokens_seen": 33891360, "step": 230 }, { "epoch": 0.06630152781781493, "loss": 1.8530738353729248, "loss_ce": 0.1235816702246666, "loss_xval": 1.7265625, "num_input_tokens_seen": 33891360, "step": 230 }, { "epoch": 0.0665897953300663, "grad_norm": 320.3658674116094, "learning_rate": 9.856843671419021e-05, "loss": 0.8727, "num_input_tokens_seen": 34063912, "step": 231 }, { "epoch": 0.0665897953300663, "loss": 0.7934414744377136, "loss_ce": 0.11863675713539124, "loss_xval": 0.67578125, "num_input_tokens_seen": 34063912, "step": 231 }, { "epoch": 0.06687806284231768, "grad_norm": 714.0511131829795, "learning_rate": 9.864667073972757e-05, "loss": 3.6725, "num_input_tokens_seen": 34198672, "step": 232 }, { "epoch": 0.06687806284231768, "loss": 2.818943500518799, "loss_ce": 0.13925601541996002, "loss_xval": 2.6875, "num_input_tokens_seen": 34198672, "step": 232 }, { "epoch": 0.06716633035456904, "grad_norm": 193.6951410568273, "learning_rate": 9.872456827379282e-05, "loss": 0.4295, "num_input_tokens_seen": 34333616, "step": 233 }, { "epoch": 0.06716633035456904, "loss": 0.4650017321109772, "loss_ce": 0.13553395867347717, "loss_xval": 0.330078125, "num_input_tokens_seen": 34333616, "step": 233 }, { "epoch": 0.06745459786682041, "grad_norm": 719.8870289760067, "learning_rate": 9.880213219855314e-05, "loss": 3.872, "num_input_tokens_seen": 34506024, "step": 234 }, { "epoch": 0.06745459786682041, "loss": 2.788909435272217, "loss_ce": 0.12094061076641083, "loss_xval": 2.671875, "num_input_tokens_seen": 34506024, "step": 234 }, { "epoch": 0.06774286537907177, "grad_norm": 181.98869361563385, "learning_rate": 9.887936535930344e-05, "loss": 0.4589, "num_input_tokens_seen": 34640728, "step": 235 }, { "epoch": 0.06774286537907177, "loss": 0.5512518882751465, "loss_ce": 0.14207220077514648, "loss_xval": 0.41015625, "num_input_tokens_seen": 34640728, "step": 235 }, { "epoch": 0.06803113289132315, "grad_norm": 703.1804675651726, "learning_rate": 9.895627056509262e-05, "loss": 4.0544, "num_input_tokens_seen": 34775784, "step": 236 }, { "epoch": 0.06803113289132315, "loss": 3.0673837661743164, "loss_ce": 0.14746198058128357, "loss_xval": 2.921875, "num_input_tokens_seen": 34775784, "step": 236 }, { "epoch": 0.06831940040357452, "grad_norm": 192.37560557603808, "learning_rate": 9.90328505893366e-05, "loss": 0.5742, "num_input_tokens_seen": 34948304, "step": 237 }, { "epoch": 0.06831940040357452, "loss": 0.7511662244796753, "loss_ce": 0.14130297303199768, "loss_xval": 0.609375, "num_input_tokens_seen": 34948304, "step": 237 }, { "epoch": 0.06860766791582588, "grad_norm": 670.4064003150474, "learning_rate": 9.91091081704185e-05, "loss": 3.8927, "num_input_tokens_seen": 35083072, "step": 238 }, { "epoch": 0.06860766791582588, "loss": 3.181490898132324, "loss_ce": 0.16391250491142273, "loss_xval": 3.015625, "num_input_tokens_seen": 35083072, "step": 238 }, { "epoch": 0.06889593542807726, "grad_norm": 184.37943098250278, "learning_rate": 9.918504601227611e-05, "loss": 0.5801, "num_input_tokens_seen": 35218160, "step": 239 }, { "epoch": 0.06889593542807726, "loss": 0.808303713798523, "loss_ce": 0.15058887004852295, "loss_xval": 0.65625, "num_input_tokens_seen": 35218160, "step": 239 }, { "epoch": 0.06918420294032862, "grad_norm": 486.97199962741814, "learning_rate": 9.926066678497726e-05, "loss": 2.5227, "num_input_tokens_seen": 35390600, "step": 240 }, { "epoch": 0.06918420294032862, "loss": 2.4577555656433105, "loss_ce": 0.15697449445724487, "loss_xval": 2.296875, "num_input_tokens_seen": 35390600, "step": 240 }, { "epoch": 0.06947247045258, "grad_norm": 291.8847677589011, "learning_rate": 9.933597312528319e-05, "loss": 1.0913, "num_input_tokens_seen": 35525384, "step": 241 }, { "epoch": 0.06947247045258, "loss": 1.3332394361495972, "loss_ce": 0.2033565640449524, "loss_xval": 1.1328125, "num_input_tokens_seen": 35525384, "step": 241 }, { "epoch": 0.06976073796483136, "grad_norm": 248.55809130995584, "learning_rate": 9.941096763720006e-05, "loss": 0.8804, "num_input_tokens_seen": 35660456, "step": 242 }, { "epoch": 0.06976073796483136, "loss": 0.9395829439163208, "loss_ce": 0.1744462549686432, "loss_xval": 0.765625, "num_input_tokens_seen": 35660456, "step": 242 }, { "epoch": 0.07004900547708273, "grad_norm": 344.70726428259974, "learning_rate": 9.948565289251937e-05, "loss": 1.5722, "num_input_tokens_seen": 35832752, "step": 243 }, { "epoch": 0.07004900547708273, "loss": 1.5424445867538452, "loss_ce": 0.19088204205036163, "loss_xval": 1.3515625, "num_input_tokens_seen": 35832752, "step": 243 }, { "epoch": 0.07033727298933411, "grad_norm": 35.03505347015722, "learning_rate": 9.956003143134718e-05, "loss": 0.2743, "num_input_tokens_seen": 35967456, "step": 244 }, { "epoch": 0.07033727298933411, "loss": 0.3062777519226074, "loss_ce": 0.2516207695007324, "loss_xval": 0.0546875, "num_input_tokens_seen": 35967456, "step": 244 }, { "epoch": 0.07062554050158547, "grad_norm": 324.27535084781664, "learning_rate": 9.963410576262232e-05, "loss": 1.4836, "num_input_tokens_seen": 36102544, "step": 245 }, { "epoch": 0.07062554050158547, "loss": 1.3415637016296387, "loss_ce": 0.18629023432731628, "loss_xval": 1.15625, "num_input_tokens_seen": 36102544, "step": 245 }, { "epoch": 0.07091380801383684, "grad_norm": 192.40093602923258, "learning_rate": 9.97078783646244e-05, "loss": 0.7443, "num_input_tokens_seen": 36275024, "step": 246 }, { "epoch": 0.07091380801383684, "loss": 0.865687370300293, "loss_ce": 0.19967174530029297, "loss_xval": 0.6640625, "num_input_tokens_seen": 36275024, "step": 246 }, { "epoch": 0.0712020755260882, "grad_norm": 169.35354206434812, "learning_rate": 9.978135168547127e-05, "loss": 0.6059, "num_input_tokens_seen": 36409848, "step": 247 }, { "epoch": 0.0712020755260882, "loss": 0.6389449834823608, "loss_ce": 0.2558884024620056, "loss_xval": 0.3828125, "num_input_tokens_seen": 36409848, "step": 247 }, { "epoch": 0.07149034303833958, "grad_norm": 236.26593297962225, "learning_rate": 9.985452814360636e-05, "loss": 0.9012, "num_input_tokens_seen": 36544976, "step": 248 }, { "epoch": 0.07149034303833958, "loss": 1.0095927715301514, "loss_ce": 0.1878153681755066, "loss_xval": 0.8203125, "num_input_tokens_seen": 36544976, "step": 248 }, { "epoch": 0.07177861055059095, "grad_norm": 48.60897485178449, "learning_rate": 9.992741012827652e-05, "loss": 0.2498, "num_input_tokens_seen": 36717640, "step": 249 }, { "epoch": 0.07177861055059095, "loss": 0.20433716475963593, "loss_ce": 0.17581848800182343, "loss_xval": 0.028564453125, "num_input_tokens_seen": 36717640, "step": 249 }, { "epoch": 0.07206687806284232, "grad_norm": 179.656878403396, "learning_rate": 0.0001, "loss": 0.6386, "num_input_tokens_seen": 36852424, "step": 250 }, { "epoch": 0.07206687806284232, "eval_websight_new_IoU": 0.0, "eval_websight_new_MAE_x": 0.6111758947372437, "eval_websight_new_MAE_y": 0.5629599392414093, "eval_websight_new_NUM_probability": 0.6707842648029327, "eval_websight_new_inside_bbox": 0.0, "eval_websight_new_loss": 0.5729646682739258, "eval_websight_new_loss_ce": 0.1514492705464363, "eval_websight_new_loss_xval": 0.4156494140625, "eval_websight_new_runtime": 35.618, "eval_websight_new_samples_per_second": 1.404, "eval_websight_new_steps_per_second": 0.056, "num_input_tokens_seen": 36852424, "step": 250 }, { "epoch": 0.07206687806284232, "eval_seeclick_IoU": 0.0, "eval_seeclick_MAE_x": 0.5661886930465698, "eval_seeclick_MAE_y": 0.5124521255493164, "eval_seeclick_NUM_probability": 0.6034610271453857, "eval_seeclick_inside_bbox": 0.0, "eval_seeclick_loss": 0.5685375332832336, "eval_seeclick_loss_ce": 0.22021447867155075, "eval_seeclick_loss_xval": 0.3525390625, "eval_seeclick_runtime": 64.6475, "eval_seeclick_samples_per_second": 0.773, "eval_seeclick_steps_per_second": 0.031, "num_input_tokens_seen": 36852424, "step": 250 }, { "epoch": 0.07206687806284232, "eval_icons_IoU": 0.0, "eval_icons_MAE_x": 0.6235586404800415, "eval_icons_MAE_y": 0.6090098023414612, "eval_icons_NUM_probability": 0.7112331092357635, "eval_icons_inside_bbox": 0.0, "eval_icons_loss": 0.5959538817405701, "eval_icons_loss_ce": 0.18405180424451828, "eval_icons_loss_xval": 0.41455078125, "eval_icons_runtime": 64.7835, "eval_icons_samples_per_second": 0.772, "eval_icons_steps_per_second": 0.031, "num_input_tokens_seen": 36852424, "step": 250 }, { "epoch": 0.07206687806284232, "loss": 0.5958322882652283, "loss_ce": 0.18811748921871185, "loss_xval": 0.408203125, "num_input_tokens_seen": 36852424, "step": 250 }, { "epoch": 0.07235514557509369, "grad_norm": 178.65768699117723, "learning_rate": 0.0001, "loss": 0.5964, "num_input_tokens_seen": 36987608, "step": 251 }, { "epoch": 0.07235514557509369, "loss": 0.589705228805542, "loss_ce": 0.15757633745670319, "loss_xval": 0.431640625, "num_input_tokens_seen": 36987608, "step": 251 }, { "epoch": 0.07264341308734505, "grad_norm": 38.66699265104924, "learning_rate": 0.0001, "loss": 0.2277, "num_input_tokens_seen": 37160176, "step": 252 }, { "epoch": 0.07264341308734505, "loss": 0.20238327980041504, "loss_ce": 0.17584824562072754, "loss_xval": 0.0264892578125, "num_input_tokens_seen": 37160176, "step": 252 }, { "epoch": 0.07293168059959643, "grad_norm": 185.87773020429194, "learning_rate": 0.0001, "loss": 0.636, "num_input_tokens_seen": 37295120, "step": 253 }, { "epoch": 0.07293168059959643, "loss": 0.6577854752540588, "loss_ce": 0.20099833607673645, "loss_xval": 0.45703125, "num_input_tokens_seen": 37295120, "step": 253 }, { "epoch": 0.07321994811184779, "grad_norm": 102.75700880750043, "learning_rate": 0.0001, "loss": 0.3128, "num_input_tokens_seen": 37430168, "step": 254 }, { "epoch": 0.07321994811184779, "loss": 0.2869405746459961, "loss_ce": 0.1482686698436737, "loss_xval": 0.138671875, "num_input_tokens_seen": 37430168, "step": 254 }, { "epoch": 0.07350821562409916, "grad_norm": 87.97296152813514, "learning_rate": 0.0001, "loss": 0.3084, "num_input_tokens_seen": 37602528, "step": 255 }, { "epoch": 0.07350821562409916, "loss": 0.28620052337646484, "loss_ce": 0.17829035222530365, "loss_xval": 0.10791015625, "num_input_tokens_seen": 37602528, "step": 255 }, { "epoch": 0.07379648313635054, "grad_norm": 164.32807477823272, "learning_rate": 0.0001, "loss": 0.539, "num_input_tokens_seen": 37737296, "step": 256 }, { "epoch": 0.07379648313635054, "loss": 0.5527489185333252, "loss_ce": 0.1921532154083252, "loss_xval": 0.361328125, "num_input_tokens_seen": 37737296, "step": 256 }, { "epoch": 0.0740847506486019, "grad_norm": 29.183798965456198, "learning_rate": 0.0001, "loss": 0.1747, "num_input_tokens_seen": 37872304, "step": 257 }, { "epoch": 0.0740847506486019, "loss": 0.1750638484954834, "loss_ce": 0.1369626522064209, "loss_xval": 0.0380859375, "num_input_tokens_seen": 37872304, "step": 257 }, { "epoch": 0.07437301816085327, "grad_norm": 113.89670618085759, "learning_rate": 0.0001, "loss": 0.3648, "num_input_tokens_seen": 38044744, "step": 258 }, { "epoch": 0.07437301816085327, "loss": 0.37308743596076965, "loss_ce": 0.15116359293460846, "loss_xval": 0.2216796875, "num_input_tokens_seen": 38044744, "step": 258 }, { "epoch": 0.07466128567310464, "grad_norm": 114.85316241618703, "learning_rate": 0.0001, "loss": 0.3363, "num_input_tokens_seen": 38179512, "step": 259 }, { "epoch": 0.07466128567310464, "loss": 0.34536540508270264, "loss_ce": 0.16409097611904144, "loss_xval": 0.181640625, "num_input_tokens_seen": 38179512, "step": 259 }, { "epoch": 0.07494955318535601, "grad_norm": 14.752637398271474, "learning_rate": 0.0001, "loss": 0.1474, "num_input_tokens_seen": 38314592, "step": 260 }, { "epoch": 0.07494955318535601, "loss": 0.13977420330047607, "loss_ce": 0.12003696709871292, "loss_xval": 0.019775390625, "num_input_tokens_seen": 38314592, "step": 260 }, { "epoch": 0.07523782069760739, "grad_norm": 120.3078403264174, "learning_rate": 0.0001, "loss": 0.3626, "num_input_tokens_seen": 38487080, "step": 261 }, { "epoch": 0.07523782069760739, "loss": 0.31943660974502563, "loss_ce": 0.12986139953136444, "loss_xval": 0.189453125, "num_input_tokens_seen": 38487080, "step": 261 }, { "epoch": 0.07552608820985875, "grad_norm": 79.9536443457315, "learning_rate": 0.0001, "loss": 0.2404, "num_input_tokens_seen": 38621784, "step": 262 }, { "epoch": 0.07552608820985875, "loss": 0.2604348063468933, "loss_ce": 0.1555764079093933, "loss_xval": 0.10498046875, "num_input_tokens_seen": 38621784, "step": 262 }, { "epoch": 0.07581435572211012, "grad_norm": 67.24270419291493, "learning_rate": 0.0001, "loss": 0.1914, "num_input_tokens_seen": 38756824, "step": 263 }, { "epoch": 0.07581435572211012, "loss": 0.17697831988334656, "loss_ce": 0.11179277300834656, "loss_xval": 0.0654296875, "num_input_tokens_seen": 38756824, "step": 263 }, { "epoch": 0.07610262323436148, "grad_norm": 109.17180729202337, "learning_rate": 0.0001, "loss": 0.3252, "num_input_tokens_seen": 38929360, "step": 264 }, { "epoch": 0.07610262323436148, "loss": 0.3131757378578186, "loss_ce": 0.120792917907238, "loss_xval": 0.1923828125, "num_input_tokens_seen": 38929360, "step": 264 }, { "epoch": 0.07639089074661286, "grad_norm": 17.105499866069646, "learning_rate": 0.0001, "loss": 0.1472, "num_input_tokens_seen": 39064128, "step": 265 }, { "epoch": 0.07639089074661286, "loss": 0.17089056968688965, "loss_ce": 0.14578986167907715, "loss_xval": 0.025146484375, "num_input_tokens_seen": 39064128, "step": 265 }, { "epoch": 0.07667915825886423, "grad_norm": 82.84505450129119, "learning_rate": 0.0001, "loss": 0.2117, "num_input_tokens_seen": 39199264, "step": 266 }, { "epoch": 0.07667915825886423, "loss": 0.20540225505828857, "loss_ce": 0.09560001641511917, "loss_xval": 0.10986328125, "num_input_tokens_seen": 39199264, "step": 266 }, { "epoch": 0.0769674257711156, "grad_norm": 70.99727079552451, "learning_rate": 0.0001, "loss": 0.2063, "num_input_tokens_seen": 39371720, "step": 267 }, { "epoch": 0.0769674257711156, "loss": 0.1930188685655594, "loss_ce": 0.10299200564622879, "loss_xval": 0.08984375, "num_input_tokens_seen": 39371720, "step": 267 }, { "epoch": 0.07725569328336697, "grad_norm": 24.615430995589435, "learning_rate": 0.0001, "loss": 0.1345, "num_input_tokens_seen": 39506472, "step": 268 }, { "epoch": 0.07725569328336697, "loss": 0.15381276607513428, "loss_ce": 0.13003194332122803, "loss_xval": 0.0238037109375, "num_input_tokens_seen": 39506472, "step": 268 }, { "epoch": 0.07754396079561833, "grad_norm": 81.15937577572338, "learning_rate": 0.0001, "loss": 0.1952, "num_input_tokens_seen": 39641416, "step": 269 }, { "epoch": 0.07754396079561833, "loss": 0.19636116921901703, "loss_ce": 0.08729134500026703, "loss_xval": 0.10888671875, "num_input_tokens_seen": 39641416, "step": 269 }, { "epoch": 0.0778322283078697, "grad_norm": 33.693167336460625, "learning_rate": 0.0001, "loss": 0.1362, "num_input_tokens_seen": 39814000, "step": 270 }, { "epoch": 0.0778322283078697, "loss": 0.1107083410024643, "loss_ce": 0.0908261388540268, "loss_xval": 0.0198974609375, "num_input_tokens_seen": 39814000, "step": 270 }, { "epoch": 0.07812049582012107, "grad_norm": 49.838891424712216, "learning_rate": 0.0001, "loss": 0.1504, "num_input_tokens_seen": 39948960, "step": 271 }, { "epoch": 0.07812049582012107, "loss": 0.16035011410713196, "loss_ce": 0.11991433054208755, "loss_xval": 0.04052734375, "num_input_tokens_seen": 39948960, "step": 271 }, { "epoch": 0.07840876333237244, "grad_norm": 71.84009907388065, "learning_rate": 0.0001, "loss": 0.1673, "num_input_tokens_seen": 40084048, "step": 272 }, { "epoch": 0.07840876333237244, "loss": 0.158762127161026, "loss_ce": 0.079172283411026, "loss_xval": 0.07958984375, "num_input_tokens_seen": 40084048, "step": 272 }, { "epoch": 0.07869703084462382, "grad_norm": 14.134734889641313, "learning_rate": 0.0001, "loss": 0.1095, "num_input_tokens_seen": 40256624, "step": 273 }, { "epoch": 0.07869703084462382, "loss": 0.09496979415416718, "loss_ce": 0.08466248214244843, "loss_xval": 0.01031494140625, "num_input_tokens_seen": 40256624, "step": 273 }, { "epoch": 0.07898529835687518, "grad_norm": 65.02772231537142, "learning_rate": 0.0001, "loss": 0.1654, "num_input_tokens_seen": 40391416, "step": 274 }, { "epoch": 0.07898529835687518, "loss": 0.1933080554008484, "loss_ce": 0.10480707883834839, "loss_xval": 0.08837890625, "num_input_tokens_seen": 40391416, "step": 274 }, { "epoch": 0.07927356586912655, "grad_norm": 22.172397413986033, "learning_rate": 0.0001, "loss": 0.0932, "num_input_tokens_seen": 40526544, "step": 275 }, { "epoch": 0.07927356586912655, "loss": 0.09251715242862701, "loss_ce": 0.07400824129581451, "loss_xval": 0.0185546875, "num_input_tokens_seen": 40526544, "step": 275 }, { "epoch": 0.07956183338137791, "grad_norm": 30.37442186919003, "learning_rate": 0.0001, "loss": 0.1126, "num_input_tokens_seen": 40699040, "step": 276 }, { "epoch": 0.07956183338137791, "loss": 0.10694645345211029, "loss_ce": 0.07613895833492279, "loss_xval": 0.03076171875, "num_input_tokens_seen": 40699040, "step": 276 }, { "epoch": 0.07985010089362929, "grad_norm": 47.63916778777146, "learning_rate": 0.0001, "loss": 0.1285, "num_input_tokens_seen": 40833864, "step": 277 }, { "epoch": 0.07985010089362929, "loss": 0.12656483054161072, "loss_ce": 0.09027943015098572, "loss_xval": 0.036376953125, "num_input_tokens_seen": 40833864, "step": 277 }, { "epoch": 0.08013836840588066, "grad_norm": 5.830462727623499, "learning_rate": 0.0001, "loss": 0.0816, "num_input_tokens_seen": 40968944, "step": 278 }, { "epoch": 0.08013836840588066, "loss": 0.0807839035987854, "loss_ce": 0.06702810525894165, "loss_xval": 0.01373291015625, "num_input_tokens_seen": 40968944, "step": 278 }, { "epoch": 0.08042663591813203, "grad_norm": 50.914697191412415, "learning_rate": 0.0001, "loss": 0.131, "num_input_tokens_seen": 41141552, "step": 279 }, { "epoch": 0.08042663591813203, "loss": 0.10976387560367584, "loss_ce": 0.07019783556461334, "loss_xval": 0.03955078125, "num_input_tokens_seen": 41141552, "step": 279 }, { "epoch": 0.0807149034303834, "grad_norm": 27.192581330339348, "learning_rate": 0.0001, "loss": 0.096, "num_input_tokens_seen": 41276352, "step": 280 }, { "epoch": 0.0807149034303834, "loss": 0.11412344127893448, "loss_ce": 0.08633718639612198, "loss_xval": 0.02783203125, "num_input_tokens_seen": 41276352, "step": 280 }, { "epoch": 0.08100317094263476, "grad_norm": 35.97693953893691, "learning_rate": 0.0001, "loss": 0.093, "num_input_tokens_seen": 41411352, "step": 281 }, { "epoch": 0.08100317094263476, "loss": 0.08904524892568588, "loss_ce": 0.06298323720693588, "loss_xval": 0.026123046875, "num_input_tokens_seen": 41411352, "step": 281 }, { "epoch": 0.08129143845488614, "grad_norm": 39.86205075250757, "learning_rate": 0.0001, "loss": 0.115, "num_input_tokens_seen": 41583784, "step": 282 }, { "epoch": 0.08129143845488614, "loss": 0.1079903095960617, "loss_ce": 0.0660133808851242, "loss_xval": 0.0419921875, "num_input_tokens_seen": 41583784, "step": 282 }, { "epoch": 0.0815797059671375, "grad_norm": 16.584565294970684, "learning_rate": 0.0001, "loss": 0.0897, "num_input_tokens_seen": 41718576, "step": 283 }, { "epoch": 0.0815797059671375, "loss": 0.1088862419128418, "loss_ce": 0.08702802658081055, "loss_xval": 0.0218505859375, "num_input_tokens_seen": 41718576, "step": 283 }, { "epoch": 0.08186797347938887, "grad_norm": 39.839454046307246, "learning_rate": 0.0001, "loss": 0.093, "num_input_tokens_seen": 41853736, "step": 284 }, { "epoch": 0.08186797347938887, "loss": 0.09195973724126816, "loss_ce": 0.05681874603033066, "loss_xval": 0.03515625, "num_input_tokens_seen": 41853736, "step": 284 }, { "epoch": 0.08215624099164025, "grad_norm": 4.847042953173711, "learning_rate": 0.0001, "loss": 0.079, "num_input_tokens_seen": 42026296, "step": 285 }, { "epoch": 0.08215624099164025, "loss": 0.06571406126022339, "loss_ce": 0.056989844888448715, "loss_xval": 0.00872802734375, "num_input_tokens_seen": 42026296, "step": 285 }, { "epoch": 0.08244450850389161, "grad_norm": 34.08495666742016, "learning_rate": 0.0001, "loss": 0.0979, "num_input_tokens_seen": 42161136, "step": 286 }, { "epoch": 0.08244450850389161, "loss": 0.11285368353128433, "loss_ce": 0.08119169622659683, "loss_xval": 0.03173828125, "num_input_tokens_seen": 42161136, "step": 286 }, { "epoch": 0.08273277601614298, "grad_norm": 17.053837094150378, "learning_rate": 0.0001, "loss": 0.0704, "num_input_tokens_seen": 42296312, "step": 287 }, { "epoch": 0.08273277601614298, "loss": 0.07522790133953094, "loss_ce": 0.056322261691093445, "loss_xval": 0.0189208984375, "num_input_tokens_seen": 42296312, "step": 287 }, { "epoch": 0.08302104352839434, "grad_norm": 26.03619151270456, "learning_rate": 0.0001, "loss": 0.088, "num_input_tokens_seen": 42468952, "step": 288 }, { "epoch": 0.08302104352839434, "loss": 0.0716395452618599, "loss_ce": 0.051970966160297394, "loss_xval": 0.0196533203125, "num_input_tokens_seen": 42468952, "step": 288 }, { "epoch": 0.08330931104064572, "grad_norm": 19.104599746913536, "learning_rate": 0.0001, "loss": 0.0795, "num_input_tokens_seen": 42603720, "step": 289 }, { "epoch": 0.08330931104064572, "loss": 0.09239511936903, "loss_ce": 0.07012491673231125, "loss_xval": 0.022216796875, "num_input_tokens_seen": 42603720, "step": 289 }, { "epoch": 0.0835975785528971, "grad_norm": 12.898381782846887, "learning_rate": 0.0001, "loss": 0.0611, "num_input_tokens_seen": 42738816, "step": 290 }, { "epoch": 0.0835975785528971, "loss": 0.062205493450164795, "loss_ce": 0.046939074993133545, "loss_xval": 0.0152587890625, "num_input_tokens_seen": 42738816, "step": 290 }, { "epoch": 0.08388584606514846, "grad_norm": 20.1050497393893, "learning_rate": 0.0001, "loss": 0.0789, "num_input_tokens_seen": 42911264, "step": 291 }, { "epoch": 0.08388584606514846, "loss": 0.0620073564350605, "loss_ce": 0.0473284013569355, "loss_xval": 0.0146484375, "num_input_tokens_seen": 42911264, "step": 291 }, { "epoch": 0.08417411357739983, "grad_norm": 2.8626189497123953, "learning_rate": 0.0001, "loss": 0.0681, "num_input_tokens_seen": 43046240, "step": 292 }, { "epoch": 0.08417411357739983, "loss": 0.08566500246524811, "loss_ce": 0.07007052004337311, "loss_xval": 0.015625, "num_input_tokens_seen": 43046240, "step": 292 }, { "epoch": 0.08446238108965119, "grad_norm": 26.39635415241842, "learning_rate": 0.0001, "loss": 0.0655, "num_input_tokens_seen": 43181360, "step": 293 }, { "epoch": 0.08446238108965119, "loss": 0.06426546722650528, "loss_ce": 0.04400942474603653, "loss_xval": 0.020263671875, "num_input_tokens_seen": 43181360, "step": 293 }, { "epoch": 0.08475064860190257, "grad_norm": 7.208164119900281, "learning_rate": 0.0001, "loss": 0.0653, "num_input_tokens_seen": 43353920, "step": 294 }, { "epoch": 0.08475064860190257, "loss": 0.053133852779865265, "loss_ce": 0.042292483150959015, "loss_xval": 0.0108642578125, "num_input_tokens_seen": 43353920, "step": 294 }, { "epoch": 0.08503891611415393, "grad_norm": 26.98415268638157, "learning_rate": 0.0001, "loss": 0.073, "num_input_tokens_seen": 43488696, "step": 295 }, { "epoch": 0.08503891611415393, "loss": 0.08344084024429321, "loss_ce": 0.06291013956069946, "loss_xval": 0.0205078125, "num_input_tokens_seen": 43488696, "step": 295 }, { "epoch": 0.0853271836264053, "grad_norm": 10.145044575777948, "learning_rate": 0.0001, "loss": 0.053, "num_input_tokens_seen": 43623704, "step": 296 }, { "epoch": 0.0853271836264053, "loss": 0.05334699898958206, "loss_ce": 0.04183042794466019, "loss_xval": 0.01153564453125, "num_input_tokens_seen": 43623704, "step": 296 }, { "epoch": 0.08561545113865668, "grad_norm": 22.607482953404165, "learning_rate": 0.0001, "loss": 0.0696, "num_input_tokens_seen": 43796312, "step": 297 }, { "epoch": 0.08561545113865668, "loss": 0.056711770594120026, "loss_ce": 0.039812661707401276, "loss_xval": 0.016845703125, "num_input_tokens_seen": 43796312, "step": 297 }, { "epoch": 0.08590371865090804, "grad_norm": 13.95908121236066, "learning_rate": 0.0001, "loss": 0.0629, "num_input_tokens_seen": 43931104, "step": 298 }, { "epoch": 0.08590371865090804, "loss": 0.07877244055271149, "loss_ce": 0.057333845645189285, "loss_xval": 0.021484375, "num_input_tokens_seen": 43931104, "step": 298 }, { "epoch": 0.08619198616315941, "grad_norm": 21.14603428598075, "learning_rate": 0.0001, "loss": 0.0536, "num_input_tokens_seen": 44066328, "step": 299 }, { "epoch": 0.08619198616315941, "loss": 0.05007364600896835, "loss_ce": 0.03374292701482773, "loss_xval": 0.016357421875, "num_input_tokens_seen": 44066328, "step": 299 }, { "epoch": 0.08648025367541078, "grad_norm": 16.200678478671318, "learning_rate": 0.0001, "loss": 0.0663, "num_input_tokens_seen": 44238784, "step": 300 }, { "epoch": 0.08648025367541078, "loss": 0.04962464049458504, "loss_ce": 0.03678818419575691, "loss_xval": 0.0128173828125, "num_input_tokens_seen": 44238784, "step": 300 }, { "epoch": 0.08676852118766215, "grad_norm": 22.293967998334683, "learning_rate": 0.0001, "loss": 0.0643, "num_input_tokens_seen": 44373552, "step": 301 }, { "epoch": 0.08676852118766215, "loss": 0.07732580602169037, "loss_ce": 0.05923651158809662, "loss_xval": 0.01806640625, "num_input_tokens_seen": 44373552, "step": 301 }, { "epoch": 0.08705678869991353, "grad_norm": 15.846001468722628, "learning_rate": 0.0001, "loss": 0.0481, "num_input_tokens_seen": 44508624, "step": 302 }, { "epoch": 0.08705678869991353, "loss": 0.0516769215464592, "loss_ce": 0.0360976979136467, "loss_xval": 0.01556396484375, "num_input_tokens_seen": 44508624, "step": 302 }, { "epoch": 0.08734505621216489, "grad_norm": 20.491333852475268, "learning_rate": 0.0001, "loss": 0.0634, "num_input_tokens_seen": 44681000, "step": 303 }, { "epoch": 0.08734505621216489, "loss": 0.053738269954919815, "loss_ce": 0.034756336361169815, "loss_xval": 0.01904296875, "num_input_tokens_seen": 44681000, "step": 303 }, { "epoch": 0.08763332372441626, "grad_norm": 13.708979707336596, "learning_rate": 0.0001, "loss": 0.0586, "num_input_tokens_seen": 44815696, "step": 304 }, { "epoch": 0.08763332372441626, "loss": 0.0743015930056572, "loss_ce": 0.05784880369901657, "loss_xval": 0.0164794921875, "num_input_tokens_seen": 44815696, "step": 304 }, { "epoch": 0.08792159123666762, "grad_norm": 15.17868299260897, "learning_rate": 0.0001, "loss": 0.0459, "num_input_tokens_seen": 44950768, "step": 305 }, { "epoch": 0.08792159123666762, "loss": 0.05023224651813507, "loss_ce": 0.03353913128376007, "loss_xval": 0.0167236328125, "num_input_tokens_seen": 44950768, "step": 305 }, { "epoch": 0.088209858748919, "grad_norm": 14.183441934506382, "learning_rate": 0.0001, "loss": 0.0524, "num_input_tokens_seen": 45123376, "step": 306 }, { "epoch": 0.088209858748919, "loss": 0.039744533598423004, "loss_ce": 0.029437221586704254, "loss_xval": 0.01031494140625, "num_input_tokens_seen": 45123376, "step": 306 }, { "epoch": 0.08849812626117036, "grad_norm": 15.764900216413375, "learning_rate": 0.0001, "loss": 0.0557, "num_input_tokens_seen": 45258176, "step": 307 }, { "epoch": 0.08849812626117036, "loss": 0.06498374789953232, "loss_ce": 0.05014457553625107, "loss_xval": 0.01483154296875, "num_input_tokens_seen": 45258176, "step": 307 }, { "epoch": 0.08878639377342173, "grad_norm": 18.067172708638466, "learning_rate": 0.0001, "loss": 0.0422, "num_input_tokens_seen": 45393264, "step": 308 }, { "epoch": 0.08878639377342173, "loss": 0.043388769030570984, "loss_ce": 0.028351232409477234, "loss_xval": 0.0150146484375, "num_input_tokens_seen": 45393264, "step": 308 }, { "epoch": 0.08907466128567311, "grad_norm": 26.07091528719645, "learning_rate": 0.0001, "loss": 0.0596, "num_input_tokens_seen": 45565840, "step": 309 }, { "epoch": 0.08907466128567311, "loss": 0.043060220777988434, "loss_ce": 0.027099527418613434, "loss_xval": 0.0159912109375, "num_input_tokens_seen": 45565840, "step": 309 }, { "epoch": 0.08936292879792447, "grad_norm": 11.447550254988329, "learning_rate": 0.0001, "loss": 0.0516, "num_input_tokens_seen": 45700504, "step": 310 }, { "epoch": 0.08936292879792447, "loss": 0.06326834857463837, "loss_ce": 0.04712073877453804, "loss_xval": 0.01611328125, "num_input_tokens_seen": 45700504, "step": 310 }, { "epoch": 0.08965119631017585, "grad_norm": 33.1345531958039, "learning_rate": 0.0001, "loss": 0.0523, "num_input_tokens_seen": 45835696, "step": 311 }, { "epoch": 0.08965119631017585, "loss": 0.04893258213996887, "loss_ce": 0.02565530315041542, "loss_xval": 0.0233154296875, "num_input_tokens_seen": 45835696, "step": 311 }, { "epoch": 0.08993946382242721, "grad_norm": 5.875168912525042, "learning_rate": 0.0001, "loss": 0.047, "num_input_tokens_seen": 46008144, "step": 312 }, { "epoch": 0.08993946382242721, "loss": 0.0328424833714962, "loss_ce": 0.02461036667227745, "loss_xval": 0.00823974609375, "num_input_tokens_seen": 46008144, "step": 312 }, { "epoch": 0.09022773133467858, "grad_norm": 29.067938055683037, "learning_rate": 0.0001, "loss": 0.0599, "num_input_tokens_seen": 46142920, "step": 313 }, { "epoch": 0.09022773133467858, "loss": 0.07382102310657501, "loss_ce": 0.046645116060972214, "loss_xval": 0.0272216796875, "num_input_tokens_seen": 46142920, "step": 313 }, { "epoch": 0.09051599884692996, "grad_norm": 14.145042558138448, "learning_rate": 0.0001, "loss": 0.0353, "num_input_tokens_seen": 46278128, "step": 314 }, { "epoch": 0.09051599884692996, "loss": 0.03773703798651695, "loss_ce": 0.023843908682465553, "loss_xval": 0.013916015625, "num_input_tokens_seen": 46278128, "step": 314 }, { "epoch": 0.09080426635918132, "grad_norm": 26.968528320820493, "learning_rate": 0.0001, "loss": 0.0522, "num_input_tokens_seen": 46450640, "step": 315 }, { "epoch": 0.09080426635918132, "loss": 0.039110615849494934, "loss_ce": 0.023127034306526184, "loss_xval": 0.0159912109375, "num_input_tokens_seen": 46450640, "step": 315 }, { "epoch": 0.0910925338714327, "grad_norm": 21.67591186653818, "learning_rate": 0.0001, "loss": 0.0508, "num_input_tokens_seen": 46585496, "step": 316 }, { "epoch": 0.0910925338714327, "loss": 0.056679822504520416, "loss_ce": 0.04201994091272354, "loss_xval": 0.0146484375, "num_input_tokens_seen": 46585496, "step": 316 }, { "epoch": 0.09138080138368405, "grad_norm": 26.61148582926146, "learning_rate": 0.0001, "loss": 0.0416, "num_input_tokens_seen": 46720720, "step": 317 }, { "epoch": 0.09138080138368405, "loss": 0.04069584235548973, "loss_ce": 0.02201908454298973, "loss_xval": 0.0186767578125, "num_input_tokens_seen": 46720720, "step": 317 }, { "epoch": 0.09166906889593543, "grad_norm": 41.72680946967378, "learning_rate": 0.0001, "loss": 0.0668, "num_input_tokens_seen": 46893176, "step": 318 }, { "epoch": 0.09166906889593543, "loss": 0.04476112499833107, "loss_ce": 0.01906532794237137, "loss_xval": 0.025634765625, "num_input_tokens_seen": 46893176, "step": 318 }, { "epoch": 0.09195733640818679, "grad_norm": 2.8774329155971197, "learning_rate": 0.0001, "loss": 0.0397, "num_input_tokens_seen": 47027912, "step": 319 }, { "epoch": 0.09195733640818679, "loss": 0.05090821534395218, "loss_ce": 0.03768647834658623, "loss_xval": 0.01324462890625, "num_input_tokens_seen": 47027912, "step": 319 }, { "epoch": 0.09224560392043817, "grad_norm": 45.18862966542774, "learning_rate": 0.0001, "loss": 0.0591, "num_input_tokens_seen": 47163008, "step": 320 }, { "epoch": 0.09224560392043817, "loss": 0.05182437598705292, "loss_ce": 0.018575472757220268, "loss_xval": 0.033203125, "num_input_tokens_seen": 47163008, "step": 320 }, { "epoch": 0.09253387143268954, "grad_norm": 21.166922384966146, "learning_rate": 0.0001, "loss": 0.0423, "num_input_tokens_seen": 47335528, "step": 321 }, { "epoch": 0.09253387143268954, "loss": 0.028793595731258392, "loss_ce": 0.017944596707820892, "loss_xval": 0.0108642578125, "num_input_tokens_seen": 47335528, "step": 321 }, { "epoch": 0.0928221389449409, "grad_norm": 38.85691929951026, "learning_rate": 0.0001, "loss": 0.0605, "num_input_tokens_seen": 47470296, "step": 322 }, { "epoch": 0.0928221389449409, "loss": 0.07056182622909546, "loss_ce": 0.03659576177597046, "loss_xval": 0.033935546875, "num_input_tokens_seen": 47470296, "step": 322 }, { "epoch": 0.09311040645719228, "grad_norm": 45.36783852672683, "learning_rate": 0.0001, "loss": 0.0599, "num_input_tokens_seen": 47605336, "step": 323 }, { "epoch": 0.09311040645719228, "loss": 0.04938621446490288, "loss_ce": 0.017876815050840378, "loss_xval": 0.031494140625, "num_input_tokens_seen": 47605336, "step": 323 }, { "epoch": 0.09339867396944364, "grad_norm": 17.114573694016176, "learning_rate": 0.0001, "loss": 0.0453, "num_input_tokens_seen": 47777800, "step": 324 }, { "epoch": 0.09339867396944364, "loss": 0.026957228779792786, "loss_ce": 0.01701231300830841, "loss_xval": 0.00994873046875, "num_input_tokens_seen": 47777800, "step": 324 }, { "epoch": 0.09368694148169501, "grad_norm": 64.38369471444744, "learning_rate": 0.0001, "loss": 0.0997, "num_input_tokens_seen": 47912616, "step": 325 }, { "epoch": 0.09368694148169501, "loss": 0.08857642859220505, "loss_ce": 0.03495704382658005, "loss_xval": 0.0537109375, "num_input_tokens_seen": 47912616, "step": 325 }, { "epoch": 0.09397520899394639, "grad_norm": 8.95093976787684, "learning_rate": 0.0001, "loss": 0.0257, "num_input_tokens_seen": 48047752, "step": 326 }, { "epoch": 0.09397520899394639, "loss": 0.027160916477441788, "loss_ce": 0.016243252903223038, "loss_xval": 0.01092529296875, "num_input_tokens_seen": 48047752, "step": 326 }, { "epoch": 0.09426347650619775, "grad_norm": 73.2897144597942, "learning_rate": 0.0001, "loss": 0.1164, "num_input_tokens_seen": 48220264, "step": 327 }, { "epoch": 0.09426347650619775, "loss": 0.07901627570390701, "loss_ce": 0.014685220085084438, "loss_xval": 0.064453125, "num_input_tokens_seen": 48220264, "step": 327 }, { "epoch": 0.09455174401844912, "grad_norm": 48.71043719154727, "learning_rate": 0.0001, "loss": 0.0726, "num_input_tokens_seen": 48355104, "step": 328 }, { "epoch": 0.09455174401844912, "loss": 0.0679316371679306, "loss_ce": 0.0327753871679306, "loss_xval": 0.03515625, "num_input_tokens_seen": 48355104, "step": 328 }, { "epoch": 0.09484001153070049, "grad_norm": 59.90845726737974, "learning_rate": 0.0001, "loss": 0.0818, "num_input_tokens_seen": 48490152, "step": 329 }, { "epoch": 0.09484001153070049, "loss": 0.06156587600708008, "loss_ce": 0.015392780303955078, "loss_xval": 0.046142578125, "num_input_tokens_seen": 48490152, "step": 329 }, { "epoch": 0.09512827904295186, "grad_norm": 95.59853445230955, "learning_rate": 0.0001, "loss": 0.1703, "num_input_tokens_seen": 48662608, "step": 330 }, { "epoch": 0.09512827904295186, "loss": 0.11032046377658844, "loss_ce": 0.01553286612033844, "loss_xval": 0.0947265625, "num_input_tokens_seen": 48662608, "step": 330 }, { "epoch": 0.09541654655520324, "grad_norm": 16.26928208251906, "learning_rate": 0.0001, "loss": 0.0408, "num_input_tokens_seen": 48797416, "step": 331 }, { "epoch": 0.09541654655520324, "loss": 0.048342738300561905, "loss_ce": 0.03313354030251503, "loss_xval": 0.01519775390625, "num_input_tokens_seen": 48797416, "step": 331 }, { "epoch": 0.0957048140674546, "grad_norm": 122.17471882082314, "learning_rate": 0.0001, "loss": 0.2516, "num_input_tokens_seen": 48932560, "step": 332 }, { "epoch": 0.0957048140674546, "loss": 0.17840638756752014, "loss_ce": 0.01605287566781044, "loss_xval": 0.162109375, "num_input_tokens_seen": 48932560, "step": 332 }, { "epoch": 0.09599308157970597, "grad_norm": 52.88764880613315, "learning_rate": 0.0001, "loss": 0.0842, "num_input_tokens_seen": 49105232, "step": 333 }, { "epoch": 0.09599308157970597, "loss": 0.05301094055175781, "loss_ce": 0.017061231657862663, "loss_xval": 0.035888671875, "num_input_tokens_seen": 49105232, "step": 333 }, { "epoch": 0.09628134909195733, "grad_norm": 115.37258236272477, "learning_rate": 0.0001, "loss": 0.2361, "num_input_tokens_seen": 49240080, "step": 334 }, { "epoch": 0.09628134909195733, "loss": 0.20233498513698578, "loss_ce": 0.03271828591823578, "loss_xval": 0.169921875, "num_input_tokens_seen": 49240080, "step": 334 }, { "epoch": 0.09656961660420871, "grad_norm": 135.4813468564314, "learning_rate": 0.0001, "loss": 0.2934, "num_input_tokens_seen": 49375304, "step": 335 }, { "epoch": 0.09656961660420871, "loss": 0.21803660690784454, "loss_ce": 0.015644025057554245, "loss_xval": 0.2021484375, "num_input_tokens_seen": 49375304, "step": 335 }, { "epoch": 0.09685788411646007, "grad_norm": 45.24827134034747, "learning_rate": 0.0001, "loss": 0.0666, "num_input_tokens_seen": 49547824, "step": 336 }, { "epoch": 0.09685788411646007, "loss": 0.04550432041287422, "loss_ce": 0.014895188622176647, "loss_xval": 0.0306396484375, "num_input_tokens_seen": 49547824, "step": 336 }, { "epoch": 0.09714615162871144, "grad_norm": 174.6835376599181, "learning_rate": 0.0001, "loss": 0.4999, "num_input_tokens_seen": 49682744, "step": 337 }, { "epoch": 0.09714615162871144, "loss": 0.34568923711776733, "loss_ce": 0.028062283992767334, "loss_xval": 0.318359375, "num_input_tokens_seen": 49682744, "step": 337 }, { "epoch": 0.09743441914096282, "grad_norm": 34.17773463839456, "learning_rate": 0.0001, "loss": 0.0419, "num_input_tokens_seen": 49817848, "step": 338 }, { "epoch": 0.09743441914096282, "loss": 0.03003338724374771, "loss_ce": 0.013782776892185211, "loss_xval": 0.0162353515625, "num_input_tokens_seen": 49817848, "step": 338 }, { "epoch": 0.09772268665321418, "grad_norm": 185.13851267540525, "learning_rate": 0.0001, "loss": 0.5886, "num_input_tokens_seen": 49990280, "step": 339 }, { "epoch": 0.09772268665321418, "loss": 0.4133293032646179, "loss_ce": 0.014159374870359898, "loss_xval": 0.3984375, "num_input_tokens_seen": 49990280, "step": 339 }, { "epoch": 0.09801095416546556, "grad_norm": 150.00087701324276, "learning_rate": 0.0001, "loss": 0.3805, "num_input_tokens_seen": 50125128, "step": 340 }, { "epoch": 0.09801095416546556, "loss": 0.26180437207221985, "loss_ce": 0.03206804767251015, "loss_xval": 0.2294921875, "num_input_tokens_seen": 50125128, "step": 340 }, { "epoch": 0.09829922167771692, "grad_norm": 129.76070714652064, "learning_rate": 0.0001, "loss": 0.2883, "num_input_tokens_seen": 50260248, "step": 341 }, { "epoch": 0.09829922167771692, "loss": 0.21383962035179138, "loss_ce": 0.015231214463710785, "loss_xval": 0.1982421875, "num_input_tokens_seen": 50260248, "step": 341 }, { "epoch": 0.09858748918996829, "grad_norm": 234.05736326944017, "learning_rate": 0.0001, "loss": 0.9074, "num_input_tokens_seen": 50432816, "step": 342 }, { "epoch": 0.09858748918996829, "loss": 0.6195493936538696, "loss_ce": 0.017986856400966644, "loss_xval": 0.6015625, "num_input_tokens_seen": 50432816, "step": 342 }, { "epoch": 0.09887575670221967, "grad_norm": 43.17761008074917, "learning_rate": 0.0001, "loss": 0.0801, "num_input_tokens_seen": 50567648, "step": 343 }, { "epoch": 0.09887575670221967, "loss": 0.1131780743598938, "loss_ce": 0.0380437970161438, "loss_xval": 0.0751953125, "num_input_tokens_seen": 50567648, "step": 343 }, { "epoch": 0.09916402421447103, "grad_norm": 293.5641993425453, "learning_rate": 0.0001, "loss": 1.3699, "num_input_tokens_seen": 50702776, "step": 344 }, { "epoch": 0.09916402421447103, "loss": 1.069295048713684, "loss_ce": 0.013631051406264305, "loss_xval": 1.0546875, "num_input_tokens_seen": 50702776, "step": 344 }, { "epoch": 0.0994522917267224, "grad_norm": 131.71555163423727, "learning_rate": 0.0001, "loss": 0.3338, "num_input_tokens_seen": 50875256, "step": 345 }, { "epoch": 0.0994522917267224, "loss": 0.2189842015504837, "loss_ce": 0.0239158496260643, "loss_xval": 0.1953125, "num_input_tokens_seen": 50875256, "step": 345 }, { "epoch": 0.09974055923897376, "grad_norm": 233.3297271775485, "learning_rate": 0.0001, "loss": 0.8797, "num_input_tokens_seen": 51010072, "step": 346 }, { "epoch": 0.09974055923897376, "loss": 0.7266819477081299, "loss_ce": 0.038693614304065704, "loss_xval": 0.6875, "num_input_tokens_seen": 51010072, "step": 346 }, { "epoch": 0.10002882675122514, "grad_norm": 225.78961503277478, "learning_rate": 0.0001, "loss": 0.8029, "num_input_tokens_seen": 51145272, "step": 347 }, { "epoch": 0.10002882675122514, "loss": 0.6846581697463989, "loss_ce": 0.019130853936076164, "loss_xval": 0.6640625, "num_input_tokens_seen": 51145272, "step": 347 }, { "epoch": 0.1003170942634765, "grad_norm": 147.2061227303931, "learning_rate": 0.0001, "loss": 0.4121, "num_input_tokens_seen": 51317720, "step": 348 }, { "epoch": 0.1003170942634765, "loss": 0.32414305210113525, "loss_ce": 0.03166259080171585, "loss_xval": 0.29296875, "num_input_tokens_seen": 51317720, "step": 348 }, { "epoch": 0.10060536177572788, "grad_norm": 263.8637916525708, "learning_rate": 0.0001, "loss": 1.165, "num_input_tokens_seen": 51452536, "step": 349 }, { "epoch": 0.10060536177572788, "loss": 0.889431357383728, "loss_ce": 0.04763442277908325, "loss_xval": 0.84375, "num_input_tokens_seen": 51452536, "step": 349 }, { "epoch": 0.10089362928797925, "grad_norm": 73.43080285637714, "learning_rate": 0.0001, "loss": 0.1414, "num_input_tokens_seen": 51587552, "step": 350 }, { "epoch": 0.10089362928797925, "loss": 0.11348080635070801, "loss_ce": 0.02668880857527256, "loss_xval": 0.0869140625, "num_input_tokens_seen": 51587552, "step": 350 }, { "epoch": 0.10118189680023061, "grad_norm": 307.91044587597645, "learning_rate": 0.0001, "loss": 1.6928, "num_input_tokens_seen": 51760016, "step": 351 }, { "epoch": 0.10118189680023061, "loss": 1.2498633861541748, "loss_ce": 0.04380872845649719, "loss_xval": 1.203125, "num_input_tokens_seen": 51760016, "step": 351 }, { "epoch": 0.10147016431248199, "grad_norm": 22.52344498921592, "learning_rate": 0.0001, "loss": 0.0778, "num_input_tokens_seen": 51894824, "step": 352 }, { "epoch": 0.10147016431248199, "loss": 0.08678604662418365, "loss_ce": 0.051645055413246155, "loss_xval": 0.03515625, "num_input_tokens_seen": 51894824, "step": 352 }, { "epoch": 0.10175843182473335, "grad_norm": 314.1793084218993, "learning_rate": 0.0001, "loss": 1.7642, "num_input_tokens_seen": 52029744, "step": 353 }, { "epoch": 0.10175843182473335, "loss": 1.4714159965515137, "loss_ce": 0.02805667743086815, "loss_xval": 1.4453125, "num_input_tokens_seen": 52029744, "step": 353 }, { "epoch": 0.10204669933698472, "grad_norm": 85.04274380544564, "learning_rate": 0.0001, "loss": 0.1873, "num_input_tokens_seen": 52202232, "step": 354 }, { "epoch": 0.10204669933698472, "loss": 0.15907925367355347, "loss_ce": 0.03090541996061802, "loss_xval": 0.1279296875, "num_input_tokens_seen": 52202232, "step": 354 }, { "epoch": 0.1023349668492361, "grad_norm": 282.4662109698405, "learning_rate": 0.0001, "loss": 1.365, "num_input_tokens_seen": 52337056, "step": 355 }, { "epoch": 0.1023349668492361, "loss": 1.2567670345306396, "loss_ce": 0.05559512972831726, "loss_xval": 1.203125, "num_input_tokens_seen": 52337056, "step": 355 }, { "epoch": 0.10262323436148746, "grad_norm": 102.65515390331076, "learning_rate": 0.0001, "loss": 0.2295, "num_input_tokens_seen": 52472048, "step": 356 }, { "epoch": 0.10262323436148746, "loss": 0.2835312783718109, "loss_ce": 0.025718793272972107, "loss_xval": 0.2578125, "num_input_tokens_seen": 52472048, "step": 356 }, { "epoch": 0.10291150187373883, "grad_norm": 252.48243156019015, "learning_rate": 0.0001, "loss": 1.344, "num_input_tokens_seen": 52644424, "step": 357 }, { "epoch": 0.10291150187373883, "loss": 0.7913997173309326, "loss_ce": 0.029192738234996796, "loss_xval": 0.76171875, "num_input_tokens_seen": 52644424, "step": 357 }, { "epoch": 0.1031997693859902, "grad_norm": 90.84763123217391, "learning_rate": 0.0001, "loss": 0.2428, "num_input_tokens_seen": 52779144, "step": 358 }, { "epoch": 0.1031997693859902, "loss": 0.2381390482187271, "loss_ce": 0.05472839996218681, "loss_xval": 0.18359375, "num_input_tokens_seen": 52779144, "step": 358 }, { "epoch": 0.10348803689824157, "grad_norm": 328.37470612355446, "learning_rate": 0.0001, "loss": 2.1571, "num_input_tokens_seen": 52914216, "step": 359 }, { "epoch": 0.10348803689824157, "loss": 1.3795331716537476, "loss_ce": 0.027970610186457634, "loss_xval": 1.3515625, "num_input_tokens_seen": 52914216, "step": 359 }, { "epoch": 0.10377630441049293, "grad_norm": 188.38399390828792, "learning_rate": 0.0001, "loss": 0.7471, "num_input_tokens_seen": 53086680, "step": 360 }, { "epoch": 0.10377630441049293, "loss": 0.49086835980415344, "loss_ce": 0.03334883227944374, "loss_xval": 0.45703125, "num_input_tokens_seen": 53086680, "step": 360 }, { "epoch": 0.1040645719227443, "grad_norm": 305.99410572222166, "learning_rate": 0.0001, "loss": 1.6008, "num_input_tokens_seen": 53221528, "step": 361 }, { "epoch": 0.1040645719227443, "loss": 1.3899487257003784, "loss_ce": 0.08233153820037842, "loss_xval": 1.3046875, "num_input_tokens_seen": 53221528, "step": 361 }, { "epoch": 0.10435283943499568, "grad_norm": 259.50968601472374, "learning_rate": 0.0001, "loss": 1.1558, "num_input_tokens_seen": 53356560, "step": 362 }, { "epoch": 0.10435283943499568, "loss": 1.188092589378357, "loss_ce": 0.04844410717487335, "loss_xval": 1.140625, "num_input_tokens_seen": 53356560, "step": 362 }, { "epoch": 0.10464110694724704, "grad_norm": 194.27041230322703, "learning_rate": 0.0001, "loss": 0.8075, "num_input_tokens_seen": 53529184, "step": 363 }, { "epoch": 0.10464110694724704, "loss": 0.5688841342926025, "loss_ce": 0.03690172731876373, "loss_xval": 0.53125, "num_input_tokens_seen": 53529184, "step": 363 }, { "epoch": 0.10492937445949842, "grad_norm": 167.4660911295136, "learning_rate": 0.0001, "loss": 0.6972, "num_input_tokens_seen": 53664024, "step": 364 }, { "epoch": 0.10492937445949842, "loss": 0.770238995552063, "loss_ce": 0.0729733482003212, "loss_xval": 0.6953125, "num_input_tokens_seen": 53664024, "step": 364 }, { "epoch": 0.10521764197174978, "grad_norm": 236.66961574420125, "learning_rate": 0.0001, "loss": 1.3009, "num_input_tokens_seen": 53799152, "step": 365 }, { "epoch": 0.10521764197174978, "loss": 0.8208091259002686, "loss_ce": 0.03858250379562378, "loss_xval": 0.78125, "num_input_tokens_seen": 53799152, "step": 365 }, { "epoch": 0.10550590948400115, "grad_norm": 186.46485981451747, "learning_rate": 0.0001, "loss": 1.0544, "num_input_tokens_seen": 53971760, "step": 366 }, { "epoch": 0.10550590948400115, "loss": 0.4651481509208679, "loss_ce": 0.03814617544412613, "loss_xval": 0.427734375, "num_input_tokens_seen": 53971760, "step": 366 }, { "epoch": 0.10579417699625253, "grad_norm": 198.19441855401575, "learning_rate": 0.0001, "loss": 0.9633, "num_input_tokens_seen": 54106544, "step": 367 }, { "epoch": 0.10579417699625253, "loss": 0.6854445934295654, "loss_ce": 0.06679220497608185, "loss_xval": 0.6171875, "num_input_tokens_seen": 54106544, "step": 367 }, { "epoch": 0.10608244450850389, "grad_norm": 173.81988841584263, "learning_rate": 0.0001, "loss": 0.7825, "num_input_tokens_seen": 54241616, "step": 368 }, { "epoch": 0.10608244450850389, "loss": 0.26582324504852295, "loss_ce": 0.03242482244968414, "loss_xval": 0.2333984375, "num_input_tokens_seen": 54241616, "step": 368 }, { "epoch": 0.10637071202075526, "grad_norm": 203.49345931635668, "learning_rate": 0.0001, "loss": 1.0004, "num_input_tokens_seen": 54414272, "step": 369 }, { "epoch": 0.10637071202075526, "loss": 0.8784339427947998, "loss_ce": 0.04493783041834831, "loss_xval": 0.83203125, "num_input_tokens_seen": 54414272, "step": 369 }, { "epoch": 0.10665897953300663, "grad_norm": 349.5785229713507, "learning_rate": 0.0001, "loss": 1.8979, "num_input_tokens_seen": 54548968, "step": 370 }, { "epoch": 0.10665897953300663, "loss": 1.4802519083023071, "loss_ce": 0.07986137270927429, "loss_xval": 1.3984375, "num_input_tokens_seen": 54548968, "step": 370 }, { "epoch": 0.106947247045258, "grad_norm": 74.2585336466779, "learning_rate": 0.0001, "loss": 0.1741, "num_input_tokens_seen": 54683992, "step": 371 }, { "epoch": 0.106947247045258, "loss": 0.19732186198234558, "loss_ce": 0.06493661552667618, "loss_xval": 0.1328125, "num_input_tokens_seen": 54683992, "step": 371 }, { "epoch": 0.10723551455750936, "grad_norm": 349.3312247600097, "learning_rate": 0.0001, "loss": 1.9217, "num_input_tokens_seen": 54856472, "step": 372 }, { "epoch": 0.10723551455750936, "loss": 1.735950231552124, "loss_ce": 0.09630171954631805, "loss_xval": 1.640625, "num_input_tokens_seen": 54856472, "step": 372 }, { "epoch": 0.10752378206976074, "grad_norm": 11.985561447706766, "learning_rate": 0.0001, "loss": 0.1713, "num_input_tokens_seen": 54991288, "step": 373 }, { "epoch": 0.10752378206976074, "loss": 0.19064179062843323, "loss_ce": 0.15305939316749573, "loss_xval": 0.03759765625, "num_input_tokens_seen": 54991288, "step": 373 }, { "epoch": 0.10781204958201211, "grad_norm": 330.89079551948697, "learning_rate": 0.0001, "loss": 1.8265, "num_input_tokens_seen": 55126536, "step": 374 }, { "epoch": 0.10781204958201211, "loss": 1.5767598152160645, "loss_ce": 0.08359581232070923, "loss_xval": 1.4921875, "num_input_tokens_seen": 55126536, "step": 374 }, { "epoch": 0.10810031709426347, "grad_norm": 24.35537299172033, "learning_rate": 0.0001, "loss": 0.1601, "num_input_tokens_seen": 55299168, "step": 375 }, { "epoch": 0.10810031709426347, "loss": 0.126111701130867, "loss_ce": 0.0854165107011795, "loss_xval": 0.040771484375, "num_input_tokens_seen": 55299168, "step": 375 }, { "epoch": 0.10838858460651485, "grad_norm": 285.65327944843983, "learning_rate": 0.0001, "loss": 1.5406, "num_input_tokens_seen": 55433896, "step": 376 }, { "epoch": 0.10838858460651485, "loss": 1.349381446838379, "loss_ce": 0.1408853530883789, "loss_xval": 1.2109375, "num_input_tokens_seen": 55433896, "step": 376 }, { "epoch": 0.10867685211876621, "grad_norm": 24.788341756279642, "learning_rate": 0.0001, "loss": 0.1178, "num_input_tokens_seen": 55568936, "step": 377 }, { "epoch": 0.10867685211876621, "loss": 0.12482555210590363, "loss_ce": 0.08234508335590363, "loss_xval": 0.04248046875, "num_input_tokens_seen": 55568936, "step": 377 }, { "epoch": 0.10896511963101758, "grad_norm": 249.08100726158384, "learning_rate": 0.0001, "loss": 1.3048, "num_input_tokens_seen": 55741328, "step": 378 }, { "epoch": 0.10896511963101758, "loss": 1.1989914178848267, "loss_ce": 0.09742889553308487, "loss_xval": 1.1015625, "num_input_tokens_seen": 55741328, "step": 378 }, { "epoch": 0.10925338714326896, "grad_norm": 41.92556961556827, "learning_rate": 0.0001, "loss": 0.1731, "num_input_tokens_seen": 55876224, "step": 379 }, { "epoch": 0.10925338714326896, "loss": 0.23099157214164734, "loss_ce": 0.14755651354789734, "loss_xval": 0.08349609375, "num_input_tokens_seen": 55876224, "step": 379 }, { "epoch": 0.10954165465552032, "grad_norm": 200.56550278580963, "learning_rate": 0.0001, "loss": 0.9848, "num_input_tokens_seen": 56011368, "step": 380 }, { "epoch": 0.10954165465552032, "loss": 1.0198489427566528, "loss_ce": 0.09260280430316925, "loss_xval": 0.92578125, "num_input_tokens_seen": 56011368, "step": 380 }, { "epoch": 0.1098299221677717, "grad_norm": 85.41435392852556, "learning_rate": 0.0001, "loss": 0.3012, "num_input_tokens_seen": 56183928, "step": 381 }, { "epoch": 0.1098299221677717, "loss": 0.28134530782699585, "loss_ce": 0.09213632345199585, "loss_xval": 0.189453125, "num_input_tokens_seen": 56183928, "step": 381 }, { "epoch": 0.11011818968002306, "grad_norm": 138.8160607265775, "learning_rate": 0.0001, "loss": 0.5655, "num_input_tokens_seen": 56318752, "step": 382 }, { "epoch": 0.11011818968002306, "loss": 0.6360880136489868, "loss_ce": 0.131449356675148, "loss_xval": 0.50390625, "num_input_tokens_seen": 56318752, "step": 382 }, { "epoch": 0.11040645719227443, "grad_norm": 125.28655562326834, "learning_rate": 0.0001, "loss": 0.4428, "num_input_tokens_seen": 56453792, "step": 383 }, { "epoch": 0.11040645719227443, "loss": 0.40254080295562744, "loss_ce": 0.07270681858062744, "loss_xval": 0.330078125, "num_input_tokens_seen": 56453792, "step": 383 }, { "epoch": 0.1106947247045258, "grad_norm": 81.8855747466578, "learning_rate": 0.0001, "loss": 0.2815, "num_input_tokens_seen": 56626256, "step": 384 }, { "epoch": 0.1106947247045258, "loss": 0.1878707855939865, "loss_ce": 0.06659393757581711, "loss_xval": 0.12109375, "num_input_tokens_seen": 56626256, "step": 384 }, { "epoch": 0.11098299221677717, "grad_norm": 134.69538035742877, "learning_rate": 0.0001, "loss": 0.5708, "num_input_tokens_seen": 56761000, "step": 385 }, { "epoch": 0.11098299221677717, "loss": 0.6164034605026245, "loss_ce": 0.1088351458311081, "loss_xval": 0.5078125, "num_input_tokens_seen": 56761000, "step": 385 }, { "epoch": 0.11127125972902854, "grad_norm": 51.9899502391935, "learning_rate": 0.0001, "loss": 0.1245, "num_input_tokens_seen": 56896072, "step": 386 }, { "epoch": 0.11127125972902854, "loss": 0.11218064278364182, "loss_ce": 0.050565652549266815, "loss_xval": 0.0615234375, "num_input_tokens_seen": 56896072, "step": 386 }, { "epoch": 0.1115595272412799, "grad_norm": 134.96651808205962, "learning_rate": 0.0001, "loss": 0.5115, "num_input_tokens_seen": 57068584, "step": 387 }, { "epoch": 0.1115595272412799, "loss": 0.45479512214660645, "loss_ce": 0.04708027094602585, "loss_xval": 0.408203125, "num_input_tokens_seen": 57068584, "step": 387 }, { "epoch": 0.11184779475353128, "grad_norm": 3.9921421710123757, "learning_rate": 0.0001, "loss": 0.0814, "num_input_tokens_seen": 57203448, "step": 388 }, { "epoch": 0.11184779475353128, "loss": 0.11016294360160828, "loss_ce": 0.08075925707817078, "loss_xval": 0.0294189453125, "num_input_tokens_seen": 57203448, "step": 388 }, { "epoch": 0.11213606226578264, "grad_norm": 154.34349074142602, "learning_rate": 0.0001, "loss": 0.5716, "num_input_tokens_seen": 57338608, "step": 389 }, { "epoch": 0.11213606226578264, "loss": 0.5179411172866821, "loss_ce": 0.03551921993494034, "loss_xval": 0.482421875, "num_input_tokens_seen": 57338608, "step": 389 }, { "epoch": 0.11242432977803402, "grad_norm": 28.336565736169785, "learning_rate": 0.0001, "loss": 0.0975, "num_input_tokens_seen": 57511192, "step": 390 }, { "epoch": 0.11242432977803402, "loss": 0.10524661839008331, "loss_ce": 0.036337923258543015, "loss_xval": 0.06884765625, "num_input_tokens_seen": 57511192, "step": 390 }, { "epoch": 0.11271259729028539, "grad_norm": 111.97474819267906, "learning_rate": 0.0001, "loss": 0.3659, "num_input_tokens_seen": 57646056, "step": 391 }, { "epoch": 0.11271259729028539, "loss": 0.434956431388855, "loss_ce": 0.05531773343682289, "loss_xval": 0.37890625, "num_input_tokens_seen": 57646056, "step": 391 }, { "epoch": 0.11300086480253675, "grad_norm": 54.641254485706455, "learning_rate": 0.0001, "loss": 0.1127, "num_input_tokens_seen": 57781256, "step": 392 }, { "epoch": 0.11300086480253675, "loss": 0.11866104602813721, "loss_ce": 0.02973281964659691, "loss_xval": 0.0888671875, "num_input_tokens_seen": 57781256, "step": 392 }, { "epoch": 0.11328913231478813, "grad_norm": 65.2115128081605, "learning_rate": 0.0001, "loss": 0.1522, "num_input_tokens_seen": 57953776, "step": 393 }, { "epoch": 0.11328913231478813, "loss": 0.16455772519111633, "loss_ce": 0.02844933047890663, "loss_xval": 0.1357421875, "num_input_tokens_seen": 57953776, "step": 393 }, { "epoch": 0.11357739982703949, "grad_norm": 72.45753153535337, "learning_rate": 0.0001, "loss": 0.1703, "num_input_tokens_seen": 58088568, "step": 394 }, { "epoch": 0.11357739982703949, "loss": 0.16719621419906616, "loss_ce": 0.045064859092235565, "loss_xval": 0.1220703125, "num_input_tokens_seen": 58088568, "step": 394 }, { "epoch": 0.11386566733929086, "grad_norm": 16.350567249204943, "learning_rate": 0.0001, "loss": 0.0372, "num_input_tokens_seen": 58223688, "step": 395 }, { "epoch": 0.11386566733929086, "loss": 0.03724553436040878, "loss_ce": 0.022444508969783783, "loss_xval": 0.0147705078125, "num_input_tokens_seen": 58223688, "step": 395 }, { "epoch": 0.11415393485154224, "grad_norm": 80.20883394198827, "learning_rate": 0.0001, "loss": 0.1971, "num_input_tokens_seen": 58396152, "step": 396 }, { "epoch": 0.11415393485154224, "loss": 0.1776159405708313, "loss_ce": 0.0247839093208313, "loss_xval": 0.15234375, "num_input_tokens_seen": 58396152, "step": 396 }, { "epoch": 0.1144422023637936, "grad_norm": 13.45347570730963, "learning_rate": 0.0001, "loss": 0.0415, "num_input_tokens_seen": 58531008, "step": 397 }, { "epoch": 0.1144422023637936, "loss": 0.05418435111641884, "loss_ce": 0.03697243705391884, "loss_xval": 0.0172119140625, "num_input_tokens_seen": 58531008, "step": 397 }, { "epoch": 0.11473046987604497, "grad_norm": 66.82835410580104, "learning_rate": 0.0001, "loss": 0.1362, "num_input_tokens_seen": 58666032, "step": 398 }, { "epoch": 0.11473046987604497, "loss": 0.1477038562297821, "loss_ce": 0.017088618129491806, "loss_xval": 0.130859375, "num_input_tokens_seen": 58666032, "step": 398 }, { "epoch": 0.11501873738829634, "grad_norm": 33.58390379792348, "learning_rate": 0.0001, "loss": 0.0668, "num_input_tokens_seen": 58838640, "step": 399 }, { "epoch": 0.11501873738829634, "loss": 0.04878139868378639, "loss_ce": 0.019392970949411392, "loss_xval": 0.0294189453125, "num_input_tokens_seen": 58838640, "step": 399 }, { "epoch": 0.11530700490054771, "grad_norm": 37.13352731577908, "learning_rate": 0.0001, "loss": 0.0724, "num_input_tokens_seen": 58973440, "step": 400 }, { "epoch": 0.11530700490054771, "loss": 0.08155344426631927, "loss_ce": 0.037638649344444275, "loss_xval": 0.0439453125, "num_input_tokens_seen": 58973440, "step": 400 }, { "epoch": 0.11559527241279907, "grad_norm": 58.5907348005972, "learning_rate": 0.0001, "loss": 0.1073, "num_input_tokens_seen": 59108488, "step": 401 }, { "epoch": 0.11559527241279907, "loss": 0.0918554961681366, "loss_ce": 0.011746861040592194, "loss_xval": 0.080078125, "num_input_tokens_seen": 59108488, "step": 401 }, { "epoch": 0.11588353992505045, "grad_norm": 21.688355763409646, "learning_rate": 0.0001, "loss": 0.0501, "num_input_tokens_seen": 59281024, "step": 402 }, { "epoch": 0.11588353992505045, "loss": 0.028341641649603844, "loss_ce": 0.016180386766791344, "loss_xval": 0.01214599609375, "num_input_tokens_seen": 59281024, "step": 402 }, { "epoch": 0.11617180743730182, "grad_norm": 62.076919731958114, "learning_rate": 0.0001, "loss": 0.1351, "num_input_tokens_seen": 59415824, "step": 403 }, { "epoch": 0.11617180743730182, "loss": 0.15151304006576538, "loss_ce": 0.032372407615184784, "loss_xval": 0.119140625, "num_input_tokens_seen": 59415824, "step": 403 }, { "epoch": 0.11646007494955318, "grad_norm": 10.21746319986935, "learning_rate": 0.0001, "loss": 0.0221, "num_input_tokens_seen": 59550856, "step": 404 }, { "epoch": 0.11646007494955318, "loss": 0.019688857719302177, "loss_ce": 0.010304702445864677, "loss_xval": 0.0093994140625, "num_input_tokens_seen": 59550856, "step": 404 }, { "epoch": 0.11674834246180456, "grad_norm": 51.93656716076114, "learning_rate": 0.0001, "loss": 0.0985, "num_input_tokens_seen": 59723384, "step": 405 }, { "epoch": 0.11674834246180456, "loss": 0.11036922037601471, "loss_ce": 0.013872633688151836, "loss_xval": 0.0966796875, "num_input_tokens_seen": 59723384, "step": 405 }, { "epoch": 0.11703660997405592, "grad_norm": 7.218077949279075, "learning_rate": 0.0001, "loss": 0.0292, "num_input_tokens_seen": 59858224, "step": 406 }, { "epoch": 0.11703660997405592, "loss": 0.03941994905471802, "loss_ce": 0.03063851408660412, "loss_xval": 0.0087890625, "num_input_tokens_seen": 59858224, "step": 406 }, { "epoch": 0.1173248774863073, "grad_norm": 38.771615811302745, "learning_rate": 0.0001, "loss": 0.0552, "num_input_tokens_seen": 59993184, "step": 407 }, { "epoch": 0.1173248774863073, "loss": 0.0620470829308033, "loss_ce": 0.009465297684073448, "loss_xval": 0.052490234375, "num_input_tokens_seen": 59993184, "step": 407 }, { "epoch": 0.11761314499855867, "grad_norm": 21.05248979073788, "learning_rate": 0.0001, "loss": 0.039, "num_input_tokens_seen": 60165568, "step": 408 }, { "epoch": 0.11761314499855867, "loss": 0.027896784245967865, "loss_ce": 0.010372065007686615, "loss_xval": 0.017578125, "num_input_tokens_seen": 60165568, "step": 408 }, { "epoch": 0.11790141251081003, "grad_norm": 24.493599892108804, "learning_rate": 0.0001, "loss": 0.0415, "num_input_tokens_seen": 60300328, "step": 409 }, { "epoch": 0.11790141251081003, "loss": 0.05128861218690872, "loss_ce": 0.027080543339252472, "loss_xval": 0.024169921875, "num_input_tokens_seen": 60300328, "step": 409 }, { "epoch": 0.1181896800230614, "grad_norm": 31.759586149499057, "learning_rate": 0.0001, "loss": 0.0422, "num_input_tokens_seen": 60435400, "step": 410 }, { "epoch": 0.1181896800230614, "loss": 0.03685798496007919, "loss_ce": 0.008232494816184044, "loss_xval": 0.028564453125, "num_input_tokens_seen": 60435400, "step": 410 }, { "epoch": 0.11847794753531277, "grad_norm": 16.79595672478788, "learning_rate": 0.0001, "loss": 0.0321, "num_input_tokens_seen": 60607824, "step": 411 }, { "epoch": 0.11847794753531277, "loss": 0.021884309127926826, "loss_ce": 0.009623872116208076, "loss_xval": 0.01226806640625, "num_input_tokens_seen": 60607824, "step": 411 }, { "epoch": 0.11876621504756414, "grad_norm": 33.97780050017176, "learning_rate": 0.0001, "loss": 0.0523, "num_input_tokens_seen": 60742616, "step": 412 }, { "epoch": 0.11876621504756414, "loss": 0.05837390199303627, "loss_ce": 0.02347705140709877, "loss_xval": 0.034912109375, "num_input_tokens_seen": 60742616, "step": 412 }, { "epoch": 0.1190544825598155, "grad_norm": 8.233141537208109, "learning_rate": 0.0001, "loss": 0.0163, "num_input_tokens_seen": 60877904, "step": 413 }, { "epoch": 0.1190544825598155, "loss": 0.015995457768440247, "loss_ce": 0.007229284383356571, "loss_xval": 0.0087890625, "num_input_tokens_seen": 60877904, "step": 413 }, { "epoch": 0.11934275007206688, "grad_norm": 34.01052905899437, "learning_rate": 0.0001, "loss": 0.0532, "num_input_tokens_seen": 61050352, "step": 414 }, { "epoch": 0.11934275007206688, "loss": 0.042255695909261703, "loss_ce": 0.007847124710679054, "loss_xval": 0.034423828125, "num_input_tokens_seen": 61050352, "step": 414 }, { "epoch": 0.11963101758431825, "grad_norm": 3.9841818309777546, "learning_rate": 0.0001, "loss": 0.0209, "num_input_tokens_seen": 61185088, "step": 415 }, { "epoch": 0.11963101758431825, "loss": 0.029205160215497017, "loss_ce": 0.02410300448536873, "loss_xval": 0.005096435546875, "num_input_tokens_seen": 61185088, "step": 415 }, { "epoch": 0.11991928509656961, "grad_norm": 33.199360451976254, "learning_rate": 0.0001, "loss": 0.0404, "num_input_tokens_seen": 61320072, "step": 416 }, { "epoch": 0.11991928509656961, "loss": 0.04171193018555641, "loss_ce": 0.006433609873056412, "loss_xval": 0.03515625, "num_input_tokens_seen": 61320072, "step": 416 }, { "epoch": 0.12020755260882099, "grad_norm": 5.067381489080464, "learning_rate": 0.0001, "loss": 0.0203, "num_input_tokens_seen": 61492440, "step": 417 }, { "epoch": 0.12020755260882099, "loss": 0.011221684515476227, "loss_ce": 0.007321156561374664, "loss_xval": 0.00390625, "num_input_tokens_seen": 61492440, "step": 417 }, { "epoch": 0.12049582012107235, "grad_norm": 26.45059905845838, "learning_rate": 0.0001, "loss": 0.0375, "num_input_tokens_seen": 61627200, "step": 418 }, { "epoch": 0.12049582012107235, "loss": 0.04936272278428078, "loss_ce": 0.022110525518655777, "loss_xval": 0.0272216796875, "num_input_tokens_seen": 61627200, "step": 418 }, { "epoch": 0.12078408763332373, "grad_norm": 1.786488274094738, "learning_rate": 0.0001, "loss": 0.011, "num_input_tokens_seen": 61762344, "step": 419 }, { "epoch": 0.12078408763332373, "loss": 0.010375022888183594, "loss_ce": 0.005908012855798006, "loss_xval": 0.00445556640625, "num_input_tokens_seen": 61762344, "step": 419 }, { "epoch": 0.1210723551455751, "grad_norm": 21.591453869968998, "learning_rate": 0.0001, "loss": 0.0339, "num_input_tokens_seen": 61934872, "step": 420 }, { "epoch": 0.1210723551455751, "loss": 0.026244094595313072, "loss_ce": 0.006491592153906822, "loss_xval": 0.019775390625, "num_input_tokens_seen": 61934872, "step": 420 }, { "epoch": 0.12136062265782646, "grad_norm": 1.327132608773384, "learning_rate": 0.0001, "loss": 0.0191, "num_input_tokens_seen": 62069656, "step": 421 }, { "epoch": 0.12136062265782646, "loss": 0.029985252767801285, "loss_ce": 0.02338201180100441, "loss_xval": 0.006591796875, "num_input_tokens_seen": 62069656, "step": 421 }, { "epoch": 0.12164889017007784, "grad_norm": 15.53135837910297, "learning_rate": 0.0001, "loss": 0.0162, "num_input_tokens_seen": 62204656, "step": 422 }, { "epoch": 0.12164889017007784, "loss": 0.017050063237547874, "loss_ce": 0.0061705466359853745, "loss_xval": 0.0108642578125, "num_input_tokens_seen": 62204656, "step": 422 }, { "epoch": 0.1219371576823292, "grad_norm": 3.8104541358240582, "learning_rate": 0.0001, "loss": 0.0182, "num_input_tokens_seen": 62377208, "step": 423 }, { "epoch": 0.1219371576823292, "loss": 0.008296707645058632, "loss_ce": 0.005021790042519569, "loss_xval": 0.0032806396484375, "num_input_tokens_seen": 62377208, "step": 423 }, { "epoch": 0.12222542519458057, "grad_norm": 13.700976362376961, "learning_rate": 0.0001, "loss": 0.0221, "num_input_tokens_seen": 62512016, "step": 424 }, { "epoch": 0.12222542519458057, "loss": 0.02982976660132408, "loss_ce": 0.019648339599370956, "loss_xval": 0.01019287109375, "num_input_tokens_seen": 62512016, "step": 424 }, { "epoch": 0.12251369270683193, "grad_norm": 5.461601173582089, "learning_rate": 0.0001, "loss": 0.01, "num_input_tokens_seen": 62645520, "step": 425 }, { "epoch": 0.12251369270683193, "loss": 0.009898142889142036, "loss_ce": 0.0057897139340639114, "loss_xval": 0.004119873046875, "num_input_tokens_seen": 62645520, "step": 425 }, { "epoch": 0.12280196021908331, "grad_norm": 12.868724266222875, "learning_rate": 0.0001, "loss": 0.0219, "num_input_tokens_seen": 62818008, "step": 426 }, { "epoch": 0.12280196021908331, "loss": 0.011952612549066544, "loss_ce": 0.004845831543207169, "loss_xval": 0.007110595703125, "num_input_tokens_seen": 62818008, "step": 426 }, { "epoch": 0.12309022773133468, "grad_norm": 5.962625423686631, "learning_rate": 0.0001, "loss": 0.0178, "num_input_tokens_seen": 62952776, "step": 427 }, { "epoch": 0.12309022773133468, "loss": 0.027931518852710724, "loss_ce": 0.02284843474626541, "loss_xval": 0.005096435546875, "num_input_tokens_seen": 62952776, "step": 427 }, { "epoch": 0.12337849524358604, "grad_norm": 13.40354193419642, "learning_rate": 0.0001, "loss": 0.0129, "num_input_tokens_seen": 63087864, "step": 428 }, { "epoch": 0.12337849524358604, "loss": 0.012452416121959686, "loss_ce": 0.004254631698131561, "loss_xval": 0.0081787109375, "num_input_tokens_seen": 63087864, "step": 428 }, { "epoch": 0.12366676275583742, "grad_norm": 3.119445339769395, "learning_rate": 0.0001, "loss": 0.0171, "num_input_tokens_seen": 63260320, "step": 429 }, { "epoch": 0.12366676275583742, "loss": 0.007456950377672911, "loss_ce": 0.004637889098376036, "loss_xval": 0.0028228759765625, "num_input_tokens_seen": 63260320, "step": 429 }, { "epoch": 0.12395503026808878, "grad_norm": 11.337685211261718, "learning_rate": 0.0001, "loss": 0.0175, "num_input_tokens_seen": 63395120, "step": 430 }, { "epoch": 0.12395503026808878, "loss": 0.023988714441657066, "loss_ce": 0.01637076400220394, "loss_xval": 0.00762939453125, "num_input_tokens_seen": 63395120, "step": 430 }, { "epoch": 0.12424329778034016, "grad_norm": 0.620352290348227, "learning_rate": 0.0001, "loss": 0.0073, "num_input_tokens_seen": 63530144, "step": 431 }, { "epoch": 0.12424329778034016, "loss": 0.007503397762775421, "loss_ce": 0.0034550500568002462, "loss_xval": 0.004058837890625, "num_input_tokens_seen": 63530144, "step": 431 }, { "epoch": 0.12453156529259153, "grad_norm": 8.636674289000593, "learning_rate": 0.0001, "loss": 0.0176, "num_input_tokens_seen": 63702656, "step": 432 }, { "epoch": 0.12453156529259153, "loss": 0.009306371212005615, "loss_ce": 0.004097401164472103, "loss_xval": 0.005218505859375, "num_input_tokens_seen": 63702656, "step": 432 }, { "epoch": 0.12481983280484289, "grad_norm": 1.1493436039330713, "learning_rate": 0.0001, "loss": 0.0128, "num_input_tokens_seen": 63837392, "step": 433 }, { "epoch": 0.12481983280484289, "loss": 0.019667595624923706, "loss_ce": 0.01628205180168152, "loss_xval": 0.003387451171875, "num_input_tokens_seen": 63837392, "step": 433 }, { "epoch": 0.12510810031709427, "grad_norm": 6.650570588966674, "learning_rate": 0.0001, "loss": 0.0077, "num_input_tokens_seen": 63972432, "step": 434 }, { "epoch": 0.12510810031709427, "loss": 0.006774544715881348, "loss_ce": 0.002935052150860429, "loss_xval": 0.00384521484375, "num_input_tokens_seen": 63972432, "step": 434 }, { "epoch": 0.12539636782934563, "grad_norm": 1.5652798425645655, "learning_rate": 0.0001, "loss": 0.0154, "num_input_tokens_seen": 64144888, "step": 435 }, { "epoch": 0.12539636782934563, "loss": 0.008809241466224194, "loss_ce": 0.005465658847242594, "loss_xval": 0.0033416748046875, "num_input_tokens_seen": 64144888, "step": 435 }, { "epoch": 0.125684635341597, "grad_norm": 7.078695658340988, "learning_rate": 0.0001, "loss": 0.0149, "num_input_tokens_seen": 64279712, "step": 436 }, { "epoch": 0.125684635341597, "loss": 0.02221812680363655, "loss_ce": 0.0176176019012928, "loss_xval": 0.004608154296875, "num_input_tokens_seen": 64279712, "step": 436 }, { "epoch": 0.12597290285384838, "grad_norm": 4.718172704478871, "learning_rate": 0.0001, "loss": 0.0067, "num_input_tokens_seen": 64414904, "step": 437 }, { "epoch": 0.12597290285384838, "loss": 0.006065657362341881, "loss_ce": 0.0030663516372442245, "loss_xval": 0.0030059814453125, "num_input_tokens_seen": 64414904, "step": 437 }, { "epoch": 0.12626117036609974, "grad_norm": 6.2712386779629785, "learning_rate": 0.0001, "loss": 0.0142, "num_input_tokens_seen": 64587336, "step": 438 }, { "epoch": 0.12626117036609974, "loss": 0.0074829827062785625, "loss_ce": 0.0033840904943645, "loss_xval": 0.00408935546875, "num_input_tokens_seen": 64587336, "step": 438 }, { "epoch": 0.1265494378783511, "grad_norm": 10.047732594806881, "learning_rate": 0.0001, "loss": 0.0157, "num_input_tokens_seen": 64722136, "step": 439 }, { "epoch": 0.1265494378783511, "loss": 0.02131899818778038, "loss_ce": 0.01652010902762413, "loss_xval": 0.004791259765625, "num_input_tokens_seen": 64722136, "step": 439 }, { "epoch": 0.1268377053906025, "grad_norm": 0.806131296148929, "learning_rate": 0.0001, "loss": 0.0055, "num_input_tokens_seen": 64857232, "step": 440 }, { "epoch": 0.1268377053906025, "loss": 0.005497447215020657, "loss_ce": 0.002595416735857725, "loss_xval": 0.002899169921875, "num_input_tokens_seen": 64857232, "step": 440 }, { "epoch": 0.12712597290285385, "grad_norm": 9.806963011163788, "learning_rate": 0.0001, "loss": 0.0158, "num_input_tokens_seen": 65029712, "step": 441 }, { "epoch": 0.12712597290285385, "loss": 0.00795245636254549, "loss_ce": 0.0030295890755951405, "loss_xval": 0.004913330078125, "num_input_tokens_seen": 65029712, "step": 441 }, { "epoch": 0.1274142404151052, "grad_norm": 6.870364954781495, "learning_rate": 0.0001, "loss": 0.0133, "num_input_tokens_seen": 65164528, "step": 442 }, { "epoch": 0.1274142404151052, "loss": 0.02025909721851349, "loss_ce": 0.01654548943042755, "loss_xval": 0.0037078857421875, "num_input_tokens_seen": 65164528, "step": 442 }, { "epoch": 0.12770250792735657, "grad_norm": 5.060108314966909, "learning_rate": 0.0001, "loss": 0.0061, "num_input_tokens_seen": 65299784, "step": 443 }, { "epoch": 0.12770250792735657, "loss": 0.005887602921575308, "loss_ce": 0.0022655476350337267, "loss_xval": 0.0036163330078125, "num_input_tokens_seen": 65299784, "step": 443 }, { "epoch": 0.12799077543960796, "grad_norm": 9.71208773417023, "learning_rate": 0.0001, "loss": 0.0154, "num_input_tokens_seen": 65472424, "step": 444 }, { "epoch": 0.12799077543960796, "loss": 0.007333572953939438, "loss_ce": 0.002883728127926588, "loss_xval": 0.00445556640625, "num_input_tokens_seen": 65472424, "step": 444 }, { "epoch": 0.12827904295185932, "grad_norm": 0.5369327335786107, "learning_rate": 0.0001, "loss": 0.0122, "num_input_tokens_seen": 65607192, "step": 445 }, { "epoch": 0.12827904295185932, "loss": 0.01974431797862053, "loss_ce": 0.016584794968366623, "loss_xval": 0.0031585693359375, "num_input_tokens_seen": 65607192, "step": 445 }, { "epoch": 0.12856731046411068, "grad_norm": 11.725794535136131, "learning_rate": 0.0001, "loss": 0.0084, "num_input_tokens_seen": 65742360, "step": 446 }, { "epoch": 0.12856731046411068, "loss": 0.008027734234929085, "loss_ce": 0.0024067778140306473, "loss_xval": 0.005615234375, "num_input_tokens_seen": 65742360, "step": 446 }, { "epoch": 0.12885557797636207, "grad_norm": 8.498973502599162, "learning_rate": 0.0001, "loss": 0.0135, "num_input_tokens_seen": 65914968, "step": 447 }, { "epoch": 0.12885557797636207, "loss": 0.006696837022900581, "loss_ce": 0.003360884264111519, "loss_xval": 0.0033416748046875, "num_input_tokens_seen": 65914968, "step": 447 }, { "epoch": 0.12914384548861343, "grad_norm": 7.918505058803784, "learning_rate": 0.0001, "loss": 0.0145, "num_input_tokens_seen": 66049832, "step": 448 }, { "epoch": 0.12914384548861343, "loss": 0.023593148216605186, "loss_ce": 0.01844712160527706, "loss_xval": 0.005157470703125, "num_input_tokens_seen": 66049832, "step": 448 }, { "epoch": 0.1294321130008648, "grad_norm": 15.69806868410208, "learning_rate": 0.0001, "loss": 0.0122, "num_input_tokens_seen": 66184864, "step": 449 }, { "epoch": 0.1294321130008648, "loss": 0.009837846271693707, "loss_ce": 0.0018842025892809033, "loss_xval": 0.0079345703125, "num_input_tokens_seen": 66184864, "step": 449 }, { "epoch": 0.12972038051311618, "grad_norm": 3.43071398992182, "learning_rate": 0.0001, "loss": 0.0114, "num_input_tokens_seen": 66357288, "step": 450 }, { "epoch": 0.12972038051311618, "loss": 0.0050545441918075085, "loss_ce": 0.0021372544579207897, "loss_xval": 0.0029144287109375, "num_input_tokens_seen": 66357288, "step": 450 }, { "epoch": 0.13000864802536755, "grad_norm": 13.550465489619242, "learning_rate": 0.0001, "loss": 0.0163, "num_input_tokens_seen": 66492088, "step": 451 }, { "epoch": 0.13000864802536755, "loss": 0.02118588052690029, "loss_ce": 0.015063291415572166, "loss_xval": 0.006134033203125, "num_input_tokens_seen": 66492088, "step": 451 }, { "epoch": 0.1302969155376189, "grad_norm": 13.399452499456793, "learning_rate": 0.0001, "loss": 0.0101, "num_input_tokens_seen": 66627072, "step": 452 }, { "epoch": 0.1302969155376189, "loss": 0.009072705172002316, "loss_ce": 0.002290173899382353, "loss_xval": 0.00677490234375, "num_input_tokens_seen": 66627072, "step": 452 }, { "epoch": 0.13058518304987027, "grad_norm": 4.240862155234506, "learning_rate": 0.0001, "loss": 0.0133, "num_input_tokens_seen": 66799488, "step": 453 }, { "epoch": 0.13058518304987027, "loss": 0.004820866510272026, "loss_ce": 0.0020142027642577887, "loss_xval": 0.0028076171875, "num_input_tokens_seen": 66799488, "step": 453 }, { "epoch": 0.13087345056212166, "grad_norm": 16.168784353206163, "learning_rate": 0.0001, "loss": 0.0189, "num_input_tokens_seen": 66934336, "step": 454 }, { "epoch": 0.13087345056212166, "loss": 0.024435311555862427, "loss_ce": 0.016409188508987427, "loss_xval": 0.008056640625, "num_input_tokens_seen": 66934336, "step": 454 }, { "epoch": 0.13116171807437302, "grad_norm": 3.851014867434399, "learning_rate": 0.0001, "loss": 0.0044, "num_input_tokens_seen": 67069320, "step": 455 }, { "epoch": 0.13116171807437302, "loss": 0.004337950609624386, "loss_ce": 0.0016276079695671797, "loss_xval": 0.002716064453125, "num_input_tokens_seen": 67069320, "step": 455 }, { "epoch": 0.13144998558662438, "grad_norm": 18.412053523066213, "learning_rate": 0.0001, "loss": 0.0218, "num_input_tokens_seen": 67241768, "step": 456 }, { "epoch": 0.13144998558662438, "loss": 0.01089160144329071, "loss_ce": 0.0026671146042644978, "loss_xval": 0.00823974609375, "num_input_tokens_seen": 67241768, "step": 456 }, { "epoch": 0.13173825309887577, "grad_norm": 19.223558233035803, "learning_rate": 0.0001, "loss": 0.0216, "num_input_tokens_seen": 67376624, "step": 457 }, { "epoch": 0.13173825309887577, "loss": 0.02506151795387268, "loss_ce": 0.015362650156021118, "loss_xval": 0.00970458984375, "num_input_tokens_seen": 67376624, "step": 457 }, { "epoch": 0.13202652061112713, "grad_norm": 9.699651714957488, "learning_rate": 0.0001, "loss": 0.0072, "num_input_tokens_seen": 67511632, "step": 458 }, { "epoch": 0.13202652061112713, "loss": 0.005822490900754929, "loss_ce": 0.001746486988849938, "loss_xval": 0.00408935546875, "num_input_tokens_seen": 67511632, "step": 458 }, { "epoch": 0.1323147881233785, "grad_norm": 34.073273051818276, "learning_rate": 0.0001, "loss": 0.0445, "num_input_tokens_seen": 67684128, "step": 459 }, { "epoch": 0.1323147881233785, "loss": 0.026470568031072617, "loss_ce": 0.002163316821679473, "loss_xval": 0.0242919921875, "num_input_tokens_seen": 67684128, "step": 459 }, { "epoch": 0.13260305563562985, "grad_norm": 15.624662552836503, "learning_rate": 0.0001, "loss": 0.0176, "num_input_tokens_seen": 67818912, "step": 460 }, { "epoch": 0.13260305563562985, "loss": 0.02330195903778076, "loss_ce": 0.015149950981140137, "loss_xval": 0.0081787109375, "num_input_tokens_seen": 67818912, "step": 460 }, { "epoch": 0.13289132314788124, "grad_norm": 31.190077918049116, "learning_rate": 0.0001, "loss": 0.0311, "num_input_tokens_seen": 67954008, "step": 461 }, { "epoch": 0.13289132314788124, "loss": 0.0190433319658041, "loss_ce": 0.0015491305384784937, "loss_xval": 0.0174560546875, "num_input_tokens_seen": 67954008, "step": 461 }, { "epoch": 0.1331795906601326, "grad_norm": 47.20861973030621, "learning_rate": 0.0001, "loss": 0.0742, "num_input_tokens_seen": 68126440, "step": 462 }, { "epoch": 0.1331795906601326, "loss": 0.04881232604384422, "loss_ce": 0.0018457742407917976, "loss_xval": 0.046875, "num_input_tokens_seen": 68126440, "step": 462 }, { "epoch": 0.13346785817238396, "grad_norm": 2.6336244187033753, "learning_rate": 0.0001, "loss": 0.0114, "num_input_tokens_seen": 68261192, "step": 463 }, { "epoch": 0.13346785817238396, "loss": 0.018522823229432106, "loss_ce": 0.014969432726502419, "loss_xval": 0.0035552978515625, "num_input_tokens_seen": 68261192, "step": 463 }, { "epoch": 0.13375612568463535, "grad_norm": 57.78556207273457, "learning_rate": 0.0001, "loss": 0.0957, "num_input_tokens_seen": 68396264, "step": 464 }, { "epoch": 0.13375612568463535, "loss": 0.06289799511432648, "loss_ce": 0.001466113142669201, "loss_xval": 0.0615234375, "num_input_tokens_seen": 68396264, "step": 464 }, { "epoch": 0.1340443931968867, "grad_norm": 58.46829235355628, "learning_rate": 0.0001, "loss": 0.1064, "num_input_tokens_seen": 68568856, "step": 465 }, { "epoch": 0.1340443931968867, "loss": 0.08666277676820755, "loss_ce": 0.002129080705344677, "loss_xval": 0.08447265625, "num_input_tokens_seen": 68568856, "step": 465 }, { "epoch": 0.13433266070913807, "grad_norm": 12.440288342385827, "learning_rate": 0.0001, "loss": 0.0193, "num_input_tokens_seen": 68703656, "step": 466 }, { "epoch": 0.13433266070913807, "loss": 0.020905818790197372, "loss_ce": 0.015027369372546673, "loss_xval": 0.005889892578125, "num_input_tokens_seen": 68703656, "step": 466 }, { "epoch": 0.13462092822138946, "grad_norm": 77.7980371215157, "learning_rate": 0.0001, "loss": 0.1704, "num_input_tokens_seen": 68838648, "step": 467 }, { "epoch": 0.13462092822138946, "loss": 0.11293500661849976, "loss_ce": 0.001667913980782032, "loss_xval": 0.111328125, "num_input_tokens_seen": 68838648, "step": 467 }, { "epoch": 0.13490919573364082, "grad_norm": 55.77808662197712, "learning_rate": 0.0001, "loss": 0.0969, "num_input_tokens_seen": 69011080, "step": 468 }, { "epoch": 0.13490919573364082, "loss": 0.0929902121424675, "loss_ce": 0.0019257587846368551, "loss_xval": 0.0908203125, "num_input_tokens_seen": 69011080, "step": 468 }, { "epoch": 0.13519746324589219, "grad_norm": 36.90146649165042, "learning_rate": 0.0001, "loss": 0.0613, "num_input_tokens_seen": 69145880, "step": 469 }, { "epoch": 0.13519746324589219, "loss": 0.03158336132764816, "loss_ce": 0.016259726136922836, "loss_xval": 0.01531982421875, "num_input_tokens_seen": 69145880, "step": 469 }, { "epoch": 0.13548573075814355, "grad_norm": 92.9330163112296, "learning_rate": 0.0001, "loss": 0.2396, "num_input_tokens_seen": 69280872, "step": 470 }, { "epoch": 0.13548573075814355, "loss": 0.17209458351135254, "loss_ce": 0.0018064901232719421, "loss_xval": 0.169921875, "num_input_tokens_seen": 69280872, "step": 470 }, { "epoch": 0.13577399827039494, "grad_norm": 35.17731963697524, "learning_rate": 0.0001, "loss": 0.052, "num_input_tokens_seen": 69453408, "step": 471 }, { "epoch": 0.13577399827039494, "loss": 0.06354454159736633, "loss_ce": 0.0019295515958219767, "loss_xval": 0.0615234375, "num_input_tokens_seen": 69453408, "step": 471 }, { "epoch": 0.1360622657826463, "grad_norm": 78.21099175332745, "learning_rate": 0.0001, "loss": 0.1981, "num_input_tokens_seen": 69588224, "step": 472 }, { "epoch": 0.1360622657826463, "loss": 0.10047487169504166, "loss_ce": 0.017772231251001358, "loss_xval": 0.08251953125, "num_input_tokens_seen": 69588224, "step": 472 }, { "epoch": 0.13635053329489766, "grad_norm": 109.31947715057484, "learning_rate": 0.0001, "loss": 0.3297, "num_input_tokens_seen": 69723480, "step": 473 }, { "epoch": 0.13635053329489766, "loss": 0.27944549918174744, "loss_ce": 0.0019796781707555056, "loss_xval": 0.27734375, "num_input_tokens_seen": 69723480, "step": 473 }, { "epoch": 0.13663880080714905, "grad_norm": 4.425859631788829, "learning_rate": 0.0001, "loss": 0.0315, "num_input_tokens_seen": 69895896, "step": 474 }, { "epoch": 0.13663880080714905, "loss": 0.024713829159736633, "loss_ce": 0.0023749619722366333, "loss_xval": 0.0223388671875, "num_input_tokens_seen": 69895896, "step": 474 }, { "epoch": 0.1369270683194004, "grad_norm": 121.91800725272553, "learning_rate": 0.0001, "loss": 0.4467, "num_input_tokens_seen": 70030768, "step": 475 }, { "epoch": 0.1369270683194004, "loss": 0.25983697175979614, "loss_ce": 0.01789361983537674, "loss_xval": 0.2421875, "num_input_tokens_seen": 70030768, "step": 475 }, { "epoch": 0.13721533583165177, "grad_norm": 102.73057111430288, "learning_rate": 0.0001, "loss": 0.2963, "num_input_tokens_seen": 70165784, "step": 476 }, { "epoch": 0.13721533583165177, "loss": 0.3070387840270996, "loss_ce": 0.002839577617123723, "loss_xval": 0.3046875, "num_input_tokens_seen": 70165784, "step": 476 }, { "epoch": 0.13750360334390313, "grad_norm": 54.2162435395872, "learning_rate": 0.0001, "loss": 0.126, "num_input_tokens_seen": 70338232, "step": 477 }, { "epoch": 0.13750360334390313, "loss": 0.022026684135198593, "loss_ce": 0.004997876472771168, "loss_xval": 0.01708984375, "num_input_tokens_seen": 70338232, "step": 477 }, { "epoch": 0.13779187085615452, "grad_norm": 154.58136133365142, "learning_rate": 0.0001, "loss": 0.6935, "num_input_tokens_seen": 70472952, "step": 478 }, { "epoch": 0.13779187085615452, "loss": 0.506955623626709, "loss_ce": 0.01940680667757988, "loss_xval": 0.48828125, "num_input_tokens_seen": 70472952, "step": 478 }, { "epoch": 0.13808013836840588, "grad_norm": 48.70800586926015, "learning_rate": 0.0001, "loss": 0.0818, "num_input_tokens_seen": 70607984, "step": 479 }, { "epoch": 0.13808013836840588, "loss": 0.12578168511390686, "loss_ce": 0.003101026639342308, "loss_xval": 0.12255859375, "num_input_tokens_seen": 70607984, "step": 479 }, { "epoch": 0.13836840588065724, "grad_norm": 143.0906218557431, "learning_rate": 0.0001, "loss": 0.643, "num_input_tokens_seen": 70780496, "step": 480 }, { "epoch": 0.13836840588065724, "loss": 0.3119392395019531, "loss_ce": 0.004077905789017677, "loss_xval": 0.30859375, "num_input_tokens_seen": 70780496, "step": 480 }, { "epoch": 0.13865667339290863, "grad_norm": 157.58702639731285, "learning_rate": 0.0001, "loss": 0.7503, "num_input_tokens_seen": 70915288, "step": 481 }, { "epoch": 0.13865667339290863, "loss": 0.6724344491958618, "loss_ce": 0.0200907364487648, "loss_xval": 0.65234375, "num_input_tokens_seen": 70915288, "step": 481 }, { "epoch": 0.13894494090516, "grad_norm": 49.559912277556116, "learning_rate": 0.0001, "loss": 0.1068, "num_input_tokens_seen": 71050288, "step": 482 }, { "epoch": 0.13894494090516, "loss": 0.024329371750354767, "loss_ce": 0.0039360010996460915, "loss_xval": 0.0203857421875, "num_input_tokens_seen": 71050288, "step": 482 }, { "epoch": 0.13923320841741135, "grad_norm": 207.90175767039054, "learning_rate": 0.0001, "loss": 1.3603, "num_input_tokens_seen": 71222816, "step": 483 }, { "epoch": 0.13923320841741135, "loss": 0.9580897092819214, "loss_ce": 0.004964680410921574, "loss_xval": 0.953125, "num_input_tokens_seen": 71222816, "step": 483 }, { "epoch": 0.1395214759296627, "grad_norm": 68.2165654788575, "learning_rate": 0.0001, "loss": 0.1706, "num_input_tokens_seen": 71357624, "step": 484 }, { "epoch": 0.1395214759296627, "loss": 0.18781864643096924, "loss_ce": 0.01801883429288864, "loss_xval": 0.169921875, "num_input_tokens_seen": 71357624, "step": 484 }, { "epoch": 0.1398097434419141, "grad_norm": 194.29247828283172, "learning_rate": 0.0001, "loss": 1.242, "num_input_tokens_seen": 71492848, "step": 485 }, { "epoch": 0.1398097434419141, "loss": 0.8029749989509583, "loss_ce": 0.0046351575292646885, "loss_xval": 0.796875, "num_input_tokens_seen": 71492848, "step": 485 }, { "epoch": 0.14009801095416546, "grad_norm": 186.8894329562773, "learning_rate": 0.0001, "loss": 1.195, "num_input_tokens_seen": 71665440, "step": 486 }, { "epoch": 0.14009801095416546, "loss": 1.040757179260254, "loss_ce": 0.009507151320576668, "loss_xval": 1.03125, "num_input_tokens_seen": 71665440, "step": 486 }, { "epoch": 0.14038627846641683, "grad_norm": 97.3981341751347, "learning_rate": 0.0001, "loss": 0.365, "num_input_tokens_seen": 71800264, "step": 487 }, { "epoch": 0.14038627846641683, "loss": 0.25503528118133545, "loss_ce": 0.02139267325401306, "loss_xval": 0.2333984375, "num_input_tokens_seen": 71800264, "step": 487 }, { "epoch": 0.14067454597866821, "grad_norm": 240.16409642375302, "learning_rate": 0.0001, "loss": 1.9637, "num_input_tokens_seen": 71935360, "step": 488 }, { "epoch": 0.14067454597866821, "loss": 1.6441001892089844, "loss_ce": 0.007381412200629711, "loss_xval": 1.640625, "num_input_tokens_seen": 71935360, "step": 488 }, { "epoch": 0.14096281349091958, "grad_norm": 3.504384559163347, "learning_rate": 0.0001, "loss": 0.0421, "num_input_tokens_seen": 72107824, "step": 489 }, { "epoch": 0.14096281349091958, "loss": 0.027433183044195175, "loss_ce": 0.005040910094976425, "loss_xval": 0.0223388671875, "num_input_tokens_seen": 72107824, "step": 489 }, { "epoch": 0.14125108100317094, "grad_norm": 251.4256804889384, "learning_rate": 0.0001, "loss": 1.9899, "num_input_tokens_seen": 72242616, "step": 490 }, { "epoch": 0.14125108100317094, "loss": 1.516852617263794, "loss_ce": 0.016852546483278275, "loss_xval": 1.5, "num_input_tokens_seen": 72242616, "step": 490 }, { "epoch": 0.14153934851542233, "grad_norm": 120.34478539836553, "learning_rate": 0.0001, "loss": 0.4263, "num_input_tokens_seen": 72377696, "step": 491 }, { "epoch": 0.14153934851542233, "loss": 0.3436959981918335, "loss_ce": 0.008979195728898048, "loss_xval": 0.333984375, "num_input_tokens_seen": 72377696, "step": 491 }, { "epoch": 0.1418276160276737, "grad_norm": 191.44123174477102, "learning_rate": 0.0001, "loss": 1.1365, "num_input_tokens_seen": 72550192, "step": 492 }, { "epoch": 0.1418276160276737, "loss": 0.9527350068092346, "loss_ce": 0.006445930339396, "loss_xval": 0.9453125, "num_input_tokens_seen": 72550192, "step": 492 }, { "epoch": 0.14211588353992505, "grad_norm": 142.707840044168, "learning_rate": 0.0001, "loss": 0.8139, "num_input_tokens_seen": 72685120, "step": 493 }, { "epoch": 0.14211588353992505, "loss": 0.5713205933570862, "loss_ce": 0.031037403270602226, "loss_xval": 0.5390625, "num_input_tokens_seen": 72685120, "step": 493 }, { "epoch": 0.1424041510521764, "grad_norm": 83.07267900422377, "learning_rate": 0.0001, "loss": 0.3097, "num_input_tokens_seen": 72820296, "step": 494 }, { "epoch": 0.1424041510521764, "loss": 0.1652831882238388, "loss_ce": 0.009643547236919403, "loss_xval": 0.1552734375, "num_input_tokens_seen": 72820296, "step": 494 }, { "epoch": 0.1426924185644278, "grad_norm": 1183.2500602177688, "learning_rate": 0.0001, "loss": 16.1702, "num_input_tokens_seen": 72992760, "step": 495 }, { "epoch": 0.1426924185644278, "loss": 7.769603729248047, "loss_ce": 0.03132236748933792, "loss_xval": 7.75, "num_input_tokens_seen": 72992760, "step": 495 }, { "epoch": 0.14298068607667916, "grad_norm": 468.23328751112655, "learning_rate": 0.0001, "loss": 2.9613, "num_input_tokens_seen": 73127528, "step": 496 }, { "epoch": 0.14298068607667916, "loss": 2.834989547729492, "loss_ce": 0.2217084765434265, "loss_xval": 2.609375, "num_input_tokens_seen": 73127528, "step": 496 }, { "epoch": 0.14326895358893052, "grad_norm": 3547.408541295576, "learning_rate": 0.0001, "loss": 63.6958, "num_input_tokens_seen": 73262672, "step": 497 }, { "epoch": 0.14326895358893052, "loss": 68.32894134521484, "loss_ce": 0.5789394378662109, "loss_xval": 68.0, "num_input_tokens_seen": 73262672, "step": 497 }, { "epoch": 0.1435572211011819, "grad_norm": 2957.9835631037604, "learning_rate": 0.0001, "loss": 88.0121, "num_input_tokens_seen": 73435112, "step": 498 }, { "epoch": 0.1435572211011819, "loss": 90.78068542480469, "loss_ce": 1.7181835174560547, "loss_xval": 89.0, "num_input_tokens_seen": 73435112, "step": 498 }, { "epoch": 0.14384548861343327, "grad_norm": 144.12515736515584, "learning_rate": 0.0001, "loss": 3.3137, "num_input_tokens_seen": 73569888, "step": 499 }, { "epoch": 0.14384548861343327, "loss": 3.01613450050354, "loss_ce": 2.08986496925354, "loss_xval": 0.92578125, "num_input_tokens_seen": 73569888, "step": 499 }, { "epoch": 0.14413375612568463, "grad_norm": 2182.1687365844186, "learning_rate": 0.0001, "loss": 42.8611, "num_input_tokens_seen": 73704904, "step": 500 }, { "epoch": 0.14413375612568463, "eval_websight_new_IoU": 0.0, "eval_websight_new_MAE_x": 16.08986473083496, "eval_websight_new_MAE_y": 14.622143268585205, "eval_websight_new_NUM_probability": 0.00971380015835166, "eval_websight_new_inside_bbox": 0.0, "eval_websight_new_loss": 261.8992614746094, "eval_websight_new_loss_ce": 1.582156479358673, "eval_websight_new_loss_xval": 260.5, "eval_websight_new_runtime": 36.8782, "eval_websight_new_samples_per_second": 1.356, "eval_websight_new_steps_per_second": 0.054, "num_input_tokens_seen": 73704904, "step": 500 }, { "epoch": 0.14413375612568463, "eval_seeclick_IoU": 0.0, "eval_seeclick_MAE_x": 15.54664421081543, "eval_seeclick_MAE_y": 14.642383098602295, "eval_seeclick_NUM_probability": 0.02366485446691513, "eval_seeclick_inside_bbox": 0.0, "eval_seeclick_loss": 251.6716766357422, "eval_seeclick_loss_ce": 1.5070286989212036, "eval_seeclick_loss_xval": 248.625, "eval_seeclick_runtime": 66.5403, "eval_seeclick_samples_per_second": 0.751, "eval_seeclick_steps_per_second": 0.03, "num_input_tokens_seen": 73704904, "step": 500 }, { "epoch": 0.14413375612568463, "eval_icons_IoU": 0.0, "eval_icons_MAE_x": 16.21195411682129, "eval_icons_MAE_y": 14.966073036193848, "eval_icons_NUM_probability": 0.017178870271891356, "eval_icons_inside_bbox": 0.0, "eval_icons_loss": 252.5450439453125, "eval_icons_loss_ce": 1.3136942386627197, "eval_icons_loss_xval": 250.5625, "eval_icons_runtime": 63.7918, "eval_icons_samples_per_second": 0.784, "eval_icons_steps_per_second": 0.031, "num_input_tokens_seen": 73704904, "step": 500 }, { "epoch": 0.14413375612568463, "loss": 254.4305419921875, "loss_ce": 1.3055286407470703, "loss_xval": 253.0, "num_input_tokens_seen": 73704904, "step": 500 }, { "epoch": 0.144422023637936, "grad_norm": 5415.594835990045, "learning_rate": 0.0001, "loss": 249.5239, "num_input_tokens_seen": 73877312, "step": 501 }, { "epoch": 0.144422023637936, "loss": 248.81724548339844, "loss_ce": 1.3172447681427002, "loss_xval": 248.0, "num_input_tokens_seen": 73877312, "step": 501 }, { "epoch": 0.14471029115018738, "grad_norm": 437.05624804314454, "learning_rate": 0.0001, "loss": 5.4789, "num_input_tokens_seen": 74012064, "step": 502 }, { "epoch": 0.14471029115018738, "loss": 6.111766815185547, "loss_ce": 0.8031734228134155, "loss_xval": 5.3125, "num_input_tokens_seen": 74012064, "step": 502 }, { "epoch": 0.14499855866243874, "grad_norm": 6017.803408310277, "learning_rate": 0.0001, "loss": 35.1959, "num_input_tokens_seen": 74147200, "step": 503 }, { "epoch": 0.14499855866243874, "loss": 24.930065155029297, "loss_ce": 0.7581892609596252, "loss_xval": 24.125, "num_input_tokens_seen": 74147200, "step": 503 }, { "epoch": 0.1452868261746901, "grad_norm": 4925.158357501441, "learning_rate": 0.0001, "loss": 231.4661, "num_input_tokens_seen": 74319696, "step": 504 }, { "epoch": 0.1452868261746901, "loss": 234.67117309570312, "loss_ce": 2.17117977142334, "loss_xval": 232.0, "num_input_tokens_seen": 74319696, "step": 504 }, { "epoch": 0.1455750936869415, "grad_norm": 6083.288980314157, "learning_rate": 0.0001, "loss": 33.7177, "num_input_tokens_seen": 74454544, "step": 505 }, { "epoch": 0.1455750936869415, "loss": 40.13539505004883, "loss_ce": 1.9478960037231445, "loss_xval": 38.25, "num_input_tokens_seen": 74454544, "step": 505 }, { "epoch": 0.14586336119919285, "grad_norm": 1590.7858721635075, "learning_rate": 0.0001, "loss": 35.5616, "num_input_tokens_seen": 74589784, "step": 506 }, { "epoch": 0.14586336119919285, "loss": 31.166776657104492, "loss_ce": 2.5730271339416504, "loss_xval": 28.625, "num_input_tokens_seen": 74589784, "step": 506 }, { "epoch": 0.14615162871144421, "grad_norm": 386.09725857908813, "learning_rate": 0.0001, "loss": 5.0356, "num_input_tokens_seen": 74762248, "step": 507 }, { "epoch": 0.14615162871144421, "loss": 4.796041488647461, "loss_ce": 3.13100266456604, "loss_xval": 1.6640625, "num_input_tokens_seen": 74762248, "step": 507 }, { "epoch": 0.14643989622369558, "grad_norm": 948.5710892886063, "learning_rate": 0.0001, "loss": 19.8862, "num_input_tokens_seen": 74897016, "step": 508 }, { "epoch": 0.14643989622369558, "loss": 19.3146915435791, "loss_ce": 2.8303165435791016, "loss_xval": 16.5, "num_input_tokens_seen": 74897016, "step": 508 }, { "epoch": 0.14672816373594696, "grad_norm": 748.4834585091971, "learning_rate": 0.0001, "loss": 5.8444, "num_input_tokens_seen": 75032128, "step": 509 }, { "epoch": 0.14672816373594696, "loss": 6.053933143615723, "loss_ce": 2.7746365070343018, "loss_xval": 3.28125, "num_input_tokens_seen": 75032128, "step": 509 }, { "epoch": 0.14701643124819833, "grad_norm": 676.2903234596315, "learning_rate": 0.0001, "loss": 5.9856, "num_input_tokens_seen": 75204696, "step": 510 }, { "epoch": 0.14701643124819833, "loss": 5.366189956665039, "loss_ce": 3.422830104827881, "loss_xval": 1.9453125, "num_input_tokens_seen": 75204696, "step": 510 }, { "epoch": 0.1473046987604497, "grad_norm": 894.8470585288454, "learning_rate": 0.0001, "loss": 9.989, "num_input_tokens_seen": 75339472, "step": 511 }, { "epoch": 0.1473046987604497, "loss": 10.071243286132812, "loss_ce": 2.645461320877075, "loss_xval": 7.4375, "num_input_tokens_seen": 75339472, "step": 511 }, { "epoch": 0.14759296627270108, "grad_norm": 1193.6287681118333, "learning_rate": 0.0001, "loss": 2.2828, "num_input_tokens_seen": 75474672, "step": 512 }, { "epoch": 0.14759296627270108, "loss": 2.0058786869049072, "loss_ce": 1.4199411869049072, "loss_xval": 0.5859375, "num_input_tokens_seen": 75474672, "step": 512 }, { "epoch": 0.14788123378495244, "grad_norm": 220.89561497284404, "learning_rate": 0.0001, "loss": 2.2091, "num_input_tokens_seen": 75647152, "step": 513 }, { "epoch": 0.14788123378495244, "loss": 2.175510883331299, "loss_ce": 1.0495343208312988, "loss_xval": 1.125, "num_input_tokens_seen": 75647152, "step": 513 }, { "epoch": 0.1481695012972038, "grad_norm": 245.22955417098655, "learning_rate": 0.0001, "loss": 2.1856, "num_input_tokens_seen": 75781872, "step": 514 }, { "epoch": 0.1481695012972038, "loss": 2.1912238597869873, "loss_ce": 0.5808722972869873, "loss_xval": 1.609375, "num_input_tokens_seen": 75781872, "step": 514 }, { "epoch": 0.1484577688094552, "grad_norm": 59.04888675936134, "learning_rate": 0.0001, "loss": 1.0063, "num_input_tokens_seen": 75916936, "step": 515 }, { "epoch": 0.1484577688094552, "loss": 0.8350945711135864, "loss_ce": 0.5239373445510864, "loss_xval": 0.310546875, "num_input_tokens_seen": 75916936, "step": 515 }, { "epoch": 0.14874603632170655, "grad_norm": 240.02413750733743, "learning_rate": 0.0001, "loss": 1.9709, "num_input_tokens_seen": 76089512, "step": 516 }, { "epoch": 0.14874603632170655, "loss": 1.302797794342041, "loss_ce": 0.599184513092041, "loss_xval": 0.703125, "num_input_tokens_seen": 76089512, "step": 516 }, { "epoch": 0.1490343038339579, "grad_norm": 98.03365050550292, "learning_rate": 0.0001, "loss": 0.8544, "num_input_tokens_seen": 76224288, "step": 517 }, { "epoch": 0.1490343038339579, "loss": 0.7350907325744629, "loss_ce": 0.4651932716369629, "loss_xval": 0.26953125, "num_input_tokens_seen": 76224288, "step": 517 }, { "epoch": 0.14932257134620927, "grad_norm": 90.15703552305258, "learning_rate": 0.0001, "loss": 0.7748, "num_input_tokens_seen": 76359384, "step": 518 }, { "epoch": 0.14932257134620927, "loss": 0.6386263370513916, "loss_ce": 0.4304353594779968, "loss_xval": 0.2080078125, "num_input_tokens_seen": 76359384, "step": 518 }, { "epoch": 0.14961083885846066, "grad_norm": 205.27603193926652, "learning_rate": 0.0001, "loss": 1.8418, "num_input_tokens_seen": 76531944, "step": 519 }, { "epoch": 0.14961083885846066, "loss": 1.29929518699646, "loss_ce": 0.5361114740371704, "loss_xval": 0.76171875, "num_input_tokens_seen": 76531944, "step": 519 }, { "epoch": 0.14989910637071202, "grad_norm": 78.52290789401435, "learning_rate": 0.0001, "loss": 0.7662, "num_input_tokens_seen": 76666672, "step": 520 }, { "epoch": 0.14989910637071202, "loss": 0.9968472123146057, "loss_ce": 0.4050503075122833, "loss_xval": 0.59375, "num_input_tokens_seen": 76666672, "step": 520 }, { "epoch": 0.15018737388296338, "grad_norm": 207.7463656688498, "learning_rate": 0.0001, "loss": 1.3291, "num_input_tokens_seen": 76801760, "step": 521 }, { "epoch": 0.15018737388296338, "loss": 1.097721815109253, "loss_ce": 0.37360072135925293, "loss_xval": 0.72265625, "num_input_tokens_seen": 76801760, "step": 521 }, { "epoch": 0.15047564139521477, "grad_norm": 135.28340638854004, "learning_rate": 0.0001, "loss": 1.0985, "num_input_tokens_seen": 76974288, "step": 522 }, { "epoch": 0.15047564139521477, "loss": 1.4575904607772827, "loss_ce": 0.49762946367263794, "loss_xval": 0.9609375, "num_input_tokens_seen": 76974288, "step": 522 }, { "epoch": 0.15076390890746613, "grad_norm": 24.17987630181005, "learning_rate": 0.0001, "loss": 0.53, "num_input_tokens_seen": 77109096, "step": 523 }, { "epoch": 0.15076390890746613, "loss": 0.5469415187835693, "loss_ce": 0.40826964378356934, "loss_xval": 0.138671875, "num_input_tokens_seen": 77109096, "step": 523 }, { "epoch": 0.1510521764197175, "grad_norm": 100.11571223596611, "learning_rate": 0.0001, "loss": 0.8684, "num_input_tokens_seen": 77244008, "step": 524 }, { "epoch": 0.1510521764197175, "loss": 0.9112217426300049, "loss_ce": 0.3496982455253601, "loss_xval": 0.5625, "num_input_tokens_seen": 77244008, "step": 524 }, { "epoch": 0.15134044393196885, "grad_norm": 49.317152729974175, "learning_rate": 0.0001, "loss": 0.6577, "num_input_tokens_seen": 77416440, "step": 525 }, { "epoch": 0.15134044393196885, "loss": 0.6915754079818726, "loss_ce": 0.48088207840919495, "loss_xval": 0.2109375, "num_input_tokens_seen": 77416440, "step": 525 }, { "epoch": 0.15162871144422024, "grad_norm": 37.88388740046188, "learning_rate": 0.0001, "loss": 0.5633, "num_input_tokens_seen": 77551096, "step": 526 }, { "epoch": 0.15162871144422024, "loss": 0.6765055656433105, "loss_ce": 0.40758466720581055, "loss_xval": 0.26953125, "num_input_tokens_seen": 77551096, "step": 526 }, { "epoch": 0.1519169789564716, "grad_norm": 96.29838625038265, "learning_rate": 0.0001, "loss": 0.539, "num_input_tokens_seen": 77686080, "step": 527 }, { "epoch": 0.1519169789564716, "loss": 0.3767795264720917, "loss_ce": 0.2926425635814667, "loss_xval": 0.083984375, "num_input_tokens_seen": 77686080, "step": 527 }, { "epoch": 0.15220524646872297, "grad_norm": 33.551167364865414, "learning_rate": 0.0001, "loss": 0.4591, "num_input_tokens_seen": 77858584, "step": 528 }, { "epoch": 0.15220524646872297, "loss": 0.4692811965942383, "loss_ce": 0.3921327590942383, "loss_xval": 0.0771484375, "num_input_tokens_seen": 77858584, "step": 528 }, { "epoch": 0.15249351398097435, "grad_norm": 25.602899058495524, "learning_rate": 0.0001, "loss": 0.3878, "num_input_tokens_seen": 77993448, "step": 529 }, { "epoch": 0.15249351398097435, "loss": 0.4130549132823944, "loss_ce": 0.3271784484386444, "loss_xval": 0.0859375, "num_input_tokens_seen": 77993448, "step": 529 }, { "epoch": 0.15278178149322572, "grad_norm": 25.35178128801067, "learning_rate": 0.0001, "loss": 0.3704, "num_input_tokens_seen": 78128592, "step": 530 }, { "epoch": 0.15278178149322572, "loss": 0.3342554569244385, "loss_ce": 0.2731592655181885, "loss_xval": 0.06103515625, "num_input_tokens_seen": 78128592, "step": 530 }, { "epoch": 0.15307004900547708, "grad_norm": 14.81325168312329, "learning_rate": 0.0001, "loss": 0.4114, "num_input_tokens_seen": 78301192, "step": 531 }, { "epoch": 0.15307004900547708, "loss": 0.42171674966812134, "loss_ce": 0.37812238931655884, "loss_xval": 0.043701171875, "num_input_tokens_seen": 78301192, "step": 531 }, { "epoch": 0.15335831651772847, "grad_norm": 24.1013008056333, "learning_rate": 0.0001, "loss": 0.3783, "num_input_tokens_seen": 78436040, "step": 532 }, { "epoch": 0.15335831651772847, "loss": 0.38421040773391724, "loss_ce": 0.31337910890579224, "loss_xval": 0.07080078125, "num_input_tokens_seen": 78436040, "step": 532 }, { "epoch": 0.15364658402997983, "grad_norm": 37.4499157075249, "learning_rate": 0.0001, "loss": 0.392, "num_input_tokens_seen": 78571040, "step": 533 }, { "epoch": 0.15364658402997983, "loss": 0.3791619539260864, "loss_ce": 0.2706414461135864, "loss_xval": 0.1083984375, "num_input_tokens_seen": 78571040, "step": 533 }, { "epoch": 0.1539348515422312, "grad_norm": 12.129510955905058, "learning_rate": 0.0001, "loss": 0.3663, "num_input_tokens_seen": 78743488, "step": 534 }, { "epoch": 0.1539348515422312, "loss": 0.3491227328777313, "loss_ce": 0.3058335483074188, "loss_xval": 0.043212890625, "num_input_tokens_seen": 78743488, "step": 534 }, { "epoch": 0.15422311905448255, "grad_norm": 23.735009533377152, "learning_rate": 0.0001, "loss": 0.3452, "num_input_tokens_seen": 78878208, "step": 535 }, { "epoch": 0.15422311905448255, "loss": 0.39459940791130066, "loss_ce": 0.27881571650505066, "loss_xval": 0.11572265625, "num_input_tokens_seen": 78878208, "step": 535 }, { "epoch": 0.15451138656673394, "grad_norm": 18.28117522583193, "learning_rate": 0.0001, "loss": 0.3246, "num_input_tokens_seen": 79013256, "step": 536 }, { "epoch": 0.15451138656673394, "loss": 0.2895457148551941, "loss_ce": 0.2394358515739441, "loss_xval": 0.050048828125, "num_input_tokens_seen": 79013256, "step": 536 }, { "epoch": 0.1547996540789853, "grad_norm": 6.24255692241842, "learning_rate": 0.0001, "loss": 0.3245, "num_input_tokens_seen": 79185768, "step": 537 }, { "epoch": 0.1547996540789853, "loss": 0.3146858811378479, "loss_ce": 0.2829018235206604, "loss_xval": 0.03173828125, "num_input_tokens_seen": 79185768, "step": 537 }, { "epoch": 0.15508792159123666, "grad_norm": 18.842244440998723, "learning_rate": 0.0001, "loss": 0.302, "num_input_tokens_seen": 79320536, "step": 538 }, { "epoch": 0.15508792159123666, "loss": 0.32134073972702026, "loss_ce": 0.24397867918014526, "loss_xval": 0.0771484375, "num_input_tokens_seen": 79320536, "step": 538 }, { "epoch": 0.15537618910348805, "grad_norm": 13.401811339057847, "learning_rate": 0.0001, "loss": 0.2766, "num_input_tokens_seen": 79455824, "step": 539 }, { "epoch": 0.15537618910348805, "loss": 0.255174458026886, "loss_ce": 0.2099168747663498, "loss_xval": 0.045166015625, "num_input_tokens_seen": 79455824, "step": 539 }, { "epoch": 0.1556644566157394, "grad_norm": 8.87835003264218, "learning_rate": 0.0001, "loss": 0.2796, "num_input_tokens_seen": 79628288, "step": 540 }, { "epoch": 0.1556644566157394, "loss": 0.28175222873687744, "loss_ce": 0.24963247776031494, "loss_xval": 0.0322265625, "num_input_tokens_seen": 79628288, "step": 540 }, { "epoch": 0.15595272412799077, "grad_norm": 10.764691594571945, "learning_rate": 0.0001, "loss": 0.2552, "num_input_tokens_seen": 79763104, "step": 541 }, { "epoch": 0.15595272412799077, "loss": 0.2863326072692871, "loss_ce": 0.2172713279724121, "loss_xval": 0.06884765625, "num_input_tokens_seen": 79763104, "step": 541 }, { "epoch": 0.15624099164024213, "grad_norm": 14.007259191137436, "learning_rate": 0.0001, "loss": 0.2418, "num_input_tokens_seen": 79898184, "step": 542 }, { "epoch": 0.15624099164024213, "loss": 0.21299250423908234, "loss_ce": 0.18197138607501984, "loss_xval": 0.031005859375, "num_input_tokens_seen": 79898184, "step": 542 }, { "epoch": 0.15652925915249352, "grad_norm": 7.639771826560467, "learning_rate": 0.0001, "loss": 0.2555, "num_input_tokens_seen": 80070632, "step": 543 }, { "epoch": 0.15652925915249352, "loss": 0.24649299681186676, "loss_ce": 0.22470344603061676, "loss_xval": 0.021728515625, "num_input_tokens_seen": 80070632, "step": 543 }, { "epoch": 0.15681752666474488, "grad_norm": 16.12636329610739, "learning_rate": 0.0001, "loss": 0.2312, "num_input_tokens_seen": 80205424, "step": 544 }, { "epoch": 0.15681752666474488, "loss": 0.26397019624710083, "loss_ce": 0.19713670015335083, "loss_xval": 0.06689453125, "num_input_tokens_seen": 80205424, "step": 544 }, { "epoch": 0.15710579417699624, "grad_norm": 7.993418479045426, "learning_rate": 0.0001, "loss": 0.2087, "num_input_tokens_seen": 80340528, "step": 545 }, { "epoch": 0.15710579417699624, "loss": 0.18511179089546204, "loss_ce": 0.16072824597358704, "loss_xval": 0.0244140625, "num_input_tokens_seen": 80340528, "step": 545 }, { "epoch": 0.15739406168924763, "grad_norm": 10.595619750311025, "learning_rate": 0.0001, "loss": 0.2364, "num_input_tokens_seen": 80513160, "step": 546 }, { "epoch": 0.15739406168924763, "loss": 0.24273675680160522, "loss_ce": 0.21314996480941772, "loss_xval": 0.029541015625, "num_input_tokens_seen": 80513160, "step": 546 }, { "epoch": 0.157682329201499, "grad_norm": 6.930279341659058, "learning_rate": 0.0001, "loss": 0.1913, "num_input_tokens_seen": 80648040, "step": 547 }, { "epoch": 0.157682329201499, "loss": 0.22323384881019592, "loss_ce": 0.18223348259925842, "loss_xval": 0.041015625, "num_input_tokens_seen": 80648040, "step": 547 }, { "epoch": 0.15797059671375036, "grad_norm": 8.714721085520644, "learning_rate": 0.0001, "loss": 0.1862, "num_input_tokens_seen": 80783152, "step": 548 }, { "epoch": 0.15797059671375036, "loss": 0.15961964428424835, "loss_ce": 0.13665516674518585, "loss_xval": 0.02294921875, "num_input_tokens_seen": 80783152, "step": 548 }, { "epoch": 0.15825886422600172, "grad_norm": 6.696238976718923, "learning_rate": 0.0001, "loss": 0.2109, "num_input_tokens_seen": 80955488, "step": 549 }, { "epoch": 0.15825886422600172, "loss": 0.20110364258289337, "loss_ce": 0.17869611084461212, "loss_xval": 0.0224609375, "num_input_tokens_seen": 80955488, "step": 549 }, { "epoch": 0.1585471317382531, "grad_norm": 4.674473836001116, "learning_rate": 0.0001, "loss": 0.1755, "num_input_tokens_seen": 81090160, "step": 550 }, { "epoch": 0.1585471317382531, "loss": 0.20668087899684906, "loss_ce": 0.16780148446559906, "loss_xval": 0.038818359375, "num_input_tokens_seen": 81090160, "step": 550 }, { "epoch": 0.15883539925050447, "grad_norm": 5.464329287182501, "learning_rate": 0.0001, "loss": 0.1676, "num_input_tokens_seen": 81225456, "step": 551 }, { "epoch": 0.15883539925050447, "loss": 0.14501583576202393, "loss_ce": 0.12630856037139893, "loss_xval": 0.0186767578125, "num_input_tokens_seen": 81225456, "step": 551 }, { "epoch": 0.15912366676275583, "grad_norm": 15.54738056116537, "learning_rate": 0.0001, "loss": 0.1926, "num_input_tokens_seen": 81397864, "step": 552 }, { "epoch": 0.15912366676275583, "loss": 0.19035065174102783, "loss_ce": 0.16454041004180908, "loss_xval": 0.0257568359375, "num_input_tokens_seen": 81397864, "step": 552 }, { "epoch": 0.15941193427500722, "grad_norm": 9.823333175994385, "learning_rate": 0.0001, "loss": 0.1606, "num_input_tokens_seen": 81532616, "step": 553 }, { "epoch": 0.15941193427500722, "loss": 0.18590956926345825, "loss_ce": 0.155170738697052, "loss_xval": 0.03076171875, "num_input_tokens_seen": 81532616, "step": 553 }, { "epoch": 0.15970020178725858, "grad_norm": 11.081119125897253, "learning_rate": 0.0001, "loss": 0.1459, "num_input_tokens_seen": 81667744, "step": 554 }, { "epoch": 0.15970020178725858, "loss": 0.1296243667602539, "loss_ce": 0.11304950714111328, "loss_xval": 0.0166015625, "num_input_tokens_seen": 81667744, "step": 554 }, { "epoch": 0.15998846929950994, "grad_norm": 10.365269177618455, "learning_rate": 0.0001, "loss": 0.1766, "num_input_tokens_seen": 81840440, "step": 555 }, { "epoch": 0.15998846929950994, "loss": 0.17812153697013855, "loss_ce": 0.15227314829826355, "loss_xval": 0.02587890625, "num_input_tokens_seen": 81840440, "step": 555 }, { "epoch": 0.16027673681176133, "grad_norm": 16.301766057599238, "learning_rate": 0.0001, "loss": 0.1505, "num_input_tokens_seen": 81975280, "step": 556 }, { "epoch": 0.16027673681176133, "loss": 0.1683354675769806, "loss_ce": 0.1410680115222931, "loss_xval": 0.0272216796875, "num_input_tokens_seen": 81975280, "step": 556 }, { "epoch": 0.1605650043240127, "grad_norm": 3.957116361728929, "learning_rate": 0.0001, "loss": 0.1385, "num_input_tokens_seen": 82110312, "step": 557 }, { "epoch": 0.1605650043240127, "loss": 0.12488369643688202, "loss_ce": 0.10235408693552017, "loss_xval": 0.0225830078125, "num_input_tokens_seen": 82110312, "step": 557 }, { "epoch": 0.16085327183626405, "grad_norm": 16.430070798635164, "learning_rate": 0.0001, "loss": 0.1404, "num_input_tokens_seen": 82282880, "step": 558 }, { "epoch": 0.16085327183626405, "loss": 0.13027092814445496, "loss_ce": 0.11491294205188751, "loss_xval": 0.015380859375, "num_input_tokens_seen": 82282880, "step": 558 }, { "epoch": 0.1611415393485154, "grad_norm": 5.8377270996113575, "learning_rate": 0.0001, "loss": 0.1266, "num_input_tokens_seen": 82417696, "step": 559 }, { "epoch": 0.1611415393485154, "loss": 0.14256012439727783, "loss_ce": 0.12391388416290283, "loss_xval": 0.0186767578125, "num_input_tokens_seen": 82417696, "step": 559 }, { "epoch": 0.1614298068607668, "grad_norm": 8.064025990014587, "learning_rate": 0.0001, "loss": 0.1194, "num_input_tokens_seen": 82552728, "step": 560 }, { "epoch": 0.1614298068607668, "loss": 0.10484252870082855, "loss_ce": 0.08726440370082855, "loss_xval": 0.017578125, "num_input_tokens_seen": 82552728, "step": 560 }, { "epoch": 0.16171807437301816, "grad_norm": 20.098214952476035, "learning_rate": 0.0001, "loss": 0.124, "num_input_tokens_seen": 82725352, "step": 561 }, { "epoch": 0.16171807437301816, "loss": 0.11904975771903992, "loss_ce": 0.09915992617607117, "loss_xval": 0.0198974609375, "num_input_tokens_seen": 82725352, "step": 561 }, { "epoch": 0.16200634188526952, "grad_norm": 7.901668286399249, "learning_rate": 0.0001, "loss": 0.1153, "num_input_tokens_seen": 82860184, "step": 562 }, { "epoch": 0.16200634188526952, "loss": 0.12794066965579987, "loss_ce": 0.10678437352180481, "loss_xval": 0.0211181640625, "num_input_tokens_seen": 82860184, "step": 562 }, { "epoch": 0.1622946093975209, "grad_norm": 6.294120755456603, "learning_rate": 0.0001, "loss": 0.0987, "num_input_tokens_seen": 82995280, "step": 563 }, { "epoch": 0.1622946093975209, "loss": 0.09468768537044525, "loss_ce": 0.079879030585289, "loss_xval": 0.01483154296875, "num_input_tokens_seen": 82995280, "step": 563 }, { "epoch": 0.16258287690977227, "grad_norm": 11.065605042496632, "learning_rate": 0.0001, "loss": 0.114, "num_input_tokens_seen": 83167720, "step": 564 }, { "epoch": 0.16258287690977227, "loss": 0.10833622515201569, "loss_ce": 0.08724857866764069, "loss_xval": 0.0211181640625, "num_input_tokens_seen": 83167720, "step": 564 }, { "epoch": 0.16287114442202363, "grad_norm": 9.562876080765191, "learning_rate": 0.0001, "loss": 0.0975, "num_input_tokens_seen": 83302488, "step": 565 }, { "epoch": 0.16287114442202363, "loss": 0.11148280650377274, "loss_ce": 0.09303493797779083, "loss_xval": 0.0184326171875, "num_input_tokens_seen": 83302488, "step": 565 }, { "epoch": 0.163159411934275, "grad_norm": 6.714478129953445, "learning_rate": 0.0001, "loss": 0.0921, "num_input_tokens_seen": 83437424, "step": 566 }, { "epoch": 0.163159411934275, "loss": 0.0829567164182663, "loss_ce": 0.06752245128154755, "loss_xval": 0.01544189453125, "num_input_tokens_seen": 83437424, "step": 566 }, { "epoch": 0.16344767944652638, "grad_norm": 4.429600475589518, "learning_rate": 0.0001, "loss": 0.1009, "num_input_tokens_seen": 83610000, "step": 567 }, { "epoch": 0.16344767944652638, "loss": 0.09480486810207367, "loss_ce": 0.07885180413722992, "loss_xval": 0.0159912109375, "num_input_tokens_seen": 83610000, "step": 567 }, { "epoch": 0.16373594695877774, "grad_norm": 8.38652460181145, "learning_rate": 0.0001, "loss": 0.0913, "num_input_tokens_seen": 83744816, "step": 568 }, { "epoch": 0.16373594695877774, "loss": 0.10490494966506958, "loss_ce": 0.08726578950881958, "loss_xval": 0.017578125, "num_input_tokens_seen": 83744816, "step": 568 }, { "epoch": 0.1640242144710291, "grad_norm": 4.1872332233055545, "learning_rate": 0.0001, "loss": 0.0846, "num_input_tokens_seen": 83879928, "step": 569 }, { "epoch": 0.1640242144710291, "loss": 0.07664161920547485, "loss_ce": 0.060459673404693604, "loss_xval": 0.0162353515625, "num_input_tokens_seen": 83879928, "step": 569 }, { "epoch": 0.1643124819832805, "grad_norm": 6.256329066436033, "learning_rate": 0.0001, "loss": 0.0895, "num_input_tokens_seen": 84052392, "step": 570 }, { "epoch": 0.1643124819832805, "loss": 0.08122982084751129, "loss_ce": 0.06586040556430817, "loss_xval": 0.015380859375, "num_input_tokens_seen": 84052392, "step": 570 }, { "epoch": 0.16460074949553186, "grad_norm": 5.398141020968809, "learning_rate": 0.0001, "loss": 0.0777, "num_input_tokens_seen": 84187232, "step": 571 }, { "epoch": 0.16460074949553186, "loss": 0.0879916399717331, "loss_ce": 0.0741061419248581, "loss_xval": 0.013916015625, "num_input_tokens_seen": 84187232, "step": 571 }, { "epoch": 0.16488901700778322, "grad_norm": 4.902929545953649, "learning_rate": 0.0001, "loss": 0.0698, "num_input_tokens_seen": 84322272, "step": 572 }, { "epoch": 0.16488901700778322, "loss": 0.06459660828113556, "loss_ce": 0.05026097595691681, "loss_xval": 0.01434326171875, "num_input_tokens_seen": 84322272, "step": 572 }, { "epoch": 0.16517728452003458, "grad_norm": 9.525756844667548, "learning_rate": 0.0001, "loss": 0.08, "num_input_tokens_seen": 84494768, "step": 573 }, { "epoch": 0.16517728452003458, "loss": 0.07009953260421753, "loss_ce": 0.05596226826310158, "loss_xval": 0.01416015625, "num_input_tokens_seen": 84494768, "step": 573 }, { "epoch": 0.16546555203228597, "grad_norm": 5.313776134745736, "learning_rate": 0.0001, "loss": 0.0721, "num_input_tokens_seen": 84629568, "step": 574 }, { "epoch": 0.16546555203228597, "loss": 0.08415266871452332, "loss_ce": 0.07041975855827332, "loss_xval": 0.01373291015625, "num_input_tokens_seen": 84629568, "step": 574 }, { "epoch": 0.16575381954453733, "grad_norm": 9.375819271631455, "learning_rate": 0.0001, "loss": 0.0646, "num_input_tokens_seen": 84764472, "step": 575 }, { "epoch": 0.16575381954453733, "loss": 0.05780894309282303, "loss_ce": 0.04357249289751053, "loss_xval": 0.01422119140625, "num_input_tokens_seen": 84764472, "step": 575 }, { "epoch": 0.1660420870567887, "grad_norm": 5.726067399275158, "learning_rate": 0.0001, "loss": 0.0764, "num_input_tokens_seen": 84937008, "step": 576 }, { "epoch": 0.1660420870567887, "loss": 0.07145722210407257, "loss_ce": 0.05638917163014412, "loss_xval": 0.01507568359375, "num_input_tokens_seen": 84937008, "step": 576 }, { "epoch": 0.16633035456904008, "grad_norm": 9.691222941127505, "learning_rate": 0.0001, "loss": 0.0636, "num_input_tokens_seen": 85071872, "step": 577 }, { "epoch": 0.16633035456904008, "loss": 0.07909021526575089, "loss_ce": 0.06402216106653214, "loss_xval": 0.01507568359375, "num_input_tokens_seen": 85071872, "step": 577 }, { "epoch": 0.16661862208129144, "grad_norm": 9.189882675536726, "learning_rate": 0.0001, "loss": 0.0571, "num_input_tokens_seen": 85207072, "step": 578 }, { "epoch": 0.16661862208129144, "loss": 0.04689406231045723, "loss_ce": 0.03543471172451973, "loss_xval": 0.011474609375, "num_input_tokens_seen": 85207072, "step": 578 }, { "epoch": 0.1669068895935428, "grad_norm": 7.183323348640227, "learning_rate": 0.0001, "loss": 0.0647, "num_input_tokens_seen": 85379712, "step": 579 }, { "epoch": 0.1669068895935428, "loss": 0.0562661737203598, "loss_ce": 0.0435708612203598, "loss_xval": 0.0126953125, "num_input_tokens_seen": 85379712, "step": 579 }, { "epoch": 0.1671951571057942, "grad_norm": 7.625349683880436, "learning_rate": 0.0001, "loss": 0.0572, "num_input_tokens_seen": 85514480, "step": 580 }, { "epoch": 0.1671951571057942, "loss": 0.07185040414333344, "loss_ce": 0.058354005217552185, "loss_xval": 0.01348876953125, "num_input_tokens_seen": 85514480, "step": 580 }, { "epoch": 0.16748342461804555, "grad_norm": 7.025990709824806, "learning_rate": 0.0001, "loss": 0.048, "num_input_tokens_seen": 85649536, "step": 581 }, { "epoch": 0.16748342461804555, "loss": 0.0399930477142334, "loss_ce": 0.02949499897658825, "loss_xval": 0.010498046875, "num_input_tokens_seen": 85649536, "step": 581 }, { "epoch": 0.1677716921302969, "grad_norm": 9.258264080755461, "learning_rate": 0.0001, "loss": 0.0616, "num_input_tokens_seen": 85822208, "step": 582 }, { "epoch": 0.1677716921302969, "loss": 0.06172851473093033, "loss_ce": 0.05022338777780533, "loss_xval": 0.011474609375, "num_input_tokens_seen": 85822208, "step": 582 }, { "epoch": 0.16805995964254827, "grad_norm": 5.89116810275108, "learning_rate": 0.0001, "loss": 0.0495, "num_input_tokens_seen": 85957008, "step": 583 }, { "epoch": 0.16805995964254827, "loss": 0.06277771294116974, "loss_ce": 0.04876251146197319, "loss_xval": 0.0140380859375, "num_input_tokens_seen": 85957008, "step": 583 }, { "epoch": 0.16834822715479966, "grad_norm": 10.182576802733074, "learning_rate": 0.0001, "loss": 0.0435, "num_input_tokens_seen": 86092000, "step": 584 }, { "epoch": 0.16834822715479966, "loss": 0.03622671216726303, "loss_ce": 0.02325674146413803, "loss_xval": 0.012939453125, "num_input_tokens_seen": 86092000, "step": 584 }, { "epoch": 0.16863649466705102, "grad_norm": 15.131169214119659, "learning_rate": 0.0001, "loss": 0.052, "num_input_tokens_seen": 86264520, "step": 585 }, { "epoch": 0.16863649466705102, "loss": 0.04348108172416687, "loss_ce": 0.029843537136912346, "loss_xval": 0.01361083984375, "num_input_tokens_seen": 86264520, "step": 585 }, { "epoch": 0.16892476217930238, "grad_norm": 6.965049780112206, "learning_rate": 0.0001, "loss": 0.0414, "num_input_tokens_seen": 86399272, "step": 586 }, { "epoch": 0.16892476217930238, "loss": 0.04911273717880249, "loss_ce": 0.04123920202255249, "loss_xval": 0.00787353515625, "num_input_tokens_seen": 86399272, "step": 586 }, { "epoch": 0.16921302969155377, "grad_norm": 4.913891385632238, "learning_rate": 0.0001, "loss": 0.0341, "num_input_tokens_seen": 86534408, "step": 587 }, { "epoch": 0.16921302969155377, "loss": 0.027581913396716118, "loss_ce": 0.020135624334216118, "loss_xval": 0.0074462890625, "num_input_tokens_seen": 86534408, "step": 587 }, { "epoch": 0.16950129720380513, "grad_norm": 2.482330696638268, "learning_rate": 0.0001, "loss": 0.0473, "num_input_tokens_seen": 86706880, "step": 588 }, { "epoch": 0.16950129720380513, "loss": 0.045104727149009705, "loss_ce": 0.037139639258384705, "loss_xval": 0.0079345703125, "num_input_tokens_seen": 86706880, "step": 588 }, { "epoch": 0.1697895647160565, "grad_norm": 7.563645548775605, "learning_rate": 0.0001, "loss": 0.037, "num_input_tokens_seen": 86841632, "step": 589 }, { "epoch": 0.1697895647160565, "loss": 0.04473777860403061, "loss_ce": 0.03411003202199936, "loss_xval": 0.0106201171875, "num_input_tokens_seen": 86841632, "step": 589 }, { "epoch": 0.17007783222830786, "grad_norm": 6.679156719751855, "learning_rate": 0.0001, "loss": 0.0334, "num_input_tokens_seen": 86976736, "step": 590 }, { "epoch": 0.17007783222830786, "loss": 0.024894479662179947, "loss_ce": 0.017337564378976822, "loss_xval": 0.007568359375, "num_input_tokens_seen": 86976736, "step": 590 }, { "epoch": 0.17036609974055925, "grad_norm": 4.787149514617314, "learning_rate": 0.0001, "loss": 0.0418, "num_input_tokens_seen": 87149400, "step": 591 }, { "epoch": 0.17036609974055925, "loss": 0.04200609773397446, "loss_ce": 0.028608879074454308, "loss_xval": 0.013427734375, "num_input_tokens_seen": 87149400, "step": 591 }, { "epoch": 0.1706543672528106, "grad_norm": 11.614698173696578, "learning_rate": 0.0001, "loss": 0.0334, "num_input_tokens_seen": 87284192, "step": 592 }, { "epoch": 0.1706543672528106, "loss": 0.04323431849479675, "loss_ce": 0.03306815028190613, "loss_xval": 0.01019287109375, "num_input_tokens_seen": 87284192, "step": 592 }, { "epoch": 0.17094263476506197, "grad_norm": 2.397756579315926, "learning_rate": 0.0001, "loss": 0.0244, "num_input_tokens_seen": 87419176, "step": 593 }, { "epoch": 0.17094263476506197, "loss": 0.020117074251174927, "loss_ce": 0.013338357210159302, "loss_xval": 0.00677490234375, "num_input_tokens_seen": 87419176, "step": 593 }, { "epoch": 0.17123090227731336, "grad_norm": 9.654477039282371, "learning_rate": 0.0001, "loss": 0.034, "num_input_tokens_seen": 87591752, "step": 594 }, { "epoch": 0.17123090227731336, "loss": 0.028524693101644516, "loss_ce": 0.019094761461019516, "loss_xval": 0.0093994140625, "num_input_tokens_seen": 87591752, "step": 594 }, { "epoch": 0.17151916978956472, "grad_norm": 5.367170509753708, "learning_rate": 0.0001, "loss": 0.0299, "num_input_tokens_seen": 87726488, "step": 595 }, { "epoch": 0.17151916978956472, "loss": 0.038070403039455414, "loss_ce": 0.027030669152736664, "loss_xval": 0.01104736328125, "num_input_tokens_seen": 87726488, "step": 595 }, { "epoch": 0.17180743730181608, "grad_norm": 5.695877234150102, "learning_rate": 0.0001, "loss": 0.0245, "num_input_tokens_seen": 87861664, "step": 596 }, { "epoch": 0.17180743730181608, "loss": 0.02125311642885208, "loss_ce": 0.010629183612763882, "loss_xval": 0.0106201171875, "num_input_tokens_seen": 87861664, "step": 596 }, { "epoch": 0.17209570481406747, "grad_norm": 5.263429908122342, "learning_rate": 0.0001, "loss": 0.0362, "num_input_tokens_seen": 88034152, "step": 597 }, { "epoch": 0.17209570481406747, "loss": 0.03428371623158455, "loss_ce": 0.025364950299263, "loss_xval": 0.0089111328125, "num_input_tokens_seen": 88034152, "step": 597 }, { "epoch": 0.17238397232631883, "grad_norm": 8.564515731366683, "learning_rate": 0.0001, "loss": 0.0273, "num_input_tokens_seen": 88168984, "step": 598 }, { "epoch": 0.17238397232631883, "loss": 0.03684600442647934, "loss_ce": 0.025626981630921364, "loss_xval": 0.01123046875, "num_input_tokens_seen": 88168984, "step": 598 }, { "epoch": 0.1726722398385702, "grad_norm": 9.058382947276888, "learning_rate": 0.0001, "loss": 0.0233, "num_input_tokens_seen": 88303992, "step": 599 }, { "epoch": 0.1726722398385702, "loss": 0.016890352591872215, "loss_ce": 0.00854760967195034, "loss_xval": 0.00836181640625, "num_input_tokens_seen": 88303992, "step": 599 }, { "epoch": 0.17296050735082155, "grad_norm": 8.279263806480797, "learning_rate": 0.0001, "loss": 0.0318, "num_input_tokens_seen": 88476368, "step": 600 }, { "epoch": 0.17296050735082155, "loss": 0.026282694190740585, "loss_ce": 0.01596393808722496, "loss_xval": 0.01031494140625, "num_input_tokens_seen": 88476368, "step": 600 }, { "epoch": 0.17324877486307294, "grad_norm": 15.192892311476902, "learning_rate": 0.0001, "loss": 0.031, "num_input_tokens_seen": 88611096, "step": 601 }, { "epoch": 0.17324877486307294, "loss": 0.036679480224847794, "loss_ce": 0.023183079436421394, "loss_xval": 0.01348876953125, "num_input_tokens_seen": 88611096, "step": 601 }, { "epoch": 0.1735370423753243, "grad_norm": 4.358761804382471, "learning_rate": 0.0001, "loss": 0.0182, "num_input_tokens_seen": 88746200, "step": 602 }, { "epoch": 0.1735370423753243, "loss": 0.014774775132536888, "loss_ce": 0.008205866441130638, "loss_xval": 0.006561279296875, "num_input_tokens_seen": 88746200, "step": 602 }, { "epoch": 0.17382530988757566, "grad_norm": 20.32798700944008, "learning_rate": 0.0001, "loss": 0.0386, "num_input_tokens_seen": 88918640, "step": 603 }, { "epoch": 0.17382530988757566, "loss": 0.03068140149116516, "loss_ce": 0.014491826295852661, "loss_xval": 0.0162353515625, "num_input_tokens_seen": 88918640, "step": 603 }, { "epoch": 0.17411357739982705, "grad_norm": 1.549470192136012, "learning_rate": 0.0001, "loss": 0.0223, "num_input_tokens_seen": 89053512, "step": 604 }, { "epoch": 0.17411357739982705, "loss": 0.03184882178902626, "loss_ce": 0.023847492411732674, "loss_xval": 0.00799560546875, "num_input_tokens_seen": 89053512, "step": 604 }, { "epoch": 0.1744018449120784, "grad_norm": 22.991785780271417, "learning_rate": 0.0001, "loss": 0.0402, "num_input_tokens_seen": 89188560, "step": 605 }, { "epoch": 0.1744018449120784, "loss": 0.025367528200149536, "loss_ce": 0.006500035524368286, "loss_xval": 0.0189208984375, "num_input_tokens_seen": 89188560, "step": 605 }, { "epoch": 0.17469011242432977, "grad_norm": 10.869462051705277, "learning_rate": 0.0001, "loss": 0.0263, "num_input_tokens_seen": 89361072, "step": 606 }, { "epoch": 0.17469011242432977, "loss": 0.023846114054322243, "loss_ce": 0.013390026986598969, "loss_xval": 0.01043701171875, "num_input_tokens_seen": 89361072, "step": 606 }, { "epoch": 0.17497837993658114, "grad_norm": 23.37286302176032, "learning_rate": 0.0001, "loss": 0.0335, "num_input_tokens_seen": 89495952, "step": 607 }, { "epoch": 0.17497837993658114, "loss": 0.03683967888355255, "loss_ce": 0.0210620928555727, "loss_xval": 0.0157470703125, "num_input_tokens_seen": 89495952, "step": 607 }, { "epoch": 0.17526664744883252, "grad_norm": 14.326681445640265, "learning_rate": 0.0001, "loss": 0.0235, "num_input_tokens_seen": 89630984, "step": 608 }, { "epoch": 0.17526664744883252, "loss": 0.012337716296315193, "loss_ce": 0.004445107653737068, "loss_xval": 0.00787353515625, "num_input_tokens_seen": 89630984, "step": 608 }, { "epoch": 0.17555491496108389, "grad_norm": 26.295209187947552, "learning_rate": 0.0001, "loss": 0.041, "num_input_tokens_seen": 89803488, "step": 609 }, { "epoch": 0.17555491496108389, "loss": 0.03529471904039383, "loss_ce": 0.013344951905310154, "loss_xval": 0.02197265625, "num_input_tokens_seen": 89803488, "step": 609 }, { "epoch": 0.17584318247333525, "grad_norm": 30.32216800331154, "learning_rate": 0.0001, "loss": 0.0455, "num_input_tokens_seen": 89938248, "step": 610 }, { "epoch": 0.17584318247333525, "loss": 0.0387270450592041, "loss_ce": 0.020073173567652702, "loss_xval": 0.0186767578125, "num_input_tokens_seen": 89938248, "step": 610 }, { "epoch": 0.17613144998558664, "grad_norm": 16.927809740603625, "learning_rate": 0.0001, "loss": 0.0234, "num_input_tokens_seen": 90073272, "step": 611 }, { "epoch": 0.17613144998558664, "loss": 0.024793002754449844, "loss_ce": 0.00980124156922102, "loss_xval": 0.0150146484375, "num_input_tokens_seen": 90073272, "step": 611 }, { "epoch": 0.176419717497838, "grad_norm": 43.083889005233594, "learning_rate": 0.0001, "loss": 0.0792, "num_input_tokens_seen": 90245760, "step": 612 }, { "epoch": 0.176419717497838, "loss": 0.05786873772740364, "loss_ce": 0.01784493774175644, "loss_xval": 0.0400390625, "num_input_tokens_seen": 90245760, "step": 612 }, { "epoch": 0.17670798501008936, "grad_norm": 4.502134202248031, "learning_rate": 0.0001, "loss": 0.0221, "num_input_tokens_seen": 90380496, "step": 613 }, { "epoch": 0.17670798501008936, "loss": 0.03407268226146698, "loss_ce": 0.026391787454485893, "loss_xval": 0.0076904296875, "num_input_tokens_seen": 90380496, "step": 613 }, { "epoch": 0.17699625252234072, "grad_norm": 46.63692973207371, "learning_rate": 0.0001, "loss": 0.0823, "num_input_tokens_seen": 90515528, "step": 614 }, { "epoch": 0.17699625252234072, "loss": 0.05757968872785568, "loss_ce": 0.006340674124658108, "loss_xval": 0.05126953125, "num_input_tokens_seen": 90515528, "step": 614 }, { "epoch": 0.1772845200345921, "grad_norm": 14.615420262559676, "learning_rate": 0.0001, "loss": 0.0273, "num_input_tokens_seen": 90688016, "step": 615 }, { "epoch": 0.1772845200345921, "loss": 0.02321012318134308, "loss_ce": 0.010972573421895504, "loss_xval": 0.01220703125, "num_input_tokens_seen": 90688016, "step": 615 }, { "epoch": 0.17757278754684347, "grad_norm": 55.01904078945244, "learning_rate": 0.0001, "loss": 0.1086, "num_input_tokens_seen": 90822800, "step": 616 }, { "epoch": 0.17757278754684347, "loss": 0.06659901142120361, "loss_ce": 0.020364880561828613, "loss_xval": 0.046142578125, "num_input_tokens_seen": 90822800, "step": 616 }, { "epoch": 0.17786105505909483, "grad_norm": 34.16231678059451, "learning_rate": 0.0001, "loss": 0.056, "num_input_tokens_seen": 90956392, "step": 617 }, { "epoch": 0.17786105505909483, "loss": 0.02499392069876194, "loss_ce": 0.004318260587751865, "loss_xval": 0.0206298828125, "num_input_tokens_seen": 90956392, "step": 617 }, { "epoch": 0.17814932257134622, "grad_norm": 59.454902458580015, "learning_rate": 0.0001, "loss": 0.1291, "num_input_tokens_seen": 91128864, "step": 618 }, { "epoch": 0.17814932257134622, "loss": 0.08348599821329117, "loss_ce": 0.0071920487098395824, "loss_xval": 0.076171875, "num_input_tokens_seen": 91128864, "step": 618 }, { "epoch": 0.17843759008359758, "grad_norm": 76.31438217203026, "learning_rate": 0.0001, "loss": 0.2037, "num_input_tokens_seen": 91263672, "step": 619 }, { "epoch": 0.17843759008359758, "loss": 0.10991793125867844, "loss_ce": 0.018182091414928436, "loss_xval": 0.091796875, "num_input_tokens_seen": 91263672, "step": 619 }, { "epoch": 0.17872585759584894, "grad_norm": 33.35197047290332, "learning_rate": 0.0001, "loss": 0.0502, "num_input_tokens_seen": 91398624, "step": 620 }, { "epoch": 0.17872585759584894, "loss": 0.04731812700629234, "loss_ce": 0.003342297160997987, "loss_xval": 0.0439453125, "num_input_tokens_seen": 91398624, "step": 620 }, { "epoch": 0.17901412510810033, "grad_norm": 104.89909603419869, "learning_rate": 0.0001, "loss": 0.3838, "num_input_tokens_seen": 91571032, "step": 621 }, { "epoch": 0.17901412510810033, "loss": 0.25730788707733154, "loss_ce": 0.008772751316428185, "loss_xval": 0.248046875, "num_input_tokens_seen": 91571032, "step": 621 }, { "epoch": 0.1793023926203517, "grad_norm": 2.5475132522315587, "learning_rate": 0.0001, "loss": 0.019, "num_input_tokens_seen": 91705872, "step": 622 }, { "epoch": 0.1793023926203517, "loss": 0.024016141891479492, "loss_ce": 0.016756772994995117, "loss_xval": 0.00726318359375, "num_input_tokens_seen": 91705872, "step": 622 }, { "epoch": 0.17959066013260305, "grad_norm": 112.72775966928485, "learning_rate": 0.0001, "loss": 0.4688, "num_input_tokens_seen": 91841016, "step": 623 }, { "epoch": 0.17959066013260305, "loss": 0.2900291681289673, "loss_ce": 0.0035301523748785257, "loss_xval": 0.287109375, "num_input_tokens_seen": 91841016, "step": 623 }, { "epoch": 0.17987892764485441, "grad_norm": 29.55511489330518, "learning_rate": 0.0001, "loss": 0.0575, "num_input_tokens_seen": 92013560, "step": 624 }, { "epoch": 0.17987892764485441, "loss": 0.030054394155740738, "loss_ce": 0.009149854071438313, "loss_xval": 0.0208740234375, "num_input_tokens_seen": 92013560, "step": 624 }, { "epoch": 0.1801671951571058, "grad_norm": 137.37470881232548, "learning_rate": 0.0001, "loss": 0.7034, "num_input_tokens_seen": 92148376, "step": 625 }, { "epoch": 0.1801671951571058, "loss": 0.40648776292800903, "loss_ce": 0.01659516617655754, "loss_xval": 0.390625, "num_input_tokens_seen": 92148376, "step": 625 }, { "epoch": 0.18045546266935716, "grad_norm": 79.15124205085066, "learning_rate": 0.0001, "loss": 0.2883, "num_input_tokens_seen": 92283352, "step": 626 }, { "epoch": 0.18045546266935716, "loss": 0.09200652688741684, "loss_ce": 0.003688656259328127, "loss_xval": 0.08837890625, "num_input_tokens_seen": 92283352, "step": 626 }, { "epoch": 0.18074373018160853, "grad_norm": 135.12030726633364, "learning_rate": 0.0001, "loss": 0.6984, "num_input_tokens_seen": 92455888, "step": 627 }, { "epoch": 0.18074373018160853, "loss": 0.5769015550613403, "loss_ce": 0.008542162366211414, "loss_xval": 0.5703125, "num_input_tokens_seen": 92455888, "step": 627 }, { "epoch": 0.18103199769385991, "grad_norm": 129.47937214572832, "learning_rate": 0.0001, "loss": 0.6816, "num_input_tokens_seen": 92590720, "step": 628 }, { "epoch": 0.18103199769385991, "loss": 0.3277011215686798, "loss_ce": 0.021304648369550705, "loss_xval": 0.306640625, "num_input_tokens_seen": 92590720, "step": 628 }, { "epoch": 0.18132026520611128, "grad_norm": 103.8174512889404, "learning_rate": 0.0001, "loss": 0.425, "num_input_tokens_seen": 92725696, "step": 629 }, { "epoch": 0.18132026520611128, "loss": 0.5176563262939453, "loss_ce": 0.004228599369525909, "loss_xval": 0.51171875, "num_input_tokens_seen": 92725696, "step": 629 }, { "epoch": 0.18160853271836264, "grad_norm": 148.7103706126506, "learning_rate": 0.0001, "loss": 0.8761, "num_input_tokens_seen": 92898232, "step": 630 }, { "epoch": 0.18160853271836264, "loss": 0.683671236038208, "loss_ce": 0.011307948268949986, "loss_xval": 0.671875, "num_input_tokens_seen": 92898232, "step": 630 }, { "epoch": 0.181896800230614, "grad_norm": 81.09085148470248, "learning_rate": 0.0001, "loss": 0.2746, "num_input_tokens_seen": 93032992, "step": 631 }, { "epoch": 0.181896800230614, "loss": 0.3315165042877197, "loss_ce": 0.021213779225945473, "loss_xval": 0.310546875, "num_input_tokens_seen": 93032992, "step": 631 }, { "epoch": 0.1821850677428654, "grad_norm": 134.49367353766172, "learning_rate": 0.0001, "loss": 0.7242, "num_input_tokens_seen": 93167984, "step": 632 }, { "epoch": 0.1821850677428654, "loss": 0.6176624298095703, "loss_ce": 0.005846067331731319, "loss_xval": 0.61328125, "num_input_tokens_seen": 93167984, "step": 632 }, { "epoch": 0.18247333525511675, "grad_norm": 82.28557695859297, "learning_rate": 0.0001, "loss": 0.295, "num_input_tokens_seen": 93340568, "step": 633 }, { "epoch": 0.18247333525511675, "loss": 0.32379481196403503, "loss_ce": 0.011294808238744736, "loss_xval": 0.3125, "num_input_tokens_seen": 93340568, "step": 633 }, { "epoch": 0.1827616027673681, "grad_norm": 115.07263488502592, "learning_rate": 0.0001, "loss": 0.5633, "num_input_tokens_seen": 93475448, "step": 634 }, { "epoch": 0.1827616027673681, "loss": 0.43479472398757935, "loss_ce": 0.021464616060256958, "loss_xval": 0.4140625, "num_input_tokens_seen": 93475448, "step": 634 }, { "epoch": 0.1830498702796195, "grad_norm": 86.6727154697341, "learning_rate": 0.0001, "loss": 0.3386, "num_input_tokens_seen": 93610488, "step": 635 }, { "epoch": 0.1830498702796195, "loss": 0.44108590483665466, "loss_ce": 0.006027324125170708, "loss_xval": 0.435546875, "num_input_tokens_seen": 93610488, "step": 635 }, { "epoch": 0.18333813779187086, "grad_norm": 96.04215122807649, "learning_rate": 0.0001, "loss": 0.3946, "num_input_tokens_seen": 93782984, "step": 636 }, { "epoch": 0.18333813779187086, "loss": 0.3574924170970917, "loss_ce": 0.01032444927841425, "loss_xval": 0.34765625, "num_input_tokens_seen": 93782984, "step": 636 }, { "epoch": 0.18362640530412222, "grad_norm": 81.44290209799817, "learning_rate": 0.0001, "loss": 0.3073, "num_input_tokens_seen": 93917776, "step": 637 }, { "epoch": 0.18362640530412222, "loss": 0.3946782946586609, "loss_ce": 0.01772518828511238, "loss_xval": 0.376953125, "num_input_tokens_seen": 93917776, "step": 637 }, { "epoch": 0.18391467281637358, "grad_norm": 50.44860046713553, "learning_rate": 0.0001, "loss": 0.1365, "num_input_tokens_seen": 94052704, "step": 638 }, { "epoch": 0.18391467281637358, "loss": 0.17554911971092224, "loss_ce": 0.007824521511793137, "loss_xval": 0.16796875, "num_input_tokens_seen": 94052704, "step": 638 }, { "epoch": 0.18420294032862497, "grad_norm": 76.33064807793744, "learning_rate": 0.0001, "loss": 0.2706, "num_input_tokens_seen": 94225128, "step": 639 }, { "epoch": 0.18420294032862497, "loss": 0.27219584584236145, "loss_ce": 0.010232958011329174, "loss_xval": 0.26171875, "num_input_tokens_seen": 94225128, "step": 639 }, { "epoch": 0.18449120784087633, "grad_norm": 2.9693015176271773, "learning_rate": 0.0001, "loss": 0.028, "num_input_tokens_seen": 94359936, "step": 640 }, { "epoch": 0.18449120784087633, "loss": 0.029297899454832077, "loss_ce": 0.018612932413816452, "loss_xval": 0.01068115234375, "num_input_tokens_seen": 94359936, "step": 640 }, { "epoch": 0.1847794753531277, "grad_norm": 78.18709435660247, "learning_rate": 0.0001, "loss": 0.294, "num_input_tokens_seen": 94495104, "step": 641 }, { "epoch": 0.1847794753531277, "loss": 0.23611602187156677, "loss_ce": 0.006379688158631325, "loss_xval": 0.2294921875, "num_input_tokens_seen": 94495104, "step": 641 }, { "epoch": 0.18506774286537908, "grad_norm": 36.01208626464418, "learning_rate": 0.0001, "loss": 0.0792, "num_input_tokens_seen": 94667656, "step": 642 }, { "epoch": 0.18506774286537908, "loss": 0.084808848798275, "loss_ce": 0.008514909073710442, "loss_xval": 0.076171875, "num_input_tokens_seen": 94667656, "step": 642 }, { "epoch": 0.18535601037763044, "grad_norm": 71.56368152016078, "learning_rate": 0.0001, "loss": 0.2428, "num_input_tokens_seen": 94802504, "step": 643 }, { "epoch": 0.18535601037763044, "loss": 0.23120847344398499, "loss_ce": 0.01905028149485588, "loss_xval": 0.2119140625, "num_input_tokens_seen": 94802504, "step": 643 }, { "epoch": 0.1856442778898818, "grad_norm": 48.96488819717949, "learning_rate": 0.0001, "loss": 0.1255, "num_input_tokens_seen": 94937664, "step": 644 }, { "epoch": 0.1856442778898818, "loss": 0.14206674695014954, "loss_ce": 0.005836280062794685, "loss_xval": 0.13671875, "num_input_tokens_seen": 94937664, "step": 644 }, { "epoch": 0.1859325454021332, "grad_norm": 41.79536487740748, "learning_rate": 0.0001, "loss": 0.1032, "num_input_tokens_seen": 95110192, "step": 645 }, { "epoch": 0.1859325454021332, "loss": 0.10236231237649918, "loss_ce": 0.007940929383039474, "loss_xval": 0.09423828125, "num_input_tokens_seen": 95110192, "step": 645 }, { "epoch": 0.18622081291438455, "grad_norm": 55.58375506803601, "learning_rate": 0.0001, "loss": 0.1666, "num_input_tokens_seen": 95245008, "step": 646 }, { "epoch": 0.18622081291438455, "loss": 0.18501755595207214, "loss_ce": 0.022175753489136696, "loss_xval": 0.1630859375, "num_input_tokens_seen": 95245008, "step": 646 }, { "epoch": 0.18650908042663591, "grad_norm": 10.179949024257645, "learning_rate": 0.0001, "loss": 0.0234, "num_input_tokens_seen": 95380144, "step": 647 }, { "epoch": 0.18650908042663591, "loss": 0.020446889102458954, "loss_ce": 0.0061799222603440285, "loss_xval": 0.0142822265625, "num_input_tokens_seen": 95380144, "step": 647 }, { "epoch": 0.18679734793888728, "grad_norm": 57.440270290522434, "learning_rate": 0.0001, "loss": 0.1697, "num_input_tokens_seen": 95552656, "step": 648 }, { "epoch": 0.18679734793888728, "loss": 0.1528579443693161, "loss_ce": 0.011012241244316101, "loss_xval": 0.1416015625, "num_input_tokens_seen": 95552656, "step": 648 }, { "epoch": 0.18708561545113866, "grad_norm": 15.56417725902896, "learning_rate": 0.0001, "loss": 0.0357, "num_input_tokens_seen": 95687496, "step": 649 }, { "epoch": 0.18708561545113866, "loss": 0.04545750468969345, "loss_ce": 0.01956333965063095, "loss_xval": 0.02587890625, "num_input_tokens_seen": 95687496, "step": 649 }, { "epoch": 0.18737388296339003, "grad_norm": 47.34908410562357, "learning_rate": 0.0001, "loss": 0.1196, "num_input_tokens_seen": 95822704, "step": 650 }, { "epoch": 0.18737388296339003, "loss": 0.12299247086048126, "loss_ce": 0.008124311454594135, "loss_xval": 0.11474609375, "num_input_tokens_seen": 95822704, "step": 650 }, { "epoch": 0.1876621504756414, "grad_norm": 30.048830082668516, "learning_rate": 0.0001, "loss": 0.0648, "num_input_tokens_seen": 95995240, "step": 651 }, { "epoch": 0.1876621504756414, "loss": 0.05971190333366394, "loss_ce": 0.01140257716178894, "loss_xval": 0.04833984375, "num_input_tokens_seen": 95995240, "step": 651 }, { "epoch": 0.18795041798789278, "grad_norm": 31.16166204897759, "learning_rate": 0.0001, "loss": 0.0639, "num_input_tokens_seen": 96130024, "step": 652 }, { "epoch": 0.18795041798789278, "loss": 0.07345931231975555, "loss_ce": 0.017398525029420853, "loss_xval": 0.05615234375, "num_input_tokens_seen": 96130024, "step": 652 }, { "epoch": 0.18823868550014414, "grad_norm": 36.79897939166327, "learning_rate": 0.0001, "loss": 0.0791, "num_input_tokens_seen": 96265224, "step": 653 }, { "epoch": 0.18823868550014414, "loss": 0.07308580726385117, "loss_ce": 0.0050010960549116135, "loss_xval": 0.06787109375, "num_input_tokens_seen": 96265224, "step": 653 }, { "epoch": 0.1885269530123955, "grad_norm": 10.089448574264999, "learning_rate": 0.0001, "loss": 0.0271, "num_input_tokens_seen": 96437712, "step": 654 }, { "epoch": 0.1885269530123955, "loss": 0.018405906856060028, "loss_ce": 0.008525840938091278, "loss_xval": 0.0098876953125, "num_input_tokens_seen": 96437712, "step": 654 }, { "epoch": 0.18881522052464686, "grad_norm": 40.4976776327298, "learning_rate": 0.0001, "loss": 0.0954, "num_input_tokens_seen": 96572520, "step": 655 }, { "epoch": 0.18881522052464686, "loss": 0.09976939857006073, "loss_ce": 0.01621226966381073, "loss_xval": 0.08349609375, "num_input_tokens_seen": 96572520, "step": 655 }, { "epoch": 0.18910348803689825, "grad_norm": 6.340013313622405, "learning_rate": 0.0001, "loss": 0.0149, "num_input_tokens_seen": 96707664, "step": 656 }, { "epoch": 0.18910348803689825, "loss": 0.016466163098812103, "loss_ce": 0.005746862851083279, "loss_xval": 0.0107421875, "num_input_tokens_seen": 96707664, "step": 656 }, { "epoch": 0.1893917555491496, "grad_norm": 35.91003124190502, "learning_rate": 0.0001, "loss": 0.0812, "num_input_tokens_seen": 96880240, "step": 657 }, { "epoch": 0.1893917555491496, "loss": 0.07167474180459976, "loss_ce": 0.008564389310777187, "loss_xval": 0.06298828125, "num_input_tokens_seen": 96880240, "step": 657 }, { "epoch": 0.18968002306140097, "grad_norm": 15.718505266645101, "learning_rate": 0.0001, "loss": 0.032, "num_input_tokens_seen": 97014944, "step": 658 }, { "epoch": 0.18968002306140097, "loss": 0.04226473718881607, "loss_ce": 0.01769808493554592, "loss_xval": 0.0245361328125, "num_input_tokens_seen": 97014944, "step": 658 }, { "epoch": 0.18996829057365236, "grad_norm": 24.304796089580112, "learning_rate": 0.0001, "loss": 0.0398, "num_input_tokens_seen": 97150216, "step": 659 }, { "epoch": 0.18996829057365236, "loss": 0.04164959490299225, "loss_ce": 0.003929869271814823, "loss_xval": 0.03759765625, "num_input_tokens_seen": 97150216, "step": 659 }, { "epoch": 0.19025655808590372, "grad_norm": 22.686082472452636, "learning_rate": 0.0001, "loss": 0.0456, "num_input_tokens_seen": 97322824, "step": 660 }, { "epoch": 0.19025655808590372, "loss": 0.041275255382061005, "loss_ce": 0.013794178143143654, "loss_xval": 0.0274658203125, "num_input_tokens_seen": 97322824, "step": 660 }, { "epoch": 0.19054482559815508, "grad_norm": 10.923850233615978, "learning_rate": 0.0001, "loss": 0.0263, "num_input_tokens_seen": 97457624, "step": 661 }, { "epoch": 0.19054482559815508, "loss": 0.03429355472326279, "loss_ce": 0.020759012550115585, "loss_xval": 0.0135498046875, "num_input_tokens_seen": 97457624, "step": 661 }, { "epoch": 0.19083309311040647, "grad_norm": 24.784504672493853, "learning_rate": 0.0001, "loss": 0.0422, "num_input_tokens_seen": 97592712, "step": 662 }, { "epoch": 0.19083309311040647, "loss": 0.038780368864536285, "loss_ce": 0.00423447135835886, "loss_xval": 0.03466796875, "num_input_tokens_seen": 97592712, "step": 662 }, { "epoch": 0.19112136062265783, "grad_norm": 1.9143655089211267, "learning_rate": 0.0001, "loss": 0.0183, "num_input_tokens_seen": 97765232, "step": 663 }, { "epoch": 0.19112136062265783, "loss": 0.013931534253060818, "loss_ce": 0.009016296826303005, "loss_xval": 0.004913330078125, "num_input_tokens_seen": 97765232, "step": 663 }, { "epoch": 0.1914096281349092, "grad_norm": 25.36963085600519, "learning_rate": 0.0001, "loss": 0.0476, "num_input_tokens_seen": 97900040, "step": 664 }, { "epoch": 0.1914096281349092, "loss": 0.05295349657535553, "loss_ce": 0.01700379140675068, "loss_xval": 0.035888671875, "num_input_tokens_seen": 97900040, "step": 664 }, { "epoch": 0.19169789564716055, "grad_norm": 7.826629884765334, "learning_rate": 0.0001, "loss": 0.0125, "num_input_tokens_seen": 98035160, "step": 665 }, { "epoch": 0.19169789564716055, "loss": 0.012243002653121948, "loss_ce": 0.003486365545541048, "loss_xval": 0.00872802734375, "num_input_tokens_seen": 98035160, "step": 665 }, { "epoch": 0.19198616315941194, "grad_norm": 22.006637229215777, "learning_rate": 0.0001, "loss": 0.0409, "num_input_tokens_seen": 98207640, "step": 666 }, { "epoch": 0.19198616315941194, "loss": 0.03680358827114105, "loss_ce": 0.007033688947558403, "loss_xval": 0.02978515625, "num_input_tokens_seen": 98207640, "step": 666 }, { "epoch": 0.1922744306716633, "grad_norm": 11.690800860739543, "learning_rate": 0.0001, "loss": 0.0231, "num_input_tokens_seen": 98342408, "step": 667 }, { "epoch": 0.1922744306716633, "loss": 0.02821066603064537, "loss_ce": 0.01575949415564537, "loss_xval": 0.012451171875, "num_input_tokens_seen": 98342408, "step": 667 }, { "epoch": 0.19256269818391467, "grad_norm": 16.620870830369185, "learning_rate": 0.0001, "loss": 0.0225, "num_input_tokens_seen": 98477512, "step": 668 }, { "epoch": 0.19256269818391467, "loss": 0.02170976996421814, "loss_ce": 0.0031550817657262087, "loss_xval": 0.0185546875, "num_input_tokens_seen": 98477512, "step": 668 }, { "epoch": 0.19285096569616605, "grad_norm": 14.19915075241969, "learning_rate": 0.0001, "loss": 0.0252, "num_input_tokens_seen": 98649920, "step": 669 }, { "epoch": 0.19285096569616605, "loss": 0.016638725996017456, "loss_ce": 0.005190819036215544, "loss_xval": 0.011474609375, "num_input_tokens_seen": 98649920, "step": 669 }, { "epoch": 0.19313923320841742, "grad_norm": 11.114309643706601, "learning_rate": 0.0001, "loss": 0.0221, "num_input_tokens_seen": 98784720, "step": 670 }, { "epoch": 0.19313923320841742, "loss": 0.028857605531811714, "loss_ce": 0.016703978180885315, "loss_xval": 0.01214599609375, "num_input_tokens_seen": 98784720, "step": 670 }, { "epoch": 0.19342750072066878, "grad_norm": 15.49157272490426, "learning_rate": 0.0001, "loss": 0.0214, "num_input_tokens_seen": 98919832, "step": 671 }, { "epoch": 0.19342750072066878, "loss": 0.020468585193157196, "loss_ce": 0.003065934870392084, "loss_xval": 0.0174560546875, "num_input_tokens_seen": 98919832, "step": 671 }, { "epoch": 0.19371576823292014, "grad_norm": 6.639652599123291, "learning_rate": 0.0001, "loss": 0.0223, "num_input_tokens_seen": 99092272, "step": 672 }, { "epoch": 0.19371576823292014, "loss": 0.021392960101366043, "loss_ce": 0.014747758395969868, "loss_xval": 0.00665283203125, "num_input_tokens_seen": 99092272, "step": 672 }, { "epoch": 0.19400403574517153, "grad_norm": 15.668890241160856, "learning_rate": 0.0001, "loss": 0.0259, "num_input_tokens_seen": 99227176, "step": 673 }, { "epoch": 0.19400403574517153, "loss": 0.03154763579368591, "loss_ce": 0.013893214985728264, "loss_xval": 0.0177001953125, "num_input_tokens_seen": 99227176, "step": 673 }, { "epoch": 0.1942923032574229, "grad_norm": 4.130485453269547, "learning_rate": 0.0001, "loss": 0.0097, "num_input_tokens_seen": 99362336, "step": 674 }, { "epoch": 0.1942923032574229, "loss": 0.009054271504282951, "loss_ce": 0.0033589282538741827, "loss_xval": 0.005706787109375, "num_input_tokens_seen": 99362336, "step": 674 }, { "epoch": 0.19458057076967425, "grad_norm": 12.946791305625748, "learning_rate": 0.0001, "loss": 0.021, "num_input_tokens_seen": 99534984, "step": 675 }, { "epoch": 0.19458057076967425, "loss": 0.01664827950298786, "loss_ce": 0.004082666710019112, "loss_xval": 0.0125732421875, "num_input_tokens_seen": 99534984, "step": 675 }, { "epoch": 0.19486883828192564, "grad_norm": 2.015939391342304, "learning_rate": 0.0001, "loss": 0.0139, "num_input_tokens_seen": 99669744, "step": 676 }, { "epoch": 0.19486883828192564, "loss": 0.01962101459503174, "loss_ce": 0.015081525780260563, "loss_xval": 0.004547119140625, "num_input_tokens_seen": 99669744, "step": 676 }, { "epoch": 0.195157105794177, "grad_norm": 10.133951813447016, "learning_rate": 0.0001, "loss": 0.0146, "num_input_tokens_seen": 99804800, "step": 677 }, { "epoch": 0.195157105794177, "loss": 0.012844789773225784, "loss_ce": 0.0024726272094994783, "loss_xval": 0.0103759765625, "num_input_tokens_seen": 99804800, "step": 677 }, { "epoch": 0.19544537330642836, "grad_norm": 0.7346199890040997, "learning_rate": 0.0001, "loss": 0.0157, "num_input_tokens_seen": 99977304, "step": 678 }, { "epoch": 0.19544537330642836, "loss": 0.011482784524559975, "loss_ce": 0.008564542047679424, "loss_xval": 0.0029144287109375, "num_input_tokens_seen": 99977304, "step": 678 }, { "epoch": 0.19573364081867972, "grad_norm": 8.687525798170709, "learning_rate": 0.0001, "loss": 0.0166, "num_input_tokens_seen": 100112096, "step": 679 }, { "epoch": 0.19573364081867972, "loss": 0.022103361785411835, "loss_ce": 0.01443963497877121, "loss_xval": 0.007659912109375, "num_input_tokens_seen": 100112096, "step": 679 }, { "epoch": 0.1960219083309311, "grad_norm": 0.622549468221611, "learning_rate": 0.0001, "loss": 0.0071, "num_input_tokens_seen": 100247088, "step": 680 }, { "epoch": 0.1960219083309311, "loss": 0.006348044611513615, "loss_ce": 0.0022729947231709957, "loss_xval": 0.00408935546875, "num_input_tokens_seen": 100247088, "step": 680 }, { "epoch": 0.19631017584318247, "grad_norm": 7.192674512777773, "learning_rate": 0.0001, "loss": 0.0147, "num_input_tokens_seen": 100419584, "step": 681 }, { "epoch": 0.19631017584318247, "loss": 0.009196193888783455, "loss_ce": 0.00399676151573658, "loss_xval": 0.00518798828125, "num_input_tokens_seen": 100419584, "step": 681 }, { "epoch": 0.19659844335543383, "grad_norm": 0.7655775148366553, "learning_rate": 0.0001, "loss": 0.0116, "num_input_tokens_seen": 100554328, "step": 682 }, { "epoch": 0.19659844335543383, "loss": 0.01690344139933586, "loss_ce": 0.012867492623627186, "loss_xval": 0.0040283203125, "num_input_tokens_seen": 100554328, "step": 682 }, { "epoch": 0.19688671086768522, "grad_norm": 7.242367892671911, "learning_rate": 0.0001, "loss": 0.0106, "num_input_tokens_seen": 100689416, "step": 683 }, { "epoch": 0.19688671086768522, "loss": 0.009902188554406166, "loss_ce": 0.002890774980187416, "loss_xval": 0.00701904296875, "num_input_tokens_seen": 100689416, "step": 683 }, { "epoch": 0.19717497837993658, "grad_norm": 1.2326886099093166, "learning_rate": 0.0001, "loss": 0.0119, "num_input_tokens_seen": 100861864, "step": 684 }, { "epoch": 0.19717497837993658, "loss": 0.006663970649242401, "loss_ce": 0.0037590786814689636, "loss_xval": 0.002899169921875, "num_input_tokens_seen": 100861864, "step": 684 }, { "epoch": 0.19746324589218794, "grad_norm": 6.2741974114302375, "learning_rate": 0.0001, "loss": 0.0135, "num_input_tokens_seen": 100996592, "step": 685 }, { "epoch": 0.19746324589218794, "loss": 0.01968025602400303, "loss_ce": 0.013534778729081154, "loss_xval": 0.006134033203125, "num_input_tokens_seen": 100996592, "step": 685 }, { "epoch": 0.19775151340443933, "grad_norm": 2.38411098441469, "learning_rate": 0.0001, "loss": 0.0059, "num_input_tokens_seen": 101131656, "step": 686 }, { "epoch": 0.19775151340443933, "loss": 0.005984088871628046, "loss_ce": 0.0022304265294224024, "loss_xval": 0.003753662109375, "num_input_tokens_seen": 101131656, "step": 686 }, { "epoch": 0.1980397809166907, "grad_norm": 4.6953093680133104, "learning_rate": 0.0001, "loss": 0.0151, "num_input_tokens_seen": 101304152, "step": 687 }, { "epoch": 0.1980397809166907, "loss": 0.012748443521559238, "loss_ce": 0.009401046670973301, "loss_xval": 0.0033416748046875, "num_input_tokens_seen": 101304152, "step": 687 }, { "epoch": 0.19832804842894206, "grad_norm": 2.421385194885947, "learning_rate": 0.0001, "loss": 0.0119, "num_input_tokens_seen": 101439032, "step": 688 }, { "epoch": 0.19832804842894206, "loss": 0.0180523619055748, "loss_ce": 0.014428399503231049, "loss_xval": 0.003631591796875, "num_input_tokens_seen": 101439032, "step": 688 }, { "epoch": 0.19861631594119342, "grad_norm": 4.6660438919960985, "learning_rate": 0.0001, "loss": 0.0062, "num_input_tokens_seen": 101574136, "step": 689 }, { "epoch": 0.19861631594119342, "loss": 0.0060625383630394936, "loss_ce": 0.0021047904156148434, "loss_xval": 0.00396728515625, "num_input_tokens_seen": 101574136, "step": 689 }, { "epoch": 0.1989045834534448, "grad_norm": 4.1145697211269425, "learning_rate": 0.0001, "loss": 0.0116, "num_input_tokens_seen": 101746584, "step": 690 }, { "epoch": 0.1989045834534448, "loss": 0.007049012929201126, "loss_ce": 0.003686357755213976, "loss_xval": 0.00335693359375, "num_input_tokens_seen": 101746584, "step": 690 }, { "epoch": 0.19919285096569617, "grad_norm": 3.2637992516048917, "learning_rate": 0.0001, "loss": 0.0128, "num_input_tokens_seen": 101881448, "step": 691 }, { "epoch": 0.19919285096569617, "loss": 0.019784413278102875, "loss_ce": 0.01585432142019272, "loss_xval": 0.003936767578125, "num_input_tokens_seen": 101881448, "step": 691 }, { "epoch": 0.19948111847794753, "grad_norm": 4.695312015427915, "learning_rate": 0.0001, "loss": 0.0054, "num_input_tokens_seen": 102016520, "step": 692 }, { "epoch": 0.19948111847794753, "loss": 0.005383410956710577, "loss_ce": 0.0017766145756468177, "loss_xval": 0.00360107421875, "num_input_tokens_seen": 102016520, "step": 692 }, { "epoch": 0.19976938599019892, "grad_norm": 1.9018232659248924, "learning_rate": 0.0001, "loss": 0.0118, "num_input_tokens_seen": 102189000, "step": 693 }, { "epoch": 0.19976938599019892, "loss": 0.006288346368819475, "loss_ce": 0.0036495295353233814, "loss_xval": 0.0026397705078125, "num_input_tokens_seen": 102189000, "step": 693 }, { "epoch": 0.20005765350245028, "grad_norm": 5.098530938515272, "learning_rate": 0.0001, "loss": 0.0109, "num_input_tokens_seen": 102323736, "step": 694 }, { "epoch": 0.20005765350245028, "loss": 0.016563966870307922, "loss_ce": 0.012800769880414009, "loss_xval": 0.0037689208984375, "num_input_tokens_seen": 102323736, "step": 694 }, { "epoch": 0.20034592101470164, "grad_norm": 0.43345396702123434, "learning_rate": 0.0001, "loss": 0.0052, "num_input_tokens_seen": 102458768, "step": 695 }, { "epoch": 0.20034592101470164, "loss": 0.004343533888459206, "loss_ce": 0.0017161609139293432, "loss_xval": 0.00262451171875, "num_input_tokens_seen": 102458768, "step": 695 }, { "epoch": 0.200634188526953, "grad_norm": 5.545324349512925, "learning_rate": 0.0001, "loss": 0.0148, "num_input_tokens_seen": 102631224, "step": 696 }, { "epoch": 0.200634188526953, "loss": 0.007838984951376915, "loss_ce": 0.004875918850302696, "loss_xval": 0.002960205078125, "num_input_tokens_seen": 102631224, "step": 696 }, { "epoch": 0.2009224560392044, "grad_norm": 2.378620571444512, "learning_rate": 0.0001, "loss": 0.0105, "num_input_tokens_seen": 102766000, "step": 697 }, { "epoch": 0.2009224560392044, "loss": 0.016783371567726135, "loss_ce": 0.014165535569190979, "loss_xval": 0.00262451171875, "num_input_tokens_seen": 102766000, "step": 697 }, { "epoch": 0.20121072355145575, "grad_norm": 4.147280176248775, "learning_rate": 0.0001, "loss": 0.0054, "num_input_tokens_seen": 102901240, "step": 698 }, { "epoch": 0.20121072355145575, "loss": 0.0046518342569470406, "loss_ce": 0.0016077059553936124, "loss_xval": 0.0030517578125, "num_input_tokens_seen": 102901240, "step": 698 }, { "epoch": 0.2014989910637071, "grad_norm": 4.334942145827397, "learning_rate": 0.0001, "loss": 0.0134, "num_input_tokens_seen": 103073640, "step": 699 }, { "epoch": 0.2014989910637071, "loss": 0.008360013365745544, "loss_ce": 0.005029782652854919, "loss_xval": 0.003326416015625, "num_input_tokens_seen": 103073640, "step": 699 }, { "epoch": 0.2017872585759585, "grad_norm": 2.053810403349566, "learning_rate": 0.0001, "loss": 0.0116, "num_input_tokens_seen": 103208504, "step": 700 }, { "epoch": 0.2017872585759585, "loss": 0.018476512283086777, "loss_ce": 0.015882518142461777, "loss_xval": 0.002593994140625, "num_input_tokens_seen": 103208504, "step": 700 }, { "epoch": 0.20207552608820986, "grad_norm": 6.464624251825052, "learning_rate": 0.0001, "loss": 0.007, "num_input_tokens_seen": 103343472, "step": 701 }, { "epoch": 0.20207552608820986, "loss": 0.006439526565372944, "loss_ce": 0.0027354557532817125, "loss_xval": 0.0037078857421875, "num_input_tokens_seen": 103343472, "step": 701 }, { "epoch": 0.20236379360046122, "grad_norm": 2.7451701528896693, "learning_rate": 0.0001, "loss": 0.0103, "num_input_tokens_seen": 103515840, "step": 702 }, { "epoch": 0.20236379360046122, "loss": 0.007059911731630564, "loss_ce": 0.004708150401711464, "loss_xval": 0.002349853515625, "num_input_tokens_seen": 103515840, "step": 702 }, { "epoch": 0.20265206111271258, "grad_norm": 3.7432213865383077, "learning_rate": 0.0001, "loss": 0.0101, "num_input_tokens_seen": 103650608, "step": 703 }, { "epoch": 0.20265206111271258, "loss": 0.016106905415654182, "loss_ce": 0.01323634572327137, "loss_xval": 0.00286865234375, "num_input_tokens_seen": 103650608, "step": 703 }, { "epoch": 0.20294032862496397, "grad_norm": 5.118536747197136, "learning_rate": 0.0001, "loss": 0.0066, "num_input_tokens_seen": 103785736, "step": 704 }, { "epoch": 0.20294032862496397, "loss": 0.0067821950651705265, "loss_ce": 0.00405754754319787, "loss_xval": 0.0027313232421875, "num_input_tokens_seen": 103785736, "step": 704 }, { "epoch": 0.20322859613721533, "grad_norm": 0.7402285579455906, "learning_rate": 0.0001, "loss": 0.0091, "num_input_tokens_seen": 103958312, "step": 705 }, { "epoch": 0.20322859613721533, "loss": 0.004566242918372154, "loss_ce": 0.002834369894117117, "loss_xval": 0.00173187255859375, "num_input_tokens_seen": 103958312, "step": 705 }, { "epoch": 0.2035168636494667, "grad_norm": 3.5297354964922336, "learning_rate": 0.0001, "loss": 0.0143, "num_input_tokens_seen": 104093096, "step": 706 }, { "epoch": 0.2035168636494667, "loss": 0.018068354576826096, "loss_ce": 0.015509648248553276, "loss_xval": 0.0025634765625, "num_input_tokens_seen": 104093096, "step": 706 }, { "epoch": 0.20380513116171808, "grad_norm": 2.0238108681732356, "learning_rate": 0.0001, "loss": 0.0037, "num_input_tokens_seen": 104228072, "step": 707 }, { "epoch": 0.20380513116171808, "loss": 0.0034702238626778126, "loss_ce": 0.0015657362528145313, "loss_xval": 0.0019073486328125, "num_input_tokens_seen": 104228072, "step": 707 }, { "epoch": 0.20409339867396945, "grad_norm": 2.8823502193143353, "learning_rate": 0.0001, "loss": 0.0102, "num_input_tokens_seen": 104400696, "step": 708 }, { "epoch": 0.20409339867396945, "loss": 0.00361913931556046, "loss_ce": 0.001668875222094357, "loss_xval": 0.001953125, "num_input_tokens_seen": 104400696, "step": 708 }, { "epoch": 0.2043816661862208, "grad_norm": 3.995092893791365, "learning_rate": 0.0001, "loss": 0.0091, "num_input_tokens_seen": 104535552, "step": 709 }, { "epoch": 0.2043816661862208, "loss": 0.014175212942063808, "loss_ce": 0.011864460073411465, "loss_xval": 0.0023040771484375, "num_input_tokens_seen": 104535552, "step": 709 }, { "epoch": 0.2046699336984722, "grad_norm": 0.9230410321679317, "learning_rate": 0.0001, "loss": 0.0063, "num_input_tokens_seen": 104670800, "step": 710 }, { "epoch": 0.2046699336984722, "loss": 0.0030780972447246313, "loss_ce": 0.001169794937595725, "loss_xval": 0.0019073486328125, "num_input_tokens_seen": 104670800, "step": 710 }, { "epoch": 0.20495820121072356, "grad_norm": 6.386372898528216, "learning_rate": 0.0001, "loss": 0.0107, "num_input_tokens_seen": 104843208, "step": 711 }, { "epoch": 0.20495820121072356, "loss": 0.004922128282487392, "loss_ce": 0.0021431210916489363, "loss_xval": 0.002777099609375, "num_input_tokens_seen": 104843208, "step": 711 }, { "epoch": 0.20524646872297492, "grad_norm": 4.493493805742633, "learning_rate": 0.0001, "loss": 0.0084, "num_input_tokens_seen": 104977992, "step": 712 }, { "epoch": 0.20524646872297492, "loss": 0.01235833391547203, "loss_ce": 0.010562565177679062, "loss_xval": 0.00179290771484375, "num_input_tokens_seen": 104977992, "step": 712 }, { "epoch": 0.20553473623522628, "grad_norm": 4.1652319135334634, "learning_rate": 0.0001, "loss": 0.006, "num_input_tokens_seen": 105113024, "step": 713 }, { "epoch": 0.20553473623522628, "loss": 0.007076140493154526, "loss_ce": 0.004552717786282301, "loss_xval": 0.0025177001953125, "num_input_tokens_seen": 105113024, "step": 713 }, { "epoch": 0.20582300374747767, "grad_norm": 9.277139468460556, "learning_rate": 0.0001, "loss": 0.0172, "num_input_tokens_seen": 105285544, "step": 714 }, { "epoch": 0.20582300374747767, "loss": 0.012906506657600403, "loss_ce": 0.007316067814826965, "loss_xval": 0.005584716796875, "num_input_tokens_seen": 105285544, "step": 714 }, { "epoch": 0.20611127125972903, "grad_norm": 3.598033343833172, "learning_rate": 0.0001, "loss": 0.009, "num_input_tokens_seen": 105420280, "step": 715 }, { "epoch": 0.20611127125972903, "loss": 0.014619720168411732, "loss_ce": 0.01264942903071642, "loss_xval": 0.0019683837890625, "num_input_tokens_seen": 105420280, "step": 715 }, { "epoch": 0.2063995387719804, "grad_norm": 7.377418814876543, "learning_rate": 0.0001, "loss": 0.007, "num_input_tokens_seen": 105555360, "step": 716 }, { "epoch": 0.2063995387719804, "loss": 0.006381037645041943, "loss_ce": 0.0024461778812110424, "loss_xval": 0.003936767578125, "num_input_tokens_seen": 105555360, "step": 716 }, { "epoch": 0.20668780628423178, "grad_norm": 11.760060582895239, "learning_rate": 0.0001, "loss": 0.0171, "num_input_tokens_seen": 105728048, "step": 717 }, { "epoch": 0.20668780628423178, "loss": 0.008494282141327858, "loss_ce": 0.0015858651604503393, "loss_xval": 0.00689697265625, "num_input_tokens_seen": 105728048, "step": 717 }, { "epoch": 0.20697607379648314, "grad_norm": 3.2649079419303106, "learning_rate": 0.0001, "loss": 0.0095, "num_input_tokens_seen": 105862864, "step": 718 }, { "epoch": 0.20697607379648314, "loss": 0.015600248239934444, "loss_ce": 0.013268514536321163, "loss_xval": 0.0023345947265625, "num_input_tokens_seen": 105862864, "step": 718 }, { "epoch": 0.2072643413087345, "grad_norm": 8.78753059548886, "learning_rate": 0.0001, "loss": 0.0082, "num_input_tokens_seen": 105998024, "step": 719 }, { "epoch": 0.2072643413087345, "loss": 0.007219268009066582, "loss_ce": 0.0020961295813322067, "loss_xval": 0.005126953125, "num_input_tokens_seen": 105998024, "step": 719 }, { "epoch": 0.20755260882098586, "grad_norm": 9.84300808250944, "learning_rate": 0.0001, "loss": 0.0134, "num_input_tokens_seen": 106170576, "step": 720 }, { "epoch": 0.20755260882098586, "loss": 0.007366933859884739, "loss_ce": 0.0020110984332859516, "loss_xval": 0.00537109375, "num_input_tokens_seen": 106170576, "step": 720 }, { "epoch": 0.20784087633323725, "grad_norm": 1.824367661797582, "learning_rate": 0.0001, "loss": 0.0088, "num_input_tokens_seen": 106305344, "step": 721 }, { "epoch": 0.20784087633323725, "loss": 0.014551606960594654, "loss_ce": 0.013377157039940357, "loss_xval": 0.0011749267578125, "num_input_tokens_seen": 106305344, "step": 721 }, { "epoch": 0.2081291438454886, "grad_norm": 11.832146455499752, "learning_rate": 0.0001, "loss": 0.0112, "num_input_tokens_seen": 106440480, "step": 722 }, { "epoch": 0.2081291438454886, "loss": 0.006810145918279886, "loss_ce": 0.001046138582751155, "loss_xval": 0.005767822265625, "num_input_tokens_seen": 106440480, "step": 722 }, { "epoch": 0.20841741135773997, "grad_norm": 7.026477339332676, "learning_rate": 0.0001, "loss": 0.0109, "num_input_tokens_seen": 106612912, "step": 723 }, { "epoch": 0.20841741135773997, "loss": 0.006631569936871529, "loss_ce": 0.0018555691931396723, "loss_xval": 0.0047607421875, "num_input_tokens_seen": 106612912, "step": 723 }, { "epoch": 0.20870567886999136, "grad_norm": 8.645951427184539, "learning_rate": 0.0001, "loss": 0.0127, "num_input_tokens_seen": 106747672, "step": 724 }, { "epoch": 0.20870567886999136, "loss": 0.016665926203131676, "loss_ce": 0.0138621237128973, "loss_xval": 0.0028076171875, "num_input_tokens_seen": 106747672, "step": 724 }, { "epoch": 0.20899394638224272, "grad_norm": 16.810786634717683, "learning_rate": 0.0001, "loss": 0.0164, "num_input_tokens_seen": 106882776, "step": 725 }, { "epoch": 0.20899394638224272, "loss": 0.011873760260641575, "loss_ce": 0.0009560962789691985, "loss_xval": 0.01092529296875, "num_input_tokens_seen": 106882776, "step": 725 }, { "epoch": 0.20928221389449408, "grad_norm": 6.357963151247145, "learning_rate": 0.0001, "loss": 0.0122, "num_input_tokens_seen": 107055424, "step": 726 }, { "epoch": 0.20928221389449408, "loss": 0.008962000720202923, "loss_ce": 0.0044415839947760105, "loss_xval": 0.0045166015625, "num_input_tokens_seen": 107055424, "step": 726 }, { "epoch": 0.20957048140674547, "grad_norm": 12.00840382089868, "learning_rate": 0.0001, "loss": 0.0169, "num_input_tokens_seen": 107190240, "step": 727 }, { "epoch": 0.20957048140674547, "loss": 0.018986305221915245, "loss_ce": 0.014607032760977745, "loss_xval": 0.00439453125, "num_input_tokens_seen": 107190240, "step": 727 }, { "epoch": 0.20985874891899683, "grad_norm": 16.77374168533911, "learning_rate": 0.0001, "loss": 0.0171, "num_input_tokens_seen": 107325304, "step": 728 }, { "epoch": 0.20985874891899683, "loss": 0.013724341988563538, "loss_ce": 0.0011892463080585003, "loss_xval": 0.01251220703125, "num_input_tokens_seen": 107325304, "step": 728 }, { "epoch": 0.2101470164312482, "grad_norm": 0.7581546666650136, "learning_rate": 0.0001, "loss": 0.0116, "num_input_tokens_seen": 107497904, "step": 729 }, { "epoch": 0.2101470164312482, "loss": 0.005003034602850676, "loss_ce": 0.002600728999823332, "loss_xval": 0.0023956298828125, "num_input_tokens_seen": 107497904, "step": 729 }, { "epoch": 0.21043528394349956, "grad_norm": 22.638176333350234, "learning_rate": 0.0001, "loss": 0.0338, "num_input_tokens_seen": 107632768, "step": 730 }, { "epoch": 0.21043528394349956, "loss": 0.02648492529988289, "loss_ce": 0.01167626865208149, "loss_xval": 0.01483154296875, "num_input_tokens_seen": 107632768, "step": 730 }, { "epoch": 0.21072355145575095, "grad_norm": 24.982746535895913, "learning_rate": 0.0001, "loss": 0.0326, "num_input_tokens_seen": 107767784, "step": 731 }, { "epoch": 0.21072355145575095, "loss": 0.031112845987081528, "loss_ce": 0.001465020701289177, "loss_xval": 0.0296630859375, "num_input_tokens_seen": 107767784, "step": 731 }, { "epoch": 0.2110118189680023, "grad_norm": 1.4820439364157452, "learning_rate": 0.0001, "loss": 0.0122, "num_input_tokens_seen": 107940288, "step": 732 }, { "epoch": 0.2110118189680023, "loss": 0.004679238889366388, "loss_ce": 0.0020709396339952946, "loss_xval": 0.0026092529296875, "num_input_tokens_seen": 107940288, "step": 732 }, { "epoch": 0.21130008648025367, "grad_norm": 30.58091880038745, "learning_rate": 0.0001, "loss": 0.0558, "num_input_tokens_seen": 108075104, "step": 733 }, { "epoch": 0.21130008648025367, "loss": 0.041614316403865814, "loss_ce": 0.014255308546125889, "loss_xval": 0.02734375, "num_input_tokens_seen": 108075104, "step": 733 }, { "epoch": 0.21158835399250506, "grad_norm": 29.47286835944433, "learning_rate": 0.0001, "loss": 0.0444, "num_input_tokens_seen": 108210048, "step": 734 }, { "epoch": 0.21158835399250506, "loss": 0.0443190336227417, "loss_ce": 0.0008620026055723429, "loss_xval": 0.04345703125, "num_input_tokens_seen": 108210048, "step": 734 }, { "epoch": 0.21187662150475642, "grad_norm": 4.104779558726963, "learning_rate": 0.0001, "loss": 0.0155, "num_input_tokens_seen": 108382560, "step": 735 }, { "epoch": 0.21187662150475642, "loss": 0.005712998099625111, "loss_ce": 0.002589714713394642, "loss_xval": 0.0031280517578125, "num_input_tokens_seen": 108382560, "step": 735 }, { "epoch": 0.21216488901700778, "grad_norm": 37.49739334499608, "learning_rate": 0.0001, "loss": 0.0792, "num_input_tokens_seen": 108517344, "step": 736 }, { "epoch": 0.21216488901700778, "loss": 0.05222557112574577, "loss_ce": 0.012400129809975624, "loss_xval": 0.039794921875, "num_input_tokens_seen": 108517344, "step": 736 }, { "epoch": 0.21245315652925914, "grad_norm": 34.181805068474205, "learning_rate": 0.0001, "loss": 0.0587, "num_input_tokens_seen": 108652376, "step": 737 }, { "epoch": 0.21245315652925914, "loss": 0.06710252165794373, "loss_ce": 0.0008488551247864962, "loss_xval": 0.06640625, "num_input_tokens_seen": 108652376, "step": 737 }, { "epoch": 0.21274142404151053, "grad_norm": 7.407982752291219, "learning_rate": 0.0001, "loss": 0.0198, "num_input_tokens_seen": 108824816, "step": 738 }, { "epoch": 0.21274142404151053, "loss": 0.004085796885192394, "loss_ce": 0.001343029784038663, "loss_xval": 0.00274658203125, "num_input_tokens_seen": 108824816, "step": 738 }, { "epoch": 0.2130296915537619, "grad_norm": 45.64979852027761, "learning_rate": 0.0001, "loss": 0.115, "num_input_tokens_seen": 108959632, "step": 739 }, { "epoch": 0.2130296915537619, "loss": 0.07691901922225952, "loss_ce": 0.017562326043844223, "loss_xval": 0.059326171875, "num_input_tokens_seen": 108959632, "step": 739 }, { "epoch": 0.21331795906601325, "grad_norm": 36.967704163197965, "learning_rate": 0.0001, "loss": 0.0698, "num_input_tokens_seen": 109094712, "step": 740 }, { "epoch": 0.21331795906601325, "loss": 0.08530056476593018, "loss_ce": 0.0009499795269221067, "loss_xval": 0.08447265625, "num_input_tokens_seen": 109094712, "step": 740 }, { "epoch": 0.21360622657826464, "grad_norm": 14.27081537855223, "learning_rate": 0.0001, "loss": 0.0316, "num_input_tokens_seen": 109267256, "step": 741 }, { "epoch": 0.21360622657826464, "loss": 0.0032704181503504515, "loss_ce": 0.001323015196248889, "loss_xval": 0.00194549560546875, "num_input_tokens_seen": 109267256, "step": 741 }, { "epoch": 0.213894494090516, "grad_norm": 57.42413597809985, "learning_rate": 0.0001, "loss": 0.1725, "num_input_tokens_seen": 109401992, "step": 742 }, { "epoch": 0.213894494090516, "loss": 0.1181582510471344, "loss_ce": 0.011102594435214996, "loss_xval": 0.10693359375, "num_input_tokens_seen": 109401992, "step": 742 }, { "epoch": 0.21418276160276736, "grad_norm": 43.418170426670386, "learning_rate": 0.0001, "loss": 0.098, "num_input_tokens_seen": 109536952, "step": 743 }, { "epoch": 0.21418276160276736, "loss": 0.12911713123321533, "loss_ce": 0.0012484844774007797, "loss_xval": 0.1279296875, "num_input_tokens_seen": 109536952, "step": 743 }, { "epoch": 0.21447102911501872, "grad_norm": 19.583171126203737, "learning_rate": 0.0001, "loss": 0.0458, "num_input_tokens_seen": 109709448, "step": 744 }, { "epoch": 0.21447102911501872, "loss": 0.0038100657984614372, "loss_ce": 0.0019217908848077059, "loss_xval": 0.00189208984375, "num_input_tokens_seen": 109709448, "step": 744 }, { "epoch": 0.2147592966272701, "grad_norm": 67.0891843399185, "learning_rate": 0.0001, "loss": 0.2363, "num_input_tokens_seen": 109844328, "step": 745 }, { "epoch": 0.2147592966272701, "loss": 0.16546039283275604, "loss_ce": 0.01458149217069149, "loss_xval": 0.150390625, "num_input_tokens_seen": 109844328, "step": 745 }, { "epoch": 0.21504756413952147, "grad_norm": 41.61436187130198, "learning_rate": 0.0001, "loss": 0.0962, "num_input_tokens_seen": 109979464, "step": 746 }, { "epoch": 0.21504756413952147, "loss": 0.1428573727607727, "loss_ce": 0.002293400000780821, "loss_xval": 0.140625, "num_input_tokens_seen": 109979464, "step": 746 }, { "epoch": 0.21533583165177284, "grad_norm": 35.76768484393859, "learning_rate": 0.0001, "loss": 0.1025, "num_input_tokens_seen": 110152024, "step": 747 }, { "epoch": 0.21533583165177284, "loss": 0.010296022519469261, "loss_ce": 0.00212112651206553, "loss_xval": 0.0081787109375, "num_input_tokens_seen": 110152024, "step": 747 }, { "epoch": 0.21562409916402422, "grad_norm": 81.41444916351666, "learning_rate": 0.0001, "loss": 0.3448, "num_input_tokens_seen": 110286800, "step": 748 }, { "epoch": 0.21562409916402422, "loss": 0.2511296272277832, "loss_ce": 0.011261474341154099, "loss_xval": 0.240234375, "num_input_tokens_seen": 110286800, "step": 748 }, { "epoch": 0.21591236667627559, "grad_norm": 33.949280913156656, "learning_rate": 0.0001, "loss": 0.0738, "num_input_tokens_seen": 110421864, "step": 749 }, { "epoch": 0.21591236667627559, "loss": 0.12540201842784882, "loss_ce": 0.0011344377417117357, "loss_xval": 0.1240234375, "num_input_tokens_seen": 110421864, "step": 749 }, { "epoch": 0.21620063418852695, "grad_norm": 64.0371404773295, "learning_rate": 0.0001, "loss": 0.2511, "num_input_tokens_seen": 110594344, "step": 750 }, { "epoch": 0.21620063418852695, "eval_websight_new_IoU": 0.0, "eval_websight_new_MAE_x": 0.8172395527362823, "eval_websight_new_MAE_y": 0.8070051372051239, "eval_websight_new_NUM_probability": 0.9898514747619629, "eval_websight_new_inside_bbox": 0.0, "eval_websight_new_loss": 0.6402856707572937, "eval_websight_new_loss_ce": 0.0014999214326962829, "eval_websight_new_loss_xval": 0.638671875, "eval_websight_new_runtime": 35.9819, "eval_websight_new_samples_per_second": 1.39, "eval_websight_new_steps_per_second": 0.056, "num_input_tokens_seen": 110594344, "step": 750 }, { "epoch": 0.21620063418852695, "eval_seeclick_IoU": 0.0, "eval_seeclick_MAE_x": 0.6540522873401642, "eval_seeclick_MAE_y": 0.6515002846717834, "eval_seeclick_NUM_probability": 0.9914124011993408, "eval_seeclick_inside_bbox": 0.0, "eval_seeclick_loss": 0.4462566375732422, "eval_seeclick_loss_ce": 0.012402037624269724, "eval_seeclick_loss_xval": 0.4342041015625, "eval_seeclick_runtime": 64.6128, "eval_seeclick_samples_per_second": 0.774, "eval_seeclick_steps_per_second": 0.031, "num_input_tokens_seen": 110594344, "step": 750 }, { "epoch": 0.21620063418852695, "eval_icons_IoU": 0.0, "eval_icons_MAE_x": 1.0583945512771606, "eval_icons_MAE_y": 1.0435316562652588, "eval_icons_NUM_probability": 0.9888060688972473, "eval_icons_inside_bbox": 0.0, "eval_icons_loss": 1.1147381067276, "eval_icons_loss_ce": 0.0066304560750722885, "eval_icons_loss_xval": 1.10546875, "eval_icons_runtime": 65.9871, "eval_icons_samples_per_second": 0.758, "eval_icons_steps_per_second": 0.03, "num_input_tokens_seen": 110594344, "step": 750 }, { "epoch": 0.21620063418852695, "loss": 1.12251615524292, "loss_ce": 0.008258317597210407, "loss_xval": 1.1171875, "num_input_tokens_seen": 110594344, "step": 750 }, { "epoch": 0.21648890170077834, "grad_norm": 100.39901558775324, "learning_rate": 0.0001, "loss": 0.5285, "num_input_tokens_seen": 110729120, "step": 751 }, { "epoch": 0.21648890170077834, "loss": 0.433902382850647, "loss_ce": 0.012515652924776077, "loss_xval": 0.421875, "num_input_tokens_seen": 110729120, "step": 751 }, { "epoch": 0.2167771692130297, "grad_norm": 14.068410073467993, "learning_rate": 0.0001, "loss": 0.0389, "num_input_tokens_seen": 110864456, "step": 752 }, { "epoch": 0.2167771692130297, "loss": 0.07106943428516388, "loss_ce": 0.0024048807099461555, "loss_xval": 0.06884765625, "num_input_tokens_seen": 110864456, "step": 752 }, { "epoch": 0.21706543672528106, "grad_norm": 108.40390401257352, "learning_rate": 0.0001, "loss": 0.6725, "num_input_tokens_seen": 111037032, "step": 753 }, { "epoch": 0.21706543672528106, "loss": 0.3352723717689514, "loss_ce": 0.0029969951137900352, "loss_xval": 0.33203125, "num_input_tokens_seen": 111037032, "step": 753 }, { "epoch": 0.21735370423753242, "grad_norm": 109.21261440688775, "learning_rate": 0.0001, "loss": 0.6504, "num_input_tokens_seen": 111171856, "step": 754 }, { "epoch": 0.21735370423753242, "loss": 0.5691009759902954, "loss_ce": 0.011483820155262947, "loss_xval": 0.55859375, "num_input_tokens_seen": 111171856, "step": 754 }, { "epoch": 0.2176419717497838, "grad_norm": 41.5565355210608, "learning_rate": 0.0001, "loss": 0.145, "num_input_tokens_seen": 111306816, "step": 755 }, { "epoch": 0.2176419717497838, "loss": 0.01940779760479927, "loss_ce": 0.0028062332421541214, "loss_xval": 0.0166015625, "num_input_tokens_seen": 111306816, "step": 755 }, { "epoch": 0.21793023926203517, "grad_norm": 162.7730201135358, "learning_rate": 0.0001, "loss": 1.5395, "num_input_tokens_seen": 111479384, "step": 756 }, { "epoch": 0.21793023926203517, "loss": 1.0829627513885498, "loss_ce": 0.002884726971387863, "loss_xval": 1.078125, "num_input_tokens_seen": 111479384, "step": 756 }, { "epoch": 0.21821850677428653, "grad_norm": 65.1372279307664, "learning_rate": 0.0001, "loss": 0.2507, "num_input_tokens_seen": 111614168, "step": 757 }, { "epoch": 0.21821850677428653, "loss": 0.2695873975753784, "loss_ce": 0.01116455439478159, "loss_xval": 0.2578125, "num_input_tokens_seen": 111614168, "step": 757 }, { "epoch": 0.21850677428653792, "grad_norm": 145.76586500867106, "learning_rate": 0.0001, "loss": 1.1751, "num_input_tokens_seen": 111749232, "step": 758 }, { "epoch": 0.21850677428653792, "loss": 0.6525642275810242, "loss_ce": 0.0026618903502821922, "loss_xval": 0.6484375, "num_input_tokens_seen": 111749232, "step": 758 }, { "epoch": 0.21879504179878928, "grad_norm": 409.62868599068196, "learning_rate": 0.0001, "loss": 4.1763, "num_input_tokens_seen": 111921744, "step": 759 }, { "epoch": 0.21879504179878928, "loss": 3.8221874237060547, "loss_ce": 0.011640791781246662, "loss_xval": 3.8125, "num_input_tokens_seen": 111921744, "step": 759 }, { "epoch": 0.21908330931104064, "grad_norm": 135.94398972900095, "learning_rate": 0.0001, "loss": 0.9475, "num_input_tokens_seen": 112056560, "step": 760 }, { "epoch": 0.21908330931104064, "loss": 0.940850019454956, "loss_ce": 0.3182913661003113, "loss_xval": 0.62109375, "num_input_tokens_seen": 112056560, "step": 760 }, { "epoch": 0.219371576823292, "grad_norm": 738.893441811946, "learning_rate": 0.0001, "loss": 9.1575, "num_input_tokens_seen": 112191688, "step": 761 }, { "epoch": 0.219371576823292, "loss": 10.175888061523438, "loss_ce": 0.6133874654769897, "loss_xval": 9.5625, "num_input_tokens_seen": 112191688, "step": 761 }, { "epoch": 0.2196598443355434, "grad_norm": 832.3972251198659, "learning_rate": 0.0001, "loss": 33.4835, "num_input_tokens_seen": 112364192, "step": 762 }, { "epoch": 0.2196598443355434, "loss": 32.80980682373047, "loss_ce": 0.3723085820674896, "loss_xval": 32.5, "num_input_tokens_seen": 112364192, "step": 762 }, { "epoch": 0.21994811184779475, "grad_norm": 755.1576609542743, "learning_rate": 0.0001, "loss": 12.6196, "num_input_tokens_seen": 112498976, "step": 763 }, { "epoch": 0.21994811184779475, "loss": 11.760293960571289, "loss_ce": 0.40091896057128906, "loss_xval": 11.375, "num_input_tokens_seen": 112498976, "step": 763 }, { "epoch": 0.22023637936004611, "grad_norm": 138.2397155003218, "learning_rate": 0.0001, "loss": 0.9111, "num_input_tokens_seen": 112634040, "step": 764 }, { "epoch": 0.22023637936004611, "loss": 0.974545419216156, "loss_ce": 0.2006196528673172, "loss_xval": 0.7734375, "num_input_tokens_seen": 112634040, "step": 764 }, { "epoch": 0.2205246468722975, "grad_norm": 173.75011876445583, "learning_rate": 0.0001, "loss": 1.5548, "num_input_tokens_seen": 112806632, "step": 765 }, { "epoch": 0.2205246468722975, "loss": 1.4252033233642578, "loss_ce": 0.2347736358642578, "loss_xval": 1.1875, "num_input_tokens_seen": 112806632, "step": 765 }, { "epoch": 0.22081291438454886, "grad_norm": 80.24372426526006, "learning_rate": 0.0001, "loss": 0.4294, "num_input_tokens_seen": 112941480, "step": 766 }, { "epoch": 0.22081291438454886, "loss": 0.3662590980529785, "loss_ce": 0.21873709559440613, "loss_xval": 0.1474609375, "num_input_tokens_seen": 112941480, "step": 766 }, { "epoch": 0.22110118189680023, "grad_norm": 241.67489394610638, "learning_rate": 0.0001, "loss": 1.8493, "num_input_tokens_seen": 113076600, "step": 767 }, { "epoch": 0.22110118189680023, "loss": 1.8362443447113037, "loss_ce": 0.2532365918159485, "loss_xval": 1.5859375, "num_input_tokens_seen": 113076600, "step": 767 }, { "epoch": 0.2213894494090516, "grad_norm": 309.9244098220946, "learning_rate": 0.0001, "loss": 0.6891, "num_input_tokens_seen": 113249144, "step": 768 }, { "epoch": 0.2213894494090516, "loss": 0.7872165441513062, "loss_ce": 0.7117770910263062, "loss_xval": 0.0751953125, "num_input_tokens_seen": 113249144, "step": 768 }, { "epoch": 0.22167771692130298, "grad_norm": 159.45569658811266, "learning_rate": 0.0001, "loss": 0.7886, "num_input_tokens_seen": 113383944, "step": 769 }, { "epoch": 0.22167771692130298, "loss": 0.9091415405273438, "loss_ce": 0.17281340062618256, "loss_xval": 0.734375, "num_input_tokens_seen": 113383944, "step": 769 }, { "epoch": 0.22196598443355434, "grad_norm": 76.54296676716736, "learning_rate": 0.0001, "loss": 0.5255, "num_input_tokens_seen": 113519064, "step": 770 }, { "epoch": 0.22196598443355434, "loss": 0.4586948752403259, "loss_ce": 0.14302101731300354, "loss_xval": 0.31640625, "num_input_tokens_seen": 113519064, "step": 770 }, { "epoch": 0.2222542519458057, "grad_norm": 33.441254496122355, "learning_rate": 0.0001, "loss": 0.2996, "num_input_tokens_seen": 113691528, "step": 771 }, { "epoch": 0.2222542519458057, "loss": 0.3153448700904846, "loss_ce": 0.20883852243423462, "loss_xval": 0.1064453125, "num_input_tokens_seen": 113691528, "step": 771 }, { "epoch": 0.2225425194580571, "grad_norm": 94.5394479291071, "learning_rate": 0.0001, "loss": 0.7111, "num_input_tokens_seen": 113826248, "step": 772 }, { "epoch": 0.2225425194580571, "loss": 0.6232205033302307, "loss_ce": 0.1376248300075531, "loss_xval": 0.486328125, "num_input_tokens_seen": 113826248, "step": 772 }, { "epoch": 0.22283078697030845, "grad_norm": 31.618367864103973, "learning_rate": 0.0001, "loss": 0.2197, "num_input_tokens_seen": 113961288, "step": 773 }, { "epoch": 0.22283078697030845, "loss": 0.20101110637187958, "loss_ce": 0.10189002007246017, "loss_xval": 0.09912109375, "num_input_tokens_seen": 113961288, "step": 773 }, { "epoch": 0.2231190544825598, "grad_norm": 79.57061276729071, "learning_rate": 0.0001, "loss": 0.5602, "num_input_tokens_seen": 114133776, "step": 774 }, { "epoch": 0.2231190544825598, "loss": 0.5369690656661987, "loss_ce": 0.18931280076503754, "loss_xval": 0.34765625, "num_input_tokens_seen": 114133776, "step": 774 }, { "epoch": 0.2234073219948112, "grad_norm": 63.51037426847027, "learning_rate": 0.0001, "loss": 0.3826, "num_input_tokens_seen": 114268600, "step": 775 }, { "epoch": 0.2234073219948112, "loss": 0.4234406352043152, "loss_ce": 0.11557929962873459, "loss_xval": 0.30859375, "num_input_tokens_seen": 114268600, "step": 775 }, { "epoch": 0.22369558950706256, "grad_norm": 33.74071638554486, "learning_rate": 0.0001, "loss": 0.2024, "num_input_tokens_seen": 114403672, "step": 776 }, { "epoch": 0.22369558950706256, "loss": 0.18327657878398895, "loss_ce": 0.08977071940898895, "loss_xval": 0.09375, "num_input_tokens_seen": 114403672, "step": 776 }, { "epoch": 0.22398385701931392, "grad_norm": 73.48859012338815, "learning_rate": 0.0001, "loss": 0.4483, "num_input_tokens_seen": 114576152, "step": 777 }, { "epoch": 0.22398385701931392, "loss": 0.4633466303348541, "loss_ce": 0.10030952095985413, "loss_xval": 0.36328125, "num_input_tokens_seen": 114576152, "step": 777 }, { "epoch": 0.22427212453156528, "grad_norm": 14.942164124982677, "learning_rate": 0.0001, "loss": 0.1079, "num_input_tokens_seen": 114710920, "step": 778 }, { "epoch": 0.22427212453156528, "loss": 0.12119618058204651, "loss_ce": 0.09102955460548401, "loss_xval": 0.0301513671875, "num_input_tokens_seen": 114710920, "step": 778 }, { "epoch": 0.22456039204381667, "grad_norm": 57.038676561237565, "learning_rate": 0.0001, "loss": 0.2876, "num_input_tokens_seen": 114845984, "step": 779 }, { "epoch": 0.22456039204381667, "loss": 0.27952879667282104, "loss_ce": 0.06114499643445015, "loss_xval": 0.21875, "num_input_tokens_seen": 114845984, "step": 779 }, { "epoch": 0.22484865955606803, "grad_norm": 43.27715155294268, "learning_rate": 0.0001, "loss": 0.2181, "num_input_tokens_seen": 115018592, "step": 780 }, { "epoch": 0.22484865955606803, "loss": 0.2002837359905243, "loss_ce": 0.06808160245418549, "loss_xval": 0.1318359375, "num_input_tokens_seen": 115018592, "step": 780 }, { "epoch": 0.2251369270683194, "grad_norm": 29.573825175062915, "learning_rate": 0.0001, "loss": 0.141, "num_input_tokens_seen": 115153416, "step": 781 }, { "epoch": 0.2251369270683194, "loss": 0.16036775708198547, "loss_ce": 0.09045198559761047, "loss_xval": 0.06982421875, "num_input_tokens_seen": 115153416, "step": 781 }, { "epoch": 0.22542519458057078, "grad_norm": 55.78457083485312, "learning_rate": 0.0001, "loss": 0.2542, "num_input_tokens_seen": 115288464, "step": 782 }, { "epoch": 0.22542519458057078, "loss": 0.2591482400894165, "loss_ce": 0.0452810674905777, "loss_xval": 0.2138671875, "num_input_tokens_seen": 115288464, "step": 782 }, { "epoch": 0.22571346209282214, "grad_norm": 2.6055577715854104, "learning_rate": 0.0001, "loss": 0.0719, "num_input_tokens_seen": 115461056, "step": 783 }, { "epoch": 0.22571346209282214, "loss": 0.05857803672552109, "loss_ce": 0.04785110801458359, "loss_xval": 0.0107421875, "num_input_tokens_seen": 115461056, "step": 783 }, { "epoch": 0.2260017296050735, "grad_norm": 47.20006023553052, "learning_rate": 0.0001, "loss": 0.2008, "num_input_tokens_seen": 115595912, "step": 784 }, { "epoch": 0.2260017296050735, "loss": 0.2157151997089386, "loss_ce": 0.0576341450214386, "loss_xval": 0.158203125, "num_input_tokens_seen": 115595912, "step": 784 }, { "epoch": 0.22628999711732486, "grad_norm": 28.185111544321373, "learning_rate": 0.0001, "loss": 0.0922, "num_input_tokens_seen": 115730992, "step": 785 }, { "epoch": 0.22628999711732486, "loss": 0.08684153854846954, "loss_ce": 0.029437974095344543, "loss_xval": 0.057373046875, "num_input_tokens_seen": 115730992, "step": 785 }, { "epoch": 0.22657826462957625, "grad_norm": 29.89508500227094, "learning_rate": 0.0001, "loss": 0.1067, "num_input_tokens_seen": 115903480, "step": 786 }, { "epoch": 0.22657826462957625, "loss": 0.09849099814891815, "loss_ce": 0.04423074051737785, "loss_xval": 0.05419921875, "num_input_tokens_seen": 115903480, "step": 786 }, { "epoch": 0.22686653214182761, "grad_norm": 42.04327426869395, "learning_rate": 0.0001, "loss": 0.1484, "num_input_tokens_seen": 116038288, "step": 787 }, { "epoch": 0.22686653214182761, "loss": 0.160066157579422, "loss_ce": 0.0313430055975914, "loss_xval": 0.12890625, "num_input_tokens_seen": 116038288, "step": 787 }, { "epoch": 0.22715479965407898, "grad_norm": 9.903206689698122, "learning_rate": 0.0001, "loss": 0.0459, "num_input_tokens_seen": 116173248, "step": 788 }, { "epoch": 0.22715479965407898, "loss": 0.032735370099544525, "loss_ce": 0.018895648419857025, "loss_xval": 0.01385498046875, "num_input_tokens_seen": 116173248, "step": 788 }, { "epoch": 0.22744306716633036, "grad_norm": 39.38878364407851, "learning_rate": 0.0001, "loss": 0.1426, "num_input_tokens_seen": 116345712, "step": 789 }, { "epoch": 0.22744306716633036, "loss": 0.14974749088287354, "loss_ce": 0.03658831864595413, "loss_xval": 0.11328125, "num_input_tokens_seen": 116345712, "step": 789 }, { "epoch": 0.22773133467858173, "grad_norm": 10.849920705998569, "learning_rate": 0.0001, "loss": 0.0358, "num_input_tokens_seen": 116480424, "step": 790 }, { "epoch": 0.22773133467858173, "loss": 0.042529068887233734, "loss_ce": 0.024920426309108734, "loss_xval": 0.017578125, "num_input_tokens_seen": 116480424, "step": 790 }, { "epoch": 0.2280196021908331, "grad_norm": 29.27412447713324, "learning_rate": 0.0001, "loss": 0.0822, "num_input_tokens_seen": 116615552, "step": 791 }, { "epoch": 0.2280196021908331, "loss": 0.07675212621688843, "loss_ce": 0.012237967923283577, "loss_xval": 0.064453125, "num_input_tokens_seen": 116615552, "step": 791 }, { "epoch": 0.22830786970308448, "grad_norm": 27.650517838759587, "learning_rate": 0.0001, "loss": 0.0588, "num_input_tokens_seen": 116788096, "step": 792 }, { "epoch": 0.22830786970308448, "loss": 0.05113198608160019, "loss_ce": 0.02311684750020504, "loss_xval": 0.028076171875, "num_input_tokens_seen": 116788096, "step": 792 }, { "epoch": 0.22859613721533584, "grad_norm": 16.993351779114146, "learning_rate": 0.0001, "loss": 0.0454, "num_input_tokens_seen": 116922872, "step": 793 }, { "epoch": 0.22859613721533584, "loss": 0.04983597993850708, "loss_ce": 0.02609330229461193, "loss_xval": 0.023681640625, "num_input_tokens_seen": 116922872, "step": 793 }, { "epoch": 0.2288844047275872, "grad_norm": 24.271465227011593, "learning_rate": 0.0001, "loss": 0.0618, "num_input_tokens_seen": 117057920, "step": 794 }, { "epoch": 0.2288844047275872, "loss": 0.054998017847537994, "loss_ce": 0.008885959163308144, "loss_xval": 0.046142578125, "num_input_tokens_seen": 117057920, "step": 794 }, { "epoch": 0.22917267223983856, "grad_norm": 6.142005548753007, "learning_rate": 0.0001, "loss": 0.0321, "num_input_tokens_seen": 117230352, "step": 795 }, { "epoch": 0.22917267223983856, "loss": 0.024952255189418793, "loss_ce": 0.017422042787075043, "loss_xval": 0.007537841796875, "num_input_tokens_seen": 117230352, "step": 795 }, { "epoch": 0.22946093975208995, "grad_norm": 25.269181779510625, "learning_rate": 0.0001, "loss": 0.0654, "num_input_tokens_seen": 117365136, "step": 796 }, { "epoch": 0.22946093975208995, "loss": 0.07435447722673416, "loss_ce": 0.02610618993639946, "loss_xval": 0.04833984375, "num_input_tokens_seen": 117365136, "step": 796 }, { "epoch": 0.2297492072643413, "grad_norm": 4.149447856918701, "learning_rate": 0.0001, "loss": 0.0165, "num_input_tokens_seen": 117500200, "step": 797 }, { "epoch": 0.2297492072643413, "loss": 0.014349638484418392, "loss_ce": 0.0065791006200015545, "loss_xval": 0.007781982421875, "num_input_tokens_seen": 117500200, "step": 797 }, { "epoch": 0.23003747477659267, "grad_norm": 20.90550876360466, "learning_rate": 0.0001, "loss": 0.0545, "num_input_tokens_seen": 117672592, "step": 798 }, { "epoch": 0.23003747477659267, "loss": 0.053098104894161224, "loss_ce": 0.015897177159786224, "loss_xval": 0.037109375, "num_input_tokens_seen": 117672592, "step": 798 }, { "epoch": 0.23032574228884406, "grad_norm": 12.299732683231854, "learning_rate": 0.0001, "loss": 0.0297, "num_input_tokens_seen": 117807336, "step": 799 }, { "epoch": 0.23032574228884406, "loss": 0.040189050137996674, "loss_ce": 0.024312280118465424, "loss_xval": 0.015869140625, "num_input_tokens_seen": 117807336, "step": 799 }, { "epoch": 0.23061400980109542, "grad_norm": 12.779776228914203, "learning_rate": 0.0001, "loss": 0.0245, "num_input_tokens_seen": 117942352, "step": 800 }, { "epoch": 0.23061400980109542, "loss": 0.021533731371164322, "loss_ce": 0.005611184053122997, "loss_xval": 0.015869140625, "num_input_tokens_seen": 117942352, "step": 800 }, { "epoch": 0.23090227731334678, "grad_norm": 18.23997338878261, "learning_rate": 0.0001, "loss": 0.0437, "num_input_tokens_seen": 118114760, "step": 801 }, { "epoch": 0.23090227731334678, "loss": 0.03563398867845535, "loss_ce": 0.012455889955163002, "loss_xval": 0.023193359375, "num_input_tokens_seen": 118114760, "step": 801 }, { "epoch": 0.23119054482559814, "grad_norm": 5.126952756766402, "learning_rate": 0.0001, "loss": 0.0194, "num_input_tokens_seen": 118249608, "step": 802 }, { "epoch": 0.23119054482559814, "loss": 0.027275238186120987, "loss_ce": 0.020496521145105362, "loss_xval": 0.00677490234375, "num_input_tokens_seen": 118249608, "step": 802 }, { "epoch": 0.23147881233784953, "grad_norm": 20.333284577996896, "learning_rate": 0.0001, "loss": 0.0414, "num_input_tokens_seen": 118384776, "step": 803 }, { "epoch": 0.23147881233784953, "loss": 0.03544127941131592, "loss_ce": 0.004832149483263493, "loss_xval": 0.0306396484375, "num_input_tokens_seen": 118384776, "step": 803 }, { "epoch": 0.2317670798501009, "grad_norm": 1.0000957750891695, "learning_rate": 0.0001, "loss": 0.0192, "num_input_tokens_seen": 118557304, "step": 804 }, { "epoch": 0.2317670798501009, "loss": 0.014575895853340626, "loss_ce": 0.011049207299947739, "loss_xval": 0.0035247802734375, "num_input_tokens_seen": 118557304, "step": 804 }, { "epoch": 0.23205534736235225, "grad_norm": 18.193515679118878, "learning_rate": 0.0001, "loss": 0.0369, "num_input_tokens_seen": 118692096, "step": 805 }, { "epoch": 0.23205534736235225, "loss": 0.04329147934913635, "loss_ce": 0.017046362161636353, "loss_xval": 0.0262451171875, "num_input_tokens_seen": 118692096, "step": 805 }, { "epoch": 0.23234361487460364, "grad_norm": 5.562258804450563, "learning_rate": 0.0001, "loss": 0.0137, "num_input_tokens_seen": 118827128, "step": 806 }, { "epoch": 0.23234361487460364, "loss": 0.010109086520969868, "loss_ce": 0.004446167498826981, "loss_xval": 0.00567626953125, "num_input_tokens_seen": 118827128, "step": 806 }, { "epoch": 0.232631882386855, "grad_norm": 13.555981820540076, "learning_rate": 0.0001, "loss": 0.0296, "num_input_tokens_seen": 118999824, "step": 807 }, { "epoch": 0.232631882386855, "loss": 0.026791900396347046, "loss_ce": 0.009984343312680721, "loss_xval": 0.016845703125, "num_input_tokens_seen": 118999824, "step": 807 }, { "epoch": 0.23292014989910637, "grad_norm": 9.654390742337505, "learning_rate": 0.0001, "loss": 0.021, "num_input_tokens_seen": 119134800, "step": 808 }, { "epoch": 0.23292014989910637, "loss": 0.028388747945427895, "loss_ce": 0.01809288002550602, "loss_xval": 0.01031494140625, "num_input_tokens_seen": 119134800, "step": 808 }, { "epoch": 0.23320841741135773, "grad_norm": 9.473802505614938, "learning_rate": 0.0001, "loss": 0.0163, "num_input_tokens_seen": 119270000, "step": 809 }, { "epoch": 0.23320841741135773, "loss": 0.014459874480962753, "loss_ce": 0.003813053946942091, "loss_xval": 0.0106201171875, "num_input_tokens_seen": 119270000, "step": 809 }, { "epoch": 0.23349668492360912, "grad_norm": 12.29129249764753, "learning_rate": 0.0001, "loss": 0.0254, "num_input_tokens_seen": 119442552, "step": 810 }, { "epoch": 0.23349668492360912, "loss": 0.0196915864944458, "loss_ce": 0.007125974632799625, "loss_xval": 0.0125732421875, "num_input_tokens_seen": 119442552, "step": 810 }, { "epoch": 0.23378495243586048, "grad_norm": 6.4491102706539865, "learning_rate": 0.0001, "loss": 0.0168, "num_input_tokens_seen": 119577288, "step": 811 }, { "epoch": 0.23378495243586048, "loss": 0.02235257439315319, "loss_ce": 0.016197558492422104, "loss_xval": 0.00616455078125, "num_input_tokens_seen": 119577288, "step": 811 }, { "epoch": 0.23407321994811184, "grad_norm": 11.972644652281343, "learning_rate": 0.0001, "loss": 0.017, "num_input_tokens_seen": 119712456, "step": 812 }, { "epoch": 0.23407321994811184, "loss": 0.017476797103881836, "loss_ce": 0.0034692296758294106, "loss_xval": 0.0140380859375, "num_input_tokens_seen": 119712456, "step": 812 }, { "epoch": 0.23436148746036323, "grad_norm": 2.3991287299948647, "learning_rate": 0.0001, "loss": 0.0159, "num_input_tokens_seen": 119884928, "step": 813 }, { "epoch": 0.23436148746036323, "loss": 0.0121365487575531, "loss_ce": 0.009936422109603882, "loss_xval": 0.002197265625, "num_input_tokens_seen": 119884928, "step": 813 }, { "epoch": 0.2346497549726146, "grad_norm": 11.432710545599601, "learning_rate": 0.0001, "loss": 0.0199, "num_input_tokens_seen": 120019720, "step": 814 }, { "epoch": 0.2346497549726146, "loss": 0.02629391849040985, "loss_ce": 0.014300510287284851, "loss_xval": 0.011962890625, "num_input_tokens_seen": 120019720, "step": 814 }, { "epoch": 0.23493802248486595, "grad_norm": 0.4407423912378353, "learning_rate": 0.0001, "loss": 0.0082, "num_input_tokens_seen": 120154800, "step": 815 }, { "epoch": 0.23493802248486595, "loss": 0.004980717785656452, "loss_ce": 0.002568875439465046, "loss_xval": 0.002410888671875, "num_input_tokens_seen": 120154800, "step": 815 }, { "epoch": 0.23522628999711734, "grad_norm": 10.567351106248262, "learning_rate": 0.0001, "loss": 0.0256, "num_input_tokens_seen": 120327264, "step": 816 }, { "epoch": 0.23522628999711734, "loss": 0.023881588131189346, "loss_ce": 0.012002620846033096, "loss_xval": 0.01190185546875, "num_input_tokens_seen": 120327264, "step": 816 }, { "epoch": 0.2355145575093687, "grad_norm": 1.9443929350512452, "learning_rate": 0.0001, "loss": 0.0121, "num_input_tokens_seen": 120462024, "step": 817 }, { "epoch": 0.2355145575093687, "loss": 0.019425533711910248, "loss_ce": 0.016307972371578217, "loss_xval": 0.00311279296875, "num_input_tokens_seen": 120462024, "step": 817 }, { "epoch": 0.23580282502162006, "grad_norm": 8.887875354058915, "learning_rate": 0.0001, "loss": 0.0133, "num_input_tokens_seen": 120596992, "step": 818 }, { "epoch": 0.23580282502162006, "loss": 0.011268608272075653, "loss_ce": 0.002510063350200653, "loss_xval": 0.0087890625, "num_input_tokens_seen": 120596992, "step": 818 }, { "epoch": 0.23609109253387142, "grad_norm": 3.4867306523058232, "learning_rate": 0.0001, "loss": 0.0154, "num_input_tokens_seen": 120769480, "step": 819 }, { "epoch": 0.23609109253387142, "loss": 0.011347447521984577, "loss_ce": 0.008326207287609577, "loss_xval": 0.003021240234375, "num_input_tokens_seen": 120769480, "step": 819 }, { "epoch": 0.2363793600461228, "grad_norm": 7.558834106455437, "learning_rate": 0.0001, "loss": 0.016, "num_input_tokens_seen": 120904232, "step": 820 }, { "epoch": 0.2363793600461228, "loss": 0.023228928446769714, "loss_ce": 0.01735810935497284, "loss_xval": 0.005859375, "num_input_tokens_seen": 120904232, "step": 820 }, { "epoch": 0.23666762755837417, "grad_norm": 3.8589553586698733, "learning_rate": 0.0001, "loss": 0.0114, "num_input_tokens_seen": 121039600, "step": 821 }, { "epoch": 0.23666762755837417, "loss": 0.007957377471029758, "loss_ce": 0.005234637297689915, "loss_xval": 0.002716064453125, "num_input_tokens_seen": 121039600, "step": 821 }, { "epoch": 0.23695589507062553, "grad_norm": 6.547441002040102, "learning_rate": 0.0001, "loss": 0.0148, "num_input_tokens_seen": 121212112, "step": 822 }, { "epoch": 0.23695589507062553, "loss": 0.009982486255466938, "loss_ce": 0.0054372744634747505, "loss_xval": 0.004547119140625, "num_input_tokens_seen": 121212112, "step": 822 }, { "epoch": 0.23724416258287692, "grad_norm": 3.574675045046867, "learning_rate": 0.0001, "loss": 0.012, "num_input_tokens_seen": 121346864, "step": 823 }, { "epoch": 0.23724416258287692, "loss": 0.018714873120188713, "loss_ce": 0.0160426776856184, "loss_xval": 0.0026702880859375, "num_input_tokens_seen": 121346864, "step": 823 }, { "epoch": 0.23753243009512828, "grad_norm": 5.120405630275477, "learning_rate": 0.0001, "loss": 0.0076, "num_input_tokens_seen": 121482008, "step": 824 }, { "epoch": 0.23753243009512828, "loss": 0.006162785924971104, "loss_ce": 0.002063893945887685, "loss_xval": 0.00408935546875, "num_input_tokens_seen": 121482008, "step": 824 }, { "epoch": 0.23782069760737964, "grad_norm": 3.847733022136177, "learning_rate": 0.0001, "loss": 0.0124, "num_input_tokens_seen": 121654536, "step": 825 }, { "epoch": 0.23782069760737964, "loss": 0.006931713782250881, "loss_ce": 0.004517009947448969, "loss_xval": 0.002410888671875, "num_input_tokens_seen": 121654536, "step": 825 }, { "epoch": 0.238108965119631, "grad_norm": 4.460862200587386, "learning_rate": 0.0001, "loss": 0.0116, "num_input_tokens_seen": 121789288, "step": 826 }, { "epoch": 0.238108965119631, "loss": 0.017785396426916122, "loss_ce": 0.015078868716955185, "loss_xval": 0.0027008056640625, "num_input_tokens_seen": 121789288, "step": 826 }, { "epoch": 0.2383972326318824, "grad_norm": 3.2989980361041567, "learning_rate": 0.0001, "loss": 0.0097, "num_input_tokens_seen": 121924200, "step": 827 }, { "epoch": 0.2383972326318824, "loss": 0.0073768566362559795, "loss_ce": 0.0046989391557872295, "loss_xval": 0.002685546875, "num_input_tokens_seen": 121924200, "step": 827 }, { "epoch": 0.23868550014413376, "grad_norm": 3.652485050248189, "learning_rate": 0.0001, "loss": 0.014, "num_input_tokens_seen": 122096688, "step": 828 }, { "epoch": 0.23868550014413376, "loss": 0.011565220542252064, "loss_ce": 0.009024632163345814, "loss_xval": 0.002532958984375, "num_input_tokens_seen": 122096688, "step": 828 }, { "epoch": 0.23897376765638512, "grad_norm": 2.7805410201893728, "learning_rate": 0.0001, "loss": 0.0118, "num_input_tokens_seen": 122231512, "step": 829 }, { "epoch": 0.23897376765638512, "loss": 0.019701950252056122, "loss_ce": 0.017360679805278778, "loss_xval": 0.0023345947265625, "num_input_tokens_seen": 122231512, "step": 829 }, { "epoch": 0.2392620351686365, "grad_norm": 3.7491616956615665, "learning_rate": 0.0001, "loss": 0.0073, "num_input_tokens_seen": 122366616, "step": 830 }, { "epoch": 0.2392620351686365, "loss": 0.004523873329162598, "loss_ce": 0.001920342561788857, "loss_xval": 0.0026092529296875, "num_input_tokens_seen": 122366616, "step": 830 }, { "epoch": 0.23955030268088787, "grad_norm": 1.7379208880039498, "learning_rate": 0.0001, "loss": 0.0119, "num_input_tokens_seen": 122539112, "step": 831 }, { "epoch": 0.23955030268088787, "loss": 0.008797908201813698, "loss_ce": 0.006952548865228891, "loss_xval": 0.0018463134765625, "num_input_tokens_seen": 122539112, "step": 831 }, { "epoch": 0.23983857019313923, "grad_norm": 2.942166352575048, "learning_rate": 0.0001, "loss": 0.0112, "num_input_tokens_seen": 122673944, "step": 832 }, { "epoch": 0.23983857019313923, "loss": 0.017966344952583313, "loss_ce": 0.01571376621723175, "loss_xval": 0.00225830078125, "num_input_tokens_seen": 122673944, "step": 832 }, { "epoch": 0.2401268377053906, "grad_norm": 1.8229508953203557, "learning_rate": 0.0001, "loss": 0.0096, "num_input_tokens_seen": 122809096, "step": 833 }, { "epoch": 0.2401268377053906, "loss": 0.003673751372843981, "loss_ce": 0.0015937874559313059, "loss_xval": 0.0020751953125, "num_input_tokens_seen": 122809096, "step": 833 }, { "epoch": 0.24041510521764198, "grad_norm": 2.8603712211321946, "learning_rate": 0.0001, "loss": 0.0137, "num_input_tokens_seen": 122981720, "step": 834 }, { "epoch": 0.24041510521764198, "loss": 0.01071496307849884, "loss_ce": 0.008584454655647278, "loss_xval": 0.00213623046875, "num_input_tokens_seen": 122981720, "step": 834 }, { "epoch": 0.24070337272989334, "grad_norm": 1.672332473164387, "learning_rate": 0.0001, "loss": 0.0096, "num_input_tokens_seen": 123116504, "step": 835 }, { "epoch": 0.24070337272989334, "loss": 0.015596328303217888, "loss_ce": 0.013956962153315544, "loss_xval": 0.00164031982421875, "num_input_tokens_seen": 123116504, "step": 835 }, { "epoch": 0.2409916402421447, "grad_norm": 2.8517776188793613, "learning_rate": 0.0001, "loss": 0.0055, "num_input_tokens_seen": 123251632, "step": 836 }, { "epoch": 0.2409916402421447, "loss": 0.004265207797288895, "loss_ce": 0.0019525473471730947, "loss_xval": 0.0023193359375, "num_input_tokens_seen": 123251632, "step": 836 }, { "epoch": 0.2412799077543961, "grad_norm": 1.3730800143412834, "learning_rate": 0.0001, "loss": 0.0121, "num_input_tokens_seen": 123424136, "step": 837 }, { "epoch": 0.2412799077543961, "loss": 0.007244983687996864, "loss_ce": 0.005767742171883583, "loss_xval": 0.0014801025390625, "num_input_tokens_seen": 123424136, "step": 837 }, { "epoch": 0.24156817526664745, "grad_norm": 3.164522197042215, "learning_rate": 0.0001, "loss": 0.0113, "num_input_tokens_seen": 123558920, "step": 838 }, { "epoch": 0.24156817526664745, "loss": 0.018805041909217834, "loss_ce": 0.016788974404335022, "loss_xval": 0.00201416015625, "num_input_tokens_seen": 123558920, "step": 838 }, { "epoch": 0.2418564427788988, "grad_norm": 0.6940947497193604, "learning_rate": 0.0001, "loss": 0.0054, "num_input_tokens_seen": 123694040, "step": 839 }, { "epoch": 0.2418564427788988, "loss": 0.0029890744481235743, "loss_ce": 0.0013792722020298243, "loss_xval": 0.00160980224609375, "num_input_tokens_seen": 123694040, "step": 839 }, { "epoch": 0.2421447102911502, "grad_norm": 3.414009592985551, "learning_rate": 0.0001, "loss": 0.0131, "num_input_tokens_seen": 123866680, "step": 840 }, { "epoch": 0.2421447102911502, "loss": 0.008075650781393051, "loss_ce": 0.005059178918600082, "loss_xval": 0.003021240234375, "num_input_tokens_seen": 123866680, "step": 840 }, { "epoch": 0.24243297780340156, "grad_norm": 1.4163402518449046, "learning_rate": 0.0001, "loss": 0.0094, "num_input_tokens_seen": 124001520, "step": 841 }, { "epoch": 0.24243297780340156, "loss": 0.015565956011414528, "loss_ce": 0.014158331789076328, "loss_xval": 0.00140380859375, "num_input_tokens_seen": 124001520, "step": 841 }, { "epoch": 0.24272124531565292, "grad_norm": 1.403378922157825, "learning_rate": 0.0001, "loss": 0.0047, "num_input_tokens_seen": 124136760, "step": 842 }, { "epoch": 0.24272124531565292, "loss": 0.003177802776917815, "loss_ce": 0.001676719170063734, "loss_xval": 0.00150299072265625, "num_input_tokens_seen": 124136760, "step": 842 }, { "epoch": 0.24300951282790428, "grad_norm": 0.9423805294699967, "learning_rate": 0.0001, "loss": 0.0128, "num_input_tokens_seen": 124309304, "step": 843 }, { "epoch": 0.24300951282790428, "loss": 0.008656498044729233, "loss_ce": 0.007049556355923414, "loss_xval": 0.00160980224609375, "num_input_tokens_seen": 124309304, "step": 843 }, { "epoch": 0.24329778034015567, "grad_norm": 0.9049718050772102, "learning_rate": 0.0001, "loss": 0.0079, "num_input_tokens_seen": 124444096, "step": 844 }, { "epoch": 0.24329778034015567, "loss": 0.012917540036141872, "loss_ce": 0.01146986149251461, "loss_xval": 0.0014495849609375, "num_input_tokens_seen": 124444096, "step": 844 }, { "epoch": 0.24358604785240703, "grad_norm": 0.8273414421621751, "learning_rate": 0.0001, "loss": 0.005, "num_input_tokens_seen": 124579144, "step": 845 }, { "epoch": 0.24358604785240703, "loss": 0.002799539128318429, "loss_ce": 0.0014577193651348352, "loss_xval": 0.0013427734375, "num_input_tokens_seen": 124579144, "step": 845 }, { "epoch": 0.2438743153646584, "grad_norm": 0.7750075248220467, "learning_rate": 0.0001, "loss": 0.0132, "num_input_tokens_seen": 124751800, "step": 846 }, { "epoch": 0.2438743153646584, "loss": 0.010742152109742165, "loss_ce": 0.008784258738160133, "loss_xval": 0.001953125, "num_input_tokens_seen": 124751800, "step": 846 }, { "epoch": 0.24416258287690978, "grad_norm": 0.5817994642876541, "learning_rate": 0.0001, "loss": 0.0096, "num_input_tokens_seen": 124886816, "step": 847 }, { "epoch": 0.24416258287690978, "loss": 0.016359955072402954, "loss_ce": 0.01504865288734436, "loss_xval": 0.001312255859375, "num_input_tokens_seen": 124886816, "step": 847 }, { "epoch": 0.24445085038916115, "grad_norm": 1.7709905019760608, "learning_rate": 0.0001, "loss": 0.0044, "num_input_tokens_seen": 125021848, "step": 848 }, { "epoch": 0.24445085038916115, "loss": 0.0030480134300887585, "loss_ce": 0.0013533341698348522, "loss_xval": 0.0016937255859375, "num_input_tokens_seen": 125021848, "step": 848 }, { "epoch": 0.2447391179014125, "grad_norm": 2.165383770527757, "learning_rate": 0.0001, "loss": 0.0109, "num_input_tokens_seen": 125194344, "step": 849 }, { "epoch": 0.2447391179014125, "loss": 0.005808062851428986, "loss_ce": 0.003933139145374298, "loss_xval": 0.0018768310546875, "num_input_tokens_seen": 125194344, "step": 849 }, { "epoch": 0.24502738541366387, "grad_norm": 0.2620781135391585, "learning_rate": 0.0001, "loss": 0.0084, "num_input_tokens_seen": 125329208, "step": 850 }, { "epoch": 0.24502738541366387, "loss": 0.014183461666107178, "loss_ce": 0.012954652309417725, "loss_xval": 0.00122833251953125, "num_input_tokens_seen": 125329208, "step": 850 }, { "epoch": 0.24531565292591526, "grad_norm": 1.9324423430851303, "learning_rate": 0.0001, "loss": 0.0055, "num_input_tokens_seen": 125464352, "step": 851 }, { "epoch": 0.24531565292591526, "loss": 0.0028029857203364372, "loss_ce": 0.0011912761256098747, "loss_xval": 0.00160980224609375, "num_input_tokens_seen": 125464352, "step": 851 }, { "epoch": 0.24560392043816662, "grad_norm": 1.994122040931301, "learning_rate": 0.0001, "loss": 0.0102, "num_input_tokens_seen": 125636976, "step": 852 }, { "epoch": 0.24560392043816662, "loss": 0.006850851699709892, "loss_ce": 0.005015028640627861, "loss_xval": 0.00183868408203125, "num_input_tokens_seen": 125636976, "step": 852 }, { "epoch": 0.24589218795041798, "grad_norm": 0.6645628358683997, "learning_rate": 0.0001, "loss": 0.0108, "num_input_tokens_seen": 125771696, "step": 853 }, { "epoch": 0.24589218795041798, "loss": 0.01390957273542881, "loss_ce": 0.0125958863645792, "loss_xval": 0.001312255859375, "num_input_tokens_seen": 125771696, "step": 853 }, { "epoch": 0.24618045546266937, "grad_norm": 0.9193485395039878, "learning_rate": 0.0001, "loss": 0.0067, "num_input_tokens_seen": 125907032, "step": 854 }, { "epoch": 0.24618045546266937, "loss": 0.008167913183569908, "loss_ce": 0.006742647383362055, "loss_xval": 0.00142669677734375, "num_input_tokens_seen": 125907032, "step": 854 }, { "epoch": 0.24646872297492073, "grad_norm": 1.0377375477638926, "learning_rate": 0.0001, "loss": 0.0101, "num_input_tokens_seen": 126079496, "step": 855 }, { "epoch": 0.24646872297492073, "loss": 0.005995092913508415, "loss_ce": 0.004644213244318962, "loss_xval": 0.00135040283203125, "num_input_tokens_seen": 126079496, "step": 855 }, { "epoch": 0.2467569904871721, "grad_norm": 0.4591843203940489, "learning_rate": 0.0001, "loss": 0.0077, "num_input_tokens_seen": 126214408, "step": 856 }, { "epoch": 0.2467569904871721, "loss": 0.013096168637275696, "loss_ce": 0.01175482664257288, "loss_xval": 0.0013427734375, "num_input_tokens_seen": 126214408, "step": 856 }, { "epoch": 0.24704525799942348, "grad_norm": 0.3583351807212624, "learning_rate": 0.0001, "loss": 0.005, "num_input_tokens_seen": 126349512, "step": 857 }, { "epoch": 0.24704525799942348, "loss": 0.0021648311521857977, "loss_ce": 0.0009698771755211055, "loss_xval": 0.00119781494140625, "num_input_tokens_seen": 126349512, "step": 857 }, { "epoch": 0.24733352551167484, "grad_norm": 0.4255231077383957, "learning_rate": 0.0001, "loss": 0.0109, "num_input_tokens_seen": 126521928, "step": 858 }, { "epoch": 0.24733352551167484, "loss": 0.007635718677192926, "loss_ce": 0.0065132444724440575, "loss_xval": 0.00112152099609375, "num_input_tokens_seen": 126521928, "step": 858 }, { "epoch": 0.2476217930239262, "grad_norm": 1.9034822694333349, "learning_rate": 0.0001, "loss": 0.0072, "num_input_tokens_seen": 126656648, "step": 859 }, { "epoch": 0.2476217930239262, "loss": 0.011844690889120102, "loss_ce": 0.010460909456014633, "loss_xval": 0.00138092041015625, "num_input_tokens_seen": 126656648, "step": 859 }, { "epoch": 0.24791006053617756, "grad_norm": 2.160556800745774, "learning_rate": 0.0001, "loss": 0.0038, "num_input_tokens_seen": 126791656, "step": 860 }, { "epoch": 0.24791006053617756, "loss": 0.0028993161395192146, "loss_ce": 0.0011569531634449959, "loss_xval": 0.001739501953125, "num_input_tokens_seen": 126791656, "step": 860 }, { "epoch": 0.24819832804842895, "grad_norm": 0.8898372077055581, "learning_rate": 0.0001, "loss": 0.0107, "num_input_tokens_seen": 126964216, "step": 861 }, { "epoch": 0.24819832804842895, "loss": 0.0031397445127367973, "loss_ce": 0.002113114111125469, "loss_xval": 0.00102996826171875, "num_input_tokens_seen": 126964216, "step": 861 }, { "epoch": 0.2484865955606803, "grad_norm": 3.312099635274805, "learning_rate": 0.0001, "loss": 0.0103, "num_input_tokens_seen": 127099080, "step": 862 }, { "epoch": 0.2484865955606803, "loss": 0.01764540746808052, "loss_ce": 0.015682268887758255, "loss_xval": 0.0019683837890625, "num_input_tokens_seen": 127099080, "step": 862 }, { "epoch": 0.24877486307293167, "grad_norm": 1.5009503860441187, "learning_rate": 0.0001, "loss": 0.0045, "num_input_tokens_seen": 127234144, "step": 863 }, { "epoch": 0.24877486307293167, "loss": 0.0024071503430604935, "loss_ce": 0.0009451676160097122, "loss_xval": 0.00146484375, "num_input_tokens_seen": 127234144, "step": 863 }, { "epoch": 0.24906313058518306, "grad_norm": 3.065705952173178, "learning_rate": 0.0001, "loss": 0.0101, "num_input_tokens_seen": 127406696, "step": 864 }, { "epoch": 0.24906313058518306, "loss": 0.005417156033217907, "loss_ce": 0.003864574246108532, "loss_xval": 0.001556396484375, "num_input_tokens_seen": 127406696, "step": 864 }, { "epoch": 0.24935139809743442, "grad_norm": 5.191152496926581, "learning_rate": 0.0001, "loss": 0.01, "num_input_tokens_seen": 127541608, "step": 865 }, { "epoch": 0.24935139809743442, "loss": 0.015241998247802258, "loss_ce": 0.012771028093993664, "loss_xval": 0.002471923828125, "num_input_tokens_seen": 127541608, "step": 865 }, { "epoch": 0.24963966560968578, "grad_norm": 1.9883630746272334, "learning_rate": 0.0001, "loss": 0.0043, "num_input_tokens_seen": 127676656, "step": 866 }, { "epoch": 0.24963966560968578, "loss": 0.0024233288131654263, "loss_ce": 0.0009551472612656653, "loss_xval": 0.00146484375, "num_input_tokens_seen": 127676656, "step": 866 }, { "epoch": 0.24992793312193715, "grad_norm": 4.045749456376484, "learning_rate": 0.0001, "loss": 0.0124, "num_input_tokens_seen": 127849168, "step": 867 }, { "epoch": 0.24992793312193715, "loss": 0.005908184219151735, "loss_ce": 0.004185848403722048, "loss_xval": 0.0017242431640625, "num_input_tokens_seen": 127849168, "step": 867 }, { "epoch": 0.25021620063418853, "grad_norm": 6.916765603415226, "learning_rate": 0.0001, "loss": 0.0113, "num_input_tokens_seen": 127983992, "step": 868 }, { "epoch": 0.25021620063418853, "loss": 0.01603618450462818, "loss_ce": 0.012471349909901619, "loss_xval": 0.003570556640625, "num_input_tokens_seen": 127983992, "step": 868 }, { "epoch": 0.2505044681464399, "grad_norm": 2.7933517156089267, "learning_rate": 0.0001, "loss": 0.0035, "num_input_tokens_seen": 128118960, "step": 869 }, { "epoch": 0.2505044681464399, "loss": 0.002570713870227337, "loss_ce": 0.000868404982611537, "loss_xval": 0.00170135498046875, "num_input_tokens_seen": 128118960, "step": 869 }, { "epoch": 0.25079273565869126, "grad_norm": 5.686330272706534, "learning_rate": 0.0001, "loss": 0.0128, "num_input_tokens_seen": 128291560, "step": 870 }, { "epoch": 0.25079273565869126, "loss": 0.005967923440039158, "loss_ce": 0.0036791053134948015, "loss_xval": 0.002288818359375, "num_input_tokens_seen": 128291560, "step": 870 }, { "epoch": 0.2510810031709426, "grad_norm": 10.381696412661938, "learning_rate": 0.0001, "loss": 0.0156, "num_input_tokens_seen": 128426336, "step": 871 }, { "epoch": 0.2510810031709426, "loss": 0.020823419094085693, "loss_ce": 0.013220726512372494, "loss_xval": 0.007598876953125, "num_input_tokens_seen": 128426336, "step": 871 }, { "epoch": 0.251369270683194, "grad_norm": 5.533838864070437, "learning_rate": 0.0001, "loss": 0.0045, "num_input_tokens_seen": 128561360, "step": 872 }, { "epoch": 0.251369270683194, "loss": 0.004516711458563805, "loss_ce": 0.0008450658060610294, "loss_xval": 0.0036773681640625, "num_input_tokens_seen": 128561360, "step": 872 }, { "epoch": 0.2516575381954454, "grad_norm": 5.185474513960417, "learning_rate": 0.0001, "loss": 0.0088, "num_input_tokens_seen": 128733848, "step": 873 }, { "epoch": 0.2516575381954454, "loss": 0.003517691045999527, "loss_ce": 0.0014730134280398488, "loss_xval": 0.002044677734375, "num_input_tokens_seen": 128733848, "step": 873 }, { "epoch": 0.25194580570769676, "grad_norm": 11.881789355652334, "learning_rate": 0.0001, "loss": 0.017, "num_input_tokens_seen": 128868688, "step": 874 }, { "epoch": 0.25194580570769676, "loss": 0.020403128117322922, "loss_ce": 0.011839132755994797, "loss_xval": 0.008544921875, "num_input_tokens_seen": 128868688, "step": 874 }, { "epoch": 0.2522340732199481, "grad_norm": 8.080500825807434, "learning_rate": 0.0001, "loss": 0.0082, "num_input_tokens_seen": 129003864, "step": 875 }, { "epoch": 0.2522340732199481, "loss": 0.0072290534153580666, "loss_ce": 0.0008051031036302447, "loss_xval": 0.00640869140625, "num_input_tokens_seen": 129003864, "step": 875 }, { "epoch": 0.2525223407321995, "grad_norm": 3.481422564697328, "learning_rate": 0.0001, "loss": 0.0092, "num_input_tokens_seen": 129176360, "step": 876 }, { "epoch": 0.2525223407321995, "loss": 0.0051642959006130695, "loss_ce": 0.003819615114480257, "loss_xval": 0.0013427734375, "num_input_tokens_seen": 129176360, "step": 876 }, { "epoch": 0.25281060824445084, "grad_norm": 11.855942235668593, "learning_rate": 0.0001, "loss": 0.0183, "num_input_tokens_seen": 129311152, "step": 877 }, { "epoch": 0.25281060824445084, "loss": 0.022560758516192436, "loss_ce": 0.014446896500885487, "loss_xval": 0.00811767578125, "num_input_tokens_seen": 129311152, "step": 877 }, { "epoch": 0.2530988757567022, "grad_norm": 8.119722893152582, "learning_rate": 0.0001, "loss": 0.0082, "num_input_tokens_seen": 129446176, "step": 878 }, { "epoch": 0.2530988757567022, "loss": 0.007987035438418388, "loss_ce": 0.0008916989318095148, "loss_xval": 0.007080078125, "num_input_tokens_seen": 129446176, "step": 878 }, { "epoch": 0.25338714326895356, "grad_norm": 4.820818363547437, "learning_rate": 0.0001, "loss": 0.0104, "num_input_tokens_seen": 129618720, "step": 879 }, { "epoch": 0.25338714326895356, "loss": 0.004318573512136936, "loss_ce": 0.0032614259980618954, "loss_xval": 0.00106048583984375, "num_input_tokens_seen": 129618720, "step": 879 }, { "epoch": 0.253675410781205, "grad_norm": 14.62637741629135, "learning_rate": 0.0001, "loss": 0.0224, "num_input_tokens_seen": 129753536, "step": 880 }, { "epoch": 0.253675410781205, "loss": 0.024506494402885437, "loss_ce": 0.012139246799051762, "loss_xval": 0.01239013671875, "num_input_tokens_seen": 129753536, "step": 880 }, { "epoch": 0.25396367829345634, "grad_norm": 11.717901677148392, "learning_rate": 0.0001, "loss": 0.0121, "num_input_tokens_seen": 129888712, "step": 881 }, { "epoch": 0.25396367829345634, "loss": 0.015599137172102928, "loss_ce": 0.000851518358103931, "loss_xval": 0.0147705078125, "num_input_tokens_seen": 129888712, "step": 881 }, { "epoch": 0.2542519458057077, "grad_norm": 1.4620165698355299, "learning_rate": 0.0001, "loss": 0.0083, "num_input_tokens_seen": 130061136, "step": 882 }, { "epoch": 0.2542519458057077, "loss": 0.003730251919478178, "loss_ce": 0.0018252875888720155, "loss_xval": 0.0019073486328125, "num_input_tokens_seen": 130061136, "step": 882 }, { "epoch": 0.25454021331795906, "grad_norm": 11.906553202096525, "learning_rate": 0.0001, "loss": 0.0181, "num_input_tokens_seen": 130195896, "step": 883 }, { "epoch": 0.25454021331795906, "loss": 0.02044621855020523, "loss_ce": 0.013198292814195156, "loss_xval": 0.00726318359375, "num_input_tokens_seen": 130195896, "step": 883 }, { "epoch": 0.2548284808302104, "grad_norm": 9.009958862531548, "learning_rate": 0.0001, "loss": 0.0086, "num_input_tokens_seen": 130330992, "step": 884 }, { "epoch": 0.2548284808302104, "loss": 0.010740547440946102, "loss_ce": 0.0007994465995579958, "loss_xval": 0.00994873046875, "num_input_tokens_seen": 130330992, "step": 884 }, { "epoch": 0.2551167483424618, "grad_norm": 4.994362602119953, "learning_rate": 0.0001, "loss": 0.0109, "num_input_tokens_seen": 130503464, "step": 885 }, { "epoch": 0.2551167483424618, "loss": 0.00263015553355217, "loss_ce": 0.0014218504074960947, "loss_xval": 0.0012054443359375, "num_input_tokens_seen": 130503464, "step": 885 }, { "epoch": 0.25540501585471315, "grad_norm": 16.541549750545187, "learning_rate": 0.0001, "loss": 0.0254, "num_input_tokens_seen": 130638304, "step": 886 }, { "epoch": 0.25540501585471315, "loss": 0.0236981064081192, "loss_ce": 0.009553208947181702, "loss_xval": 0.01416015625, "num_input_tokens_seen": 130638304, "step": 886 }, { "epoch": 0.25569328336696456, "grad_norm": 14.441262893561307, "learning_rate": 0.0001, "loss": 0.0177, "num_input_tokens_seen": 130773464, "step": 887 }, { "epoch": 0.25569328336696456, "loss": 0.021735940128564835, "loss_ce": 0.0007093290332704782, "loss_xval": 0.02099609375, "num_input_tokens_seen": 130773464, "step": 887 }, { "epoch": 0.2559815508792159, "grad_norm": 1.030551625521142, "learning_rate": 0.0001, "loss": 0.0105, "num_input_tokens_seen": 130945968, "step": 888 }, { "epoch": 0.2559815508792159, "loss": 0.007245267741382122, "loss_ce": 0.0021755348425358534, "loss_xval": 0.00506591796875, "num_input_tokens_seen": 130945968, "step": 888 }, { "epoch": 0.2562698183914673, "grad_norm": 10.336845186935472, "learning_rate": 0.0001, "loss": 0.0164, "num_input_tokens_seen": 131080728, "step": 889 }, { "epoch": 0.2562698183914673, "loss": 0.017896853387355804, "loss_ce": 0.01371975801885128, "loss_xval": 0.004180908203125, "num_input_tokens_seen": 131080728, "step": 889 }, { "epoch": 0.25655808590371865, "grad_norm": 7.421496056016462, "learning_rate": 0.0001, "loss": 0.0074, "num_input_tokens_seen": 131215752, "step": 890 }, { "epoch": 0.25655808590371865, "loss": 0.010436332784593105, "loss_ce": 0.0010254747467115521, "loss_xval": 0.0093994140625, "num_input_tokens_seen": 131215752, "step": 890 }, { "epoch": 0.25684635341597, "grad_norm": 7.379148752876616, "learning_rate": 0.0001, "loss": 0.0154, "num_input_tokens_seen": 131388184, "step": 891 }, { "epoch": 0.25684635341597, "loss": 0.0033525533508509398, "loss_ce": 0.001976401312276721, "loss_xval": 0.001373291015625, "num_input_tokens_seen": 131388184, "step": 891 }, { "epoch": 0.25713462092822137, "grad_norm": 19.251412926550937, "learning_rate": 0.0001, "loss": 0.034, "num_input_tokens_seen": 131522984, "step": 892 }, { "epoch": 0.25713462092822137, "loss": 0.028828779235482216, "loss_ce": 0.01163975428789854, "loss_xval": 0.0172119140625, "num_input_tokens_seen": 131522984, "step": 892 }, { "epoch": 0.2574228884404728, "grad_norm": 15.799551940261379, "learning_rate": 0.0001, "loss": 0.0231, "num_input_tokens_seen": 131658112, "step": 893 }, { "epoch": 0.2574228884404728, "loss": 0.028818532824516296, "loss_ce": 0.0006965833017602563, "loss_xval": 0.028076171875, "num_input_tokens_seen": 131658112, "step": 893 }, { "epoch": 0.25771115595272415, "grad_norm": 1.0845714678383525, "learning_rate": 0.0001, "loss": 0.0116, "num_input_tokens_seen": 131830608, "step": 894 }, { "epoch": 0.25771115595272415, "loss": 0.006088267080485821, "loss_ce": 0.0015697581693530083, "loss_xval": 0.0045166015625, "num_input_tokens_seen": 131830608, "step": 894 }, { "epoch": 0.2579994234649755, "grad_norm": 16.644940036843188, "learning_rate": 0.0001, "loss": 0.0289, "num_input_tokens_seen": 131965448, "step": 895 }, { "epoch": 0.2579994234649755, "loss": 0.026386450976133347, "loss_ce": 0.014148902148008347, "loss_xval": 0.01220703125, "num_input_tokens_seen": 131965448, "step": 895 }, { "epoch": 0.25828769097722687, "grad_norm": 17.960460147753857, "learning_rate": 0.0001, "loss": 0.026, "num_input_tokens_seen": 132100560, "step": 896 }, { "epoch": 0.25828769097722687, "loss": 0.03656185418367386, "loss_ce": 0.0006731800967827439, "loss_xval": 0.035888671875, "num_input_tokens_seen": 132100560, "step": 896 }, { "epoch": 0.25857595848947823, "grad_norm": 5.864950391364441, "learning_rate": 0.0001, "loss": 0.016, "num_input_tokens_seen": 132273072, "step": 897 }, { "epoch": 0.25857595848947823, "loss": 0.017272870987653732, "loss_ce": 0.0020751184783875942, "loss_xval": 0.01519775390625, "num_input_tokens_seen": 132273072, "step": 897 }, { "epoch": 0.2588642260017296, "grad_norm": 5.535883983296717, "learning_rate": 0.0001, "loss": 0.0106, "num_input_tokens_seen": 132407760, "step": 898 }, { "epoch": 0.2588642260017296, "loss": 0.01192381139844656, "loss_ce": 0.010679743252694607, "loss_xval": 0.00124359130859375, "num_input_tokens_seen": 132407760, "step": 898 }, { "epoch": 0.25915249351398095, "grad_norm": 2.705241713165992, "learning_rate": 0.0001, "loss": 0.0039, "num_input_tokens_seen": 132542840, "step": 899 }, { "epoch": 0.25915249351398095, "loss": 0.004840262699872255, "loss_ce": 0.000699408701620996, "loss_xval": 0.004150390625, "num_input_tokens_seen": 132542840, "step": 899 }, { "epoch": 0.25944076102623237, "grad_norm": 12.68596683547975, "learning_rate": 0.0001, "loss": 0.0286, "num_input_tokens_seen": 132715368, "step": 900 }, { "epoch": 0.25944076102623237, "loss": 0.010654434561729431, "loss_ce": 0.008203492499887943, "loss_xval": 0.0024566650390625, "num_input_tokens_seen": 132715368, "step": 900 }, { "epoch": 0.25972902853848373, "grad_norm": 26.31430231024215, "learning_rate": 0.0001, "loss": 0.0572, "num_input_tokens_seen": 132850240, "step": 901 }, { "epoch": 0.25972902853848373, "loss": 0.046879298985004425, "loss_ce": 0.011906154453754425, "loss_xval": 0.034912109375, "num_input_tokens_seen": 132850240, "step": 901 }, { "epoch": 0.2600172960507351, "grad_norm": 26.183178589900663, "learning_rate": 0.0001, "loss": 0.0528, "num_input_tokens_seen": 132985336, "step": 902 }, { "epoch": 0.2600172960507351, "loss": 0.06659163534641266, "loss_ce": 0.0006736628711223602, "loss_xval": 0.06591796875, "num_input_tokens_seen": 132985336, "step": 902 }, { "epoch": 0.26030556356298645, "grad_norm": 14.531282187694886, "learning_rate": 0.0001, "loss": 0.0282, "num_input_tokens_seen": 133157976, "step": 903 }, { "epoch": 0.26030556356298645, "loss": 0.04092877358198166, "loss_ce": 0.0007676435052417219, "loss_xval": 0.0400390625, "num_input_tokens_seen": 133157976, "step": 903 }, { "epoch": 0.2605938310752378, "grad_norm": 4.771268098697512, "learning_rate": 0.0001, "loss": 0.0109, "num_input_tokens_seen": 133292824, "step": 904 }, { "epoch": 0.2605938310752378, "loss": 0.017029020935297012, "loss_ce": 0.009620878845453262, "loss_xval": 0.007415771484375, "num_input_tokens_seen": 133292824, "step": 904 }, { "epoch": 0.2608820985874892, "grad_norm": 7.83416493572119, "learning_rate": 0.0001, "loss": 0.0076, "num_input_tokens_seen": 133428056, "step": 905 }, { "epoch": 0.2608820985874892, "loss": 0.00283773522824049, "loss_ce": 0.0006976902368478477, "loss_xval": 0.00213623046875, "num_input_tokens_seen": 133428056, "step": 905 }, { "epoch": 0.26117036609974054, "grad_norm": 21.668543085444146, "learning_rate": 0.0001, "loss": 0.0442, "num_input_tokens_seen": 133600712, "step": 906 }, { "epoch": 0.26117036609974054, "loss": 0.01621638983488083, "loss_ce": 0.000980487558990717, "loss_xval": 0.0152587890625, "num_input_tokens_seen": 133600712, "step": 906 }, { "epoch": 0.26145863361199195, "grad_norm": 33.911068651601695, "learning_rate": 0.0001, "loss": 0.0903, "num_input_tokens_seen": 133735480, "step": 907 }, { "epoch": 0.26145863361199195, "loss": 0.08236135542392731, "loss_ce": 0.011316439136862755, "loss_xval": 0.0712890625, "num_input_tokens_seen": 133735480, "step": 907 }, { "epoch": 0.2617469011242433, "grad_norm": 33.76515245614444, "learning_rate": 0.0001, "loss": 0.0865, "num_input_tokens_seen": 133870456, "step": 908 }, { "epoch": 0.2617469011242433, "loss": 0.10329385101795197, "loss_ce": 0.0006937556900084019, "loss_xval": 0.1025390625, "num_input_tokens_seen": 133870456, "step": 908 }, { "epoch": 0.2620351686364947, "grad_norm": 22.571864793058605, "learning_rate": 0.0001, "loss": 0.0465, "num_input_tokens_seen": 134043096, "step": 909 }, { "epoch": 0.2620351686364947, "loss": 0.06501968204975128, "loss_ce": 0.0009938071016222239, "loss_xval": 0.06396484375, "num_input_tokens_seen": 134043096, "step": 909 }, { "epoch": 0.26232343614874604, "grad_norm": 11.579752563407906, "learning_rate": 0.0001, "loss": 0.0196, "num_input_tokens_seen": 134177904, "step": 910 }, { "epoch": 0.26232343614874604, "loss": 0.03199625760316849, "loss_ce": 0.013319501653313637, "loss_xval": 0.0186767578125, "num_input_tokens_seen": 134177904, "step": 910 }, { "epoch": 0.2626117036609974, "grad_norm": 10.671083819097934, "learning_rate": 0.0001, "loss": 0.0124, "num_input_tokens_seen": 134312864, "step": 911 }, { "epoch": 0.2626117036609974, "loss": 0.006020676344633102, "loss_ce": 0.0007373203989118338, "loss_xval": 0.005279541015625, "num_input_tokens_seen": 134312864, "step": 911 }, { "epoch": 0.26289997117324876, "grad_norm": 17.29284770364731, "learning_rate": 0.0001, "loss": 0.0327, "num_input_tokens_seen": 134485392, "step": 912 }, { "epoch": 0.26289997117324876, "loss": 0.011055695824325085, "loss_ce": 0.0016448370879516006, "loss_xval": 0.0093994140625, "num_input_tokens_seen": 134485392, "step": 912 }, { "epoch": 0.2631882386855001, "grad_norm": 20.517777314388695, "learning_rate": 0.0001, "loss": 0.0384, "num_input_tokens_seen": 134620280, "step": 913 }, { "epoch": 0.2631882386855001, "loss": 0.03527616709470749, "loss_ce": 0.009626143611967564, "loss_xval": 0.025634765625, "num_input_tokens_seen": 134620280, "step": 913 }, { "epoch": 0.26347650619775154, "grad_norm": 13.194845052998755, "learning_rate": 0.0001, "loss": 0.0165, "num_input_tokens_seen": 134755240, "step": 914 }, { "epoch": 0.26347650619775154, "loss": 0.02211439609527588, "loss_ce": 0.0007368340156972408, "loss_xval": 0.0213623046875, "num_input_tokens_seen": 134755240, "step": 914 }, { "epoch": 0.2637647737100029, "grad_norm": 1.4668499046176524, "learning_rate": 0.0001, "loss": 0.0096, "num_input_tokens_seen": 134927608, "step": 915 }, { "epoch": 0.2637647737100029, "loss": 0.00330064888112247, "loss_ce": 0.0014142810832709074, "loss_xval": 0.00188446044921875, "num_input_tokens_seen": 134927608, "step": 915 }, { "epoch": 0.26405304122225426, "grad_norm": 13.229827627513, "learning_rate": 0.0001, "loss": 0.0199, "num_input_tokens_seen": 135062368, "step": 916 }, { "epoch": 0.26405304122225426, "loss": 0.021753720939159393, "loss_ce": 0.010431700386106968, "loss_xval": 0.0113525390625, "num_input_tokens_seen": 135062368, "step": 916 }, { "epoch": 0.2643413087345056, "grad_norm": 15.10165445453856, "learning_rate": 0.0001, "loss": 0.0203, "num_input_tokens_seen": 135197440, "step": 917 }, { "epoch": 0.2643413087345056, "loss": 0.027548931539058685, "loss_ce": 0.0007239800179377198, "loss_xval": 0.02685546875, "num_input_tokens_seen": 135197440, "step": 917 }, { "epoch": 0.264629576246757, "grad_norm": 10.113371750330549, "learning_rate": 0.0001, "loss": 0.0166, "num_input_tokens_seen": 135369888, "step": 918 }, { "epoch": 0.264629576246757, "loss": 0.019287362694740295, "loss_ce": 0.0010988865979015827, "loss_xval": 0.0181884765625, "num_input_tokens_seen": 135369888, "step": 918 }, { "epoch": 0.26491784375900834, "grad_norm": 7.503561052273172, "learning_rate": 0.0001, "loss": 0.0095, "num_input_tokens_seen": 135504720, "step": 919 }, { "epoch": 0.26491784375900834, "loss": 0.014727404341101646, "loss_ce": 0.007189563009887934, "loss_xval": 0.007537841796875, "num_input_tokens_seen": 135504720, "step": 919 }, { "epoch": 0.2652061112712597, "grad_norm": 12.942512436123494, "learning_rate": 0.0001, "loss": 0.0155, "num_input_tokens_seen": 135639816, "step": 920 }, { "epoch": 0.2652061112712597, "loss": 0.009108353406190872, "loss_ce": 0.0009105693316087127, "loss_xval": 0.0081787109375, "num_input_tokens_seen": 135639816, "step": 920 }, { "epoch": 0.2654943787835111, "grad_norm": 23.22476813528809, "learning_rate": 0.0001, "loss": 0.049, "num_input_tokens_seen": 135812408, "step": 921 }, { "epoch": 0.2654943787835111, "loss": 0.03185856714844704, "loss_ce": 0.0026837619952857494, "loss_xval": 0.0291748046875, "num_input_tokens_seen": 135812408, "step": 921 }, { "epoch": 0.2657826462957625, "grad_norm": 30.64846542366728, "learning_rate": 0.0001, "loss": 0.0798, "num_input_tokens_seen": 135947240, "step": 922 }, { "epoch": 0.2657826462957625, "loss": 0.08230604231357574, "loss_ce": 0.013214251026511192, "loss_xval": 0.0693359375, "num_input_tokens_seen": 135947240, "step": 922 }, { "epoch": 0.26607091380801384, "grad_norm": 30.414646214325735, "learning_rate": 0.0001, "loss": 0.0749, "num_input_tokens_seen": 136082312, "step": 923 }, { "epoch": 0.26607091380801384, "loss": 0.08990393579006195, "loss_ce": 0.0007315789698623121, "loss_xval": 0.08935546875, "num_input_tokens_seen": 136082312, "step": 923 }, { "epoch": 0.2663591813202652, "grad_norm": 24.71366898539121, "learning_rate": 0.0001, "loss": 0.0544, "num_input_tokens_seen": 136254824, "step": 924 }, { "epoch": 0.2663591813202652, "loss": 0.06479089707136154, "loss_ce": 0.0011007109424099326, "loss_xval": 0.0634765625, "num_input_tokens_seen": 136254824, "step": 924 }, { "epoch": 0.26664744883251656, "grad_norm": 20.403882779981654, "learning_rate": 0.0001, "loss": 0.0387, "num_input_tokens_seen": 136389768, "step": 925 }, { "epoch": 0.26664744883251656, "loss": 0.04584236443042755, "loss_ce": 0.008092120289802551, "loss_xval": 0.037841796875, "num_input_tokens_seen": 136389768, "step": 925 }, { "epoch": 0.2669357163447679, "grad_norm": 21.71097006562949, "learning_rate": 0.0001, "loss": 0.0408, "num_input_tokens_seen": 136524832, "step": 926 }, { "epoch": 0.2669357163447679, "loss": 0.03015214577317238, "loss_ce": 0.0007484586676582694, "loss_xval": 0.0294189453125, "num_input_tokens_seen": 136524832, "step": 926 }, { "epoch": 0.2672239838570193, "grad_norm": 25.048860969375237, "learning_rate": 0.0001, "loss": 0.0588, "num_input_tokens_seen": 136697280, "step": 927 }, { "epoch": 0.2672239838570193, "loss": 0.039126113057136536, "loss_ce": 0.0023066524881869555, "loss_xval": 0.036865234375, "num_input_tokens_seen": 136697280, "step": 927 }, { "epoch": 0.2675122513692707, "grad_norm": 23.604295185328763, "learning_rate": 0.0001, "loss": 0.051, "num_input_tokens_seen": 136832016, "step": 928 }, { "epoch": 0.2675122513692707, "loss": 0.05441943556070328, "loss_ce": 0.009802735410630703, "loss_xval": 0.044677734375, "num_input_tokens_seen": 136832016, "step": 928 }, { "epoch": 0.26780051888152206, "grad_norm": 14.373659940022728, "learning_rate": 0.0001, "loss": 0.0196, "num_input_tokens_seen": 136967048, "step": 929 }, { "epoch": 0.26780051888152206, "loss": 0.026713449507951736, "loss_ce": 0.0007887678220868111, "loss_xval": 0.02587890625, "num_input_tokens_seen": 136967048, "step": 929 }, { "epoch": 0.2680887863937734, "grad_norm": 1.625112853934714, "learning_rate": 0.0001, "loss": 0.0089, "num_input_tokens_seen": 137139536, "step": 930 }, { "epoch": 0.2680887863937734, "loss": 0.004050274379551411, "loss_ce": 0.0012865260941907763, "loss_xval": 0.0027618408203125, "num_input_tokens_seen": 137139536, "step": 930 }, { "epoch": 0.2683770539060248, "grad_norm": 7.604170035840317, "learning_rate": 0.0001, "loss": 0.0118, "num_input_tokens_seen": 137274200, "step": 931 }, { "epoch": 0.2683770539060248, "loss": 0.017092669382691383, "loss_ce": 0.010817492380738258, "loss_xval": 0.00628662109375, "num_input_tokens_seen": 137274200, "step": 931 }, { "epoch": 0.26866532141827615, "grad_norm": 9.838193297363524, "learning_rate": 0.0001, "loss": 0.011, "num_input_tokens_seen": 137409296, "step": 932 }, { "epoch": 0.26866532141827615, "loss": 0.01571030355989933, "loss_ce": 0.0009321661200374365, "loss_xval": 0.0147705078125, "num_input_tokens_seen": 137409296, "step": 932 }, { "epoch": 0.2689535889305275, "grad_norm": 8.518101589222372, "learning_rate": 0.0001, "loss": 0.0139, "num_input_tokens_seen": 137581768, "step": 933 }, { "epoch": 0.2689535889305275, "loss": 0.012986189685761929, "loss_ce": 0.001626020995900035, "loss_xval": 0.0113525390625, "num_input_tokens_seen": 137581768, "step": 933 }, { "epoch": 0.2692418564427789, "grad_norm": 9.815786119364088, "learning_rate": 0.0001, "loss": 0.0153, "num_input_tokens_seen": 137716504, "step": 934 }, { "epoch": 0.2692418564427789, "loss": 0.021232970058918, "loss_ce": 0.011879331432282925, "loss_xval": 0.00933837890625, "num_input_tokens_seen": 137716504, "step": 934 }, { "epoch": 0.2695301239550303, "grad_norm": 15.504062175280504, "learning_rate": 0.0001, "loss": 0.0223, "num_input_tokens_seen": 137851616, "step": 935 }, { "epoch": 0.2695301239550303, "loss": 0.015094293281435966, "loss_ce": 0.0007739191642031074, "loss_xval": 0.01434326171875, "num_input_tokens_seen": 137851616, "step": 935 }, { "epoch": 0.26981839146728165, "grad_norm": 21.816170072043693, "learning_rate": 0.0001, "loss": 0.0501, "num_input_tokens_seen": 138024032, "step": 936 }, { "epoch": 0.26981839146728165, "loss": 0.03761507570743561, "loss_ce": 0.006273522041738033, "loss_xval": 0.03125, "num_input_tokens_seen": 138024032, "step": 936 }, { "epoch": 0.270106658979533, "grad_norm": 24.016219232791897, "learning_rate": 0.0001, "loss": 0.054, "num_input_tokens_seen": 138158760, "step": 937 }, { "epoch": 0.270106658979533, "loss": 0.057084821164608, "loss_ce": 0.008043071255087852, "loss_xval": 0.049072265625, "num_input_tokens_seen": 138158760, "step": 937 }, { "epoch": 0.27039492649178437, "grad_norm": 20.950077621507994, "learning_rate": 0.0001, "loss": 0.0387, "num_input_tokens_seen": 138293760, "step": 938 }, { "epoch": 0.27039492649178437, "loss": 0.046418577432632446, "loss_ce": 0.0007947962731122971, "loss_xval": 0.045654296875, "num_input_tokens_seen": 138293760, "step": 938 }, { "epoch": 0.27068319400403573, "grad_norm": 16.409492997422056, "learning_rate": 0.0001, "loss": 0.0306, "num_input_tokens_seen": 138466136, "step": 939 }, { "epoch": 0.27068319400403573, "loss": 0.030278921127319336, "loss_ce": 0.0010888581164181232, "loss_xval": 0.0291748046875, "num_input_tokens_seen": 138466136, "step": 939 }, { "epoch": 0.2709714615162871, "grad_norm": 15.24922810529818, "learning_rate": 0.0001, "loss": 0.0268, "num_input_tokens_seen": 138600880, "step": 940 }, { "epoch": 0.2709714615162871, "loss": 0.03236062824726105, "loss_ce": 0.012325836345553398, "loss_xval": 0.02001953125, "num_input_tokens_seen": 138600880, "step": 940 }, { "epoch": 0.2712597290285385, "grad_norm": 19.295290220045818, "learning_rate": 0.0001, "loss": 0.0333, "num_input_tokens_seen": 138735968, "step": 941 }, { "epoch": 0.2712597290285385, "loss": 0.025427913293242455, "loss_ce": 0.0008002271642908454, "loss_xval": 0.024658203125, "num_input_tokens_seen": 138735968, "step": 941 }, { "epoch": 0.27154799654078987, "grad_norm": 25.769955372399583, "learning_rate": 0.0001, "loss": 0.0632, "num_input_tokens_seen": 138908344, "step": 942 }, { "epoch": 0.27154799654078987, "loss": 0.04995029419660568, "loss_ce": 0.0012747579021379352, "loss_xval": 0.048583984375, "num_input_tokens_seen": 138908344, "step": 942 }, { "epoch": 0.27183626405304123, "grad_norm": 29.89708338319636, "learning_rate": 0.0001, "loss": 0.0834, "num_input_tokens_seen": 139043064, "step": 943 }, { "epoch": 0.27183626405304123, "loss": 0.09044340252876282, "loss_ce": 0.012196330353617668, "loss_xval": 0.078125, "num_input_tokens_seen": 139043064, "step": 943 }, { "epoch": 0.2721245315652926, "grad_norm": 29.221058418100938, "learning_rate": 0.0001, "loss": 0.0752, "num_input_tokens_seen": 139178080, "step": 944 }, { "epoch": 0.2721245315652926, "loss": 0.0861777663230896, "loss_ce": 0.0012168283574283123, "loss_xval": 0.0849609375, "num_input_tokens_seen": 139178080, "step": 944 }, { "epoch": 0.27241279907754395, "grad_norm": 26.289080802161784, "learning_rate": 0.0001, "loss": 0.0653, "num_input_tokens_seen": 139350632, "step": 945 }, { "epoch": 0.27241279907754395, "loss": 0.07178245484828949, "loss_ce": 0.0016530563589185476, "loss_xval": 0.0703125, "num_input_tokens_seen": 139350632, "step": 945 }, { "epoch": 0.2727010665897953, "grad_norm": 24.600247355148554, "learning_rate": 0.0001, "loss": 0.0575, "num_input_tokens_seen": 139485448, "step": 946 }, { "epoch": 0.2727010665897953, "loss": 0.060692962259054184, "loss_ce": 0.007531340233981609, "loss_xval": 0.05322265625, "num_input_tokens_seen": 139485448, "step": 946 }, { "epoch": 0.2729893341020467, "grad_norm": 25.384371769491704, "learning_rate": 0.0001, "loss": 0.0584, "num_input_tokens_seen": 139620448, "step": 947 }, { "epoch": 0.2729893341020467, "loss": 0.04900098592042923, "loss_ce": 0.0009663160308264196, "loss_xval": 0.048095703125, "num_input_tokens_seen": 139620448, "step": 947 }, { "epoch": 0.2732776016142981, "grad_norm": 25.47315877753157, "learning_rate": 0.0001, "loss": 0.0646, "num_input_tokens_seen": 139792856, "step": 948 }, { "epoch": 0.2732776016142981, "loss": 0.049740906804800034, "loss_ce": 0.0014315814478322864, "loss_xval": 0.04833984375, "num_input_tokens_seen": 139792856, "step": 948 }, { "epoch": 0.27356586912654945, "grad_norm": 20.431395255995, "learning_rate": 0.0001, "loss": 0.0477, "num_input_tokens_seen": 139927664, "step": 949 }, { "epoch": 0.27356586912654945, "loss": 0.05533936247229576, "loss_ce": 0.01728394255042076, "loss_xval": 0.0380859375, "num_input_tokens_seen": 139927664, "step": 949 }, { "epoch": 0.2738541366388008, "grad_norm": 9.615385475945006, "learning_rate": 0.0001, "loss": 0.0109, "num_input_tokens_seen": 140062672, "step": 950 }, { "epoch": 0.2738541366388008, "loss": 0.013754522427916527, "loss_ce": 0.0008455874631181359, "loss_xval": 0.012939453125, "num_input_tokens_seen": 140062672, "step": 950 }, { "epoch": 0.2741424041510522, "grad_norm": 2.7294140895326406, "learning_rate": 0.0001, "loss": 0.0108, "num_input_tokens_seen": 140235104, "step": 951 }, { "epoch": 0.2741424041510522, "loss": 0.003939633257687092, "loss_ce": 0.002550129545852542, "loss_xval": 0.0013885498046875, "num_input_tokens_seen": 140235104, "step": 951 }, { "epoch": 0.27443067166330354, "grad_norm": 10.861198971518537, "learning_rate": 0.0001, "loss": 0.02, "num_input_tokens_seen": 140369880, "step": 952 }, { "epoch": 0.27443067166330354, "loss": 0.027800941839814186, "loss_ce": 0.01453342568129301, "loss_xval": 0.01324462890625, "num_input_tokens_seen": 140369880, "step": 952 }, { "epoch": 0.2747189391755549, "grad_norm": 13.40011589641844, "learning_rate": 0.0001, "loss": 0.0191, "num_input_tokens_seen": 140504904, "step": 953 }, { "epoch": 0.2747189391755549, "loss": 0.02331135794520378, "loss_ce": 0.0008504212601110339, "loss_xval": 0.0224609375, "num_input_tokens_seen": 140504904, "step": 953 }, { "epoch": 0.27500720668780626, "grad_norm": 12.303345775184155, "learning_rate": 0.0001, "loss": 0.0221, "num_input_tokens_seen": 140677608, "step": 954 }, { "epoch": 0.27500720668780626, "loss": 0.02076645940542221, "loss_ce": 0.0022804364562034607, "loss_xval": 0.0184326171875, "num_input_tokens_seen": 140677608, "step": 954 }, { "epoch": 0.2752954742000577, "grad_norm": 10.961128742719117, "learning_rate": 0.0001, "loss": 0.02, "num_input_tokens_seen": 140812384, "step": 955 }, { "epoch": 0.2752954742000577, "loss": 0.02713668718934059, "loss_ce": 0.014540554955601692, "loss_xval": 0.0125732421875, "num_input_tokens_seen": 140812384, "step": 955 }, { "epoch": 0.27558374171230904, "grad_norm": 11.45692148425062, "learning_rate": 0.0001, "loss": 0.0142, "num_input_tokens_seen": 140947408, "step": 956 }, { "epoch": 0.27558374171230904, "loss": 0.011002715677022934, "loss_ce": 0.0008022149559110403, "loss_xval": 0.01019287109375, "num_input_tokens_seen": 140947408, "step": 956 }, { "epoch": 0.2758720092245604, "grad_norm": 12.276193654056682, "learning_rate": 0.0001, "loss": 0.0219, "num_input_tokens_seen": 141119968, "step": 957 }, { "epoch": 0.2758720092245604, "loss": 0.012618216685950756, "loss_ce": 0.0017005529953166842, "loss_xval": 0.01092529296875, "num_input_tokens_seen": 141119968, "step": 957 }, { "epoch": 0.27616027673681176, "grad_norm": 10.322703497206954, "learning_rate": 0.0001, "loss": 0.0169, "num_input_tokens_seen": 141254712, "step": 958 }, { "epoch": 0.27616027673681176, "loss": 0.022379161790013313, "loss_ce": 0.011713268235325813, "loss_xval": 0.01068115234375, "num_input_tokens_seen": 141254712, "step": 958 }, { "epoch": 0.2764485442490631, "grad_norm": 5.142704767137008, "learning_rate": 0.0001, "loss": 0.0041, "num_input_tokens_seen": 141389776, "step": 959 }, { "epoch": 0.2764485442490631, "loss": 0.005325574427843094, "loss_ce": 0.0007574743358418345, "loss_xval": 0.00457763671875, "num_input_tokens_seen": 141389776, "step": 959 }, { "epoch": 0.2767368117613145, "grad_norm": 0.8162921258077296, "learning_rate": 0.0001, "loss": 0.0076, "num_input_tokens_seen": 141562256, "step": 960 }, { "epoch": 0.2767368117613145, "loss": 0.0015741356182843447, "loss_ce": 0.0009709365549497306, "loss_xval": 0.00060272216796875, "num_input_tokens_seen": 141562256, "step": 960 }, { "epoch": 0.27702507927356584, "grad_norm": 4.287498292630918, "learning_rate": 0.0001, "loss": 0.006, "num_input_tokens_seen": 141697080, "step": 961 }, { "epoch": 0.27702507927356584, "loss": 0.008994442410767078, "loss_ce": 0.006450039334595203, "loss_xval": 0.0025482177734375, "num_input_tokens_seen": 141697080, "step": 961 }, { "epoch": 0.27731334678581726, "grad_norm": 4.414203081683492, "learning_rate": 0.0001, "loss": 0.0038, "num_input_tokens_seen": 141832144, "step": 962 }, { "epoch": 0.27731334678581726, "loss": 0.004648888483643532, "loss_ce": 0.000725472578778863, "loss_xval": 0.003936767578125, "num_input_tokens_seen": 141832144, "step": 962 }, { "epoch": 0.2776016142980686, "grad_norm": 3.131954367557028, "learning_rate": 0.0001, "loss": 0.0092, "num_input_tokens_seen": 142004616, "step": 963 }, { "epoch": 0.2776016142980686, "loss": 0.0035226130858063698, "loss_ce": 0.0009610439883545041, "loss_xval": 0.0025634765625, "num_input_tokens_seen": 142004616, "step": 963 }, { "epoch": 0.27788988181032, "grad_norm": 3.7388693449607406, "learning_rate": 0.0001, "loss": 0.0079, "num_input_tokens_seen": 142139456, "step": 964 }, { "epoch": 0.27788988181032, "loss": 0.013125386089086533, "loss_ce": 0.010863270610570908, "loss_xval": 0.00225830078125, "num_input_tokens_seen": 142139456, "step": 964 }, { "epoch": 0.27817814932257134, "grad_norm": 7.853043705392883, "learning_rate": 0.0001, "loss": 0.0098, "num_input_tokens_seen": 142274552, "step": 965 }, { "epoch": 0.27817814932257134, "loss": 0.0050224545411765575, "loss_ce": 0.0006698851939290762, "loss_xval": 0.004364013671875, "num_input_tokens_seen": 142274552, "step": 965 }, { "epoch": 0.2784664168348227, "grad_norm": 13.798301364906187, "learning_rate": 0.0001, "loss": 0.0246, "num_input_tokens_seen": 142446992, "step": 966 }, { "epoch": 0.2784664168348227, "loss": 0.01616474986076355, "loss_ce": 0.0009364773286506534, "loss_xval": 0.0152587890625, "num_input_tokens_seen": 142446992, "step": 966 }, { "epoch": 0.27875468434707407, "grad_norm": 21.11692774117553, "learning_rate": 0.0001, "loss": 0.048, "num_input_tokens_seen": 142581824, "step": 967 }, { "epoch": 0.27875468434707407, "loss": 0.05509456992149353, "loss_ce": 0.011607019230723381, "loss_xval": 0.04345703125, "num_input_tokens_seen": 142581824, "step": 967 }, { "epoch": 0.2790429518593254, "grad_norm": 30.422620769917025, "learning_rate": 0.0001, "loss": 0.0882, "num_input_tokens_seen": 142716784, "step": 968 }, { "epoch": 0.2790429518593254, "loss": 0.09613828361034393, "loss_ce": 0.0006793015636503696, "loss_xval": 0.095703125, "num_input_tokens_seen": 142716784, "step": 968 }, { "epoch": 0.27933121937157684, "grad_norm": 44.02363731095848, "learning_rate": 0.0001, "loss": 0.1874, "num_input_tokens_seen": 142889200, "step": 969 }, { "epoch": 0.27933121937157684, "loss": 0.19272679090499878, "loss_ce": 0.0013205313589423895, "loss_xval": 0.19140625, "num_input_tokens_seen": 142889200, "step": 969 }, { "epoch": 0.2796194868838282, "grad_norm": 62.57380169348872, "learning_rate": 0.0001, "loss": 0.379, "num_input_tokens_seen": 143023960, "step": 970 }, { "epoch": 0.2796194868838282, "loss": 0.39476674795150757, "loss_ce": 0.010245269164443016, "loss_xval": 0.384765625, "num_input_tokens_seen": 143023960, "step": 970 }, { "epoch": 0.27990775439607957, "grad_norm": 80.40325261016795, "learning_rate": 0.0001, "loss": 0.6164, "num_input_tokens_seen": 143158960, "step": 971 }, { "epoch": 0.27990775439607957, "loss": 0.5973809957504272, "loss_ce": 0.000701274024322629, "loss_xval": 0.59765625, "num_input_tokens_seen": 143158960, "step": 971 }, { "epoch": 0.28019602190833093, "grad_norm": 76.35630842860024, "learning_rate": 0.0001, "loss": 0.5774, "num_input_tokens_seen": 143331528, "step": 972 }, { "epoch": 0.28019602190833093, "loss": 0.5543111562728882, "loss_ce": 0.0010884751100093126, "loss_xval": 0.5546875, "num_input_tokens_seen": 143331528, "step": 972 }, { "epoch": 0.2804842894205823, "grad_norm": 33.64460824242602, "learning_rate": 0.0001, "loss": 0.1209, "num_input_tokens_seen": 143466336, "step": 973 }, { "epoch": 0.2804842894205823, "loss": 0.128205806016922, "loss_ce": 0.010774169117212296, "loss_xval": 0.1171875, "num_input_tokens_seen": 143466336, "step": 973 }, { "epoch": 0.28077255693283365, "grad_norm": 26.28356998850835, "learning_rate": 0.0001, "loss": 0.0743, "num_input_tokens_seen": 143601336, "step": 974 }, { "epoch": 0.28077255693283365, "loss": 0.0688517838716507, "loss_ce": 0.0009196547325700521, "loss_xval": 0.06787109375, "num_input_tokens_seen": 143601336, "step": 974 }, { "epoch": 0.281060824445085, "grad_norm": 57.38440801081276, "learning_rate": 0.0001, "loss": 0.3491, "num_input_tokens_seen": 143773952, "step": 975 }, { "epoch": 0.281060824445085, "loss": 0.35013845562934875, "loss_ce": 0.004191188141703606, "loss_xval": 0.345703125, "num_input_tokens_seen": 143773952, "step": 975 }, { "epoch": 0.28134909195733643, "grad_norm": 33.886311086551565, "learning_rate": 0.0001, "loss": 0.1299, "num_input_tokens_seen": 143908696, "step": 976 }, { "epoch": 0.28134909195733643, "loss": 0.15857642889022827, "loss_ce": 0.013068612664937973, "loss_xval": 0.1455078125, "num_input_tokens_seen": 143908696, "step": 976 }, { "epoch": 0.2816373594695878, "grad_norm": 17.302379582721162, "learning_rate": 0.0001, "loss": 0.0358, "num_input_tokens_seen": 144043712, "step": 977 }, { "epoch": 0.2816373594695878, "loss": 0.027677983045578003, "loss_ce": 0.001127689378336072, "loss_xval": 0.026611328125, "num_input_tokens_seen": 144043712, "step": 977 }, { "epoch": 0.28192562698183915, "grad_norm": 41.012904645947934, "learning_rate": 0.0001, "loss": 0.1911, "num_input_tokens_seen": 144216272, "step": 978 }, { "epoch": 0.28192562698183915, "loss": 0.2025768756866455, "loss_ce": 0.003480201121419668, "loss_xval": 0.19921875, "num_input_tokens_seen": 144216272, "step": 978 }, { "epoch": 0.2822138944940905, "grad_norm": 15.54293449596175, "learning_rate": 0.0001, "loss": 0.0381, "num_input_tokens_seen": 144351040, "step": 979 }, { "epoch": 0.2822138944940905, "loss": 0.05929769203066826, "loss_ce": 0.009187827818095684, "loss_xval": 0.050048828125, "num_input_tokens_seen": 144351040, "step": 979 }, { "epoch": 0.2825021620063419, "grad_norm": 22.751053245751585, "learning_rate": 0.0001, "loss": 0.0628, "num_input_tokens_seen": 144486072, "step": 980 }, { "epoch": 0.2825021620063419, "loss": 0.04631127417087555, "loss_ce": 0.0014504313003271818, "loss_xval": 0.044921875, "num_input_tokens_seen": 144486072, "step": 980 }, { "epoch": 0.28279042951859323, "grad_norm": 24.906885248483842, "learning_rate": 0.0001, "loss": 0.0798, "num_input_tokens_seen": 144658504, "step": 981 }, { "epoch": 0.28279042951859323, "loss": 0.09189799427986145, "loss_ce": 0.0021763185504823923, "loss_xval": 0.08984375, "num_input_tokens_seen": 144658504, "step": 981 }, { "epoch": 0.28307869703084465, "grad_norm": 9.041544339456737, "learning_rate": 0.0001, "loss": 0.0248, "num_input_tokens_seen": 144793368, "step": 982 }, { "epoch": 0.28307869703084465, "loss": 0.015240021049976349, "loss_ce": 0.011438675224781036, "loss_xval": 0.0037994384765625, "num_input_tokens_seen": 144793368, "step": 982 }, { "epoch": 0.283366964543096, "grad_norm": 29.264667801560147, "learning_rate": 0.0001, "loss": 0.1046, "num_input_tokens_seen": 144928696, "step": 983 }, { "epoch": 0.283366964543096, "loss": 0.07928408682346344, "loss_ce": 0.0017084057908505201, "loss_xval": 0.07763671875, "num_input_tokens_seen": 144928696, "step": 983 }, { "epoch": 0.2836552320553474, "grad_norm": 3.9211013715650176, "learning_rate": 0.0001, "loss": 0.0122, "num_input_tokens_seen": 145101192, "step": 984 }, { "epoch": 0.2836552320553474, "loss": 0.011925269849598408, "loss_ce": 0.002686073537915945, "loss_xval": 0.00921630859375, "num_input_tokens_seen": 145101192, "step": 984 }, { "epoch": 0.28394349956759873, "grad_norm": 33.30285734842819, "learning_rate": 0.0001, "loss": 0.1483, "num_input_tokens_seen": 145235976, "step": 985 }, { "epoch": 0.28394349956759873, "loss": 0.0838775783777237, "loss_ce": 0.008682267740368843, "loss_xval": 0.0751953125, "num_input_tokens_seen": 145235976, "step": 985 }, { "epoch": 0.2842317670798501, "grad_norm": 25.700668378333628, "learning_rate": 0.0001, "loss": 0.0873, "num_input_tokens_seen": 145371144, "step": 986 }, { "epoch": 0.2842317670798501, "loss": 0.05714728683233261, "loss_ce": 0.0016968436539173126, "loss_xval": 0.055419921875, "num_input_tokens_seen": 145371144, "step": 986 }, { "epoch": 0.28452003459210146, "grad_norm": 26.210793499594022, "learning_rate": 0.0001, "loss": 0.0977, "num_input_tokens_seen": 145543648, "step": 987 }, { "epoch": 0.28452003459210146, "loss": 0.04943012818694115, "loss_ce": 0.0018532233079895377, "loss_xval": 0.047607421875, "num_input_tokens_seen": 145543648, "step": 987 }, { "epoch": 0.2848083021043528, "grad_norm": 53.09264170885138, "learning_rate": 0.0001, "loss": 0.3692, "num_input_tokens_seen": 145678456, "step": 988 }, { "epoch": 0.2848083021043528, "loss": 0.23942574858665466, "loss_ce": 0.009933562949299812, "loss_xval": 0.2294921875, "num_input_tokens_seen": 145678456, "step": 988 }, { "epoch": 0.28509656961660423, "grad_norm": 7.6051148878985035, "learning_rate": 0.0001, "loss": 0.013, "num_input_tokens_seen": 145813592, "step": 989 }, { "epoch": 0.28509656961660423, "loss": 0.007788852788507938, "loss_ce": 0.0017158547416329384, "loss_xval": 0.006072998046875, "num_input_tokens_seen": 145813592, "step": 989 }, { "epoch": 0.2853848371288556, "grad_norm": 58.91672638655118, "learning_rate": 0.0001, "loss": 0.4474, "num_input_tokens_seen": 145986328, "step": 990 }, { "epoch": 0.2853848371288556, "loss": 0.3273075222969055, "loss_ce": 0.002112208865582943, "loss_xval": 0.32421875, "num_input_tokens_seen": 145986328, "step": 990 }, { "epoch": 0.28567310464110696, "grad_norm": 49.15384164290371, "learning_rate": 0.0001, "loss": 0.3252, "num_input_tokens_seen": 146121160, "step": 991 }, { "epoch": 0.28567310464110696, "loss": 0.23587465286254883, "loss_ce": 0.008579719811677933, "loss_xval": 0.2275390625, "num_input_tokens_seen": 146121160, "step": 991 }, { "epoch": 0.2859613721533583, "grad_norm": 31.272820081609783, "learning_rate": 0.0001, "loss": 0.1358, "num_input_tokens_seen": 146256144, "step": 992 }, { "epoch": 0.2859613721533583, "loss": 0.13801869750022888, "loss_ce": 0.00172718265093863, "loss_xval": 0.13671875, "num_input_tokens_seen": 146256144, "step": 992 }, { "epoch": 0.2862496396656097, "grad_norm": 65.15782599733767, "learning_rate": 0.0001, "loss": 0.5779, "num_input_tokens_seen": 146428640, "step": 993 }, { "epoch": 0.2862496396656097, "loss": 0.49440979957580566, "loss_ce": 0.0027106208726763725, "loss_xval": 0.4921875, "num_input_tokens_seen": 146428640, "step": 993 }, { "epoch": 0.28653790717786104, "grad_norm": 0.9831012404283345, "learning_rate": 0.0001, "loss": 0.0182, "num_input_tokens_seen": 146563456, "step": 994 }, { "epoch": 0.28653790717786104, "loss": 0.025697484612464905, "loss_ce": 0.014360204339027405, "loss_xval": 0.0113525390625, "num_input_tokens_seen": 146563456, "step": 994 }, { "epoch": 0.2868261746901124, "grad_norm": 58.8648520424778, "learning_rate": 0.0001, "loss": 0.4717, "num_input_tokens_seen": 146698480, "step": 995 }, { "epoch": 0.2868261746901124, "loss": 0.46646779775619507, "loss_ce": 0.0016240678960457444, "loss_xval": 0.46484375, "num_input_tokens_seen": 146698480, "step": 995 }, { "epoch": 0.2871144422023638, "grad_norm": 17.60872940599237, "learning_rate": 0.0001, "loss": 0.0565, "num_input_tokens_seen": 146871072, "step": 996 }, { "epoch": 0.2871144422023638, "loss": 0.04825851321220398, "loss_ce": 0.003336637280881405, "loss_xval": 0.044921875, "num_input_tokens_seen": 146871072, "step": 996 }, { "epoch": 0.2874027097146152, "grad_norm": 48.696681592737896, "learning_rate": 0.0001, "loss": 0.3301, "num_input_tokens_seen": 147005872, "step": 997 }, { "epoch": 0.2874027097146152, "loss": 0.3169158101081848, "loss_ce": 0.010275169275701046, "loss_xval": 0.306640625, "num_input_tokens_seen": 147005872, "step": 997 }, { "epoch": 0.28769097722686654, "grad_norm": 24.725614729868106, "learning_rate": 0.0001, "loss": 0.0927, "num_input_tokens_seen": 147140992, "step": 998 }, { "epoch": 0.28769097722686654, "loss": 0.09264456480741501, "loss_ce": 0.0017632140079513192, "loss_xval": 0.0908203125, "num_input_tokens_seen": 147140992, "step": 998 }, { "epoch": 0.2879792447391179, "grad_norm": 42.92469792044564, "learning_rate": 0.0001, "loss": 0.2791, "num_input_tokens_seen": 147313528, "step": 999 }, { "epoch": 0.2879792447391179, "loss": 0.22426754236221313, "loss_ce": 0.0030761342495679855, "loss_xval": 0.220703125, "num_input_tokens_seen": 147313528, "step": 999 }, { "epoch": 0.28826751225136926, "grad_norm": 29.84366516038228, "learning_rate": 0.0001, "loss": 0.145, "num_input_tokens_seen": 147448352, "step": 1000 }, { "epoch": 0.28826751225136926, "eval_websight_new_IoU": 0.0006760411197319627, "eval_websight_new_MAE_x": 0.48339928686618805, "eval_websight_new_MAE_y": 0.4158986359834671, "eval_websight_new_NUM_probability": 0.9786257445812225, "eval_websight_new_inside_bbox": 0.0, "eval_websight_new_loss": 0.21313229203224182, "eval_websight_new_loss_ce": 0.002203845651820302, "eval_websight_new_loss_xval": 0.2103271484375, "eval_websight_new_runtime": 35.3244, "eval_websight_new_samples_per_second": 1.415, "eval_websight_new_steps_per_second": 0.057, "num_input_tokens_seen": 147448352, "step": 1000 }, { "epoch": 0.28826751225136926, "eval_seeclick_IoU": 0.010920957662165165, "eval_seeclick_MAE_x": 0.3465300649404526, "eval_seeclick_MAE_y": 0.3193105012178421, "eval_seeclick_NUM_probability": 0.980810135602951, "eval_seeclick_inside_bbox": 0.0, "eval_seeclick_loss": 0.1418568640947342, "eval_seeclick_loss_ce": 0.012138516176491976, "eval_seeclick_loss_xval": 0.12896728515625, "eval_seeclick_runtime": 66.5626, "eval_seeclick_samples_per_second": 0.751, "eval_seeclick_steps_per_second": 0.03, "num_input_tokens_seen": 147448352, "step": 1000 }, { "epoch": 0.28826751225136926, "eval_icons_IoU": 0.0, "eval_icons_MAE_x": 0.41779157519340515, "eval_icons_MAE_y": 0.4029218852519989, "eval_icons_NUM_probability": 0.9756582975387573, "eval_icons_inside_bbox": 0.0, "eval_icons_loss": 0.2196388989686966, "eval_icons_loss_ce": 0.005240184138529003, "eval_icons_loss_xval": 0.21490478515625, "eval_icons_runtime": 63.1802, "eval_icons_samples_per_second": 0.791, "eval_icons_steps_per_second": 0.032, "num_input_tokens_seen": 147448352, "step": 1000 }, { "epoch": 0.28826751225136926, "loss": 0.22050818800926208, "loss_ce": 0.006885141599923372, "loss_xval": 0.2138671875, "num_input_tokens_seen": 147448352, "step": 1000 }, { "epoch": 0.2885557797636206, "grad_norm": 38.16026161198317, "learning_rate": 0.0001, "loss": 0.2163, "num_input_tokens_seen": 147583504, "step": 1001 }, { "epoch": 0.2885557797636206, "loss": 0.21293914318084717, "loss_ce": 0.0027340645901858807, "loss_xval": 0.2099609375, "num_input_tokens_seen": 147583504, "step": 1001 }, { "epoch": 0.288844047275872, "grad_norm": 34.97874588867303, "learning_rate": 0.0001, "loss": 0.2006, "num_input_tokens_seen": 147756064, "step": 1002 }, { "epoch": 0.288844047275872, "loss": 0.15403415262699127, "loss_ce": 0.0029111080802977085, "loss_xval": 0.1513671875, "num_input_tokens_seen": 147756064, "step": 1002 }, { "epoch": 0.2891323147881234, "grad_norm": 29.870015158590114, "learning_rate": 0.0001, "loss": 0.1551, "num_input_tokens_seen": 147890952, "step": 1003 }, { "epoch": 0.2891323147881234, "loss": 0.1824108362197876, "loss_ce": 0.013831727206707, "loss_xval": 0.1689453125, "num_input_tokens_seen": 147890952, "step": 1003 }, { "epoch": 0.28942058230037476, "grad_norm": 52.54062919848932, "learning_rate": 0.0001, "loss": 0.1827, "num_input_tokens_seen": 148026080, "step": 1004 }, { "epoch": 0.28942058230037476, "loss": 0.1736568808555603, "loss_ce": 0.0031246549915522337, "loss_xval": 0.1708984375, "num_input_tokens_seen": 148026080, "step": 1004 }, { "epoch": 0.2897088498126261, "grad_norm": 23.43538994398458, "learning_rate": 0.0001, "loss": 0.0984, "num_input_tokens_seen": 148198608, "step": 1005 }, { "epoch": 0.2897088498126261, "loss": 0.10520273447036743, "loss_ce": 0.002785742050036788, "loss_xval": 0.1025390625, "num_input_tokens_seen": 148198608, "step": 1005 }, { "epoch": 0.2899971173248775, "grad_norm": 24.570694864205628, "learning_rate": 0.0001, "loss": 0.1066, "num_input_tokens_seen": 148333400, "step": 1006 }, { "epoch": 0.2899971173248775, "loss": 0.1271902620792389, "loss_ce": 0.011772781610488892, "loss_xval": 0.115234375, "num_input_tokens_seen": 148333400, "step": 1006 }, { "epoch": 0.29028538483712885, "grad_norm": 21.721609732412762, "learning_rate": 0.0001, "loss": 0.0815, "num_input_tokens_seen": 148468376, "step": 1007 }, { "epoch": 0.29028538483712885, "loss": 0.07593949139118195, "loss_ce": 0.0031245506834238768, "loss_xval": 0.07275390625, "num_input_tokens_seen": 148468376, "step": 1007 }, { "epoch": 0.2905736523493802, "grad_norm": 19.166959945295666, "learning_rate": 0.0001, "loss": 0.0741, "num_input_tokens_seen": 148640952, "step": 1008 }, { "epoch": 0.2905736523493802, "loss": 0.07962779700756073, "loss_ce": 0.014259150251746178, "loss_xval": 0.0654296875, "num_input_tokens_seen": 148640952, "step": 1008 }, { "epoch": 0.29086191986163157, "grad_norm": 21.311366697639038, "learning_rate": 0.0001, "loss": 0.0842, "num_input_tokens_seen": 148775840, "step": 1009 }, { "epoch": 0.29086191986163157, "loss": 0.07820656895637512, "loss_ce": 0.013600854203104973, "loss_xval": 0.064453125, "num_input_tokens_seen": 148775840, "step": 1009 }, { "epoch": 0.291150187373883, "grad_norm": 19.015665857193742, "learning_rate": 0.0001, "loss": 0.0639, "num_input_tokens_seen": 148910984, "step": 1010 }, { "epoch": 0.291150187373883, "loss": 0.05878002196550369, "loss_ce": 0.00336010055616498, "loss_xval": 0.055419921875, "num_input_tokens_seen": 148910984, "step": 1010 }, { "epoch": 0.29143845488613435, "grad_norm": 19.15943729010692, "learning_rate": 0.0001, "loss": 0.0723, "num_input_tokens_seen": 149083560, "step": 1011 }, { "epoch": 0.29143845488613435, "loss": 0.06524774432182312, "loss_ce": 0.005860530771315098, "loss_xval": 0.059326171875, "num_input_tokens_seen": 149083560, "step": 1011 }, { "epoch": 0.2917267223983857, "grad_norm": 18.53212069603917, "learning_rate": 0.0001, "loss": 0.0679, "num_input_tokens_seen": 149218400, "step": 1012 }, { "epoch": 0.2917267223983857, "loss": 0.06433580815792084, "loss_ce": 0.014164908789098263, "loss_xval": 0.05029296875, "num_input_tokens_seen": 149218400, "step": 1012 }, { "epoch": 0.29201498991063707, "grad_norm": 16.539777806069416, "learning_rate": 0.0001, "loss": 0.0612, "num_input_tokens_seen": 149353464, "step": 1013 }, { "epoch": 0.29201498991063707, "loss": 0.0547209233045578, "loss_ce": 0.0029631140641868114, "loss_xval": 0.0517578125, "num_input_tokens_seen": 149353464, "step": 1013 }, { "epoch": 0.29230325742288843, "grad_norm": 18.267120678013114, "learning_rate": 0.0001, "loss": 0.0727, "num_input_tokens_seen": 149525984, "step": 1014 }, { "epoch": 0.29230325742288843, "loss": 0.05868646502494812, "loss_ce": 0.0076000383123755455, "loss_xval": 0.051025390625, "num_input_tokens_seen": 149525984, "step": 1014 }, { "epoch": 0.2925915249351398, "grad_norm": 12.986732799219686, "learning_rate": 0.0001, "loss": 0.0425, "num_input_tokens_seen": 149660760, "step": 1015 }, { "epoch": 0.2925915249351398, "loss": 0.056522078812122345, "loss_ce": 0.01465196069329977, "loss_xval": 0.0419921875, "num_input_tokens_seen": 149660760, "step": 1015 }, { "epoch": 0.29287979244739115, "grad_norm": 16.2358668446718, "learning_rate": 0.0001, "loss": 0.052, "num_input_tokens_seen": 149795824, "step": 1016 }, { "epoch": 0.29287979244739115, "loss": 0.04977504909038544, "loss_ce": 0.002167628612369299, "loss_xval": 0.047607421875, "num_input_tokens_seen": 149795824, "step": 1016 }, { "epoch": 0.29316805995964257, "grad_norm": 10.364906246580064, "learning_rate": 0.0001, "loss": 0.0348, "num_input_tokens_seen": 149968296, "step": 1017 }, { "epoch": 0.29316805995964257, "loss": 0.039077188819646835, "loss_ce": 0.016387369483709335, "loss_xval": 0.022705078125, "num_input_tokens_seen": 149968296, "step": 1017 }, { "epoch": 0.29345632747189393, "grad_norm": 13.811124874886177, "learning_rate": 0.0001, "loss": 0.0447, "num_input_tokens_seen": 150103088, "step": 1018 }, { "epoch": 0.29345632747189393, "loss": 0.054243750870227814, "loss_ce": 0.013930030167102814, "loss_xval": 0.040283203125, "num_input_tokens_seen": 150103088, "step": 1018 }, { "epoch": 0.2937445949841453, "grad_norm": 8.931755673306386, "learning_rate": 0.0001, "loss": 0.0189, "num_input_tokens_seen": 150238168, "step": 1019 }, { "epoch": 0.2937445949841453, "loss": 0.020072992891073227, "loss_ce": 0.0018311117310076952, "loss_xval": 0.0181884765625, "num_input_tokens_seen": 150238168, "step": 1019 }, { "epoch": 0.29403286249639665, "grad_norm": 13.364194602767919, "learning_rate": 0.0001, "loss": 0.0416, "num_input_tokens_seen": 150410816, "step": 1020 }, { "epoch": 0.29403286249639665, "loss": 0.04038884490728378, "loss_ce": 0.009215136058628559, "loss_xval": 0.0311279296875, "num_input_tokens_seen": 150410816, "step": 1020 }, { "epoch": 0.294321130008648, "grad_norm": 8.488080883654083, "learning_rate": 0.0001, "loss": 0.0252, "num_input_tokens_seen": 150545608, "step": 1021 }, { "epoch": 0.294321130008648, "loss": 0.026289835572242737, "loss_ce": 0.014265909790992737, "loss_xval": 0.01202392578125, "num_input_tokens_seen": 150545608, "step": 1021 }, { "epoch": 0.2946093975208994, "grad_norm": 13.343837433926826, "learning_rate": 0.0001, "loss": 0.0378, "num_input_tokens_seen": 150680792, "step": 1022 }, { "epoch": 0.2946093975208994, "loss": 0.0328974649310112, "loss_ce": 0.0027918717823922634, "loss_xval": 0.0301513671875, "num_input_tokens_seen": 150680792, "step": 1022 }, { "epoch": 0.2948976650331508, "grad_norm": 7.013524864159005, "learning_rate": 0.0001, "loss": 0.0203, "num_input_tokens_seen": 150853376, "step": 1023 }, { "epoch": 0.2948976650331508, "loss": 0.013356918469071388, "loss_ce": 0.004079573787748814, "loss_xval": 0.00927734375, "num_input_tokens_seen": 150853376, "step": 1023 }, { "epoch": 0.29518593254540215, "grad_norm": 13.120764246368939, "learning_rate": 0.0001, "loss": 0.0397, "num_input_tokens_seen": 150988160, "step": 1024 }, { "epoch": 0.29518593254540215, "loss": 0.043137479573488235, "loss_ce": 0.012879302725195885, "loss_xval": 0.0302734375, "num_input_tokens_seen": 150988160, "step": 1024 }, { "epoch": 0.2954742000576535, "grad_norm": 5.383806173983863, "learning_rate": 0.0001, "loss": 0.0108, "num_input_tokens_seen": 151123136, "step": 1025 }, { "epoch": 0.2954742000576535, "loss": 0.010275648906826973, "loss_ce": 0.0012500756420195103, "loss_xval": 0.009033203125, "num_input_tokens_seen": 151123136, "step": 1025 }, { "epoch": 0.2957624675699049, "grad_norm": 13.615844972319005, "learning_rate": 0.0001, "loss": 0.0424, "num_input_tokens_seen": 151295736, "step": 1026 }, { "epoch": 0.2957624675699049, "loss": 0.035312261432409286, "loss_ce": 0.005817023105919361, "loss_xval": 0.029541015625, "num_input_tokens_seen": 151295736, "step": 1026 }, { "epoch": 0.29605073508215624, "grad_norm": 3.120567186484494, "learning_rate": 0.0001, "loss": 0.0121, "num_input_tokens_seen": 151430624, "step": 1027 }, { "epoch": 0.29605073508215624, "loss": 0.019799476489424706, "loss_ce": 0.01299024187028408, "loss_xval": 0.006805419921875, "num_input_tokens_seen": 151430624, "step": 1027 }, { "epoch": 0.2963390025944076, "grad_norm": 12.13040539309611, "learning_rate": 0.0001, "loss": 0.0291, "num_input_tokens_seen": 151565696, "step": 1028 }, { "epoch": 0.2963390025944076, "loss": 0.029170066118240356, "loss_ce": 0.001063377596437931, "loss_xval": 0.028076171875, "num_input_tokens_seen": 151565696, "step": 1028 }, { "epoch": 0.29662727010665896, "grad_norm": 1.6662810240924648, "learning_rate": 0.0001, "loss": 0.0107, "num_input_tokens_seen": 151738232, "step": 1029 }, { "epoch": 0.29662727010665896, "loss": 0.005300058051943779, "loss_ce": 0.0031552445143461227, "loss_xval": 0.0021514892578125, "num_input_tokens_seen": 151738232, "step": 1029 }, { "epoch": 0.2969155376189104, "grad_norm": 11.118915433420447, "learning_rate": 0.0001, "loss": 0.0318, "num_input_tokens_seen": 151873024, "step": 1030 }, { "epoch": 0.2969155376189104, "loss": 0.03775542229413986, "loss_ce": 0.012639455497264862, "loss_xval": 0.025146484375, "num_input_tokens_seen": 151873024, "step": 1030 }, { "epoch": 0.29720380513116174, "grad_norm": 0.7622954876728699, "learning_rate": 0.0001, "loss": 0.0041, "num_input_tokens_seen": 152008160, "step": 1031 }, { "epoch": 0.29720380513116174, "loss": 0.004262763075530529, "loss_ce": 0.0008714973228052258, "loss_xval": 0.003387451171875, "num_input_tokens_seen": 152008160, "step": 1031 }, { "epoch": 0.2974920726434131, "grad_norm": 11.233492609765298, "learning_rate": 0.0001, "loss": 0.0283, "num_input_tokens_seen": 152180696, "step": 1032 }, { "epoch": 0.2974920726434131, "loss": 0.022938229143619537, "loss_ce": 0.0020642043091356754, "loss_xval": 0.0208740234375, "num_input_tokens_seen": 152180696, "step": 1032 }, { "epoch": 0.29778034015566446, "grad_norm": 1.8711093823146188, "learning_rate": 0.0001, "loss": 0.0078, "num_input_tokens_seen": 152315464, "step": 1033 }, { "epoch": 0.29778034015566446, "loss": 0.011964991688728333, "loss_ce": 0.009271815419197083, "loss_xval": 0.002685546875, "num_input_tokens_seen": 152315464, "step": 1033 }, { "epoch": 0.2980686076679158, "grad_norm": 9.643338816839094, "learning_rate": 0.0001, "loss": 0.0176, "num_input_tokens_seen": 152450504, "step": 1034 }, { "epoch": 0.2980686076679158, "loss": 0.017920546233654022, "loss_ce": 0.0008764795493334532, "loss_xval": 0.01708984375, "num_input_tokens_seen": 152450504, "step": 1034 }, { "epoch": 0.2983568751801672, "grad_norm": 3.7999975124459855, "learning_rate": 0.0001, "loss": 0.0101, "num_input_tokens_seen": 152622880, "step": 1035 }, { "epoch": 0.2983568751801672, "loss": 0.0050286888144910336, "loss_ce": 0.0016564966645091772, "loss_xval": 0.0033721923828125, "num_input_tokens_seen": 152622880, "step": 1035 }, { "epoch": 0.29864514269241854, "grad_norm": 7.570198961949153, "learning_rate": 0.0001, "loss": 0.016, "num_input_tokens_seen": 152757704, "step": 1036 }, { "epoch": 0.29864514269241854, "loss": 0.019874121993780136, "loss_ce": 0.008231667801737785, "loss_xval": 0.01165771484375, "num_input_tokens_seen": 152757704, "step": 1036 }, { "epoch": 0.29893341020466996, "grad_norm": 5.035619336789141, "learning_rate": 0.0001, "loss": 0.007, "num_input_tokens_seen": 152892736, "step": 1037 }, { "epoch": 0.29893341020466996, "loss": 0.006385215558111668, "loss_ce": 0.0007489998824894428, "loss_xval": 0.005645751953125, "num_input_tokens_seen": 152892736, "step": 1037 }, { "epoch": 0.2992216777169213, "grad_norm": 6.429868675290196, "learning_rate": 0.0001, "loss": 0.0139, "num_input_tokens_seen": 153065136, "step": 1038 }, { "epoch": 0.2992216777169213, "loss": 0.009618373587727547, "loss_ce": 0.0013137776404619217, "loss_xval": 0.00830078125, "num_input_tokens_seen": 153065136, "step": 1038 }, { "epoch": 0.2995099452291727, "grad_norm": 6.432639398582564, "learning_rate": 0.0001, "loss": 0.0133, "num_input_tokens_seen": 153199952, "step": 1039 }, { "epoch": 0.2995099452291727, "loss": 0.01743425987660885, "loss_ce": 0.009648462757468224, "loss_xval": 0.007781982421875, "num_input_tokens_seen": 153199952, "step": 1039 }, { "epoch": 0.29979821274142404, "grad_norm": 3.7247511285128807, "learning_rate": 0.0001, "loss": 0.0049, "num_input_tokens_seen": 153335216, "step": 1040 }, { "epoch": 0.29979821274142404, "loss": 0.003518037497997284, "loss_ce": 0.0005702301859855652, "loss_xval": 0.0029449462890625, "num_input_tokens_seen": 153335216, "step": 1040 }, { "epoch": 0.3000864802536754, "grad_norm": 7.104556921570073, "learning_rate": 0.0001, "loss": 0.016, "num_input_tokens_seen": 153507568, "step": 1041 }, { "epoch": 0.3000864802536754, "loss": 0.009859236888587475, "loss_ce": 0.0010854334104806185, "loss_xval": 0.0087890625, "num_input_tokens_seen": 153507568, "step": 1041 }, { "epoch": 0.30037474776592676, "grad_norm": 1.2721608266605768, "learning_rate": 0.0001, "loss": 0.0082, "num_input_tokens_seen": 153642344, "step": 1042 }, { "epoch": 0.30037474776592676, "loss": 0.010791032575070858, "loss_ce": 0.00926181674003601, "loss_xval": 0.00152587890625, "num_input_tokens_seen": 153642344, "step": 1042 }, { "epoch": 0.3006630152781781, "grad_norm": 7.144400955958297, "learning_rate": 0.0001, "loss": 0.0098, "num_input_tokens_seen": 153777424, "step": 1043 }, { "epoch": 0.3006630152781781, "loss": 0.00973152182996273, "loss_ce": 0.00044654967496171594, "loss_xval": 0.00927734375, "num_input_tokens_seen": 153777424, "step": 1043 }, { "epoch": 0.30095128279042954, "grad_norm": 1.5694669073126917, "learning_rate": 0.0001, "loss": 0.0085, "num_input_tokens_seen": 153949848, "step": 1044 }, { "epoch": 0.30095128279042954, "loss": 0.0023211517836898565, "loss_ce": 0.0010503807570785284, "loss_xval": 0.00127410888671875, "num_input_tokens_seen": 153949848, "step": 1044 }, { "epoch": 0.3012395503026809, "grad_norm": 6.43614642477153, "learning_rate": 0.0001, "loss": 0.0131, "num_input_tokens_seen": 154084664, "step": 1045 }, { "epoch": 0.3012395503026809, "loss": 0.017825186252593994, "loss_ce": 0.01016527321189642, "loss_xval": 0.007659912109375, "num_input_tokens_seen": 154084664, "step": 1045 }, { "epoch": 0.30152781781493226, "grad_norm": 4.690194926699786, "learning_rate": 0.0001, "loss": 0.0049, "num_input_tokens_seen": 154219688, "step": 1046 }, { "epoch": 0.30152781781493226, "loss": 0.004704832099378109, "loss_ce": 0.00046861107693985105, "loss_xval": 0.004241943359375, "num_input_tokens_seen": 154219688, "step": 1046 }, { "epoch": 0.3018160853271836, "grad_norm": 3.929857549293112, "learning_rate": 0.0001, "loss": 0.0106, "num_input_tokens_seen": 154392072, "step": 1047 }, { "epoch": 0.3018160853271836, "loss": 0.0042261965572834015, "loss_ce": 0.0011534581426531076, "loss_xval": 0.0030670166015625, "num_input_tokens_seen": 154392072, "step": 1047 }, { "epoch": 0.302104352839435, "grad_norm": 6.606956071895859, "learning_rate": 0.0001, "loss": 0.012, "num_input_tokens_seen": 154526880, "step": 1048 }, { "epoch": 0.302104352839435, "loss": 0.01597825065255165, "loss_ce": 0.00860062800347805, "loss_xval": 0.00738525390625, "num_input_tokens_seen": 154526880, "step": 1048 }, { "epoch": 0.30239262035168635, "grad_norm": 0.3879556290843241, "learning_rate": 0.0001, "loss": 0.0018, "num_input_tokens_seen": 154661976, "step": 1049 }, { "epoch": 0.30239262035168635, "loss": 0.000958413234911859, "loss_ce": 0.00039574544643983245, "loss_xval": 0.0005645751953125, "num_input_tokens_seen": 154661976, "step": 1049 }, { "epoch": 0.3026808878639377, "grad_norm": 5.967744172554036, "learning_rate": 0.0001, "loss": 0.0107, "num_input_tokens_seen": 154834640, "step": 1050 }, { "epoch": 0.3026808878639377, "loss": 0.007467512972652912, "loss_ce": 0.000837569241411984, "loss_xval": 0.006622314453125, "num_input_tokens_seen": 154834640, "step": 1050 }, { "epoch": 0.3029691553761891, "grad_norm": 3.674411800824582, "learning_rate": 0.0001, "loss": 0.0071, "num_input_tokens_seen": 154969320, "step": 1051 }, { "epoch": 0.3029691553761891, "loss": 0.010981084778904915, "loss_ce": 0.008468152955174446, "loss_xval": 0.0025177001953125, "num_input_tokens_seen": 154969320, "step": 1051 }, { "epoch": 0.3032574228884405, "grad_norm": 3.6628554067126227, "learning_rate": 0.0001, "loss": 0.0033, "num_input_tokens_seen": 155104376, "step": 1052 }, { "epoch": 0.3032574228884405, "loss": 0.0028882953338325024, "loss_ce": 0.0003457994607742876, "loss_xval": 0.0025482177734375, "num_input_tokens_seen": 155104376, "step": 1052 }, { "epoch": 0.30354569040069185, "grad_norm": 6.352602041437757, "learning_rate": 0.0001, "loss": 0.0121, "num_input_tokens_seen": 155276944, "step": 1053 }, { "epoch": 0.30354569040069185, "loss": 0.006636136211454868, "loss_ce": 0.0006050996598787606, "loss_xval": 0.00604248046875, "num_input_tokens_seen": 155276944, "step": 1053 }, { "epoch": 0.3038339579129432, "grad_norm": 1.16276976605343, "learning_rate": 0.0001, "loss": 0.0074, "num_input_tokens_seen": 155411768, "step": 1054 }, { "epoch": 0.3038339579129432, "loss": 0.013181800954043865, "loss_ce": 0.012291069142520428, "loss_xval": 0.00089263916015625, "num_input_tokens_seen": 155411768, "step": 1054 }, { "epoch": 0.30412222542519457, "grad_norm": 5.495397592642692, "learning_rate": 0.0001, "loss": 0.0059, "num_input_tokens_seen": 155546872, "step": 1055 }, { "epoch": 0.30412222542519457, "loss": 0.005437557119876146, "loss_ce": 0.0003220479120500386, "loss_xval": 0.005126953125, "num_input_tokens_seen": 155546872, "step": 1055 }, { "epoch": 0.30441049293744593, "grad_norm": 5.63390137909816, "learning_rate": 0.0001, "loss": 0.0119, "num_input_tokens_seen": 155719568, "step": 1056 }, { "epoch": 0.30441049293744593, "loss": 0.005506637506186962, "loss_ce": 0.00046742259291931987, "loss_xval": 0.005035400390625, "num_input_tokens_seen": 155719568, "step": 1056 }, { "epoch": 0.3046987604496973, "grad_norm": 1.0159675221600082, "learning_rate": 0.0001, "loss": 0.0058, "num_input_tokens_seen": 155854352, "step": 1057 }, { "epoch": 0.3046987604496973, "loss": 0.010783128440380096, "loss_ce": 0.010038308799266815, "loss_xval": 0.000743865966796875, "num_input_tokens_seen": 155854352, "step": 1057 }, { "epoch": 0.3049870279619487, "grad_norm": 6.196608356578114, "learning_rate": 0.0001, "loss": 0.0081, "num_input_tokens_seen": 155989472, "step": 1058 }, { "epoch": 0.3049870279619487, "loss": 0.006915869191288948, "loss_ce": 0.0006788394530303776, "loss_xval": 0.0062255859375, "num_input_tokens_seen": 155989472, "step": 1058 }, { "epoch": 0.30527529547420007, "grad_norm": 3.793207536774932, "learning_rate": 0.0001, "loss": 0.0097, "num_input_tokens_seen": 156161928, "step": 1059 }, { "epoch": 0.30527529547420007, "loss": 0.0034458579029887915, "loss_ce": 0.0010359229054301977, "loss_xval": 0.002410888671875, "num_input_tokens_seen": 156161928, "step": 1059 }, { "epoch": 0.30556356298645143, "grad_norm": 2.9765471186640684, "learning_rate": 0.0001, "loss": 0.0073, "num_input_tokens_seen": 156296776, "step": 1060 }, { "epoch": 0.30556356298645143, "loss": 0.012287916615605354, "loss_ce": 0.010464491322636604, "loss_xval": 0.00182342529296875, "num_input_tokens_seen": 156296776, "step": 1060 }, { "epoch": 0.3058518304987028, "grad_norm": 6.3222162275498555, "learning_rate": 0.0001, "loss": 0.0075, "num_input_tokens_seen": 156431832, "step": 1061 }, { "epoch": 0.3058518304987028, "loss": 0.00641957763582468, "loss_ce": 0.000300803454592824, "loss_xval": 0.006103515625, "num_input_tokens_seen": 156431832, "step": 1061 }, { "epoch": 0.30614009801095415, "grad_norm": 2.708296585407212, "learning_rate": 0.0001, "loss": 0.0072, "num_input_tokens_seen": 156604280, "step": 1062 }, { "epoch": 0.30614009801095415, "loss": 0.0020502936094999313, "loss_ce": 0.0005210767267271876, "loss_xval": 0.00152587890625, "num_input_tokens_seen": 156604280, "step": 1062 }, { "epoch": 0.3064283655232055, "grad_norm": 3.646756298160841, "learning_rate": 0.0001, "loss": 0.0079, "num_input_tokens_seen": 156739048, "step": 1063 }, { "epoch": 0.3064283655232055, "loss": 0.012669045478105545, "loss_ce": 0.010387856513261795, "loss_xval": 0.002288818359375, "num_input_tokens_seen": 156739048, "step": 1063 }, { "epoch": 0.30671663303545693, "grad_norm": 5.864763562119953, "learning_rate": 0.0001, "loss": 0.0064, "num_input_tokens_seen": 156874232, "step": 1064 }, { "epoch": 0.30671663303545693, "loss": 0.00592215359210968, "loss_ce": 0.0003298069932498038, "loss_xval": 0.005584716796875, "num_input_tokens_seen": 156874232, "step": 1064 }, { "epoch": 0.3070049005477083, "grad_norm": 1.621707410019325, "learning_rate": 0.0001, "loss": 0.0065, "num_input_tokens_seen": 157046744, "step": 1065 }, { "epoch": 0.3070049005477083, "loss": 0.0015599572798237205, "loss_ce": 0.0005266511579975486, "loss_xval": 0.00102996826171875, "num_input_tokens_seen": 157046744, "step": 1065 }, { "epoch": 0.30729316805995965, "grad_norm": 4.474351846510715, "learning_rate": 0.0001, "loss": 0.0077, "num_input_tokens_seen": 157181640, "step": 1066 }, { "epoch": 0.30729316805995965, "loss": 0.008225830271840096, "loss_ce": 0.005221756175160408, "loss_xval": 0.0030059814453125, "num_input_tokens_seen": 157181640, "step": 1066 }, { "epoch": 0.307581435572211, "grad_norm": 5.802771539488246, "learning_rate": 0.0001, "loss": 0.0077, "num_input_tokens_seen": 157316736, "step": 1067 }, { "epoch": 0.307581435572211, "loss": 0.00557685736566782, "loss_ce": 0.00023628136841580272, "loss_xval": 0.005340576171875, "num_input_tokens_seen": 157316736, "step": 1067 }, { "epoch": 0.3078697030844624, "grad_norm": 0.6427301743774646, "learning_rate": 0.0001, "loss": 0.0047, "num_input_tokens_seen": 157489248, "step": 1068 }, { "epoch": 0.3078697030844624, "loss": 0.0010960090439766645, "loss_ce": 0.0004949558642692864, "loss_xval": 0.00060272216796875, "num_input_tokens_seen": 157489248, "step": 1068 }, { "epoch": 0.30815797059671374, "grad_norm": 5.649654271338128, "learning_rate": 0.0001, "loss": 0.012, "num_input_tokens_seen": 157624120, "step": 1069 }, { "epoch": 0.30815797059671374, "loss": 0.0163407139480114, "loss_ce": 0.012455443851649761, "loss_xval": 0.0038909912109375, "num_input_tokens_seen": 157624120, "step": 1069 }, { "epoch": 0.3084462381089651, "grad_norm": 5.799699774111246, "learning_rate": 0.0001, "loss": 0.0087, "num_input_tokens_seen": 157759240, "step": 1070 }, { "epoch": 0.3084462381089651, "loss": 0.010901054367423058, "loss_ce": 0.005217155907303095, "loss_xval": 0.00567626953125, "num_input_tokens_seen": 157759240, "step": 1070 }, { "epoch": 0.3087345056212165, "grad_norm": 1.1247419801395206, "learning_rate": 0.0001, "loss": 0.0054, "num_input_tokens_seen": 157931680, "step": 1071 }, { "epoch": 0.3087345056212165, "loss": 0.0013144547119736671, "loss_ce": 0.0008340412168763578, "loss_xval": 0.00048065185546875, "num_input_tokens_seen": 157931680, "step": 1071 }, { "epoch": 0.3090227731334679, "grad_norm": 7.909503645706065, "learning_rate": 0.0001, "loss": 0.0155, "num_input_tokens_seen": 158066368, "step": 1072 }, { "epoch": 0.3090227731334679, "loss": 0.01710927113890648, "loss_ce": 0.008610125631093979, "loss_xval": 0.00848388671875, "num_input_tokens_seen": 158066368, "step": 1072 }, { "epoch": 0.30931104064571924, "grad_norm": 7.299799078860211, "learning_rate": 0.0001, "loss": 0.0113, "num_input_tokens_seen": 158201368, "step": 1073 }, { "epoch": 0.30931104064571924, "loss": 0.012196368537843227, "loss_ce": 0.0033806031569838524, "loss_xval": 0.0087890625, "num_input_tokens_seen": 158201368, "step": 1073 }, { "epoch": 0.3095993081579706, "grad_norm": 1.2005303324186114, "learning_rate": 0.0001, "loss": 0.008, "num_input_tokens_seen": 158373776, "step": 1074 }, { "epoch": 0.3095993081579706, "loss": 0.0010331561788916588, "loss_ce": 0.0005875517963431776, "loss_xval": 0.000446319580078125, "num_input_tokens_seen": 158373776, "step": 1074 }, { "epoch": 0.30988757567022196, "grad_norm": 9.154735585892116, "learning_rate": 0.0001, "loss": 0.019, "num_input_tokens_seen": 158508480, "step": 1075 }, { "epoch": 0.30988757567022196, "loss": 0.019368529319763184, "loss_ce": 0.008641598746180534, "loss_xval": 0.0107421875, "num_input_tokens_seen": 158508480, "step": 1075 }, { "epoch": 0.3101758431824733, "grad_norm": 8.412756668445324, "learning_rate": 0.0001, "loss": 0.0126, "num_input_tokens_seen": 158643488, "step": 1076 }, { "epoch": 0.3101758431824733, "loss": 0.012810402549803257, "loss_ce": 0.000229530967772007, "loss_xval": 0.0125732421875, "num_input_tokens_seen": 158643488, "step": 1076 }, { "epoch": 0.3104641106947247, "grad_norm": 0.9466170936867265, "learning_rate": 0.0001, "loss": 0.0062, "num_input_tokens_seen": 158816016, "step": 1077 }, { "epoch": 0.3104641106947247, "loss": 0.0014774627052247524, "loss_ce": 0.0010189838940277696, "loss_xval": 0.000457763671875, "num_input_tokens_seen": 158816016, "step": 1077 }, { "epoch": 0.3107523782069761, "grad_norm": 10.21379456640987, "learning_rate": 0.0001, "loss": 0.023, "num_input_tokens_seen": 158950880, "step": 1078 }, { "epoch": 0.3107523782069761, "loss": 0.02332136780023575, "loss_ce": 0.009924151003360748, "loss_xval": 0.013427734375, "num_input_tokens_seen": 158950880, "step": 1078 }, { "epoch": 0.31104064571922746, "grad_norm": 9.987037711665085, "learning_rate": 0.0001, "loss": 0.019, "num_input_tokens_seen": 159085888, "step": 1079 }, { "epoch": 0.31104064571922746, "loss": 0.018559180200099945, "loss_ce": 0.00021811538317706436, "loss_xval": 0.018310546875, "num_input_tokens_seen": 159085888, "step": 1079 }, { "epoch": 0.3113289132314788, "grad_norm": 0.20392553303226543, "learning_rate": 0.0001, "loss": 0.0068, "num_input_tokens_seen": 159258384, "step": 1080 }, { "epoch": 0.3113289132314788, "loss": 0.0019423088524490595, "loss_ce": 0.0007568916189484298, "loss_xval": 0.00118255615234375, "num_input_tokens_seen": 159258384, "step": 1080 }, { "epoch": 0.3116171807437302, "grad_norm": 9.791986473139499, "learning_rate": 0.0001, "loss": 0.0221, "num_input_tokens_seen": 159393072, "step": 1081 }, { "epoch": 0.3116171807437302, "loss": 0.021431514993309975, "loss_ce": 0.009758541360497475, "loss_xval": 0.01165771484375, "num_input_tokens_seen": 159393072, "step": 1081 }, { "epoch": 0.31190544825598154, "grad_norm": 9.607565793359004, "learning_rate": 0.0001, "loss": 0.0166, "num_input_tokens_seen": 159528224, "step": 1082 }, { "epoch": 0.31190544825598154, "loss": 0.01773345097899437, "loss_ce": 0.000231619254918769, "loss_xval": 0.0174560546875, "num_input_tokens_seen": 159528224, "step": 1082 }, { "epoch": 0.3121937157682329, "grad_norm": 0.8754551559488629, "learning_rate": 0.0001, "loss": 0.0064, "num_input_tokens_seen": 159700648, "step": 1083 }, { "epoch": 0.3121937157682329, "loss": 0.002320789499208331, "loss_ce": 0.0010037652682512999, "loss_xval": 0.00131988525390625, "num_input_tokens_seen": 159700648, "step": 1083 }, { "epoch": 0.31248198328048427, "grad_norm": 10.787525641949458, "learning_rate": 0.0001, "loss": 0.0255, "num_input_tokens_seen": 159835472, "step": 1084 }, { "epoch": 0.31248198328048427, "loss": 0.022629978135228157, "loss_ce": 0.008164646103978157, "loss_xval": 0.01446533203125, "num_input_tokens_seen": 159835472, "step": 1084 }, { "epoch": 0.3127702507927357, "grad_norm": 10.646109982293996, "learning_rate": 0.0001, "loss": 0.0204, "num_input_tokens_seen": 159970448, "step": 1085 }, { "epoch": 0.3127702507927357, "loss": 0.02264305204153061, "loss_ce": 0.00024314915935974568, "loss_xval": 0.0224609375, "num_input_tokens_seen": 159970448, "step": 1085 }, { "epoch": 0.31305851830498704, "grad_norm": 0.27325773479874504, "learning_rate": 0.0001, "loss": 0.008, "num_input_tokens_seen": 160142976, "step": 1086 }, { "epoch": 0.31305851830498704, "loss": 0.0023754406720399857, "loss_ce": 0.0006254484760574996, "loss_xval": 0.00174713134765625, "num_input_tokens_seen": 160142976, "step": 1086 }, { "epoch": 0.3133467858172384, "grad_norm": 11.219846459935127, "learning_rate": 0.0001, "loss": 0.0282, "num_input_tokens_seen": 160277728, "step": 1087 }, { "epoch": 0.3133467858172384, "loss": 0.025487396866083145, "loss_ce": 0.011311981827020645, "loss_xval": 0.01416015625, "num_input_tokens_seen": 160277728, "step": 1087 }, { "epoch": 0.31363505332948977, "grad_norm": 11.484214546937917, "learning_rate": 0.0001, "loss": 0.024, "num_input_tokens_seen": 160412976, "step": 1088 }, { "epoch": 0.31363505332948977, "loss": 0.027778614312410355, "loss_ce": 0.0002670185058377683, "loss_xval": 0.0274658203125, "num_input_tokens_seen": 160412976, "step": 1088 }, { "epoch": 0.3139233208417411, "grad_norm": 0.5123509703035186, "learning_rate": 0.0001, "loss": 0.0082, "num_input_tokens_seen": 160585480, "step": 1089 }, { "epoch": 0.3139233208417411, "loss": 0.0037535782903432846, "loss_ce": 0.0007361529860645533, "loss_xval": 0.003021240234375, "num_input_tokens_seen": 160585480, "step": 1089 }, { "epoch": 0.3142115883539925, "grad_norm": 10.85298039866953, "learning_rate": 0.0001, "loss": 0.0258, "num_input_tokens_seen": 160720264, "step": 1090 }, { "epoch": 0.3142115883539925, "loss": 0.020125051960349083, "loss_ce": 0.00652947137132287, "loss_xval": 0.01361083984375, "num_input_tokens_seen": 160720264, "step": 1090 }, { "epoch": 0.31449985586624385, "grad_norm": 10.865952405702181, "learning_rate": 0.0001, "loss": 0.0218, "num_input_tokens_seen": 160855376, "step": 1091 }, { "epoch": 0.31449985586624385, "loss": 0.025808095932006836, "loss_ce": 0.00029540024115704, "loss_xval": 0.0255126953125, "num_input_tokens_seen": 160855376, "step": 1091 }, { "epoch": 0.31478812337849527, "grad_norm": 0.6638405627439249, "learning_rate": 0.0001, "loss": 0.0089, "num_input_tokens_seen": 161027808, "step": 1092 }, { "epoch": 0.31478812337849527, "loss": 0.0030655530281364918, "loss_ce": 0.0007996229687705636, "loss_xval": 0.00225830078125, "num_input_tokens_seen": 161027808, "step": 1092 }, { "epoch": 0.3150763908907466, "grad_norm": 11.225511097099034, "learning_rate": 0.0001, "loss": 0.0288, "num_input_tokens_seen": 161162600, "step": 1093 }, { "epoch": 0.3150763908907466, "loss": 0.022820211946964264, "loss_ce": 0.009468772448599339, "loss_xval": 0.01336669921875, "num_input_tokens_seen": 161162600, "step": 1093 }, { "epoch": 0.315364658402998, "grad_norm": 9.31150135276372, "learning_rate": 0.0001, "loss": 0.0167, "num_input_tokens_seen": 161297704, "step": 1094 }, { "epoch": 0.315364658402998, "loss": 0.021114954724907875, "loss_ce": 0.0003630008432082832, "loss_xval": 0.020751953125, "num_input_tokens_seen": 161297704, "step": 1094 }, { "epoch": 0.31565292591524935, "grad_norm": 3.7042286765060615, "learning_rate": 0.0001, "loss": 0.0121, "num_input_tokens_seen": 161470192, "step": 1095 }, { "epoch": 0.31565292591524935, "loss": 0.001126689836382866, "loss_ce": 0.00049559585750103, "loss_xval": 0.000629425048828125, "num_input_tokens_seen": 161470192, "step": 1095 }, { "epoch": 0.3159411934275007, "grad_norm": 14.440635946622175, "learning_rate": 0.0001, "loss": 0.0437, "num_input_tokens_seen": 161604920, "step": 1096 }, { "epoch": 0.3159411934275007, "loss": 0.03371185064315796, "loss_ce": 0.009496153332293034, "loss_xval": 0.024169921875, "num_input_tokens_seen": 161604920, "step": 1096 }, { "epoch": 0.31622946093975207, "grad_norm": 10.8120636880313, "learning_rate": 0.0001, "loss": 0.0224, "num_input_tokens_seen": 161739992, "step": 1097 }, { "epoch": 0.31622946093975207, "loss": 0.028470251709222794, "loss_ce": 0.00036356275086291134, "loss_xval": 0.028076171875, "num_input_tokens_seen": 161739992, "step": 1097 }, { "epoch": 0.31651772845200343, "grad_norm": 4.845232306751907, "learning_rate": 0.0001, "loss": 0.0134, "num_input_tokens_seen": 161912496, "step": 1098 }, { "epoch": 0.31651772845200343, "loss": 0.0014338723849505186, "loss_ce": 0.0007434121216647327, "loss_xval": 0.000690460205078125, "num_input_tokens_seen": 161912496, "step": 1098 }, { "epoch": 0.31680599596425485, "grad_norm": 16.83238346653988, "learning_rate": 0.0001, "loss": 0.0573, "num_input_tokens_seen": 162047344, "step": 1099 }, { "epoch": 0.31680599596425485, "loss": 0.042024776339530945, "loss_ce": 0.007814568467438221, "loss_xval": 0.0341796875, "num_input_tokens_seen": 162047344, "step": 1099 }, { "epoch": 0.3170942634765062, "grad_norm": 11.875846180074877, "learning_rate": 0.0001, "loss": 0.0276, "num_input_tokens_seen": 162182456, "step": 1100 }, { "epoch": 0.3170942634765062, "loss": 0.03627585619688034, "loss_ce": 0.0006008032942190766, "loss_xval": 0.03564453125, "num_input_tokens_seen": 162182456, "step": 1100 }, { "epoch": 0.31738253098875757, "grad_norm": 6.8405890813471935, "learning_rate": 0.0001, "loss": 0.0213, "num_input_tokens_seen": 162354808, "step": 1101 }, { "epoch": 0.31738253098875757, "loss": 0.005464246030896902, "loss_ce": 0.004270245786756277, "loss_xval": 0.001190185546875, "num_input_tokens_seen": 162354808, "step": 1101 }, { "epoch": 0.31767079850100893, "grad_norm": 20.75383958252492, "learning_rate": 0.0001, "loss": 0.0855, "num_input_tokens_seen": 162489592, "step": 1102 }, { "epoch": 0.31767079850100893, "loss": 0.06375418603420258, "loss_ce": 0.009616006165742874, "loss_xval": 0.05419921875, "num_input_tokens_seen": 162489592, "step": 1102 }, { "epoch": 0.3179590660132603, "grad_norm": 14.355246236937758, "learning_rate": 0.0001, "loss": 0.0399, "num_input_tokens_seen": 162624624, "step": 1103 }, { "epoch": 0.3179590660132603, "loss": 0.05262349545955658, "loss_ce": 0.0004384391359053552, "loss_xval": 0.05224609375, "num_input_tokens_seen": 162624624, "step": 1103 }, { "epoch": 0.31824733352551166, "grad_norm": 7.739226969232351, "learning_rate": 0.0001, "loss": 0.0272, "num_input_tokens_seen": 162797120, "step": 1104 }, { "epoch": 0.31824733352551166, "loss": 0.00172033766284585, "loss_ce": 0.0007585571147501469, "loss_xval": 0.0009613037109375, "num_input_tokens_seen": 162797120, "step": 1104 }, { "epoch": 0.318535601037763, "grad_norm": 22.913991416793884, "learning_rate": 0.0001, "loss": 0.1058, "num_input_tokens_seen": 162931936, "step": 1105 }, { "epoch": 0.318535601037763, "loss": 0.08138446509838104, "loss_ce": 0.010888861492276192, "loss_xval": 0.0703125, "num_input_tokens_seen": 162931936, "step": 1105 }, { "epoch": 0.31882386855001443, "grad_norm": 13.233226662544276, "learning_rate": 0.0001, "loss": 0.0363, "num_input_tokens_seen": 163067024, "step": 1106 }, { "epoch": 0.31882386855001443, "loss": 0.05254264920949936, "loss_ce": 0.000540698878467083, "loss_xval": 0.052001953125, "num_input_tokens_seen": 163067024, "step": 1106 }, { "epoch": 0.3191121360622658, "grad_norm": 13.283548229086884, "learning_rate": 0.0001, "loss": 0.0508, "num_input_tokens_seen": 163239560, "step": 1107 }, { "epoch": 0.3191121360622658, "loss": 0.008430368266999722, "loss_ce": 0.0013655491638928652, "loss_xval": 0.007080078125, "num_input_tokens_seen": 163239560, "step": 1107 }, { "epoch": 0.31940040357451716, "grad_norm": 26.958071158147696, "learning_rate": 0.0001, "loss": 0.1463, "num_input_tokens_seen": 163374336, "step": 1108 }, { "epoch": 0.31940040357451716, "loss": 0.11349773406982422, "loss_ce": 0.008700372651219368, "loss_xval": 0.10498046875, "num_input_tokens_seen": 163374336, "step": 1108 }, { "epoch": 0.3196886710867685, "grad_norm": 8.341302719853255, "learning_rate": 0.0001, "loss": 0.0185, "num_input_tokens_seen": 163509360, "step": 1109 }, { "epoch": 0.3196886710867685, "loss": 0.032069962471723557, "loss_ce": 0.0006063411710783839, "loss_xval": 0.031494140625, "num_input_tokens_seen": 163509360, "step": 1109 }, { "epoch": 0.3199769385990199, "grad_norm": 25.057511538159552, "learning_rate": 0.0001, "loss": 0.1433, "num_input_tokens_seen": 163681776, "step": 1110 }, { "epoch": 0.3199769385990199, "loss": 0.05327451229095459, "loss_ce": 0.0007842776831239462, "loss_xval": 0.052490234375, "num_input_tokens_seen": 163681776, "step": 1110 }, { "epoch": 0.32026520611127124, "grad_norm": 31.97887247106539, "learning_rate": 0.0001, "loss": 0.2089, "num_input_tokens_seen": 163816592, "step": 1111 }, { "epoch": 0.32026520611127124, "loss": 0.1736195832490921, "loss_ce": 0.014439894817769527, "loss_xval": 0.1591796875, "num_input_tokens_seen": 163816592, "step": 1111 }, { "epoch": 0.32055347362352266, "grad_norm": 3.7657790177530446, "learning_rate": 0.0001, "loss": 0.0122, "num_input_tokens_seen": 163951688, "step": 1112 }, { "epoch": 0.32055347362352266, "loss": 0.0032639564014971256, "loss_ce": 0.0006890358054079115, "loss_xval": 0.0025787353515625, "num_input_tokens_seen": 163951688, "step": 1112 }, { "epoch": 0.320841741135774, "grad_norm": 41.76094920123107, "learning_rate": 0.0001, "loss": 0.3675, "num_input_tokens_seen": 164124288, "step": 1113 }, { "epoch": 0.320841741135774, "loss": 0.22158637642860413, "loss_ce": 0.0008832494495436549, "loss_xval": 0.220703125, "num_input_tokens_seen": 164124288, "step": 1113 }, { "epoch": 0.3211300086480254, "grad_norm": 23.535877158829695, "learning_rate": 0.0001, "loss": 0.1256, "num_input_tokens_seen": 164259104, "step": 1114 }, { "epoch": 0.3211300086480254, "loss": 0.11367707699537277, "loss_ce": 0.008574537932872772, "loss_xval": 0.10498046875, "num_input_tokens_seen": 164259104, "step": 1114 }, { "epoch": 0.32141827616027674, "grad_norm": 36.60361659058581, "learning_rate": 0.0001, "loss": 0.281, "num_input_tokens_seen": 164394192, "step": 1115 }, { "epoch": 0.32141827616027674, "loss": 0.1536085605621338, "loss_ce": 0.0010206589940935373, "loss_xval": 0.15234375, "num_input_tokens_seen": 164394192, "step": 1115 }, { "epoch": 0.3217065436725281, "grad_norm": 49.90161265894237, "learning_rate": 0.0001, "loss": 0.5425, "num_input_tokens_seen": 164566656, "step": 1116 }, { "epoch": 0.3217065436725281, "loss": 0.412850946187973, "loss_ce": 0.003915389999747276, "loss_xval": 0.408203125, "num_input_tokens_seen": 164566656, "step": 1116 }, { "epoch": 0.32199481118477946, "grad_norm": 9.574391739728737, "learning_rate": 0.0001, "loss": 0.046, "num_input_tokens_seen": 164701456, "step": 1117 }, { "epoch": 0.32199481118477946, "loss": 0.05413966625928879, "loss_ce": 0.012193256989121437, "loss_xval": 0.0419921875, "num_input_tokens_seen": 164701456, "step": 1117 }, { "epoch": 0.3222830786970308, "grad_norm": 54.59205629061163, "learning_rate": 0.0001, "loss": 0.6666, "num_input_tokens_seen": 164836512, "step": 1118 }, { "epoch": 0.3222830786970308, "loss": 0.6489585638046265, "loss_ce": 0.0024742158129811287, "loss_xval": 0.6484375, "num_input_tokens_seen": 164836512, "step": 1118 }, { "epoch": 0.32257134620928224, "grad_norm": 11.693643308618372, "learning_rate": 0.0001, "loss": 0.046, "num_input_tokens_seen": 165008952, "step": 1119 }, { "epoch": 0.32257134620928224, "loss": 0.01354275457561016, "loss_ce": 0.00258694333024323, "loss_xval": 0.010986328125, "num_input_tokens_seen": 165008952, "step": 1119 }, { "epoch": 0.3228596137215336, "grad_norm": 31.216037525281035, "learning_rate": 0.0001, "loss": 0.204, "num_input_tokens_seen": 165143728, "step": 1120 }, { "epoch": 0.3228596137215336, "loss": 0.19186297059059143, "loss_ce": 0.009123703464865685, "loss_xval": 0.1826171875, "num_input_tokens_seen": 165143728, "step": 1120 }, { "epoch": 0.32314788123378496, "grad_norm": 146.70057585480131, "learning_rate": 0.0001, "loss": 2.2466, "num_input_tokens_seen": 165278776, "step": 1121 }, { "epoch": 0.32314788123378496, "loss": 2.0149452686309814, "loss_ce": 0.0032265656627714634, "loss_xval": 2.015625, "num_input_tokens_seen": 165278776, "step": 1121 }, { "epoch": 0.3234361487460363, "grad_norm": 128.1209425151165, "learning_rate": 0.0001, "loss": 3.5429, "num_input_tokens_seen": 165451272, "step": 1122 }, { "epoch": 0.3234361487460363, "loss": 3.4982962608337402, "loss_ce": 0.006108833011239767, "loss_xval": 3.5, "num_input_tokens_seen": 165451272, "step": 1122 }, { "epoch": 0.3237244162582877, "grad_norm": 23.896542536561327, "learning_rate": 0.0001, "loss": 0.1602, "num_input_tokens_seen": 165586072, "step": 1123 }, { "epoch": 0.3237244162582877, "loss": 0.17959138751029968, "loss_ce": 0.03127596154808998, "loss_xval": 0.1484375, "num_input_tokens_seen": 165586072, "step": 1123 }, { "epoch": 0.32401268377053905, "grad_norm": 225.41097447165146, "learning_rate": 0.0001, "loss": 7.3387, "num_input_tokens_seen": 165721048, "step": 1124 }, { "epoch": 0.32401268377053905, "loss": 7.041110038757324, "loss_ce": 0.08407886326313019, "loss_xval": 6.96875, "num_input_tokens_seen": 165721048, "step": 1124 }, { "epoch": 0.3243009512827904, "grad_norm": 27.324685193473073, "learning_rate": 0.0001, "loss": 0.3048, "num_input_tokens_seen": 165893552, "step": 1125 }, { "epoch": 0.3243009512827904, "loss": 0.26803261041641235, "loss_ce": 0.10726602375507355, "loss_xval": 0.1611328125, "num_input_tokens_seen": 165893552, "step": 1125 }, { "epoch": 0.3245892187950418, "grad_norm": 83.20874980582771, "learning_rate": 0.0001, "loss": 1.505, "num_input_tokens_seen": 166028344, "step": 1126 }, { "epoch": 0.3245892187950418, "loss": 1.5246477127075195, "loss_ce": 0.06371021270751953, "loss_xval": 1.4609375, "num_input_tokens_seen": 166028344, "step": 1126 }, { "epoch": 0.3248774863072932, "grad_norm": 117.91214459350986, "learning_rate": 0.0001, "loss": 1.802, "num_input_tokens_seen": 166163440, "step": 1127 }, { "epoch": 0.3248774863072932, "loss": 1.7308454513549805, "loss_ce": 0.021861031651496887, "loss_xval": 1.7109375, "num_input_tokens_seen": 166163440, "step": 1127 }, { "epoch": 0.32516575381954455, "grad_norm": 21.969901497136338, "learning_rate": 0.0001, "loss": 0.1225, "num_input_tokens_seen": 166335888, "step": 1128 }, { "epoch": 0.32516575381954455, "loss": 0.14037910103797913, "loss_ce": 0.017393263056874275, "loss_xval": 0.123046875, "num_input_tokens_seen": 166335888, "step": 1128 }, { "epoch": 0.3254540213317959, "grad_norm": 70.85543800871753, "learning_rate": 0.0001, "loss": 0.8779, "num_input_tokens_seen": 166470640, "step": 1129 }, { "epoch": 0.3254540213317959, "loss": 0.817480206489563, "loss_ce": 0.014257535338401794, "loss_xval": 0.8046875, "num_input_tokens_seen": 166470640, "step": 1129 }, { "epoch": 0.32574228884404727, "grad_norm": 10.650929566560954, "learning_rate": 0.0001, "loss": 0.0421, "num_input_tokens_seen": 166605624, "step": 1130 }, { "epoch": 0.32574228884404727, "loss": 0.036035455763339996, "loss_ce": 0.0022219824604690075, "loss_xval": 0.03369140625, "num_input_tokens_seen": 166605624, "step": 1130 }, { "epoch": 0.32603055635629863, "grad_norm": 88.29133006200985, "learning_rate": 0.0001, "loss": 1.0776, "num_input_tokens_seen": 166778096, "step": 1131 }, { "epoch": 0.32603055635629863, "loss": 0.9238502979278564, "loss_ce": 0.021994847804307938, "loss_xval": 0.90234375, "num_input_tokens_seen": 166778096, "step": 1131 }, { "epoch": 0.32631882386855, "grad_norm": 36956.58996931595, "learning_rate": 0.0001, "loss": 148.2867, "num_input_tokens_seen": 166912928, "step": 1132 }, { "epoch": 0.32631882386855, "loss": 155.36557006835938, "loss_ce": 20.115570068359375, "loss_xval": 135.0, "num_input_tokens_seen": 166912928, "step": 1132 }, { "epoch": 0.3266070913808014, "grad_norm": 37411.494503156595, "learning_rate": 0.0001, "loss": 961.649, "num_input_tokens_seen": 167047896, "step": 1133 }, { "epoch": 0.3266070913808014, "loss": 1922.889404296875, "loss_ce": 9.014402389526367, "loss_xval": 1912.0, "num_input_tokens_seen": 167047896, "step": 1133 }, { "epoch": 0.32689535889305277, "grad_norm": 35611.01625548505, "learning_rate": 0.0001, "loss": 1535.11, "num_input_tokens_seen": 167220624, "step": 1134 }, { "epoch": 0.32689535889305277, "loss": 0.5700771808624268, "loss_ce": 0.013924892991781235, "loss_xval": 0.5546875, "num_input_tokens_seen": 167220624, "step": 1134 }, { "epoch": 0.32718362640530413, "grad_norm": 3903.1403432917123, "learning_rate": 0.0001, "loss": 64.9901, "num_input_tokens_seen": 167355400, "step": 1135 }, { "epoch": 0.32718362640530413, "loss": 102.77651977539062, "loss_ce": 5.159328937530518, "loss_xval": 97.5, "num_input_tokens_seen": 167355400, "step": 1135 }, { "epoch": 0.3274718939175555, "grad_norm": 14367.867932467441, "learning_rate": 0.0001, "loss": 52.0497, "num_input_tokens_seen": 167490400, "step": 1136 }, { "epoch": 0.3274718939175555, "loss": 103.61935424804688, "loss_ce": 1.8678855895996094, "loss_xval": 102.0, "num_input_tokens_seen": 167490400, "step": 1136 }, { "epoch": 0.32776016142980685, "grad_norm": 128.34438076454498, "learning_rate": 0.0001, "loss": 0.487, "num_input_tokens_seen": 167662864, "step": 1137 }, { "epoch": 0.32776016142980685, "loss": 0.5655533075332642, "loss_ce": 0.23571929335594177, "loss_xval": 0.330078125, "num_input_tokens_seen": 167662864, "step": 1137 }, { "epoch": 0.3280484289420582, "grad_norm": 186.8314494397255, "learning_rate": 0.0001, "loss": 1.0293, "num_input_tokens_seen": 167797576, "step": 1138 }, { "epoch": 0.3280484289420582, "loss": 1.259447693824768, "loss_ce": 0.9324213266372681, "loss_xval": 0.326171875, "num_input_tokens_seen": 167797576, "step": 1138 }, { "epoch": 0.3283366964543096, "grad_norm": 124.40677251025926, "learning_rate": 0.0001, "loss": 2.4235, "num_input_tokens_seen": 167932648, "step": 1139 }, { "epoch": 0.3283366964543096, "loss": 1.974762201309204, "loss_ce": 1.721283197402954, "loss_xval": 0.25390625, "num_input_tokens_seen": 167932648, "step": 1139 }, { "epoch": 0.328624963966561, "grad_norm": 111.30133258078932, "learning_rate": 0.0001, "loss": 3.6092, "num_input_tokens_seen": 168105168, "step": 1140 }, { "epoch": 0.328624963966561, "loss": 3.867354393005371, "loss_ce": 3.805678367614746, "loss_xval": 0.061767578125, "num_input_tokens_seen": 168105168, "step": 1140 }, { "epoch": 0.32891323147881235, "grad_norm": 121.31372025179807, "learning_rate": 0.0001, "loss": 3.274, "num_input_tokens_seen": 168239888, "step": 1141 }, { "epoch": 0.32891323147881235, "loss": 3.6110503673553467, "loss_ce": 3.0675933361053467, "loss_xval": 0.54296875, "num_input_tokens_seen": 168239888, "step": 1141 }, { "epoch": 0.3292014989910637, "grad_norm": 397.8345944363598, "learning_rate": 0.0001, "loss": 5.6374, "num_input_tokens_seen": 168374952, "step": 1142 }, { "epoch": 0.3292014989910637, "loss": 4.382977485656738, "loss_ce": 2.9259462356567383, "loss_xval": 1.453125, "num_input_tokens_seen": 168374952, "step": 1142 }, { "epoch": 0.3294897665033151, "grad_norm": 12260.97294175282, "learning_rate": 0.0001, "loss": 120.2437, "num_input_tokens_seen": 168547464, "step": 1143 }, { "epoch": 0.3294897665033151, "loss": 110.66117858886719, "loss_ce": 4.442431926727295, "loss_xval": 106.0, "num_input_tokens_seen": 168547464, "step": 1143 }, { "epoch": 0.32977803401556643, "grad_norm": 16729.315621184192, "learning_rate": 0.0001, "loss": 437.4442, "num_input_tokens_seen": 168682168, "step": 1144 }, { "epoch": 0.32977803401556643, "loss": 455.2748107910156, "loss_ce": 8.024798393249512, "loss_xval": 448.0, "num_input_tokens_seen": 168682168, "step": 1144 }, { "epoch": 0.3300663015278178, "grad_norm": 38370.55638467301, "learning_rate": 0.0001, "loss": 1914.62, "num_input_tokens_seen": 168817304, "step": 1145 }, { "epoch": 0.3300663015278178, "loss": 1790.501220703125, "loss_ce": 7.501255035400391, "loss_xval": 1784.0, "num_input_tokens_seen": 168817304, "step": 1145 }, { "epoch": 0.33035456904006916, "grad_norm": 575.5757979659641, "learning_rate": 0.0001, "loss": 19.334, "num_input_tokens_seen": 168989928, "step": 1146 }, { "epoch": 0.33035456904006916, "loss": 19.11969757080078, "loss_ce": 5.244698524475098, "loss_xval": 13.875, "num_input_tokens_seen": 168989928, "step": 1146 }, { "epoch": 0.3306428365523206, "grad_norm": 363.34565051839263, "learning_rate": 0.0001, "loss": 17.0378, "num_input_tokens_seen": 169124768, "step": 1147 }, { "epoch": 0.3306428365523206, "loss": 17.8129825592041, "loss_ce": 5.852045059204102, "loss_xval": 11.9375, "num_input_tokens_seen": 169124768, "step": 1147 }, { "epoch": 0.33093110406457193, "grad_norm": 260.9191068247743, "learning_rate": 0.0001, "loss": 13.149, "num_input_tokens_seen": 169259872, "step": 1148 }, { "epoch": 0.33093110406457193, "loss": 12.404302597045898, "loss_ce": 6.541021347045898, "loss_xval": 5.875, "num_input_tokens_seen": 169259872, "step": 1148 }, { "epoch": 0.3312193715768233, "grad_norm": 209.5617449763566, "learning_rate": 0.0001, "loss": 10.899, "num_input_tokens_seen": 169432424, "step": 1149 }, { "epoch": 0.3312193715768233, "loss": 11.033249855041504, "loss_ce": 6.748093605041504, "loss_xval": 4.28125, "num_input_tokens_seen": 169432424, "step": 1149 }, { "epoch": 0.33150763908907466, "grad_norm": 123.85290589512873, "learning_rate": 0.0001, "loss": 8.0386, "num_input_tokens_seen": 169567184, "step": 1150 }, { "epoch": 0.33150763908907466, "loss": 8.336545944213867, "loss_ce": 6.424436569213867, "loss_xval": 1.9140625, "num_input_tokens_seen": 169567184, "step": 1150 }, { "epoch": 0.331795906601326, "grad_norm": 57.909997902104216, "learning_rate": 0.0001, "loss": 6.8854, "num_input_tokens_seen": 169700752, "step": 1151 }, { "epoch": 0.331795906601326, "loss": 6.737767219543457, "loss_ce": 6.636631965637207, "loss_xval": 0.10107421875, "num_input_tokens_seen": 169700752, "step": 1151 }, { "epoch": 0.3320841741135774, "grad_norm": 58.06386206153396, "learning_rate": 0.0001, "loss": 6.633, "num_input_tokens_seen": 169873256, "step": 1152 }, { "epoch": 0.3320841741135774, "loss": 6.838903427124023, "loss_ce": 6.601720809936523, "loss_xval": 0.2373046875, "num_input_tokens_seen": 169873256, "step": 1152 }, { "epoch": 0.3323724416258288, "grad_norm": 101.93742036056541, "learning_rate": 0.0001, "loss": 7.0618, "num_input_tokens_seen": 170008032, "step": 1153 }, { "epoch": 0.3323724416258288, "loss": 6.780313491821289, "loss_ce": 6.040079116821289, "loss_xval": 0.7421875, "num_input_tokens_seen": 170008032, "step": 1153 }, { "epoch": 0.33266070913808016, "grad_norm": 139.2700863542851, "learning_rate": 0.0001, "loss": 7.8894, "num_input_tokens_seen": 170143120, "step": 1154 }, { "epoch": 0.33266070913808016, "loss": 7.91797399520874, "loss_ce": 6.046880722045898, "loss_xval": 1.875, "num_input_tokens_seen": 170143120, "step": 1154 }, { "epoch": 0.3329489766503315, "grad_norm": 149.41658480780492, "learning_rate": 0.0001, "loss": 7.7876, "num_input_tokens_seen": 170315632, "step": 1155 }, { "epoch": 0.3329489766503315, "loss": 8.109418869018555, "loss_ce": 5.894575119018555, "loss_xval": 2.21875, "num_input_tokens_seen": 170315632, "step": 1155 }, { "epoch": 0.3332372441625829, "grad_norm": 156.32637537503285, "learning_rate": 0.0001, "loss": 7.672, "num_input_tokens_seen": 170450384, "step": 1156 }, { "epoch": 0.3332372441625829, "loss": 7.428304672241211, "loss_ce": 5.239828109741211, "loss_xval": 2.1875, "num_input_tokens_seen": 170450384, "step": 1156 }, { "epoch": 0.33352551167483424, "grad_norm": 146.9594323082445, "learning_rate": 0.0001, "loss": 7.1932, "num_input_tokens_seen": 170585472, "step": 1157 }, { "epoch": 0.33352551167483424, "loss": 7.230058670043945, "loss_ce": 5.124589920043945, "loss_xval": 2.109375, "num_input_tokens_seen": 170585472, "step": 1157 }, { "epoch": 0.3338137791870856, "grad_norm": 107.99167328343886, "learning_rate": 0.0001, "loss": 5.884, "num_input_tokens_seen": 170758080, "step": 1158 }, { "epoch": 0.3338137791870856, "loss": 6.086777687072754, "loss_ce": 4.977890968322754, "loss_xval": 1.109375, "num_input_tokens_seen": 170758080, "step": 1158 }, { "epoch": 0.33410204669933696, "grad_norm": 67.67594647471446, "learning_rate": 0.0001, "loss": 4.9364, "num_input_tokens_seen": 170892832, "step": 1159 }, { "epoch": 0.33410204669933696, "loss": 4.73914098739624, "loss_ce": 4.37781286239624, "loss_xval": 0.361328125, "num_input_tokens_seen": 170892832, "step": 1159 }, { "epoch": 0.3343903142115884, "grad_norm": 27.192686899599124, "learning_rate": 0.0001, "loss": 4.5496, "num_input_tokens_seen": 171027808, "step": 1160 }, { "epoch": 0.3343903142115884, "loss": 4.496192932128906, "loss_ce": 4.365577697753906, "loss_xval": 0.130859375, "num_input_tokens_seen": 171027808, "step": 1160 }, { "epoch": 0.33467858172383974, "grad_norm": 38.851402590144424, "learning_rate": 0.0001, "loss": 4.4712, "num_input_tokens_seen": 171200368, "step": 1161 }, { "epoch": 0.33467858172383974, "loss": 4.607762336730957, "loss_ce": 4.399876594543457, "loss_xval": 0.2080078125, "num_input_tokens_seen": 171200368, "step": 1161 }, { "epoch": 0.3349668492360911, "grad_norm": 76.11601509063742, "learning_rate": 0.0001, "loss": 4.634, "num_input_tokens_seen": 171335200, "step": 1162 }, { "epoch": 0.3349668492360911, "loss": 4.723811149597168, "loss_ce": 3.919123888015747, "loss_xval": 0.8046875, "num_input_tokens_seen": 171335200, "step": 1162 }, { "epoch": 0.33525511674834246, "grad_norm": 99.2778880784972, "learning_rate": 0.0001, "loss": 5.0097, "num_input_tokens_seen": 171470312, "step": 1163 }, { "epoch": 0.33525511674834246, "loss": 4.773837089538574, "loss_ce": 4.016513347625732, "loss_xval": 0.7578125, "num_input_tokens_seen": 171470312, "step": 1163 }, { "epoch": 0.3355433842605938, "grad_norm": 123.6317341440415, "learning_rate": 0.0001, "loss": 5.297, "num_input_tokens_seen": 171642888, "step": 1164 }, { "epoch": 0.3355433842605938, "loss": 5.352497577667236, "loss_ce": 4.102497577667236, "loss_xval": 1.25, "num_input_tokens_seen": 171642888, "step": 1164 }, { "epoch": 0.3358316517728452, "grad_norm": 104.57501260677738, "learning_rate": 0.0001, "loss": 4.8253, "num_input_tokens_seen": 171777616, "step": 1165 }, { "epoch": 0.3358316517728452, "loss": 4.950669765472412, "loss_ce": 3.735337734222412, "loss_xval": 1.21875, "num_input_tokens_seen": 171777616, "step": 1165 }, { "epoch": 0.33611991928509655, "grad_norm": 81.33891217763653, "learning_rate": 0.0001, "loss": 4.5942, "num_input_tokens_seen": 171912712, "step": 1166 }, { "epoch": 0.33611991928509655, "loss": 4.381400108337402, "loss_ce": 3.8672401905059814, "loss_xval": 0.515625, "num_input_tokens_seen": 171912712, "step": 1166 }, { "epoch": 0.33640818679734796, "grad_norm": 55.303487211926544, "learning_rate": 0.0001, "loss": 4.1956, "num_input_tokens_seen": 172085144, "step": 1167 }, { "epoch": 0.33640818679734796, "loss": 4.305488109588623, "loss_ce": 4.013373851776123, "loss_xval": 0.29296875, "num_input_tokens_seen": 172085144, "step": 1167 }, { "epoch": 0.3366964543095993, "grad_norm": 16.54202519736511, "learning_rate": 0.0001, "loss": 3.8247, "num_input_tokens_seen": 172219936, "step": 1168 }, { "epoch": 0.3366964543095993, "loss": 3.7913873195648193, "loss_ce": 3.6247613430023193, "loss_xval": 0.1669921875, "num_input_tokens_seen": 172219936, "step": 1168 }, { "epoch": 0.3369847218218507, "grad_norm": 26.200108410757043, "learning_rate": 0.0001, "loss": 3.9935, "num_input_tokens_seen": 172354984, "step": 1169 }, { "epoch": 0.3369847218218507, "loss": 3.902951717376709, "loss_ce": 3.763547420501709, "loss_xval": 0.1396484375, "num_input_tokens_seen": 172354984, "step": 1169 }, { "epoch": 0.33727298933410205, "grad_norm": 47.584733452244784, "learning_rate": 0.0001, "loss": 4.0044, "num_input_tokens_seen": 172527480, "step": 1170 }, { "epoch": 0.33727298933410205, "loss": 4.209903717041016, "loss_ce": 3.900089740753174, "loss_xval": 0.310546875, "num_input_tokens_seen": 172527480, "step": 1170 }, { "epoch": 0.3375612568463534, "grad_norm": 67.07075172810855, "learning_rate": 0.0001, "loss": 4.0503, "num_input_tokens_seen": 172662256, "step": 1171 }, { "epoch": 0.3375612568463534, "loss": 3.90846586227417, "loss_ce": 3.51808500289917, "loss_xval": 0.390625, "num_input_tokens_seen": 172662256, "step": 1171 }, { "epoch": 0.33784952435860477, "grad_norm": 78.6025130293129, "learning_rate": 0.0001, "loss": 4.3344, "num_input_tokens_seen": 172797288, "step": 1172 }, { "epoch": 0.33784952435860477, "loss": 4.278440475463867, "loss_ce": 3.666623592376709, "loss_xval": 0.61328125, "num_input_tokens_seen": 172797288, "step": 1172 }, { "epoch": 0.33813779187085613, "grad_norm": 65.58126499126392, "learning_rate": 0.0001, "loss": 4.0663, "num_input_tokens_seen": 172969768, "step": 1173 }, { "epoch": 0.33813779187085613, "loss": 4.310020446777344, "loss_ce": 3.809532642364502, "loss_xval": 0.5, "num_input_tokens_seen": 172969768, "step": 1173 }, { "epoch": 0.33842605938310755, "grad_norm": 53.08952868260317, "learning_rate": 0.0001, "loss": 3.8061, "num_input_tokens_seen": 173104648, "step": 1174 }, { "epoch": 0.33842605938310755, "loss": 3.6648547649383545, "loss_ce": 3.4049670696258545, "loss_xval": 0.259765625, "num_input_tokens_seen": 173104648, "step": 1174 }, { "epoch": 0.3387143268953589, "grad_norm": 40.470371532877415, "learning_rate": 0.0001, "loss": 3.8175, "num_input_tokens_seen": 173239736, "step": 1175 }, { "epoch": 0.3387143268953589, "loss": 3.7364258766174316, "loss_ce": 3.5298829078674316, "loss_xval": 0.20703125, "num_input_tokens_seen": 173239736, "step": 1175 }, { "epoch": 0.33900259440761027, "grad_norm": 9.107618648325966, "learning_rate": 0.0001, "loss": 3.5961, "num_input_tokens_seen": 173412152, "step": 1176 }, { "epoch": 0.33900259440761027, "loss": 3.7630152702331543, "loss_ce": 3.6769556999206543, "loss_xval": 0.0859375, "num_input_tokens_seen": 173412152, "step": 1176 }, { "epoch": 0.33929086191986163, "grad_norm": 16.62523082092502, "learning_rate": 0.0001, "loss": 3.4821, "num_input_tokens_seen": 173546992, "step": 1177 }, { "epoch": 0.33929086191986163, "loss": 3.445162296295166, "loss_ce": 3.302217960357666, "loss_xval": 0.142578125, "num_input_tokens_seen": 173546992, "step": 1177 }, { "epoch": 0.339579129432113, "grad_norm": 32.240317086567416, "learning_rate": 0.0001, "loss": 3.693, "num_input_tokens_seen": 173681984, "step": 1178 }, { "epoch": 0.339579129432113, "loss": 3.5491960048675537, "loss_ce": 3.4296891689300537, "loss_xval": 0.11962890625, "num_input_tokens_seen": 173681984, "step": 1178 }, { "epoch": 0.33986739694436435, "grad_norm": 52.75172198229805, "learning_rate": 0.0001, "loss": 3.7589, "num_input_tokens_seen": 173854440, "step": 1179 }, { "epoch": 0.33986739694436435, "loss": 3.916414976119995, "loss_ce": 3.636019468307495, "loss_xval": 0.28125, "num_input_tokens_seen": 173854440, "step": 1179 }, { "epoch": 0.3401556644566157, "grad_norm": 50.607619311064305, "learning_rate": 0.0001, "loss": 3.5949, "num_input_tokens_seen": 173989424, "step": 1180 }, { "epoch": 0.3401556644566157, "loss": 3.5826573371887207, "loss_ce": 3.2337803840637207, "loss_xval": 0.349609375, "num_input_tokens_seen": 173989424, "step": 1180 }, { "epoch": 0.34044393196886713, "grad_norm": 40.62064593668211, "learning_rate": 0.0001, "loss": 3.6989, "num_input_tokens_seen": 174124608, "step": 1181 }, { "epoch": 0.34044393196886713, "loss": 3.5342178344726562, "loss_ce": 3.3720474243164062, "loss_xval": 0.162109375, "num_input_tokens_seen": 174124608, "step": 1181 }, { "epoch": 0.3407321994811185, "grad_norm": 35.836911748651296, "learning_rate": 0.0001, "loss": 3.5721, "num_input_tokens_seen": 174297080, "step": 1182 }, { "epoch": 0.3407321994811185, "loss": 3.7330305576324463, "loss_ce": 3.5919783115386963, "loss_xval": 0.140625, "num_input_tokens_seen": 174297080, "step": 1182 }, { "epoch": 0.34102046699336985, "grad_norm": 16.492342672168352, "learning_rate": 0.0001, "loss": 3.3894, "num_input_tokens_seen": 174432032, "step": 1183 }, { "epoch": 0.34102046699336985, "loss": 3.316561222076416, "loss_ce": 3.188875675201416, "loss_xval": 0.1279296875, "num_input_tokens_seen": 174432032, "step": 1183 }, { "epoch": 0.3413087345056212, "grad_norm": 5.432967850334228, "learning_rate": 0.0001, "loss": 3.5067, "num_input_tokens_seen": 174567080, "step": 1184 }, { "epoch": 0.3413087345056212, "loss": 3.3840579986572266, "loss_ce": 3.3247928619384766, "loss_xval": 0.059326171875, "num_input_tokens_seen": 174567080, "step": 1184 }, { "epoch": 0.3415970020178726, "grad_norm": 11.636727885745076, "learning_rate": 0.0001, "loss": 3.456, "num_input_tokens_seen": 174739592, "step": 1185 }, { "epoch": 0.3415970020178726, "loss": 3.6579012870788574, "loss_ce": 3.5756258964538574, "loss_xval": 0.08203125, "num_input_tokens_seen": 174739592, "step": 1185 }, { "epoch": 0.34188526953012394, "grad_norm": 28.073243632677382, "learning_rate": 0.0001, "loss": 3.3726, "num_input_tokens_seen": 174874352, "step": 1186 }, { "epoch": 0.34188526953012394, "loss": 3.2818756103515625, "loss_ce": 3.1490631103515625, "loss_xval": 0.1328125, "num_input_tokens_seen": 174874352, "step": 1186 }, { "epoch": 0.3421735370423753, "grad_norm": 37.38558090166038, "learning_rate": 0.0001, "loss": 3.6097, "num_input_tokens_seen": 175009344, "step": 1187 }, { "epoch": 0.3421735370423753, "loss": 3.4909286499023438, "loss_ce": 3.3041610717773438, "loss_xval": 0.1865234375, "num_input_tokens_seen": 175009344, "step": 1187 }, { "epoch": 0.3424618045546267, "grad_norm": 32.136012294484296, "learning_rate": 0.0001, "loss": 3.5214, "num_input_tokens_seen": 175181992, "step": 1188 }, { "epoch": 0.3424618045546267, "loss": 3.7463483810424805, "loss_ce": 3.5837507247924805, "loss_xval": 0.162109375, "num_input_tokens_seen": 175181992, "step": 1188 }, { "epoch": 0.3427500720668781, "grad_norm": 30.42733750158407, "learning_rate": 0.0001, "loss": 3.3589, "num_input_tokens_seen": 175316728, "step": 1189 }, { "epoch": 0.3427500720668781, "loss": 3.259042739868164, "loss_ce": 3.126535415649414, "loss_xval": 0.1328125, "num_input_tokens_seen": 175316728, "step": 1189 }, { "epoch": 0.34303833957912944, "grad_norm": 21.403150656984053, "learning_rate": 0.0001, "loss": 3.5281, "num_input_tokens_seen": 175451896, "step": 1190 }, { "epoch": 0.34303833957912944, "loss": 3.385063409805298, "loss_ce": 3.290947198867798, "loss_xval": 0.09423828125, "num_input_tokens_seen": 175451896, "step": 1190 }, { "epoch": 0.3433266070913808, "grad_norm": 3.7029830708447267, "learning_rate": 0.0001, "loss": 3.406, "num_input_tokens_seen": 175624320, "step": 1191 }, { "epoch": 0.3433266070913808, "loss": 3.61179780960083, "loss_ce": 3.54966402053833, "loss_xval": 0.06201171875, "num_input_tokens_seen": 175624320, "step": 1191 }, { "epoch": 0.34361487460363216, "grad_norm": 9.347038645733567, "learning_rate": 0.0001, "loss": 3.2743, "num_input_tokens_seen": 175759136, "step": 1192 }, { "epoch": 0.34361487460363216, "loss": 3.2146201133728027, "loss_ce": 3.1162314414978027, "loss_xval": 0.0986328125, "num_input_tokens_seen": 175759136, "step": 1192 }, { "epoch": 0.3439031421158835, "grad_norm": 13.15268747761076, "learning_rate": 0.0001, "loss": 3.4675, "num_input_tokens_seen": 175894248, "step": 1193 }, { "epoch": 0.3439031421158835, "loss": 3.3171091079711914, "loss_ce": 3.268967628479004, "loss_xval": 0.048095703125, "num_input_tokens_seen": 175894248, "step": 1193 }, { "epoch": 0.34419140962813494, "grad_norm": 24.383472160045933, "learning_rate": 0.0001, "loss": 3.45, "num_input_tokens_seen": 176066792, "step": 1194 }, { "epoch": 0.34419140962813494, "loss": 3.663198947906494, "loss_ce": 3.574148654937744, "loss_xval": 0.0888671875, "num_input_tokens_seen": 176066792, "step": 1194 }, { "epoch": 0.3444796771403863, "grad_norm": 21.76778370017577, "learning_rate": 0.0001, "loss": 3.281, "num_input_tokens_seen": 176201608, "step": 1195 }, { "epoch": 0.3444796771403863, "loss": 3.234495162963867, "loss_ce": 3.100400924682617, "loss_xval": 0.1337890625, "num_input_tokens_seen": 176201608, "step": 1195 }, { "epoch": 0.34476794465263766, "grad_norm": 16.369405491156957, "learning_rate": 0.0001, "loss": 3.4813, "num_input_tokens_seen": 176336784, "step": 1196 }, { "epoch": 0.34476794465263766, "loss": 3.335388660430908, "loss_ce": 3.2846226692199707, "loss_xval": 0.05078125, "num_input_tokens_seen": 176336784, "step": 1196 }, { "epoch": 0.345056212164889, "grad_norm": 16.78593163365355, "learning_rate": 0.0001, "loss": 3.3888, "num_input_tokens_seen": 176509320, "step": 1197 }, { "epoch": 0.345056212164889, "loss": 3.5923142433166504, "loss_ce": 3.5209641456604004, "loss_xval": 0.0712890625, "num_input_tokens_seen": 176509320, "step": 1197 }, { "epoch": 0.3453444796771404, "grad_norm": 6.970010570859095, "learning_rate": 0.0001, "loss": 3.2365, "num_input_tokens_seen": 176644096, "step": 1198 }, { "epoch": 0.3453444796771404, "loss": 3.164623975753784, "loss_ce": 3.082195997238159, "loss_xval": 0.08251953125, "num_input_tokens_seen": 176644096, "step": 1198 }, { "epoch": 0.34563274718939174, "grad_norm": 5.43104937026321, "learning_rate": 0.0001, "loss": 3.4491, "num_input_tokens_seen": 176779144, "step": 1199 }, { "epoch": 0.34563274718939174, "loss": 3.300656318664551, "loss_ce": 3.257901191711426, "loss_xval": 0.042724609375, "num_input_tokens_seen": 176779144, "step": 1199 }, { "epoch": 0.3459210147016431, "grad_norm": 6.256442475593534, "learning_rate": 0.0001, "loss": 3.3549, "num_input_tokens_seen": 176951656, "step": 1200 }, { "epoch": 0.3459210147016431, "loss": 3.5743367671966553, "loss_ce": 3.5145528316497803, "loss_xval": 0.059814453125, "num_input_tokens_seen": 176951656, "step": 1200 }, { "epoch": 0.3462092822138945, "grad_norm": 12.289159214842655, "learning_rate": 0.0001, "loss": 3.2364, "num_input_tokens_seen": 177086424, "step": 1201 }, { "epoch": 0.3462092822138945, "loss": 3.1313276290893555, "loss_ce": 3.0714216232299805, "loss_xval": 0.059814453125, "num_input_tokens_seen": 177086424, "step": 1201 }, { "epoch": 0.3464975497261459, "grad_norm": 16.61564645292739, "learning_rate": 0.0001, "loss": 3.4868, "num_input_tokens_seen": 177221504, "step": 1202 }, { "epoch": 0.3464975497261459, "loss": 3.34700083732605, "loss_ce": 3.2785804271698, "loss_xval": 0.068359375, "num_input_tokens_seen": 177221504, "step": 1202 }, { "epoch": 0.34678581723839724, "grad_norm": 12.253516525379556, "learning_rate": 0.0001, "loss": 3.3667, "num_input_tokens_seen": 177394048, "step": 1203 }, { "epoch": 0.34678581723839724, "loss": 3.6046361923217773, "loss_ce": 3.5355443954467773, "loss_xval": 0.0693359375, "num_input_tokens_seen": 177394048, "step": 1203 }, { "epoch": 0.3470740847506486, "grad_norm": 12.680372469910816, "learning_rate": 0.0001, "loss": 3.2299, "num_input_tokens_seen": 177528816, "step": 1204 }, { "epoch": 0.3470740847506486, "loss": 3.1183619499206543, "loss_ce": 3.0578150749206543, "loss_xval": 0.060546875, "num_input_tokens_seen": 177528816, "step": 1204 }, { "epoch": 0.34736235226289996, "grad_norm": 9.269347803679004, "learning_rate": 0.0001, "loss": 3.4058, "num_input_tokens_seen": 177663768, "step": 1205 }, { "epoch": 0.34736235226289996, "loss": 3.276123523712158, "loss_ce": 3.232819080352783, "loss_xval": 0.043212890625, "num_input_tokens_seen": 177663768, "step": 1205 }, { "epoch": 0.3476506197751513, "grad_norm": 4.73661481131897, "learning_rate": 0.0001, "loss": 3.3351, "num_input_tokens_seen": 177836392, "step": 1206 }, { "epoch": 0.3476506197751513, "loss": 3.5520756244659424, "loss_ce": 3.5076420307159424, "loss_xval": 0.04443359375, "num_input_tokens_seen": 177836392, "step": 1206 }, { "epoch": 0.3479388872874027, "grad_norm": 6.329114181074099, "learning_rate": 0.0001, "loss": 3.2034, "num_input_tokens_seen": 177971168, "step": 1207 }, { "epoch": 0.3479388872874027, "loss": 3.1209933757781982, "loss_ce": 3.0564181804656982, "loss_xval": 0.064453125, "num_input_tokens_seen": 177971168, "step": 1207 }, { "epoch": 0.3482271547996541, "grad_norm": 6.65566027578529, "learning_rate": 0.0001, "loss": 3.41, "num_input_tokens_seen": 178106168, "step": 1208 }, { "epoch": 0.3482271547996541, "loss": 3.28511381149292, "loss_ce": 3.2503085136413574, "loss_xval": 0.034912109375, "num_input_tokens_seen": 178106168, "step": 1208 }, { "epoch": 0.34851542231190547, "grad_norm": 13.210336902177081, "learning_rate": 0.0001, "loss": 3.3444, "num_input_tokens_seen": 178278728, "step": 1209 }, { "epoch": 0.34851542231190547, "loss": 3.5691750049591064, "loss_ce": 3.5233376026153564, "loss_xval": 0.0458984375, "num_input_tokens_seen": 178278728, "step": 1209 }, { "epoch": 0.3488036898241568, "grad_norm": 11.57224749405127, "learning_rate": 0.0001, "loss": 3.2052, "num_input_tokens_seen": 178413576, "step": 1210 }, { "epoch": 0.3488036898241568, "loss": 3.1245784759521484, "loss_ce": 3.0383968353271484, "loss_xval": 0.0859375, "num_input_tokens_seen": 178413576, "step": 1210 }, { "epoch": 0.3490919573364082, "grad_norm": 6.020088437621785, "learning_rate": 0.0001, "loss": 3.4461, "num_input_tokens_seen": 178548736, "step": 1211 }, { "epoch": 0.3490919573364082, "loss": 3.2839789390563965, "loss_ce": 3.255948543548584, "loss_xval": 0.028076171875, "num_input_tokens_seen": 178548736, "step": 1211 }, { "epoch": 0.34938022484865955, "grad_norm": 6.669576843869337, "learning_rate": 0.0001, "loss": 3.3146, "num_input_tokens_seen": 178721200, "step": 1212 }, { "epoch": 0.34938022484865955, "loss": 3.528001308441162, "loss_ce": 3.485276699066162, "loss_xval": 0.042724609375, "num_input_tokens_seen": 178721200, "step": 1212 }, { "epoch": 0.3496684923609109, "grad_norm": 1.913119357143956, "learning_rate": 0.0001, "loss": 3.1879, "num_input_tokens_seen": 178856032, "step": 1213 }, { "epoch": 0.3496684923609109, "loss": 3.0951290130615234, "loss_ce": 3.0421199798583984, "loss_xval": 0.052978515625, "num_input_tokens_seen": 178856032, "step": 1213 }, { "epoch": 0.34995675987316227, "grad_norm": 7.057834925247301, "learning_rate": 0.0001, "loss": 3.4293, "num_input_tokens_seen": 178991144, "step": 1214 }, { "epoch": 0.34995675987316227, "loss": 3.283670663833618, "loss_ce": 3.247324228286743, "loss_xval": 0.036376953125, "num_input_tokens_seen": 178991144, "step": 1214 }, { "epoch": 0.3502450273854137, "grad_norm": 6.700408570182067, "learning_rate": 0.0001, "loss": 3.3265, "num_input_tokens_seen": 179163648, "step": 1215 }, { "epoch": 0.3502450273854137, "loss": 3.5651307106018066, "loss_ce": 3.5232300758361816, "loss_xval": 0.0419921875, "num_input_tokens_seen": 179163648, "step": 1215 }, { "epoch": 0.35053329489766505, "grad_norm": 8.093485597628083, "learning_rate": 0.0001, "loss": 3.1862, "num_input_tokens_seen": 179298472, "step": 1216 }, { "epoch": 0.35053329489766505, "loss": 3.0838124752044678, "loss_ce": 3.0416371822357178, "loss_xval": 0.042236328125, "num_input_tokens_seen": 179298472, "step": 1216 }, { "epoch": 0.3508215624099164, "grad_norm": 10.871592735786908, "learning_rate": 0.0001, "loss": 3.3771, "num_input_tokens_seen": 179433416, "step": 1217 }, { "epoch": 0.3508215624099164, "loss": 3.2459707260131836, "loss_ce": 3.212386131286621, "loss_xval": 0.03369140625, "num_input_tokens_seen": 179433416, "step": 1217 }, { "epoch": 0.35110982992216777, "grad_norm": 2.6312062789379613, "learning_rate": 0.0001, "loss": 3.3024, "num_input_tokens_seen": 179605904, "step": 1218 }, { "epoch": 0.35110982992216777, "loss": 3.5236358642578125, "loss_ce": 3.48828125, "loss_xval": 0.035400390625, "num_input_tokens_seen": 179605904, "step": 1218 }, { "epoch": 0.35139809743441913, "grad_norm": 1.5401386473978997, "learning_rate": 0.0001, "loss": 3.1598, "num_input_tokens_seen": 179740720, "step": 1219 }, { "epoch": 0.35139809743441913, "loss": 3.079009532928467, "loss_ce": 3.033355236053467, "loss_xval": 0.045654296875, "num_input_tokens_seen": 179740720, "step": 1219 }, { "epoch": 0.3516863649466705, "grad_norm": 1.6760672752235724, "learning_rate": 0.0001, "loss": 3.3668, "num_input_tokens_seen": 179875752, "step": 1220 }, { "epoch": 0.3516863649466705, "loss": 3.232072353363037, "loss_ce": 3.211595058441162, "loss_xval": 0.0205078125, "num_input_tokens_seen": 179875752, "step": 1220 }, { "epoch": 0.35197463245892185, "grad_norm": 8.250299885867918, "learning_rate": 0.0001, "loss": 3.3206, "num_input_tokens_seen": 180048296, "step": 1221 }, { "epoch": 0.35197463245892185, "loss": 3.5465445518493652, "loss_ce": 3.519040584564209, "loss_xval": 0.0274658203125, "num_input_tokens_seen": 180048296, "step": 1221 }, { "epoch": 0.35226289997117327, "grad_norm": 7.823759219188541, "learning_rate": 0.0001, "loss": 3.1555, "num_input_tokens_seen": 180183080, "step": 1222 }, { "epoch": 0.35226289997117327, "loss": 3.0791444778442383, "loss_ce": 3.0272035598754883, "loss_xval": 0.052001953125, "num_input_tokens_seen": 180183080, "step": 1222 }, { "epoch": 0.35255116748342463, "grad_norm": 3.5751367712534434, "learning_rate": 0.0001, "loss": 3.3744, "num_input_tokens_seen": 180318208, "step": 1223 }, { "epoch": 0.35255116748342463, "loss": 3.250316619873047, "loss_ce": 3.2265663146972656, "loss_xval": 0.0238037109375, "num_input_tokens_seen": 180318208, "step": 1223 }, { "epoch": 0.352839434995676, "grad_norm": 8.228880590132833, "learning_rate": 0.0001, "loss": 3.3038, "num_input_tokens_seen": 180490680, "step": 1224 }, { "epoch": 0.352839434995676, "loss": 3.5290775299072266, "loss_ce": 3.498727798461914, "loss_xval": 0.0303955078125, "num_input_tokens_seen": 180490680, "step": 1224 }, { "epoch": 0.35312770250792735, "grad_norm": 2.959232776381592, "learning_rate": 0.0001, "loss": 3.1494, "num_input_tokens_seen": 180625504, "step": 1225 }, { "epoch": 0.35312770250792735, "loss": 3.0728955268859863, "loss_ce": 3.0282177925109863, "loss_xval": 0.044677734375, "num_input_tokens_seen": 180625504, "step": 1225 }, { "epoch": 0.3534159700201787, "grad_norm": 4.857446287211869, "learning_rate": 0.0001, "loss": 3.3712, "num_input_tokens_seen": 180760592, "step": 1226 }, { "epoch": 0.3534159700201787, "loss": 3.2360877990722656, "loss_ce": 3.214008331298828, "loss_xval": 0.0220947265625, "num_input_tokens_seen": 180760592, "step": 1226 }, { "epoch": 0.3537042375324301, "grad_norm": 1.6530887063049307, "learning_rate": 0.0001, "loss": 3.3088, "num_input_tokens_seen": 180933232, "step": 1227 }, { "epoch": 0.3537042375324301, "loss": 3.561347484588623, "loss_ce": 3.5332255363464355, "loss_xval": 0.028076171875, "num_input_tokens_seen": 180933232, "step": 1227 }, { "epoch": 0.35399250504468144, "grad_norm": 5.015458929984309, "learning_rate": 0.0001, "loss": 3.1558, "num_input_tokens_seen": 181068040, "step": 1228 }, { "epoch": 0.35399250504468144, "loss": 3.0620033740997314, "loss_ce": 3.0263283252716064, "loss_xval": 0.03564453125, "num_input_tokens_seen": 181068040, "step": 1228 }, { "epoch": 0.35428077255693285, "grad_norm": 8.938959384444903, "learning_rate": 0.0001, "loss": 3.3659, "num_input_tokens_seen": 181203056, "step": 1229 }, { "epoch": 0.35428077255693285, "loss": 3.2330641746520996, "loss_ce": 3.206925868988037, "loss_xval": 0.026123046875, "num_input_tokens_seen": 181203056, "step": 1229 }, { "epoch": 0.3545690400691842, "grad_norm": 2.5622534586346046, "learning_rate": 0.0001, "loss": 3.2811, "num_input_tokens_seen": 181375560, "step": 1230 }, { "epoch": 0.3545690400691842, "loss": 3.504213333129883, "loss_ce": 3.477785110473633, "loss_xval": 0.0263671875, "num_input_tokens_seen": 181375560, "step": 1230 }, { "epoch": 0.3548573075814356, "grad_norm": 2.349014538352901, "learning_rate": 0.0001, "loss": 3.1492, "num_input_tokens_seen": 181510392, "step": 1231 }, { "epoch": 0.3548573075814356, "loss": 3.057544231414795, "loss_ce": 3.024768352508545, "loss_xval": 0.03271484375, "num_input_tokens_seen": 181510392, "step": 1231 }, { "epoch": 0.35514557509368694, "grad_norm": 2.760313775015949, "learning_rate": 0.0001, "loss": 3.3364, "num_input_tokens_seen": 181645400, "step": 1232 }, { "epoch": 0.35514557509368694, "loss": 3.192826271057129, "loss_ce": 3.1752023696899414, "loss_xval": 0.017578125, "num_input_tokens_seen": 181645400, "step": 1232 }, { "epoch": 0.3554338426059383, "grad_norm": 3.7620834491475583, "learning_rate": 0.0001, "loss": 3.2669, "num_input_tokens_seen": 181817928, "step": 1233 }, { "epoch": 0.3554338426059383, "loss": 3.4794349670410156, "loss_ce": 3.4616966247558594, "loss_xval": 0.0177001953125, "num_input_tokens_seen": 181817928, "step": 1233 }, { "epoch": 0.35572211011818966, "grad_norm": 6.472062853720295, "learning_rate": 0.0001, "loss": 3.1369, "num_input_tokens_seen": 181952752, "step": 1234 }, { "epoch": 0.35572211011818966, "loss": 3.054111957550049, "loss_ce": 3.013737201690674, "loss_xval": 0.040283203125, "num_input_tokens_seen": 181952752, "step": 1234 }, { "epoch": 0.356010377630441, "grad_norm": 2.8472546072170744, "learning_rate": 0.0001, "loss": 3.3564, "num_input_tokens_seen": 182087864, "step": 1235 }, { "epoch": 0.356010377630441, "loss": 3.2165331840515137, "loss_ce": 3.2055697441101074, "loss_xval": 0.010986328125, "num_input_tokens_seen": 182087864, "step": 1235 }, { "epoch": 0.35629864514269244, "grad_norm": 6.785687454321519, "learning_rate": 0.0001, "loss": 3.2471, "num_input_tokens_seen": 182260296, "step": 1236 }, { "epoch": 0.35629864514269244, "loss": 3.4405596256256104, "loss_ce": 3.4217989444732666, "loss_xval": 0.018798828125, "num_input_tokens_seen": 182260296, "step": 1236 }, { "epoch": 0.3565869126549438, "grad_norm": 2.146040129543714, "learning_rate": 0.0001, "loss": 3.1209, "num_input_tokens_seen": 182395104, "step": 1237 }, { "epoch": 0.3565869126549438, "loss": 3.0413930416107178, "loss_ce": 3.0080831050872803, "loss_xval": 0.033203125, "num_input_tokens_seen": 182395104, "step": 1237 }, { "epoch": 0.35687518016719516, "grad_norm": 2.974408010039056, "learning_rate": 0.0001, "loss": 3.3152, "num_input_tokens_seen": 182530136, "step": 1238 }, { "epoch": 0.35687518016719516, "loss": 3.1985116004943848, "loss_ce": 3.1820931434631348, "loss_xval": 0.016357421875, "num_input_tokens_seen": 182530136, "step": 1238 }, { "epoch": 0.3571634476794465, "grad_norm": 1.4409787530889788, "learning_rate": 0.0001, "loss": 3.243, "num_input_tokens_seen": 182702624, "step": 1239 }, { "epoch": 0.3571634476794465, "loss": 3.4570600986480713, "loss_ce": 3.43831467628479, "loss_xval": 0.018798828125, "num_input_tokens_seen": 182702624, "step": 1239 }, { "epoch": 0.3574517151916979, "grad_norm": 3.507385210087722, "learning_rate": 0.0001, "loss": 3.1193, "num_input_tokens_seen": 182837392, "step": 1240 }, { "epoch": 0.3574517151916979, "loss": 3.025404453277588, "loss_ce": 3.001112461090088, "loss_xval": 0.0242919921875, "num_input_tokens_seen": 182837392, "step": 1240 }, { "epoch": 0.35773998270394924, "grad_norm": 7.001759358957166, "learning_rate": 0.0001, "loss": 3.3462, "num_input_tokens_seen": 182972408, "step": 1241 }, { "epoch": 0.35773998270394924, "loss": 3.2213611602783203, "loss_ce": 3.2040576934814453, "loss_xval": 0.017333984375, "num_input_tokens_seen": 182972408, "step": 1241 }, { "epoch": 0.35802825021620066, "grad_norm": 2.538978744474648, "learning_rate": 0.0001, "loss": 3.2317, "num_input_tokens_seen": 183144792, "step": 1242 }, { "epoch": 0.35802825021620066, "loss": 3.4401183128356934, "loss_ce": 3.4241347312927246, "loss_xval": 0.0159912109375, "num_input_tokens_seen": 183144792, "step": 1242 }, { "epoch": 0.358316517728452, "grad_norm": 1.4479057315326487, "learning_rate": 0.0001, "loss": 3.1175, "num_input_tokens_seen": 183279616, "step": 1243 }, { "epoch": 0.358316517728452, "loss": 3.026390314102173, "loss_ce": 2.9969866275787354, "loss_xval": 0.0294189453125, "num_input_tokens_seen": 183279616, "step": 1243 }, { "epoch": 0.3586047852407034, "grad_norm": 2.4414483373827145, "learning_rate": 0.0001, "loss": 3.3614, "num_input_tokens_seen": 183414816, "step": 1244 }, { "epoch": 0.3586047852407034, "loss": 3.198207378387451, "loss_ce": 3.186450481414795, "loss_xval": 0.01177978515625, "num_input_tokens_seen": 183414816, "step": 1244 }, { "epoch": 0.35889305275295474, "grad_norm": 5.1675116274784205, "learning_rate": 0.0001, "loss": 3.2394, "num_input_tokens_seen": 183587312, "step": 1245 }, { "epoch": 0.35889305275295474, "loss": 3.4583864212036133, "loss_ce": 3.444554328918457, "loss_xval": 0.01385498046875, "num_input_tokens_seen": 183587312, "step": 1245 }, { "epoch": 0.3591813202652061, "grad_norm": 3.0400243211955966, "learning_rate": 0.0001, "loss": 3.1146, "num_input_tokens_seen": 183722120, "step": 1246 }, { "epoch": 0.3591813202652061, "loss": 3.0098531246185303, "loss_ce": 2.9842259883880615, "loss_xval": 0.025634765625, "num_input_tokens_seen": 183722120, "step": 1246 }, { "epoch": 0.35946958777745747, "grad_norm": 1.5009990234436026, "learning_rate": 0.0001, "loss": 3.2998, "num_input_tokens_seen": 183857160, "step": 1247 }, { "epoch": 0.35946958777745747, "loss": 3.1848437786102295, "loss_ce": 3.1744449138641357, "loss_xval": 0.0103759765625, "num_input_tokens_seen": 183857160, "step": 1247 }, { "epoch": 0.35975785528970883, "grad_norm": 4.303761457456079, "learning_rate": 0.0001, "loss": 3.2222, "num_input_tokens_seen": 184029680, "step": 1248 }, { "epoch": 0.35975785528970883, "loss": 3.4391422271728516, "loss_ce": 3.4264888763427734, "loss_xval": 0.01263427734375, "num_input_tokens_seen": 184029680, "step": 1248 }, { "epoch": 0.36004612280196024, "grad_norm": 1.402284473926413, "learning_rate": 0.0001, "loss": 3.0968, "num_input_tokens_seen": 184164528, "step": 1249 }, { "epoch": 0.36004612280196024, "loss": 3.003244638442993, "loss_ce": 2.981996774673462, "loss_xval": 0.021240234375, "num_input_tokens_seen": 184164528, "step": 1249 }, { "epoch": 0.3603343903142116, "grad_norm": 4.291548001512167, "learning_rate": 0.0001, "loss": 3.3356, "num_input_tokens_seen": 184299688, "step": 1250 }, { "epoch": 0.3603343903142116, "eval_websight_new_IoU": 0.06377699971199036, "eval_websight_new_MAE_x": 0.11152733862400055, "eval_websight_new_MAE_y": 0.070404052734375, "eval_websight_new_NUM_probability": 0.10237714275717735, "eval_websight_new_inside_bbox": 0.07465277798473835, "eval_websight_new_loss": 3.1641979217529297, "eval_websight_new_loss_ce": 3.14769971370697, "eval_websight_new_loss_xval": 0.011007308959960938, "eval_websight_new_runtime": 35.8519, "eval_websight_new_samples_per_second": 1.395, "eval_websight_new_steps_per_second": 0.056, "num_input_tokens_seen": 184299688, "step": 1250 }, { "epoch": 0.3603343903142116, "eval_seeclick_IoU": 0.0957968607544899, "eval_seeclick_MAE_x": 0.12743081152439117, "eval_seeclick_MAE_y": 0.1018507219851017, "eval_seeclick_NUM_probability": 0.09817633777856827, "eval_seeclick_inside_bbox": 0.2725694477558136, "eval_seeclick_loss": 2.995253324508667, "eval_seeclick_loss_ce": 2.976414203643799, "eval_seeclick_loss_xval": 0.021217823028564453, "eval_seeclick_runtime": 62.8692, "eval_seeclick_samples_per_second": 0.795, "eval_seeclick_steps_per_second": 0.032, "num_input_tokens_seen": 184299688, "step": 1250 }, { "epoch": 0.3603343903142116, "eval_icons_IoU": 0.017014755867421627, "eval_icons_MAE_x": 0.09833865612745285, "eval_icons_MAE_y": 0.08148108422756195, "eval_icons_NUM_probability": 0.09484974667429924, "eval_icons_inside_bbox": 0.02777777798473835, "eval_icons_loss": 3.481070041656494, "eval_icons_loss_ce": 3.4514541625976562, "eval_icons_loss_xval": 0.010906219482421875, "eval_icons_runtime": 65.653, "eval_icons_samples_per_second": 0.762, "eval_icons_steps_per_second": 0.03, "num_input_tokens_seen": 184299688, "step": 1250 }, { "epoch": 0.3603343903142116, "loss": 3.4562795162200928, "loss_ce": 3.4461476802825928, "loss_xval": 0.0101318359375, "num_input_tokens_seen": 184299688, "step": 1250 }, { "epoch": 0.36062265782646297, "grad_norm": 1.935079332583963, "learning_rate": 0.0001, "loss": 3.2156, "num_input_tokens_seen": 184472272, "step": 1251 }, { "epoch": 0.36062265782646297, "loss": 3.437589406967163, "loss_ce": 3.425779104232788, "loss_xval": 0.0118408203125, "num_input_tokens_seen": 184472272, "step": 1251 }, { "epoch": 0.36091092533871433, "grad_norm": 3.5075152116035633, "learning_rate": 0.0001, "loss": 3.0755, "num_input_tokens_seen": 184606984, "step": 1252 }, { "epoch": 0.36091092533871433, "loss": 2.988539695739746, "loss_ce": 2.9702444076538086, "loss_xval": 0.018310546875, "num_input_tokens_seen": 184606984, "step": 1252 }, { "epoch": 0.3611991928509657, "grad_norm": 4.050070963936819, "learning_rate": 0.0001, "loss": 3.2879, "num_input_tokens_seen": 184741984, "step": 1253 }, { "epoch": 0.3611991928509657, "loss": 3.1794817447662354, "loss_ce": 3.167839288711548, "loss_xval": 0.01165771484375, "num_input_tokens_seen": 184741984, "step": 1253 }, { "epoch": 0.36148746036321705, "grad_norm": 1.6716885708857125, "learning_rate": 0.0001, "loss": 3.1998, "num_input_tokens_seen": 184914544, "step": 1254 }, { "epoch": 0.36148746036321705, "loss": 3.411571979522705, "loss_ce": 3.401855945587158, "loss_xval": 0.00970458984375, "num_input_tokens_seen": 184914544, "step": 1254 }, { "epoch": 0.3617757278754684, "grad_norm": 2.3067478578370966, "learning_rate": 0.0001, "loss": 3.0845, "num_input_tokens_seen": 185049448, "step": 1255 }, { "epoch": 0.3617757278754684, "loss": 2.986309289932251, "loss_ce": 2.9643900394439697, "loss_xval": 0.02197265625, "num_input_tokens_seen": 185049448, "step": 1255 }, { "epoch": 0.36206399538771983, "grad_norm": 1.6005172720100542, "learning_rate": 0.0001, "loss": 3.2921, "num_input_tokens_seen": 185184592, "step": 1256 }, { "epoch": 0.36206399538771983, "loss": 3.1446337699890137, "loss_ce": 3.137214183807373, "loss_xval": 0.007415771484375, "num_input_tokens_seen": 185184592, "step": 1256 }, { "epoch": 0.3623522628999712, "grad_norm": 5.222734644437509, "learning_rate": 0.0001, "loss": 3.2062, "num_input_tokens_seen": 185357176, "step": 1257 }, { "epoch": 0.3623522628999712, "loss": 3.431488513946533, "loss_ce": 3.4212613105773926, "loss_xval": 0.01025390625, "num_input_tokens_seen": 185357176, "step": 1257 }, { "epoch": 0.36264053041222255, "grad_norm": 1.6688855379400882, "learning_rate": 0.0001, "loss": 3.0652, "num_input_tokens_seen": 185492032, "step": 1258 }, { "epoch": 0.36264053041222255, "loss": 2.9728851318359375, "loss_ce": 2.950164794921875, "loss_xval": 0.022705078125, "num_input_tokens_seen": 185492032, "step": 1258 }, { "epoch": 0.3629287979244739, "grad_norm": 2.497505358631419, "learning_rate": 0.0001, "loss": 3.2925, "num_input_tokens_seen": 185627128, "step": 1259 }, { "epoch": 0.3629287979244739, "loss": 3.153623580932617, "loss_ce": 3.142393112182617, "loss_xval": 0.01123046875, "num_input_tokens_seen": 185627128, "step": 1259 }, { "epoch": 0.3632170654367253, "grad_norm": 1.3269461584050222, "learning_rate": 0.0001, "loss": 3.2288, "num_input_tokens_seen": 185799800, "step": 1260 }, { "epoch": 0.3632170654367253, "loss": 3.494309186935425, "loss_ce": 3.483990430831909, "loss_xval": 0.01031494140625, "num_input_tokens_seen": 185799800, "step": 1260 }, { "epoch": 0.36350533294897663, "grad_norm": 2.999073864405773, "learning_rate": 0.0001, "loss": 3.0764, "num_input_tokens_seen": 185934760, "step": 1261 }, { "epoch": 0.36350533294897663, "loss": 2.960078477859497, "loss_ce": 2.9436981678009033, "loss_xval": 0.016357421875, "num_input_tokens_seen": 185934760, "step": 1261 }, { "epoch": 0.363793600461228, "grad_norm": 4.427508635851775, "learning_rate": 0.0001, "loss": 3.2545, "num_input_tokens_seen": 186069728, "step": 1262 }, { "epoch": 0.363793600461228, "loss": 3.137115240097046, "loss_ce": 3.1245877742767334, "loss_xval": 0.01251220703125, "num_input_tokens_seen": 186069728, "step": 1262 }, { "epoch": 0.3640818679734794, "grad_norm": 1.3243023539960694, "learning_rate": 0.0001, "loss": 3.2047, "num_input_tokens_seen": 186242096, "step": 1263 }, { "epoch": 0.3640818679734794, "loss": 3.4566519260406494, "loss_ce": 3.4479429721832275, "loss_xval": 0.00872802734375, "num_input_tokens_seen": 186242096, "step": 1263 }, { "epoch": 0.3643701354857308, "grad_norm": 1.492897020777446, "learning_rate": 0.0001, "loss": 3.0452, "num_input_tokens_seen": 186376880, "step": 1264 }, { "epoch": 0.3643701354857308, "loss": 2.953927993774414, "loss_ce": 2.9349613189697266, "loss_xval": 0.0189208984375, "num_input_tokens_seen": 186376880, "step": 1264 }, { "epoch": 0.36465840299798213, "grad_norm": 1.450417138161176, "learning_rate": 0.0001, "loss": 3.2671, "num_input_tokens_seen": 186511944, "step": 1265 }, { "epoch": 0.36465840299798213, "loss": 3.1252617835998535, "loss_ce": 3.1166253089904785, "loss_xval": 0.0086669921875, "num_input_tokens_seen": 186511944, "step": 1265 }, { "epoch": 0.3649466705102335, "grad_norm": 3.524096074421607, "learning_rate": 0.0001, "loss": 3.1648, "num_input_tokens_seen": 186684480, "step": 1266 }, { "epoch": 0.3649466705102335, "loss": 3.383352041244507, "loss_ce": 3.3738229274749756, "loss_xval": 0.009521484375, "num_input_tokens_seen": 186684480, "step": 1266 }, { "epoch": 0.36523493802248486, "grad_norm": 1.8573647817225514, "learning_rate": 0.0001, "loss": 3.0348, "num_input_tokens_seen": 186819320, "step": 1267 }, { "epoch": 0.36523493802248486, "loss": 2.93626070022583, "loss_ce": 2.91801118850708, "loss_xval": 0.018310546875, "num_input_tokens_seen": 186819320, "step": 1267 }, { "epoch": 0.3655232055347362, "grad_norm": 1.6620050972238471, "learning_rate": 0.0001, "loss": 3.2388, "num_input_tokens_seen": 186954360, "step": 1268 }, { "epoch": 0.3655232055347362, "loss": 3.094966173171997, "loss_ce": 3.0861923694610596, "loss_xval": 0.0087890625, "num_input_tokens_seen": 186954360, "step": 1268 }, { "epoch": 0.3658114730469876, "grad_norm": 1.6554413422711625, "learning_rate": 0.0001, "loss": 3.1737, "num_input_tokens_seen": 187126824, "step": 1269 }, { "epoch": 0.3658114730469876, "loss": 3.4225668907165527, "loss_ce": 3.4151244163513184, "loss_xval": 0.0074462890625, "num_input_tokens_seen": 187126824, "step": 1269 }, { "epoch": 0.366099740559239, "grad_norm": 1.7051155693016407, "learning_rate": 0.0001, "loss": 3.0186, "num_input_tokens_seen": 187261608, "step": 1270 }, { "epoch": 0.366099740559239, "loss": 2.921408176422119, "loss_ce": 2.9070725440979004, "loss_xval": 0.01434326171875, "num_input_tokens_seen": 187261608, "step": 1270 }, { "epoch": 0.36638800807149036, "grad_norm": 4.265203473661172, "learning_rate": 0.0001, "loss": 3.236, "num_input_tokens_seen": 187396552, "step": 1271 }, { "epoch": 0.36638800807149036, "loss": 3.108557939529419, "loss_ce": 3.0978996753692627, "loss_xval": 0.01068115234375, "num_input_tokens_seen": 187396552, "step": 1271 }, { "epoch": 0.3666762755837417, "grad_norm": 1.495532032734321, "learning_rate": 0.0001, "loss": 3.1714, "num_input_tokens_seen": 187569184, "step": 1272 }, { "epoch": 0.3666762755837417, "loss": 3.429234504699707, "loss_ce": 3.420933723449707, "loss_xval": 0.00830078125, "num_input_tokens_seen": 187569184, "step": 1272 }, { "epoch": 0.3669645430959931, "grad_norm": 1.5397754658188914, "learning_rate": 0.0001, "loss": 2.9948, "num_input_tokens_seen": 187703888, "step": 1273 }, { "epoch": 0.3669645430959931, "loss": 2.907548427581787, "loss_ce": 2.8901076316833496, "loss_xval": 0.0174560546875, "num_input_tokens_seen": 187703888, "step": 1273 }, { "epoch": 0.36725281060824444, "grad_norm": 2.2236850531763146, "learning_rate": 0.0001, "loss": 3.2323, "num_input_tokens_seen": 187838936, "step": 1274 }, { "epoch": 0.36725281060824444, "loss": 3.095729351043701, "loss_ce": 3.0877223014831543, "loss_xval": 0.00799560546875, "num_input_tokens_seen": 187838936, "step": 1274 }, { "epoch": 0.3675410781204958, "grad_norm": 2.909090515749406, "learning_rate": 0.0001, "loss": 3.1036, "num_input_tokens_seen": 188011592, "step": 1275 }, { "epoch": 0.3675410781204958, "loss": 3.313185214996338, "loss_ce": 3.3051209449768066, "loss_xval": 0.008056640625, "num_input_tokens_seen": 188011592, "step": 1275 }, { "epoch": 0.36782934563274716, "grad_norm": 2.5094342235680953, "learning_rate": 0.0001, "loss": 2.9943, "num_input_tokens_seen": 188146424, "step": 1276 }, { "epoch": 0.36782934563274716, "loss": 2.892660617828369, "loss_ce": 2.876692295074463, "loss_xval": 0.0159912109375, "num_input_tokens_seen": 188146424, "step": 1276 }, { "epoch": 0.3681176131449986, "grad_norm": 2.8488290702724988, "learning_rate": 0.0001, "loss": 3.2013, "num_input_tokens_seen": 188281472, "step": 1277 }, { "epoch": 0.3681176131449986, "loss": 3.077254295349121, "loss_ce": 3.068915367126465, "loss_xval": 0.00836181640625, "num_input_tokens_seen": 188281472, "step": 1277 }, { "epoch": 0.36840588065724994, "grad_norm": 1.8587305330114212, "learning_rate": 0.0001, "loss": 3.1322, "num_input_tokens_seen": 188454168, "step": 1278 }, { "epoch": 0.36840588065724994, "loss": 3.390192985534668, "loss_ce": 3.3831357955932617, "loss_xval": 0.007049560546875, "num_input_tokens_seen": 188454168, "step": 1278 }, { "epoch": 0.3686941481695013, "grad_norm": 1.6701222144616605, "learning_rate": 0.0001, "loss": 2.961, "num_input_tokens_seen": 188588904, "step": 1279 }, { "epoch": 0.3686941481695013, "loss": 2.8679046630859375, "loss_ce": 2.8522682189941406, "loss_xval": 0.015625, "num_input_tokens_seen": 188588904, "step": 1279 }, { "epoch": 0.36898241568175266, "grad_norm": 3.397635660389669, "learning_rate": 0.0001, "loss": 3.1997, "num_input_tokens_seen": 188724008, "step": 1280 }, { "epoch": 0.36898241568175266, "loss": 3.0433454513549805, "loss_ce": 3.0348386764526367, "loss_xval": 0.00848388671875, "num_input_tokens_seen": 188724008, "step": 1280 }, { "epoch": 0.369270683194004, "grad_norm": 1.7129980805898721, "learning_rate": 0.0001, "loss": 3.0605, "num_input_tokens_seen": 188896536, "step": 1281 }, { "epoch": 0.369270683194004, "loss": 3.2636823654174805, "loss_ce": 3.25540828704834, "loss_xval": 0.00830078125, "num_input_tokens_seen": 188896536, "step": 1281 }, { "epoch": 0.3695589507062554, "grad_norm": 1.674071501036216, "learning_rate": 0.0001, "loss": 2.9341, "num_input_tokens_seen": 189031424, "step": 1282 }, { "epoch": 0.3695589507062554, "loss": 2.8433218002319336, "loss_ce": 2.828688621520996, "loss_xval": 0.0146484375, "num_input_tokens_seen": 189031424, "step": 1282 }, { "epoch": 0.3698472182185068, "grad_norm": 2.632451663005747, "learning_rate": 0.0001, "loss": 3.1643, "num_input_tokens_seen": 189166496, "step": 1283 }, { "epoch": 0.3698472182185068, "loss": 3.038170337677002, "loss_ce": 3.0290989875793457, "loss_xval": 0.00909423828125, "num_input_tokens_seen": 189166496, "step": 1283 }, { "epoch": 0.37013548573075816, "grad_norm": 2.0903033276496874, "learning_rate": 0.0001, "loss": 3.0753, "num_input_tokens_seen": 189338952, "step": 1284 }, { "epoch": 0.37013548573075816, "loss": 3.3279080390930176, "loss_ce": 3.3198323249816895, "loss_xval": 0.008056640625, "num_input_tokens_seen": 189338952, "step": 1284 }, { "epoch": 0.3704237532430095, "grad_norm": 1.8716433509508976, "learning_rate": 0.0001, "loss": 2.9242, "num_input_tokens_seen": 189473744, "step": 1285 }, { "epoch": 0.3704237532430095, "loss": 2.816052198410034, "loss_ce": 2.800999402999878, "loss_xval": 0.01507568359375, "num_input_tokens_seen": 189473744, "step": 1285 }, { "epoch": 0.3707120207552609, "grad_norm": 2.9900748452227637, "learning_rate": 0.0001, "loss": 3.1373, "num_input_tokens_seen": 189608856, "step": 1286 }, { "epoch": 0.3707120207552609, "loss": 3.0003795623779297, "loss_ce": 2.991331100463867, "loss_xval": 0.009033203125, "num_input_tokens_seen": 189608856, "step": 1286 }, { "epoch": 0.37100028826751225, "grad_norm": 2.0213465984800454, "learning_rate": 0.0001, "loss": 3.0273, "num_input_tokens_seen": 189781312, "step": 1287 }, { "epoch": 0.37100028826751225, "loss": 3.2566895484924316, "loss_ce": 3.2470154762268066, "loss_xval": 0.0096435546875, "num_input_tokens_seen": 189781312, "step": 1287 }, { "epoch": 0.3712885557797636, "grad_norm": 2.0433003449427587, "learning_rate": 0.0001, "loss": 2.874, "num_input_tokens_seen": 189916048, "step": 1288 }, { "epoch": 0.3712885557797636, "loss": 2.785282611846924, "loss_ce": 2.7689785957336426, "loss_xval": 0.016357421875, "num_input_tokens_seen": 189916048, "step": 1288 }, { "epoch": 0.37157682329201497, "grad_norm": 3.2920199908713124, "learning_rate": 0.0001, "loss": 3.0956, "num_input_tokens_seen": 190051064, "step": 1289 }, { "epoch": 0.37157682329201497, "loss": 2.984703302383423, "loss_ce": 2.971916437149048, "loss_xval": 0.0128173828125, "num_input_tokens_seen": 190051064, "step": 1289 }, { "epoch": 0.3718650908042664, "grad_norm": 2.2241646450849935, "learning_rate": 0.0001, "loss": 3.0093, "num_input_tokens_seen": 190223664, "step": 1290 }, { "epoch": 0.3718650908042664, "loss": 3.2635810375213623, "loss_ce": 3.2557456493377686, "loss_xval": 0.0078125, "num_input_tokens_seen": 190223664, "step": 1290 }, { "epoch": 0.37215335831651775, "grad_norm": 2.3558950213229597, "learning_rate": 0.0001, "loss": 2.8469, "num_input_tokens_seen": 190358544, "step": 1291 }, { "epoch": 0.37215335831651775, "loss": 2.741013526916504, "loss_ce": 2.72432804107666, "loss_xval": 0.0167236328125, "num_input_tokens_seen": 190358544, "step": 1291 }, { "epoch": 0.3724416258287691, "grad_norm": 3.155953361472079, "learning_rate": 0.0001, "loss": 3.0072, "num_input_tokens_seen": 190493376, "step": 1292 }, { "epoch": 0.3724416258287691, "loss": 2.8824682235717773, "loss_ce": 2.8714895248413086, "loss_xval": 0.010986328125, "num_input_tokens_seen": 190493376, "step": 1292 }, { "epoch": 0.37272989334102047, "grad_norm": 2.677295979464333, "learning_rate": 0.0001, "loss": 2.9604, "num_input_tokens_seen": 190665872, "step": 1293 }, { "epoch": 0.37272989334102047, "loss": 3.2144699096679688, "loss_ce": 3.202442169189453, "loss_xval": 0.01202392578125, "num_input_tokens_seen": 190665872, "step": 1293 }, { "epoch": 0.37301816085327183, "grad_norm": 2.6246963289673144, "learning_rate": 0.0001, "loss": 2.7918, "num_input_tokens_seen": 190800688, "step": 1294 }, { "epoch": 0.37301816085327183, "loss": 2.69120717048645, "loss_ce": 2.6758530139923096, "loss_xval": 0.015380859375, "num_input_tokens_seen": 190800688, "step": 1294 }, { "epoch": 0.3733064283655232, "grad_norm": 3.4790256525746, "learning_rate": 0.0001, "loss": 2.9933, "num_input_tokens_seen": 190935744, "step": 1295 }, { "epoch": 0.3733064283655232, "loss": 2.8375144004821777, "loss_ce": 2.8264365196228027, "loss_xval": 0.0111083984375, "num_input_tokens_seen": 190935744, "step": 1295 }, { "epoch": 0.37359469587777455, "grad_norm": 2.8185221481052762, "learning_rate": 0.0001, "loss": 2.8728, "num_input_tokens_seen": 191108200, "step": 1296 }, { "epoch": 0.37359469587777455, "loss": 3.109882354736328, "loss_ce": 3.100902557373047, "loss_xval": 0.00897216796875, "num_input_tokens_seen": 191108200, "step": 1296 }, { "epoch": 0.37388296339002597, "grad_norm": 3.0080030797377493, "learning_rate": 0.0001, "loss": 2.7236, "num_input_tokens_seen": 191242960, "step": 1297 }, { "epoch": 0.37388296339002597, "loss": 2.617522954940796, "loss_ce": 2.5978238582611084, "loss_xval": 0.0196533203125, "num_input_tokens_seen": 191242960, "step": 1297 }, { "epoch": 0.37417123090227733, "grad_norm": 3.4015052487127733, "learning_rate": 0.0001, "loss": 2.9454, "num_input_tokens_seen": 191378048, "step": 1298 }, { "epoch": 0.37417123090227733, "loss": 2.8113441467285156, "loss_ce": 2.799163818359375, "loss_xval": 0.01220703125, "num_input_tokens_seen": 191378048, "step": 1298 }, { "epoch": 0.3744594984145287, "grad_norm": 3.1268663171814013, "learning_rate": 0.0001, "loss": 2.7806, "num_input_tokens_seen": 191550472, "step": 1299 }, { "epoch": 0.3744594984145287, "loss": 2.998398780822754, "loss_ce": 2.9863977432250977, "loss_xval": 0.01202392578125, "num_input_tokens_seen": 191550472, "step": 1299 }, { "epoch": 0.37474776592678005, "grad_norm": 3.4915024725359314, "learning_rate": 0.0001, "loss": 2.6396, "num_input_tokens_seen": 191685248, "step": 1300 }, { "epoch": 0.37474776592678005, "loss": 2.528428554534912, "loss_ce": 2.5081725120544434, "loss_xval": 0.020263671875, "num_input_tokens_seen": 191685248, "step": 1300 }, { "epoch": 0.3750360334390314, "grad_norm": 4.385977856328783, "learning_rate": 0.0001, "loss": 2.8341, "num_input_tokens_seen": 191820368, "step": 1301 }, { "epoch": 0.3750360334390314, "loss": 2.688603401184082, "loss_ce": 2.677090644836426, "loss_xval": 0.01153564453125, "num_input_tokens_seen": 191820368, "step": 1301 }, { "epoch": 0.3753243009512828, "grad_norm": 3.9234660899501947, "learning_rate": 0.0001, "loss": 2.6939, "num_input_tokens_seen": 191992848, "step": 1302 }, { "epoch": 0.3753243009512828, "loss": 2.939253330230713, "loss_ce": 2.9202942848205566, "loss_xval": 0.0189208984375, "num_input_tokens_seen": 191992848, "step": 1302 }, { "epoch": 0.37561256846353414, "grad_norm": 4.531139506615138, "learning_rate": 0.0001, "loss": 2.5161, "num_input_tokens_seen": 192127608, "step": 1303 }, { "epoch": 0.37561256846353414, "loss": 2.4107353687286377, "loss_ce": 2.385352373123169, "loss_xval": 0.025390625, "num_input_tokens_seen": 192127608, "step": 1303 }, { "epoch": 0.37590083597578555, "grad_norm": 4.966347398089012, "learning_rate": 0.0001, "loss": 2.7311, "num_input_tokens_seen": 192262696, "step": 1304 }, { "epoch": 0.37590083597578555, "loss": 2.579951763153076, "loss_ce": 2.568812847137451, "loss_xval": 0.0111083984375, "num_input_tokens_seen": 192262696, "step": 1304 }, { "epoch": 0.3761891034880369, "grad_norm": 4.887690213966296, "learning_rate": 0.0001, "loss": 2.5636, "num_input_tokens_seen": 192435160, "step": 1305 }, { "epoch": 0.3761891034880369, "loss": 2.807976007461548, "loss_ce": 2.7880098819732666, "loss_xval": 0.02001953125, "num_input_tokens_seen": 192435160, "step": 1305 }, { "epoch": 0.3764773710002883, "grad_norm": 5.596325586836034, "learning_rate": 0.0001, "loss": 2.3777, "num_input_tokens_seen": 192569928, "step": 1306 }, { "epoch": 0.3764773710002883, "loss": 2.276872158050537, "loss_ce": 2.245103359222412, "loss_xval": 0.03173828125, "num_input_tokens_seen": 192569928, "step": 1306 }, { "epoch": 0.37676563851253964, "grad_norm": 5.342816876559327, "learning_rate": 0.0001, "loss": 2.5818, "num_input_tokens_seen": 192705032, "step": 1307 }, { "epoch": 0.37676563851253964, "loss": 2.4396731853485107, "loss_ce": 2.425520658493042, "loss_xval": 0.01416015625, "num_input_tokens_seen": 192705032, "step": 1307 }, { "epoch": 0.377053906024791, "grad_norm": 5.187116600816184, "learning_rate": 0.0001, "loss": 2.4064, "num_input_tokens_seen": 192877544, "step": 1308 }, { "epoch": 0.377053906024791, "loss": 2.6360042095184326, "loss_ce": 2.61273455619812, "loss_xval": 0.0233154296875, "num_input_tokens_seen": 192877544, "step": 1308 }, { "epoch": 0.37734217353704236, "grad_norm": 5.392912569319807, "learning_rate": 0.0001, "loss": 2.2167, "num_input_tokens_seen": 193012368, "step": 1309 }, { "epoch": 0.37734217353704236, "loss": 2.115727663040161, "loss_ce": 2.085911989212036, "loss_xval": 0.02978515625, "num_input_tokens_seen": 193012368, "step": 1309 }, { "epoch": 0.3776304410492937, "grad_norm": 5.981915610875332, "learning_rate": 0.0001, "loss": 2.4108, "num_input_tokens_seen": 193147368, "step": 1310 }, { "epoch": 0.3776304410492937, "loss": 2.273207664489746, "loss_ce": 2.25192928314209, "loss_xval": 0.021240234375, "num_input_tokens_seen": 193147368, "step": 1310 }, { "epoch": 0.37791870856154514, "grad_norm": 5.340813411172702, "learning_rate": 0.0001, "loss": 2.2597, "num_input_tokens_seen": 193319824, "step": 1311 }, { "epoch": 0.37791870856154514, "loss": 2.519282579421997, "loss_ce": 2.4933578968048096, "loss_xval": 0.02587890625, "num_input_tokens_seen": 193319824, "step": 1311 }, { "epoch": 0.3782069760737965, "grad_norm": 5.5534880340477635, "learning_rate": 0.0001, "loss": 2.0486, "num_input_tokens_seen": 193454552, "step": 1312 }, { "epoch": 0.3782069760737965, "loss": 1.9361722469329834, "loss_ce": 1.902007818222046, "loss_xval": 0.0341796875, "num_input_tokens_seen": 193454552, "step": 1312 }, { "epoch": 0.37849524358604786, "grad_norm": 5.382293657982696, "learning_rate": 0.0001, "loss": 2.2029, "num_input_tokens_seen": 193589712, "step": 1313 }, { "epoch": 0.37849524358604786, "loss": 2.060214042663574, "loss_ce": 2.0438222885131836, "loss_xval": 0.016357421875, "num_input_tokens_seen": 193589712, "step": 1313 }, { "epoch": 0.3787835110982992, "grad_norm": 5.398565780604443, "learning_rate": 0.0001, "loss": 2.0614, "num_input_tokens_seen": 193762352, "step": 1314 }, { "epoch": 0.3787835110982992, "loss": 2.323631763458252, "loss_ce": 2.3039631843566895, "loss_xval": 0.0196533203125, "num_input_tokens_seen": 193762352, "step": 1314 }, { "epoch": 0.3790717786105506, "grad_norm": 6.089319320163971, "learning_rate": 0.0001, "loss": 1.8249, "num_input_tokens_seen": 193897064, "step": 1315 }, { "epoch": 0.3790717786105506, "loss": 1.7183966636657715, "loss_ce": 1.693814754486084, "loss_xval": 0.0245361328125, "num_input_tokens_seen": 193897064, "step": 1315 }, { "epoch": 0.37936004612280194, "grad_norm": 6.00686223988536, "learning_rate": 0.0001, "loss": 1.9911, "num_input_tokens_seen": 194032176, "step": 1316 }, { "epoch": 0.37936004612280194, "loss": 1.8674993515014648, "loss_ce": 1.8527288436889648, "loss_xval": 0.0147705078125, "num_input_tokens_seen": 194032176, "step": 1316 }, { "epoch": 0.3796483136350533, "grad_norm": 6.384847108031611, "learning_rate": 0.0001, "loss": 1.7857, "num_input_tokens_seen": 194204600, "step": 1317 }, { "epoch": 0.3796483136350533, "loss": 2.01257586479187, "loss_ce": 1.9936702251434326, "loss_xval": 0.0189208984375, "num_input_tokens_seen": 194204600, "step": 1317 }, { "epoch": 0.3799365811473047, "grad_norm": 7.057178007836694, "learning_rate": 0.0001, "loss": 1.5645, "num_input_tokens_seen": 194339368, "step": 1318 }, { "epoch": 0.3799365811473047, "loss": 1.46513032913208, "loss_ce": 1.43845796585083, "loss_xval": 0.026611328125, "num_input_tokens_seen": 194339368, "step": 1318 }, { "epoch": 0.3802248486595561, "grad_norm": 7.503319455739542, "learning_rate": 0.0001, "loss": 1.6781, "num_input_tokens_seen": 194474432, "step": 1319 }, { "epoch": 0.3802248486595561, "loss": 1.5578892230987549, "loss_ce": 1.5376484394073486, "loss_xval": 0.020263671875, "num_input_tokens_seen": 194474432, "step": 1319 }, { "epoch": 0.38051311617180744, "grad_norm": 7.2656151061317304, "learning_rate": 0.0001, "loss": 1.5075, "num_input_tokens_seen": 194647032, "step": 1320 }, { "epoch": 0.38051311617180744, "loss": 1.741824984550476, "loss_ce": 1.7225226163864136, "loss_xval": 0.019287109375, "num_input_tokens_seen": 194647032, "step": 1320 }, { "epoch": 0.3808013836840588, "grad_norm": 7.579715480943731, "learning_rate": 0.0001, "loss": 1.2582, "num_input_tokens_seen": 194781840, "step": 1321 }, { "epoch": 0.3808013836840588, "loss": 1.1731231212615967, "loss_ce": 1.1493346691131592, "loss_xval": 0.0238037109375, "num_input_tokens_seen": 194781840, "step": 1321 }, { "epoch": 0.38108965119631016, "grad_norm": 7.840261357937563, "learning_rate": 0.0001, "loss": 1.3399, "num_input_tokens_seen": 194916848, "step": 1322 }, { "epoch": 0.38108965119631016, "loss": 1.1873873472213745, "loss_ce": 1.1677416563034058, "loss_xval": 0.0196533203125, "num_input_tokens_seen": 194916848, "step": 1322 }, { "epoch": 0.3813779187085615, "grad_norm": 8.101935906238767, "learning_rate": 0.0001, "loss": 1.1675, "num_input_tokens_seen": 195089424, "step": 1323 }, { "epoch": 0.3813779187085615, "loss": 1.3871322870254517, "loss_ce": 1.367089867591858, "loss_xval": 0.02001953125, "num_input_tokens_seen": 195089424, "step": 1323 }, { "epoch": 0.38166618622081294, "grad_norm": 10.133729748812186, "learning_rate": 0.0001, "loss": 0.9085, "num_input_tokens_seen": 195224168, "step": 1324 }, { "epoch": 0.38166618622081294, "loss": 0.8485641479492188, "loss_ce": 0.8170852661132812, "loss_xval": 0.031494140625, "num_input_tokens_seen": 195224168, "step": 1324 }, { "epoch": 0.3819544537330643, "grad_norm": 9.184673489968239, "learning_rate": 0.0001, "loss": 0.9749, "num_input_tokens_seen": 195359168, "step": 1325 }, { "epoch": 0.3819544537330643, "loss": 0.8387224674224854, "loss_ce": 0.8197863101959229, "loss_xval": 0.0189208984375, "num_input_tokens_seen": 195359168, "step": 1325 }, { "epoch": 0.38224272124531566, "grad_norm": 8.462154836119543, "learning_rate": 0.0001, "loss": 0.8089, "num_input_tokens_seen": 195531720, "step": 1326 }, { "epoch": 0.38224272124531566, "loss": 1.000867247581482, "loss_ce": 0.9825414419174194, "loss_xval": 0.018310546875, "num_input_tokens_seen": 195531720, "step": 1326 }, { "epoch": 0.382530988757567, "grad_norm": 8.262590257922607, "learning_rate": 0.0001, "loss": 0.5586, "num_input_tokens_seen": 195666576, "step": 1327 }, { "epoch": 0.382530988757567, "loss": 0.5303316116333008, "loss_ce": 0.507390022277832, "loss_xval": 0.02294921875, "num_input_tokens_seen": 195666576, "step": 1327 }, { "epoch": 0.3828192562698184, "grad_norm": 8.508081307007473, "learning_rate": 0.0001, "loss": 0.6148, "num_input_tokens_seen": 195801680, "step": 1328 }, { "epoch": 0.3828192562698184, "loss": 0.4951069951057434, "loss_ce": 0.4788067936897278, "loss_xval": 0.016357421875, "num_input_tokens_seen": 195801680, "step": 1328 }, { "epoch": 0.38310752378206975, "grad_norm": 7.359277842901975, "learning_rate": 0.0001, "loss": 0.4927, "num_input_tokens_seen": 195974144, "step": 1329 }, { "epoch": 0.38310752378206975, "loss": 0.6365176439285278, "loss_ce": 0.6132174730300903, "loss_xval": 0.0233154296875, "num_input_tokens_seen": 195974144, "step": 1329 }, { "epoch": 0.3833957912943211, "grad_norm": 5.939757783528577, "learning_rate": 0.0001, "loss": 0.2844, "num_input_tokens_seen": 196108816, "step": 1330 }, { "epoch": 0.3833957912943211, "loss": 0.29377344250679016, "loss_ce": 0.26371362805366516, "loss_xval": 0.030029296875, "num_input_tokens_seen": 196108816, "step": 1330 }, { "epoch": 0.3836840588065725, "grad_norm": 6.159986104423088, "learning_rate": 0.0001, "loss": 0.3409, "num_input_tokens_seen": 196244112, "step": 1331 }, { "epoch": 0.3836840588065725, "loss": 0.23033326864242554, "loss_ce": 0.20579713582992554, "loss_xval": 0.0245361328125, "num_input_tokens_seen": 196244112, "step": 1331 }, { "epoch": 0.3839723263188239, "grad_norm": 4.77282512062471, "learning_rate": 0.0001, "loss": 0.2807, "num_input_tokens_seen": 196416592, "step": 1332 }, { "epoch": 0.3839723263188239, "loss": 0.35939767956733704, "loss_ce": 0.3379667103290558, "loss_xval": 0.021484375, "num_input_tokens_seen": 196416592, "step": 1332 }, { "epoch": 0.38426059383107525, "grad_norm": 4.051100825473909, "learning_rate": 0.0001, "loss": 0.1492, "num_input_tokens_seen": 196551408, "step": 1333 }, { "epoch": 0.38426059383107525, "loss": 0.1672614961862564, "loss_ce": 0.14396895468235016, "loss_xval": 0.0233154296875, "num_input_tokens_seen": 196551408, "step": 1333 }, { "epoch": 0.3845488613433266, "grad_norm": 4.06293877344607, "learning_rate": 0.0001, "loss": 0.2038, "num_input_tokens_seen": 196686448, "step": 1334 }, { "epoch": 0.3845488613433266, "loss": 0.1264897584915161, "loss_ce": 0.10885824263095856, "loss_xval": 0.017578125, "num_input_tokens_seen": 196686448, "step": 1334 }, { "epoch": 0.38483712885557797, "grad_norm": 3.037738848199535, "learning_rate": 0.0001, "loss": 0.1679, "num_input_tokens_seen": 196858936, "step": 1335 }, { "epoch": 0.38483712885557797, "loss": 0.2194724977016449, "loss_ce": 0.19611892104148865, "loss_xval": 0.0233154296875, "num_input_tokens_seen": 196858936, "step": 1335 }, { "epoch": 0.38512539636782933, "grad_norm": 3.9874713827981862, "learning_rate": 0.0001, "loss": 0.0949, "num_input_tokens_seen": 196993680, "step": 1336 }, { "epoch": 0.38512539636782933, "loss": 0.09953860938549042, "loss_ce": 0.07335452735424042, "loss_xval": 0.026123046875, "num_input_tokens_seen": 196993680, "step": 1336 }, { "epoch": 0.3854136638800807, "grad_norm": 3.842858227911105, "learning_rate": 0.0001, "loss": 0.1263, "num_input_tokens_seen": 197128720, "step": 1337 }, { "epoch": 0.3854136638800807, "loss": 0.07819326221942902, "loss_ce": 0.061484888195991516, "loss_xval": 0.0167236328125, "num_input_tokens_seen": 197128720, "step": 1337 }, { "epoch": 0.3857019313923321, "grad_norm": 3.7661320103755913, "learning_rate": 0.0001, "loss": 0.1149, "num_input_tokens_seen": 197301192, "step": 1338 }, { "epoch": 0.3857019313923321, "loss": 0.1548667848110199, "loss_ce": 0.1375785768032074, "loss_xval": 0.017333984375, "num_input_tokens_seen": 197301192, "step": 1338 }, { "epoch": 0.38599019890458347, "grad_norm": 2.0935235814016324, "learning_rate": 0.0001, "loss": 0.063, "num_input_tokens_seen": 197436008, "step": 1339 }, { "epoch": 0.38599019890458347, "loss": 0.06575188040733337, "loss_ce": 0.04433616250753403, "loss_xval": 0.0213623046875, "num_input_tokens_seen": 197436008, "step": 1339 }, { "epoch": 0.38627846641683483, "grad_norm": 9.864120567956032, "learning_rate": 0.0001, "loss": 0.0988, "num_input_tokens_seen": 197571040, "step": 1340 }, { "epoch": 0.38627846641683483, "loss": 0.04486007243394852, "loss_ce": 0.030917353928089142, "loss_xval": 0.013916015625, "num_input_tokens_seen": 197571040, "step": 1340 }, { "epoch": 0.3865667339290862, "grad_norm": 2.030815217945069, "learning_rate": 0.0001, "loss": 0.0922, "num_input_tokens_seen": 197743672, "step": 1341 }, { "epoch": 0.3865667339290862, "loss": 0.1213720515370369, "loss_ce": 0.1082647442817688, "loss_xval": 0.01312255859375, "num_input_tokens_seen": 197743672, "step": 1341 }, { "epoch": 0.38685500144133755, "grad_norm": 1.9671558855419173, "learning_rate": 0.0001, "loss": 0.052, "num_input_tokens_seen": 197878456, "step": 1342 }, { "epoch": 0.38685500144133755, "loss": 0.05662340298295021, "loss_ce": 0.03942674770951271, "loss_xval": 0.0172119140625, "num_input_tokens_seen": 197878456, "step": 1342 }, { "epoch": 0.3871432689535889, "grad_norm": 2.4283101718006312, "learning_rate": 0.0001, "loss": 0.0894, "num_input_tokens_seen": 198013456, "step": 1343 }, { "epoch": 0.3871432689535889, "loss": 0.04897967353463173, "loss_ce": 0.03391161933541298, "loss_xval": 0.01507568359375, "num_input_tokens_seen": 198013456, "step": 1343 }, { "epoch": 0.3874315364658403, "grad_norm": 2.7356786477473936, "learning_rate": 0.0001, "loss": 0.0838, "num_input_tokens_seen": 198186000, "step": 1344 }, { "epoch": 0.3874315364658403, "loss": 0.10481792688369751, "loss_ce": 0.08823162317276001, "loss_xval": 0.0166015625, "num_input_tokens_seen": 198186000, "step": 1344 }, { "epoch": 0.3877198039780917, "grad_norm": 1.3907189763308183, "learning_rate": 0.0001, "loss": 0.0507, "num_input_tokens_seen": 198320848, "step": 1345 }, { "epoch": 0.3877198039780917, "loss": 0.05430437996983528, "loss_ce": 0.03217913582921028, "loss_xval": 0.0220947265625, "num_input_tokens_seen": 198320848, "step": 1345 }, { "epoch": 0.38800807149034305, "grad_norm": 4.385028674819962, "learning_rate": 0.0001, "loss": 0.0654, "num_input_tokens_seen": 198456080, "step": 1346 }, { "epoch": 0.38800807149034305, "loss": 0.037943094968795776, "loss_ce": 0.020853251218795776, "loss_xval": 0.01708984375, "num_input_tokens_seen": 198456080, "step": 1346 }, { "epoch": 0.3882963390025944, "grad_norm": 2.0849586519173657, "learning_rate": 0.0001, "loss": 0.0545, "num_input_tokens_seen": 198628672, "step": 1347 }, { "epoch": 0.3882963390025944, "loss": 0.06738258898258209, "loss_ce": 0.051727067679166794, "loss_xval": 0.015625, "num_input_tokens_seen": 198628672, "step": 1347 }, { "epoch": 0.3885846065148458, "grad_norm": 1.3551925654534938, "learning_rate": 0.0001, "loss": 0.0343, "num_input_tokens_seen": 198763384, "step": 1348 }, { "epoch": 0.3885846065148458, "loss": 0.045364461839199066, "loss_ce": 0.029098594561219215, "loss_xval": 0.0162353515625, "num_input_tokens_seen": 198763384, "step": 1348 }, { "epoch": 0.38887287402709714, "grad_norm": 1.2264368422957106, "learning_rate": 0.0001, "loss": 0.0338, "num_input_tokens_seen": 198898480, "step": 1349 }, { "epoch": 0.38887287402709714, "loss": 0.025431422516703606, "loss_ce": 0.009157923981547356, "loss_xval": 0.0162353515625, "num_input_tokens_seen": 198898480, "step": 1349 }, { "epoch": 0.3891611415393485, "grad_norm": 3.0852120648983212, "learning_rate": 0.0001, "loss": 0.049, "num_input_tokens_seen": 199070928, "step": 1350 }, { "epoch": 0.3891611415393485, "loss": 0.04758167266845703, "loss_ce": 0.03284931182861328, "loss_xval": 0.01470947265625, "num_input_tokens_seen": 199070928, "step": 1350 }, { "epoch": 0.38944940905159986, "grad_norm": 4.074322664880206, "learning_rate": 0.0001, "loss": 0.0351, "num_input_tokens_seen": 199205680, "step": 1351 }, { "epoch": 0.38944940905159986, "loss": 0.043620556592941284, "loss_ce": 0.027080029249191284, "loss_xval": 0.0166015625, "num_input_tokens_seen": 199205680, "step": 1351 }, { "epoch": 0.3897376765638513, "grad_norm": 1.149605580995087, "learning_rate": 0.0001, "loss": 0.0348, "num_input_tokens_seen": 199340672, "step": 1352 }, { "epoch": 0.3897376765638513, "loss": 0.021780086681246758, "loss_ce": 0.008981777355074883, "loss_xval": 0.0128173828125, "num_input_tokens_seen": 199340672, "step": 1352 }, { "epoch": 0.39002594407610264, "grad_norm": 1.139692322477526, "learning_rate": 0.0001, "loss": 0.0338, "num_input_tokens_seen": 199513160, "step": 1353 }, { "epoch": 0.39002594407610264, "loss": 0.041254542768001556, "loss_ce": 0.03088238276541233, "loss_xval": 0.0103759765625, "num_input_tokens_seen": 199513160, "step": 1353 }, { "epoch": 0.390314211588354, "grad_norm": 0.806504861954892, "learning_rate": 0.0001, "loss": 0.0278, "num_input_tokens_seen": 199647992, "step": 1354 }, { "epoch": 0.390314211588354, "loss": 0.034722212702035904, "loss_ce": 0.023095015436410904, "loss_xval": 0.0115966796875, "num_input_tokens_seen": 199647992, "step": 1354 }, { "epoch": 0.39060247910060536, "grad_norm": 2.061867339897939, "learning_rate": 0.0001, "loss": 0.0331, "num_input_tokens_seen": 199782968, "step": 1355 }, { "epoch": 0.39060247910060536, "loss": 0.01884021796286106, "loss_ce": 0.009284400381147861, "loss_xval": 0.00958251953125, "num_input_tokens_seen": 199782968, "step": 1355 }, { "epoch": 0.3908907466128567, "grad_norm": 1.4719134776992004, "learning_rate": 0.0001, "loss": 0.0379, "num_input_tokens_seen": 199955464, "step": 1356 }, { "epoch": 0.3908907466128567, "loss": 0.044432446360588074, "loss_ce": 0.0333278626203537, "loss_xval": 0.0111083984375, "num_input_tokens_seen": 199955464, "step": 1356 }, { "epoch": 0.3911790141251081, "grad_norm": 1.8078027247986106, "learning_rate": 0.0001, "loss": 0.0257, "num_input_tokens_seen": 200090192, "step": 1357 }, { "epoch": 0.3911790141251081, "loss": 0.031802982091903687, "loss_ce": 0.019832465797662735, "loss_xval": 0.011962890625, "num_input_tokens_seen": 200090192, "step": 1357 }, { "epoch": 0.39146728163735944, "grad_norm": 1.1044953565112376, "learning_rate": 0.0001, "loss": 0.0228, "num_input_tokens_seen": 200225288, "step": 1358 }, { "epoch": 0.39146728163735944, "loss": 0.016087032854557037, "loss_ce": 0.009964443743228912, "loss_xval": 0.006134033203125, "num_input_tokens_seen": 200225288, "step": 1358 }, { "epoch": 0.39175554914961086, "grad_norm": 0.9945927909285693, "learning_rate": 0.0001, "loss": 0.0254, "num_input_tokens_seen": 200397888, "step": 1359 }, { "epoch": 0.39175554914961086, "loss": 0.02666654624044895, "loss_ce": 0.018461132422089577, "loss_xval": 0.0081787109375, "num_input_tokens_seen": 200397888, "step": 1359 }, { "epoch": 0.3920438166618622, "grad_norm": 0.494338140152747, "learning_rate": 0.0001, "loss": 0.0188, "num_input_tokens_seen": 200532680, "step": 1360 }, { "epoch": 0.3920438166618622, "loss": 0.02553568407893181, "loss_ce": 0.013740640133619308, "loss_xval": 0.01177978515625, "num_input_tokens_seen": 200532680, "step": 1360 }, { "epoch": 0.3923320841741136, "grad_norm": 1.3856157113105607, "learning_rate": 0.0001, "loss": 0.0176, "num_input_tokens_seen": 200667592, "step": 1361 }, { "epoch": 0.3923320841741136, "loss": 0.011884558014571667, "loss_ce": 0.0035341857001185417, "loss_xval": 0.00836181640625, "num_input_tokens_seen": 200667592, "step": 1361 }, { "epoch": 0.39262035168636494, "grad_norm": 0.7369679296640576, "learning_rate": 0.0001, "loss": 0.0224, "num_input_tokens_seen": 200839960, "step": 1362 }, { "epoch": 0.39262035168636494, "loss": 0.02030220441520214, "loss_ce": 0.01264992170035839, "loss_xval": 0.007659912109375, "num_input_tokens_seen": 200839960, "step": 1362 }, { "epoch": 0.3929086191986163, "grad_norm": 1.2784878807936497, "learning_rate": 0.0001, "loss": 0.018, "num_input_tokens_seen": 200974784, "step": 1363 }, { "epoch": 0.3929086191986163, "loss": 0.027085553854703903, "loss_ce": 0.019082318991422653, "loss_xval": 0.00799560546875, "num_input_tokens_seen": 200974784, "step": 1363 }, { "epoch": 0.39319688671086767, "grad_norm": 0.973016613201439, "learning_rate": 0.0001, "loss": 0.0135, "num_input_tokens_seen": 201109784, "step": 1364 }, { "epoch": 0.39319688671086767, "loss": 0.008179396390914917, "loss_ce": 0.0029017634224146605, "loss_xval": 0.005279541015625, "num_input_tokens_seen": 201109784, "step": 1364 }, { "epoch": 0.393485154223119, "grad_norm": 0.7713254196549865, "learning_rate": 0.0001, "loss": 0.0226, "num_input_tokens_seen": 201282384, "step": 1365 }, { "epoch": 0.393485154223119, "loss": 0.015684355050325394, "loss_ce": 0.010984647087752819, "loss_xval": 0.00469970703125, "num_input_tokens_seen": 201282384, "step": 1365 }, { "epoch": 0.39377342173537044, "grad_norm": 0.9308851890759041, "learning_rate": 0.0001, "loss": 0.0155, "num_input_tokens_seen": 201417136, "step": 1366 }, { "epoch": 0.39377342173537044, "loss": 0.023319490253925323, "loss_ce": 0.015459306538105011, "loss_xval": 0.00787353515625, "num_input_tokens_seen": 201417136, "step": 1366 }, { "epoch": 0.3940616892476218, "grad_norm": 0.6404491910207054, "learning_rate": 0.0001, "loss": 0.0118, "num_input_tokens_seen": 201552240, "step": 1367 }, { "epoch": 0.3940616892476218, "loss": 0.008579766377806664, "loss_ce": 0.0033879633992910385, "loss_xval": 0.00518798828125, "num_input_tokens_seen": 201552240, "step": 1367 }, { "epoch": 0.39434995675987317, "grad_norm": 0.6986686066295519, "learning_rate": 0.0001, "loss": 0.0211, "num_input_tokens_seen": 201724872, "step": 1368 }, { "epoch": 0.39434995675987317, "loss": 0.02358327805995941, "loss_ce": 0.018536433577537537, "loss_xval": 0.005035400390625, "num_input_tokens_seen": 201724872, "step": 1368 }, { "epoch": 0.3946382242721245, "grad_norm": 0.8428508345107775, "learning_rate": 0.0001, "loss": 0.0121, "num_input_tokens_seen": 201859616, "step": 1369 }, { "epoch": 0.3946382242721245, "loss": 0.017313163727521896, "loss_ce": 0.01113144587725401, "loss_xval": 0.006195068359375, "num_input_tokens_seen": 201859616, "step": 1369 }, { "epoch": 0.3949264917843759, "grad_norm": 0.8350447749155997, "learning_rate": 0.0001, "loss": 0.0129, "num_input_tokens_seen": 201994760, "step": 1370 }, { "epoch": 0.3949264917843759, "loss": 0.007084501441568136, "loss_ce": 0.0034090406261384487, "loss_xval": 0.0036773681640625, "num_input_tokens_seen": 201994760, "step": 1370 }, { "epoch": 0.39521475929662725, "grad_norm": 0.9544725638554615, "learning_rate": 0.0001, "loss": 0.0193, "num_input_tokens_seen": 202167240, "step": 1371 }, { "epoch": 0.39521475929662725, "loss": 0.014551658183336258, "loss_ce": 0.011322516947984695, "loss_xval": 0.00323486328125, "num_input_tokens_seen": 202167240, "step": 1371 }, { "epoch": 0.39550302680887867, "grad_norm": 0.6628523590881815, "learning_rate": 0.0001, "loss": 0.0116, "num_input_tokens_seen": 202302224, "step": 1372 }, { "epoch": 0.39550302680887867, "loss": 0.01799912378191948, "loss_ce": 0.011701060459017754, "loss_xval": 0.00628662109375, "num_input_tokens_seen": 202302224, "step": 1372 }, { "epoch": 0.39579129432113, "grad_norm": 0.46876947007397934, "learning_rate": 0.0001, "loss": 0.0115, "num_input_tokens_seen": 202437312, "step": 1373 }, { "epoch": 0.39579129432113, "loss": 0.009276005439460278, "loss_ce": 0.00552234286442399, "loss_xval": 0.003753662109375, "num_input_tokens_seen": 202437312, "step": 1373 }, { "epoch": 0.3960795618333814, "grad_norm": 1.8683003497946353, "learning_rate": 0.0001, "loss": 0.0165, "num_input_tokens_seen": 202609792, "step": 1374 }, { "epoch": 0.3960795618333814, "loss": 0.01582440733909607, "loss_ce": 0.011563390493392944, "loss_xval": 0.0042724609375, "num_input_tokens_seen": 202609792, "step": 1374 }, { "epoch": 0.39636782934563275, "grad_norm": 0.2811121747222105, "learning_rate": 0.0001, "loss": 0.0111, "num_input_tokens_seen": 202744600, "step": 1375 }, { "epoch": 0.39636782934563275, "loss": 0.01710539683699608, "loss_ce": 0.012304600328207016, "loss_xval": 0.004791259765625, "num_input_tokens_seen": 202744600, "step": 1375 }, { "epoch": 0.3966560968578841, "grad_norm": 1.500880723672373, "learning_rate": 0.0001, "loss": 0.0083, "num_input_tokens_seen": 202879656, "step": 1376 }, { "epoch": 0.3966560968578841, "loss": 0.004621902015060186, "loss_ce": 0.002170005114749074, "loss_xval": 0.0024566650390625, "num_input_tokens_seen": 202879656, "step": 1376 }, { "epoch": 0.39694436437013547, "grad_norm": 1.098144830738785, "learning_rate": 0.0001, "loss": 0.0174, "num_input_tokens_seen": 203052192, "step": 1377 }, { "epoch": 0.39694436437013547, "loss": 0.01708078756928444, "loss_ce": 0.012887481600046158, "loss_xval": 0.004180908203125, "num_input_tokens_seen": 203052192, "step": 1377 }, { "epoch": 0.39723263188238683, "grad_norm": 0.46400715903117085, "learning_rate": 0.0001, "loss": 0.011, "num_input_tokens_seen": 203186880, "step": 1378 }, { "epoch": 0.39723263188238683, "loss": 0.017731178551912308, "loss_ce": 0.013500679284334183, "loss_xval": 0.004241943359375, "num_input_tokens_seen": 203186880, "step": 1378 }, { "epoch": 0.39752089939463825, "grad_norm": 1.944436649986528, "learning_rate": 0.0001, "loss": 0.0073, "num_input_tokens_seen": 203321928, "step": 1379 }, { "epoch": 0.39752089939463825, "loss": 0.004524157382547855, "loss_ce": 0.002213404979556799, "loss_xval": 0.0023040771484375, "num_input_tokens_seen": 203321928, "step": 1379 }, { "epoch": 0.3978091669068896, "grad_norm": 1.8472827415052435, "learning_rate": 0.0001, "loss": 0.0133, "num_input_tokens_seen": 203494472, "step": 1380 }, { "epoch": 0.3978091669068896, "loss": 0.00947756040841341, "loss_ce": 0.006767218001186848, "loss_xval": 0.002716064453125, "num_input_tokens_seen": 203494472, "step": 1380 }, { "epoch": 0.39809743441914097, "grad_norm": 1.0137558806292657, "learning_rate": 0.0001, "loss": 0.0122, "num_input_tokens_seen": 203629256, "step": 1381 }, { "epoch": 0.39809743441914097, "loss": 0.014530917629599571, "loss_ce": 0.010992785915732384, "loss_xval": 0.0035400390625, "num_input_tokens_seen": 203629256, "step": 1381 }, { "epoch": 0.39838570193139233, "grad_norm": 1.4902002985671599, "learning_rate": 0.0001, "loss": 0.0074, "num_input_tokens_seen": 203764232, "step": 1382 }, { "epoch": 0.39838570193139233, "loss": 0.005032642744481564, "loss_ce": 0.002303226850926876, "loss_xval": 0.0027313232421875, "num_input_tokens_seen": 203764232, "step": 1382 }, { "epoch": 0.3986739694436437, "grad_norm": 1.9965416544832317, "learning_rate": 0.0001, "loss": 0.0115, "num_input_tokens_seen": 203936776, "step": 1383 }, { "epoch": 0.3986739694436437, "loss": 0.010603894479572773, "loss_ce": 0.00803088117390871, "loss_xval": 0.0025787353515625, "num_input_tokens_seen": 203936776, "step": 1383 }, { "epoch": 0.39896223695589506, "grad_norm": 0.7761101964481655, "learning_rate": 0.0001, "loss": 0.0121, "num_input_tokens_seen": 204071632, "step": 1384 }, { "epoch": 0.39896223695589506, "loss": 0.016848795115947723, "loss_ce": 0.013341179117560387, "loss_xval": 0.003509521484375, "num_input_tokens_seen": 204071632, "step": 1384 }, { "epoch": 0.3992505044681464, "grad_norm": 1.9679208914351352, "learning_rate": 0.0001, "loss": 0.0089, "num_input_tokens_seen": 204206592, "step": 1385 }, { "epoch": 0.3992505044681464, "loss": 0.004215900786221027, "loss_ce": 0.001496975775808096, "loss_xval": 0.002716064453125, "num_input_tokens_seen": 204206592, "step": 1385 }, { "epoch": 0.39953877198039783, "grad_norm": 1.9443220159955805, "learning_rate": 0.0001, "loss": 0.0117, "num_input_tokens_seen": 204378992, "step": 1386 }, { "epoch": 0.39953877198039783, "loss": 0.007329082116484642, "loss_ce": 0.0047522541135549545, "loss_xval": 0.0025787353515625, "num_input_tokens_seen": 204378992, "step": 1386 }, { "epoch": 0.3998270394926492, "grad_norm": 0.9898315702902115, "learning_rate": 0.0001, "loss": 0.0142, "num_input_tokens_seen": 204513824, "step": 1387 }, { "epoch": 0.3998270394926492, "loss": 0.016465310007333755, "loss_ce": 0.013494614511728287, "loss_xval": 0.0029754638671875, "num_input_tokens_seen": 204513824, "step": 1387 }, { "epoch": 0.40011530700490056, "grad_norm": 1.7837581013050972, "learning_rate": 0.0001, "loss": 0.0045, "num_input_tokens_seen": 204648864, "step": 1388 }, { "epoch": 0.40011530700490056, "loss": 0.0035922147799283266, "loss_ce": 0.001431188895367086, "loss_xval": 0.002166748046875, "num_input_tokens_seen": 204648864, "step": 1388 }, { "epoch": 0.4004035745171519, "grad_norm": 1.231230547855382, "learning_rate": 0.0001, "loss": 0.0146, "num_input_tokens_seen": 204821408, "step": 1389 }, { "epoch": 0.4004035745171519, "loss": 0.006246610544621944, "loss_ce": 0.004241033457219601, "loss_xval": 0.0019989013671875, "num_input_tokens_seen": 204821408, "step": 1389 }, { "epoch": 0.4006918420294033, "grad_norm": 0.45237187857054606, "learning_rate": 0.0001, "loss": 0.0091, "num_input_tokens_seen": 204956256, "step": 1390 }, { "epoch": 0.4006918420294033, "loss": 0.014893912710249424, "loss_ce": 0.011824988760054111, "loss_xval": 0.0030670166015625, "num_input_tokens_seen": 204956256, "step": 1390 }, { "epoch": 0.40098010954165464, "grad_norm": 2.2686371794323406, "learning_rate": 0.0001, "loss": 0.0088, "num_input_tokens_seen": 205091240, "step": 1391 }, { "epoch": 0.40098010954165464, "loss": 0.008735474199056625, "loss_ce": 0.006970223039388657, "loss_xval": 0.00176239013671875, "num_input_tokens_seen": 205091240, "step": 1391 }, { "epoch": 0.401268377053906, "grad_norm": 1.7761112950576003, "learning_rate": 0.0001, "loss": 0.0103, "num_input_tokens_seen": 205263688, "step": 1392 }, { "epoch": 0.401268377053906, "loss": 0.00662598293274641, "loss_ce": 0.004417273215949535, "loss_xval": 0.0022125244140625, "num_input_tokens_seen": 205263688, "step": 1392 }, { "epoch": 0.4015566445661574, "grad_norm": 0.8047908874304497, "learning_rate": 0.0001, "loss": 0.0094, "num_input_tokens_seen": 205398456, "step": 1393 }, { "epoch": 0.4015566445661574, "loss": 0.015318496152758598, "loss_ce": 0.01345787849277258, "loss_xval": 0.001861572265625, "num_input_tokens_seen": 205398456, "step": 1393 }, { "epoch": 0.4018449120784088, "grad_norm": 2.1236601899090837, "learning_rate": 0.0001, "loss": 0.0088, "num_input_tokens_seen": 205533424, "step": 1394 }, { "epoch": 0.4018449120784088, "loss": 0.0035114334896206856, "loss_ce": 0.0014324233634397388, "loss_xval": 0.0020751953125, "num_input_tokens_seen": 205533424, "step": 1394 }, { "epoch": 0.40213317959066014, "grad_norm": 1.465889493088798, "learning_rate": 0.0001, "loss": 0.0106, "num_input_tokens_seen": 205705968, "step": 1395 }, { "epoch": 0.40213317959066014, "loss": 0.007632310502231121, "loss_ce": 0.006197507493197918, "loss_xval": 0.001434326171875, "num_input_tokens_seen": 205705968, "step": 1395 }, { "epoch": 0.4024214471029115, "grad_norm": 1.1457608913972734, "learning_rate": 0.0001, "loss": 0.0083, "num_input_tokens_seen": 205840784, "step": 1396 }, { "epoch": 0.4024214471029115, "loss": 0.013797684572637081, "loss_ce": 0.011010094545781612, "loss_xval": 0.0027923583984375, "num_input_tokens_seen": 205840784, "step": 1396 }, { "epoch": 0.40270971461516286, "grad_norm": 2.24633480417486, "learning_rate": 0.0001, "loss": 0.0047, "num_input_tokens_seen": 205975904, "step": 1397 }, { "epoch": 0.40270971461516286, "loss": 0.0036817528307437897, "loss_ce": 0.0019260382978245616, "loss_xval": 0.0017547607421875, "num_input_tokens_seen": 205975904, "step": 1397 }, { "epoch": 0.4029979821274142, "grad_norm": 0.4192766702455911, "learning_rate": 0.0001, "loss": 0.0077, "num_input_tokens_seen": 206148408, "step": 1398 }, { "epoch": 0.4029979821274142, "loss": 0.005562697537243366, "loss_ce": 0.004066859371960163, "loss_xval": 0.001495361328125, "num_input_tokens_seen": 206148408, "step": 1398 }, { "epoch": 0.4032862496396656, "grad_norm": 2.0131614169700516, "learning_rate": 0.0001, "loss": 0.0104, "num_input_tokens_seen": 206283240, "step": 1399 }, { "epoch": 0.4032862496396656, "loss": 0.018222136422991753, "loss_ce": 0.015307707712054253, "loss_xval": 0.0029144287109375, "num_input_tokens_seen": 206283240, "step": 1399 }, { "epoch": 0.403574517151917, "grad_norm": 0.8860071239442986, "learning_rate": 0.0001, "loss": 0.0041, "num_input_tokens_seen": 206418328, "step": 1400 }, { "epoch": 0.403574517151917, "loss": 0.00250605377368629, "loss_ce": 0.0011155966203659773, "loss_xval": 0.0013885498046875, "num_input_tokens_seen": 206418328, "step": 1400 }, { "epoch": 0.40386278466416836, "grad_norm": 0.5472273941624404, "learning_rate": 0.0001, "loss": 0.0105, "num_input_tokens_seen": 206590920, "step": 1401 }, { "epoch": 0.40386278466416836, "loss": 0.005478480365127325, "loss_ce": 0.0040613203309476376, "loss_xval": 0.0014190673828125, "num_input_tokens_seen": 206590920, "step": 1401 }, { "epoch": 0.4041510521764197, "grad_norm": 0.9079608553207296, "learning_rate": 0.0001, "loss": 0.0092, "num_input_tokens_seen": 206725768, "step": 1402 }, { "epoch": 0.4041510521764197, "loss": 0.015894869342446327, "loss_ce": 0.014095762744545937, "loss_xval": 0.001800537109375, "num_input_tokens_seen": 206725768, "step": 1402 }, { "epoch": 0.4044393196886711, "grad_norm": 1.020068142575505, "learning_rate": 0.0001, "loss": 0.0039, "num_input_tokens_seen": 206860864, "step": 1403 }, { "epoch": 0.4044393196886711, "loss": 0.0025417383294552565, "loss_ce": 0.0014025743585079908, "loss_xval": 0.00113677978515625, "num_input_tokens_seen": 206860864, "step": 1403 }, { "epoch": 0.40472758720092245, "grad_norm": 0.7036622702056449, "learning_rate": 0.0001, "loss": 0.0082, "num_input_tokens_seen": 207033376, "step": 1404 }, { "epoch": 0.40472758720092245, "loss": 0.003258179873228073, "loss_ce": 0.002056073397397995, "loss_xval": 0.0012054443359375, "num_input_tokens_seen": 207033376, "step": 1404 }, { "epoch": 0.4050158547131738, "grad_norm": 1.2717500529898216, "learning_rate": 0.0001, "loss": 0.0081, "num_input_tokens_seen": 207168192, "step": 1405 }, { "epoch": 0.4050158547131738, "loss": 0.013880310580134392, "loss_ce": 0.011514244601130486, "loss_xval": 0.0023651123046875, "num_input_tokens_seen": 207168192, "step": 1405 }, { "epoch": 0.40530412222542517, "grad_norm": 0.3145447559978116, "learning_rate": 0.0001, "loss": 0.0038, "num_input_tokens_seen": 207303128, "step": 1406 }, { "epoch": 0.40530412222542517, "loss": 0.0033487228211015463, "loss_ce": 0.002387657528743148, "loss_xval": 0.0009613037109375, "num_input_tokens_seen": 207303128, "step": 1406 }, { "epoch": 0.4055923897376766, "grad_norm": 0.596831291599684, "learning_rate": 0.0001, "loss": 0.0078, "num_input_tokens_seen": 207475568, "step": 1407 }, { "epoch": 0.4055923897376766, "loss": 0.004142426885664463, "loss_ce": 0.0028172959573566914, "loss_xval": 0.0013275146484375, "num_input_tokens_seen": 207475568, "step": 1407 }, { "epoch": 0.40588065724992795, "grad_norm": 0.20424467166333724, "learning_rate": 0.0001, "loss": 0.0071, "num_input_tokens_seen": 207610408, "step": 1408 }, { "epoch": 0.40588065724992795, "loss": 0.012081284075975418, "loss_ce": 0.010298389941453934, "loss_xval": 0.0017852783203125, "num_input_tokens_seen": 207610408, "step": 1408 }, { "epoch": 0.4061689247621793, "grad_norm": 0.38475049889262486, "learning_rate": 0.0001, "loss": 0.0034, "num_input_tokens_seen": 207745656, "step": 1409 }, { "epoch": 0.4061689247621793, "loss": 0.0022318486589938402, "loss_ce": 0.001303923549130559, "loss_xval": 0.000926971435546875, "num_input_tokens_seen": 207745656, "step": 1409 }, { "epoch": 0.40645719227443067, "grad_norm": 0.44183879593684616, "learning_rate": 0.0001, "loss": 0.0087, "num_input_tokens_seen": 207918072, "step": 1410 }, { "epoch": 0.40645719227443067, "loss": 0.003880149219185114, "loss_ce": 0.0025850594975054264, "loss_xval": 0.0012969970703125, "num_input_tokens_seen": 207918072, "step": 1410 }, { "epoch": 0.40674545978668203, "grad_norm": 0.5407026287377048, "learning_rate": 0.0001, "loss": 0.0079, "num_input_tokens_seen": 208052832, "step": 1411 }, { "epoch": 0.40674545978668203, "loss": 0.013677089475095272, "loss_ce": 0.011812656186521053, "loss_xval": 0.001861572265625, "num_input_tokens_seen": 208052832, "step": 1411 }, { "epoch": 0.4070337272989334, "grad_norm": 0.32728890027750474, "learning_rate": 0.0001, "loss": 0.0029, "num_input_tokens_seen": 208187952, "step": 1412 }, { "epoch": 0.4070337272989334, "loss": 0.0026520793326199055, "loss_ce": 0.0017924610292539, "loss_xval": 0.000858306884765625, "num_input_tokens_seen": 208187952, "step": 1412 }, { "epoch": 0.4073219948111848, "grad_norm": 0.42571153146403357, "learning_rate": 0.0001, "loss": 0.0078, "num_input_tokens_seen": 208360488, "step": 1413 }, { "epoch": 0.4073219948111848, "loss": 0.005467297974973917, "loss_ce": 0.004301669541746378, "loss_xval": 0.00116729736328125, "num_input_tokens_seen": 208360488, "step": 1413 }, { "epoch": 0.40761026232343617, "grad_norm": 0.17038287039479713, "learning_rate": 0.0001, "loss": 0.0062, "num_input_tokens_seen": 208495232, "step": 1414 }, { "epoch": 0.40761026232343617, "loss": 0.010554078966379166, "loss_ce": 0.009341483004391193, "loss_xval": 0.00121307373046875, "num_input_tokens_seen": 208495232, "step": 1414 }, { "epoch": 0.40789852983568753, "grad_norm": 0.19734809280784743, "learning_rate": 0.0001, "loss": 0.0029, "num_input_tokens_seen": 208630288, "step": 1415 }, { "epoch": 0.40789852983568753, "loss": 0.002154544461518526, "loss_ce": 0.0012824092991650105, "loss_xval": 0.000873565673828125, "num_input_tokens_seen": 208630288, "step": 1415 }, { "epoch": 0.4081867973479389, "grad_norm": 0.31871405754082566, "learning_rate": 0.0001, "loss": 0.0078, "num_input_tokens_seen": 208802840, "step": 1416 }, { "epoch": 0.4081867973479389, "loss": 0.00468908017501235, "loss_ce": 0.0037377900443971157, "loss_xval": 0.000949859619140625, "num_input_tokens_seen": 208802840, "step": 1416 }, { "epoch": 0.40847506486019025, "grad_norm": 0.5962579211692774, "learning_rate": 0.0001, "loss": 0.0099, "num_input_tokens_seen": 208937632, "step": 1417 }, { "epoch": 0.40847506486019025, "loss": 0.017369378358125687, "loss_ce": 0.015037644654512405, "loss_xval": 0.0023345947265625, "num_input_tokens_seen": 208937632, "step": 1417 }, { "epoch": 0.4087633323724416, "grad_norm": 0.9337263657247876, "learning_rate": 0.0001, "loss": 0.0054, "num_input_tokens_seen": 209072952, "step": 1418 }, { "epoch": 0.4087633323724416, "loss": 0.0018941645976155996, "loss_ce": 0.0009700541850179434, "loss_xval": 0.00092315673828125, "num_input_tokens_seen": 209072952, "step": 1418 }, { "epoch": 0.409051599884693, "grad_norm": 0.4026671847572958, "learning_rate": 0.0001, "loss": 0.0076, "num_input_tokens_seen": 209245536, "step": 1419 }, { "epoch": 0.409051599884693, "loss": 0.006634651683270931, "loss_ce": 0.005855022929608822, "loss_xval": 0.0007781982421875, "num_input_tokens_seen": 209245536, "step": 1419 }, { "epoch": 0.4093398673969444, "grad_norm": 0.5314251752133781, "learning_rate": 0.0001, "loss": 0.0056, "num_input_tokens_seen": 209380256, "step": 1420 }, { "epoch": 0.4093398673969444, "loss": 0.009849036112427711, "loss_ce": 0.008433545008301735, "loss_xval": 0.0014190673828125, "num_input_tokens_seen": 209380256, "step": 1420 }, { "epoch": 0.40962813490919575, "grad_norm": 0.20906559044152748, "learning_rate": 0.0001, "loss": 0.0049, "num_input_tokens_seen": 209515304, "step": 1421 }, { "epoch": 0.40962813490919575, "loss": 0.002209738129749894, "loss_ce": 0.0014544283039867878, "loss_xval": 0.00075531005859375, "num_input_tokens_seen": 209515304, "step": 1421 }, { "epoch": 0.4099164024214471, "grad_norm": 0.26415178813757945, "learning_rate": 0.0001, "loss": 0.0073, "num_input_tokens_seen": 209687688, "step": 1422 }, { "epoch": 0.4099164024214471, "loss": 0.002843210706487298, "loss_ce": 0.0018246864201501012, "loss_xval": 0.0010223388671875, "num_input_tokens_seen": 209687688, "step": 1422 }, { "epoch": 0.4102046699336985, "grad_norm": 0.5415690781467778, "learning_rate": 0.0001, "loss": 0.0071, "num_input_tokens_seen": 209822464, "step": 1423 }, { "epoch": 0.4102046699336985, "loss": 0.012525908648967743, "loss_ce": 0.011467330157756805, "loss_xval": 0.00106048583984375, "num_input_tokens_seen": 209822464, "step": 1423 }, { "epoch": 0.41049293744594983, "grad_norm": 0.8888751647930228, "learning_rate": 0.0001, "loss": 0.0034, "num_input_tokens_seen": 209957616, "step": 1424 }, { "epoch": 0.41049293744594983, "loss": 0.0023013115860521793, "loss_ce": 0.0016084673115983605, "loss_xval": 0.00069427490234375, "num_input_tokens_seen": 209957616, "step": 1424 }, { "epoch": 0.4107812049582012, "grad_norm": 0.3638501786792088, "learning_rate": 0.0001, "loss": 0.0073, "num_input_tokens_seen": 210130112, "step": 1425 }, { "epoch": 0.4107812049582012, "loss": 0.0025470005348324776, "loss_ce": 0.0015520796878263354, "loss_xval": 0.0009918212890625, "num_input_tokens_seen": 210130112, "step": 1425 }, { "epoch": 0.41106947247045256, "grad_norm": 0.39490131021782965, "learning_rate": 0.0001, "loss": 0.007, "num_input_tokens_seen": 210264872, "step": 1426 }, { "epoch": 0.41106947247045256, "loss": 0.011294020339846611, "loss_ce": 0.010136259719729424, "loss_xval": 0.00115966796875, "num_input_tokens_seen": 210264872, "step": 1426 }, { "epoch": 0.411357739982704, "grad_norm": 0.5864661523256884, "learning_rate": 0.0001, "loss": 0.0033, "num_input_tokens_seen": 210399880, "step": 1427 }, { "epoch": 0.411357739982704, "loss": 0.0016113725723698735, "loss_ce": 0.0009144750656560063, "loss_xval": 0.000698089599609375, "num_input_tokens_seen": 210399880, "step": 1427 }, { "epoch": 0.41164600749495533, "grad_norm": 0.34916345557654377, "learning_rate": 0.0001, "loss": 0.01, "num_input_tokens_seen": 210572416, "step": 1428 }, { "epoch": 0.41164600749495533, "loss": 0.0048339469358325005, "loss_ce": 0.0039680106565356255, "loss_xval": 0.000865936279296875, "num_input_tokens_seen": 210572416, "step": 1428 }, { "epoch": 0.4119342750072067, "grad_norm": 0.29858260038323814, "learning_rate": 0.0001, "loss": 0.0064, "num_input_tokens_seen": 210707192, "step": 1429 }, { "epoch": 0.4119342750072067, "loss": 0.011549213901162148, "loss_ce": 0.010494926944375038, "loss_xval": 0.0010528564453125, "num_input_tokens_seen": 210707192, "step": 1429 }, { "epoch": 0.41222254251945806, "grad_norm": 0.33206058421520657, "learning_rate": 0.0001, "loss": 0.0019, "num_input_tokens_seen": 210842240, "step": 1430 }, { "epoch": 0.41222254251945806, "loss": 0.001508349203504622, "loss_ce": 0.0008450687164440751, "loss_xval": 0.00066375732421875, "num_input_tokens_seen": 210842240, "step": 1430 }, { "epoch": 0.4125108100317094, "grad_norm": 0.40896687569605294, "learning_rate": 0.0001, "loss": 0.0063, "num_input_tokens_seen": 211014728, "step": 1431 }, { "epoch": 0.4125108100317094, "loss": 0.003282485995441675, "loss_ce": 0.002321182284504175, "loss_xval": 0.0009613037109375, "num_input_tokens_seen": 211014728, "step": 1431 }, { "epoch": 0.4127990775439608, "grad_norm": 0.3321492134803992, "learning_rate": 0.0001, "loss": 0.0075, "num_input_tokens_seen": 211149584, "step": 1432 }, { "epoch": 0.4127990775439608, "loss": 0.013707758858799934, "loss_ce": 0.012455584481358528, "loss_xval": 0.001251220703125, "num_input_tokens_seen": 211149584, "step": 1432 }, { "epoch": 0.41308734505621214, "grad_norm": 0.3826896427237596, "learning_rate": 0.0001, "loss": 0.003, "num_input_tokens_seen": 211284688, "step": 1433 }, { "epoch": 0.41308734505621214, "loss": 0.0016002252232283354, "loss_ce": 0.0010268285404890776, "loss_xval": 0.00057220458984375, "num_input_tokens_seen": 211284688, "step": 1433 }, { "epoch": 0.41337561256846356, "grad_norm": 0.3051905722859315, "learning_rate": 0.0001, "loss": 0.0061, "num_input_tokens_seen": 211457040, "step": 1434 }, { "epoch": 0.41337561256846356, "loss": 0.002180540468543768, "loss_ce": 0.0013804077170789242, "loss_xval": 0.00080108642578125, "num_input_tokens_seen": 211457040, "step": 1434 }, { "epoch": 0.4136638800807149, "grad_norm": 0.6753060604545037, "learning_rate": 0.0001, "loss": 0.0062, "num_input_tokens_seen": 211591744, "step": 1435 }, { "epoch": 0.4136638800807149, "loss": 0.011045737192034721, "loss_ce": 0.00985841266810894, "loss_xval": 0.001190185546875, "num_input_tokens_seen": 211591744, "step": 1435 }, { "epoch": 0.4139521475929663, "grad_norm": 0.47751356907696696, "learning_rate": 0.0001, "loss": 0.004, "num_input_tokens_seen": 211726792, "step": 1436 }, { "epoch": 0.4139521475929663, "loss": 0.0013247502502053976, "loss_ce": 0.0006855500396341085, "loss_xval": 0.000640869140625, "num_input_tokens_seen": 211726792, "step": 1436 }, { "epoch": 0.41424041510521764, "grad_norm": 0.48960108155618903, "learning_rate": 0.0001, "loss": 0.0104, "num_input_tokens_seen": 211899272, "step": 1437 }, { "epoch": 0.41424041510521764, "loss": 0.008046119473874569, "loss_ce": 0.0073160817846655846, "loss_xval": 0.000728607177734375, "num_input_tokens_seen": 211899272, "step": 1437 }, { "epoch": 0.414528682617469, "grad_norm": 0.712602502926373, "learning_rate": 0.0001, "loss": 0.0066, "num_input_tokens_seen": 212034000, "step": 1438 }, { "epoch": 0.414528682617469, "loss": 0.01144714467227459, "loss_ce": 0.010413838550448418, "loss_xval": 0.00102996826171875, "num_input_tokens_seen": 212034000, "step": 1438 }, { "epoch": 0.41481695012972036, "grad_norm": 0.4055228692809266, "learning_rate": 0.0001, "loss": 0.0039, "num_input_tokens_seen": 212169088, "step": 1439 }, { "epoch": 0.41481695012972036, "loss": 0.003060694085434079, "loss_ce": 0.0025781351141631603, "loss_xval": 0.0004825592041015625, "num_input_tokens_seen": 212169088, "step": 1439 }, { "epoch": 0.4151052176419717, "grad_norm": 0.4669629111386831, "learning_rate": 0.0001, "loss": 0.0062, "num_input_tokens_seen": 212341600, "step": 1440 }, { "epoch": 0.4151052176419717, "loss": 0.0033066237810999155, "loss_ce": 0.0024750197771936655, "loss_xval": 0.00083160400390625, "num_input_tokens_seen": 212341600, "step": 1440 }, { "epoch": 0.41539348515422314, "grad_norm": 0.272947734067498, "learning_rate": 0.0001, "loss": 0.0048, "num_input_tokens_seen": 212476392, "step": 1441 }, { "epoch": 0.41539348515422314, "loss": 0.008491845801472664, "loss_ce": 0.007533402647823095, "loss_xval": 0.000957489013671875, "num_input_tokens_seen": 212476392, "step": 1441 }, { "epoch": 0.4156817526664745, "grad_norm": 0.6574337302620793, "learning_rate": 0.0001, "loss": 0.0023, "num_input_tokens_seen": 212611504, "step": 1442 }, { "epoch": 0.4156817526664745, "loss": 0.001446253852918744, "loss_ce": 0.0008158752461895347, "loss_xval": 0.000629425048828125, "num_input_tokens_seen": 212611504, "step": 1442 }, { "epoch": 0.41597002017872586, "grad_norm": 0.5084837390614846, "learning_rate": 0.0001, "loss": 0.0058, "num_input_tokens_seen": 212784000, "step": 1443 }, { "epoch": 0.41597002017872586, "loss": 0.001965146278962493, "loss_ce": 0.001111130928620696, "loss_xval": 0.0008544921875, "num_input_tokens_seen": 212784000, "step": 1443 }, { "epoch": 0.4162582876909772, "grad_norm": 0.10623955238908554, "learning_rate": 0.0001, "loss": 0.0057, "num_input_tokens_seen": 212918800, "step": 1444 }, { "epoch": 0.4162582876909772, "loss": 0.009658853523433208, "loss_ce": 0.00887636374682188, "loss_xval": 0.000782012939453125, "num_input_tokens_seen": 212918800, "step": 1444 }, { "epoch": 0.4165465552032286, "grad_norm": 0.26211817991715775, "learning_rate": 0.0001, "loss": 0.0018, "num_input_tokens_seen": 213053888, "step": 1445 }, { "epoch": 0.4165465552032286, "loss": 0.001269388129003346, "loss_ce": 0.0007265089661814272, "loss_xval": 0.00054168701171875, "num_input_tokens_seen": 213053888, "step": 1445 }, { "epoch": 0.41683482271547995, "grad_norm": 0.3816847147583729, "learning_rate": 0.0001, "loss": 0.0081, "num_input_tokens_seen": 213226456, "step": 1446 }, { "epoch": 0.41683482271547995, "loss": 0.0040206024423241615, "loss_ce": 0.003081233473494649, "loss_xval": 0.00093841552734375, "num_input_tokens_seen": 213226456, "step": 1446 }, { "epoch": 0.4171230902277313, "grad_norm": 0.826796537032878, "learning_rate": 0.0001, "loss": 0.0071, "num_input_tokens_seen": 213361392, "step": 1447 }, { "epoch": 0.4171230902277313, "loss": 0.01285298727452755, "loss_ce": 0.011970838531851768, "loss_xval": 0.000881195068359375, "num_input_tokens_seen": 213361392, "step": 1447 }, { "epoch": 0.4174113577399827, "grad_norm": 0.7368095171873357, "learning_rate": 0.0001, "loss": 0.0042, "num_input_tokens_seen": 213496472, "step": 1448 }, { "epoch": 0.4174113577399827, "loss": 0.001192914554849267, "loss_ce": 0.0007071368163451552, "loss_xval": 0.0004863739013671875, "num_input_tokens_seen": 213496472, "step": 1448 }, { "epoch": 0.4176996252522341, "grad_norm": 0.7264266905359824, "learning_rate": 0.0001, "loss": 0.0064, "num_input_tokens_seen": 213669032, "step": 1449 }, { "epoch": 0.4176996252522341, "loss": 0.00256285909563303, "loss_ce": 0.00180325738620013, "loss_xval": 0.000759124755859375, "num_input_tokens_seen": 213669032, "step": 1449 }, { "epoch": 0.41798789276448545, "grad_norm": 0.8756888162528366, "learning_rate": 0.0001, "loss": 0.0076, "num_input_tokens_seen": 213803896, "step": 1450 }, { "epoch": 0.41798789276448545, "loss": 0.011397944763302803, "loss_ce": 0.010598527267575264, "loss_xval": 0.00080108642578125, "num_input_tokens_seen": 213803896, "step": 1450 }, { "epoch": 0.4182761602767368, "grad_norm": 0.6703661093710185, "learning_rate": 0.0001, "loss": 0.0019, "num_input_tokens_seen": 213938976, "step": 1451 }, { "epoch": 0.4182761602767368, "loss": 0.0011391295120120049, "loss_ce": 0.0006146086379885674, "loss_xval": 0.00052642822265625, "num_input_tokens_seen": 213938976, "step": 1451 }, { "epoch": 0.41856442778898817, "grad_norm": 1.1802014879345701, "learning_rate": 0.0001, "loss": 0.0064, "num_input_tokens_seen": 214111544, "step": 1452 }, { "epoch": 0.41856442778898817, "loss": 0.002023919951170683, "loss_ce": 0.0014066543662920594, "loss_xval": 0.00061798095703125, "num_input_tokens_seen": 214111544, "step": 1452 }, { "epoch": 0.41885269530123953, "grad_norm": 0.40651186118730853, "learning_rate": 0.0001, "loss": 0.0049, "num_input_tokens_seen": 214246264, "step": 1453 }, { "epoch": 0.41885269530123953, "loss": 0.008859092369675636, "loss_ce": 0.008055144920945168, "loss_xval": 0.000804901123046875, "num_input_tokens_seen": 214246264, "step": 1453 }, { "epoch": 0.41914096281349095, "grad_norm": 1.2407763603561572, "learning_rate": 0.0001, "loss": 0.0017, "num_input_tokens_seen": 214381288, "step": 1454 }, { "epoch": 0.41914096281349095, "loss": 0.0013691552449017763, "loss_ce": 0.0006729729357175529, "loss_xval": 0.00069427490234375, "num_input_tokens_seen": 214381288, "step": 1454 }, { "epoch": 0.4194292303257423, "grad_norm": 0.2690818929859582, "learning_rate": 0.0001, "loss": 0.0063, "num_input_tokens_seen": 214553744, "step": 1455 }, { "epoch": 0.4194292303257423, "loss": 0.002206414006650448, "loss_ce": 0.0016423157649114728, "loss_xval": 0.0005645751953125, "num_input_tokens_seen": 214553744, "step": 1455 }, { "epoch": 0.41971749783799367, "grad_norm": 1.0529441379573192, "learning_rate": 0.0001, "loss": 0.0052, "num_input_tokens_seen": 214688496, "step": 1456 }, { "epoch": 0.41971749783799367, "loss": 0.008993783965706825, "loss_ce": 0.008180061355233192, "loss_xval": 0.000812530517578125, "num_input_tokens_seen": 214688496, "step": 1456 }, { "epoch": 0.42000576535024503, "grad_norm": 0.8358229366317473, "learning_rate": 0.0001, "loss": 0.0014, "num_input_tokens_seen": 214823544, "step": 1457 }, { "epoch": 0.42000576535024503, "loss": 0.0011128491023555398, "loss_ce": 0.000553042278625071, "loss_xval": 0.000560760498046875, "num_input_tokens_seen": 214823544, "step": 1457 }, { "epoch": 0.4202940328624964, "grad_norm": 0.665178161614472, "learning_rate": 0.0001, "loss": 0.0076, "num_input_tokens_seen": 214996008, "step": 1458 }, { "epoch": 0.4202940328624964, "loss": 0.0021955184638500214, "loss_ce": 0.001614253968000412, "loss_xval": 0.000579833984375, "num_input_tokens_seen": 214996008, "step": 1458 }, { "epoch": 0.42058230037474775, "grad_norm": 0.9507247443250465, "learning_rate": 0.0001, "loss": 0.0044, "num_input_tokens_seen": 215130744, "step": 1459 }, { "epoch": 0.42058230037474775, "loss": 0.00750365573912859, "loss_ce": 0.006683019455522299, "loss_xval": 0.000820159912109375, "num_input_tokens_seen": 215130744, "step": 1459 }, { "epoch": 0.4208705678869991, "grad_norm": 0.35005294419163846, "learning_rate": 0.0001, "loss": 0.0019, "num_input_tokens_seen": 215265888, "step": 1460 }, { "epoch": 0.4208705678869991, "loss": 0.002211757702752948, "loss_ce": 0.001746245427057147, "loss_xval": 0.00046539306640625, "num_input_tokens_seen": 215265888, "step": 1460 }, { "epoch": 0.42115883539925053, "grad_norm": 0.8883463319709012, "learning_rate": 0.0001, "loss": 0.0061, "num_input_tokens_seen": 215438312, "step": 1461 }, { "epoch": 0.42115883539925053, "loss": 0.0020642499439418316, "loss_ce": 0.0014379244530573487, "loss_xval": 0.0006256103515625, "num_input_tokens_seen": 215438312, "step": 1461 }, { "epoch": 0.4214471029115019, "grad_norm": 0.38416254140929673, "learning_rate": 0.0001, "loss": 0.0063, "num_input_tokens_seen": 215573144, "step": 1462 }, { "epoch": 0.4214471029115019, "loss": 0.009878046810626984, "loss_ce": 0.009044773876667023, "loss_xval": 0.00083160400390625, "num_input_tokens_seen": 215573144, "step": 1462 }, { "epoch": 0.42173537042375325, "grad_norm": 1.170583224481744, "learning_rate": 0.0001, "loss": 0.0018, "num_input_tokens_seen": 215708112, "step": 1463 }, { "epoch": 0.42173537042375325, "loss": 0.0011670843232423067, "loss_ce": 0.0005476729129441082, "loss_xval": 0.00061798095703125, "num_input_tokens_seen": 215708112, "step": 1463 }, { "epoch": 0.4220236379360046, "grad_norm": 0.5614263522034938, "learning_rate": 0.0001, "loss": 0.0062, "num_input_tokens_seen": 215880648, "step": 1464 }, { "epoch": 0.4220236379360046, "loss": 0.0018251438159495592, "loss_ce": 0.0012097854632884264, "loss_xval": 0.000614166259765625, "num_input_tokens_seen": 215880648, "step": 1464 }, { "epoch": 0.422311905448256, "grad_norm": 0.4011228468575239, "learning_rate": 0.0001, "loss": 0.004, "num_input_tokens_seen": 216015600, "step": 1465 }, { "epoch": 0.422311905448256, "loss": 0.006949270609766245, "loss_ce": 0.006290758494287729, "loss_xval": 0.000659942626953125, "num_input_tokens_seen": 216015600, "step": 1465 }, { "epoch": 0.42260017296050734, "grad_norm": 0.15231738215848292, "learning_rate": 0.0001, "loss": 0.0018, "num_input_tokens_seen": 216150536, "step": 1466 }, { "epoch": 0.42260017296050734, "loss": 0.0014463100815191865, "loss_ce": 0.000987592851743102, "loss_xval": 0.000457763671875, "num_input_tokens_seen": 216150536, "step": 1466 }, { "epoch": 0.4228884404727587, "grad_norm": 0.9513563965505365, "learning_rate": 0.0001, "loss": 0.0072, "num_input_tokens_seen": 216323024, "step": 1467 }, { "epoch": 0.4228884404727587, "loss": 0.0019695989321917295, "loss_ce": 0.0012667409610003233, "loss_xval": 0.000701904296875, "num_input_tokens_seen": 216323024, "step": 1467 }, { "epoch": 0.4231767079850101, "grad_norm": 0.5957654342033346, "learning_rate": 0.0001, "loss": 0.0058, "num_input_tokens_seen": 216457800, "step": 1468 }, { "epoch": 0.4231767079850101, "loss": 0.009507661685347557, "loss_ce": 0.008915429934859276, "loss_xval": 0.000591278076171875, "num_input_tokens_seen": 216457800, "step": 1468 }, { "epoch": 0.4234649754972615, "grad_norm": 0.7614670433113088, "learning_rate": 0.0001, "loss": 0.0018, "num_input_tokens_seen": 216592960, "step": 1469 }, { "epoch": 0.4234649754972615, "loss": 0.0016541981603950262, "loss_ce": 0.0011575722601264715, "loss_xval": 0.00049591064453125, "num_input_tokens_seen": 216592960, "step": 1469 }, { "epoch": 0.42375324300951284, "grad_norm": 1.270700724520404, "learning_rate": 0.0001, "loss": 0.005, "num_input_tokens_seen": 216765584, "step": 1470 }, { "epoch": 0.42375324300951284, "loss": 0.0014716139994561672, "loss_ce": 0.0008128635818138719, "loss_xval": 0.000659942626953125, "num_input_tokens_seen": 216765584, "step": 1470 }, { "epoch": 0.4240415105217642, "grad_norm": 0.5875145747840825, "learning_rate": 0.0001, "loss": 0.0055, "num_input_tokens_seen": 216900376, "step": 1471 }, { "epoch": 0.4240415105217642, "loss": 0.009993959218263626, "loss_ce": 0.009399700909852982, "loss_xval": 0.0005950927734375, "num_input_tokens_seen": 216900376, "step": 1471 }, { "epoch": 0.42432977803401556, "grad_norm": 0.3045933975449817, "learning_rate": 0.0001, "loss": 0.0036, "num_input_tokens_seen": 217035464, "step": 1472 }, { "epoch": 0.42432977803401556, "loss": 0.000996191636659205, "loss_ce": 0.00048502214485779405, "loss_xval": 0.00051116943359375, "num_input_tokens_seen": 217035464, "step": 1472 }, { "epoch": 0.4246180455462669, "grad_norm": 0.5158276363401353, "learning_rate": 0.0001, "loss": 0.0046, "num_input_tokens_seen": 217207904, "step": 1473 }, { "epoch": 0.4246180455462669, "loss": 0.001819569617509842, "loss_ce": 0.0012697763741016388, "loss_xval": 0.00054931640625, "num_input_tokens_seen": 217207904, "step": 1473 }, { "epoch": 0.4249063130585183, "grad_norm": 0.28704228820610606, "learning_rate": 0.0001, "loss": 0.0058, "num_input_tokens_seen": 217342744, "step": 1474 }, { "epoch": 0.4249063130585183, "loss": 0.010570655576884747, "loss_ce": 0.009992490522563457, "loss_xval": 0.000579833984375, "num_input_tokens_seen": 217342744, "step": 1474 }, { "epoch": 0.4251945805707697, "grad_norm": 0.10951631417043917, "learning_rate": 0.0001, "loss": 0.0017, "num_input_tokens_seen": 217477784, "step": 1475 }, { "epoch": 0.4251945805707697, "loss": 0.0009299661032855511, "loss_ce": 0.0004757787100970745, "loss_xval": 0.000453948974609375, "num_input_tokens_seen": 217477784, "step": 1475 }, { "epoch": 0.42548284808302106, "grad_norm": 0.1979496940686989, "learning_rate": 0.0001, "loss": 0.0054, "num_input_tokens_seen": 217650280, "step": 1476 }, { "epoch": 0.42548284808302106, "loss": 0.0023711256217211485, "loss_ce": 0.0018344454001635313, "loss_xval": 0.000537872314453125, "num_input_tokens_seen": 217650280, "step": 1476 }, { "epoch": 0.4257711155952724, "grad_norm": 0.22928083269061086, "learning_rate": 0.0001, "loss": 0.0058, "num_input_tokens_seen": 217785024, "step": 1477 }, { "epoch": 0.4257711155952724, "loss": 0.010517632588744164, "loss_ce": 0.009880578145384789, "loss_xval": 0.000637054443359375, "num_input_tokens_seen": 217785024, "step": 1477 }, { "epoch": 0.4260593831075238, "grad_norm": 0.29270217498181766, "learning_rate": 0.0001, "loss": 0.0032, "num_input_tokens_seen": 217920096, "step": 1478 }, { "epoch": 0.4260593831075238, "loss": 0.0040700966492295265, "loss_ce": 0.0037148527335375547, "loss_xval": 0.000354766845703125, "num_input_tokens_seen": 217920096, "step": 1478 }, { "epoch": 0.42634765061977514, "grad_norm": 0.2624178326681669, "learning_rate": 0.0001, "loss": 0.0054, "num_input_tokens_seen": 218092560, "step": 1479 }, { "epoch": 0.42634765061977514, "loss": 0.0016003092750906944, "loss_ce": 0.0011647185310721397, "loss_xval": 0.00043487548828125, "num_input_tokens_seen": 218092560, "step": 1479 }, { "epoch": 0.4266359181320265, "grad_norm": 0.6343513122065932, "learning_rate": 0.0001, "loss": 0.0057, "num_input_tokens_seen": 218227344, "step": 1480 }, { "epoch": 0.4266359181320265, "loss": 0.010287400335073471, "loss_ce": 0.009708281606435776, "loss_xval": 0.000579833984375, "num_input_tokens_seen": 218227344, "step": 1480 }, { "epoch": 0.42692418564427786, "grad_norm": 0.39304353113000834, "learning_rate": 0.0001, "loss": 0.0011, "num_input_tokens_seen": 218362440, "step": 1481 }, { "epoch": 0.42692418564427786, "loss": 0.0008061963599175215, "loss_ce": 0.00044070067815482616, "loss_xval": 0.0003662109375, "num_input_tokens_seen": 218362440, "step": 1481 }, { "epoch": 0.4272124531565293, "grad_norm": 12.747643337776775, "learning_rate": 0.0001, "loss": 0.0642, "num_input_tokens_seen": 218535016, "step": 1482 }, { "epoch": 0.4272124531565293, "loss": 0.11798842996358871, "loss_ce": 0.0037561291828751564, "loss_xval": 0.1142578125, "num_input_tokens_seen": 218535016, "step": 1482 }, { "epoch": 0.42750072066878064, "grad_norm": 4.612439740601446, "learning_rate": 0.0001, "loss": 0.0095, "num_input_tokens_seen": 218669808, "step": 1483 }, { "epoch": 0.42750072066878064, "loss": 0.01200609840452671, "loss_ce": 0.009215647354722023, "loss_xval": 0.0027923583984375, "num_input_tokens_seen": 218669808, "step": 1483 }, { "epoch": 0.427788988181032, "grad_norm": 3.9401909327583904, "learning_rate": 0.0001, "loss": 0.0102, "num_input_tokens_seen": 218804864, "step": 1484 }, { "epoch": 0.427788988181032, "loss": 0.003662645351141691, "loss_ce": 0.0008264179341495037, "loss_xval": 0.002838134765625, "num_input_tokens_seen": 218804864, "step": 1484 }, { "epoch": 0.42807725569328337, "grad_norm": 2.515274796217096, "learning_rate": 0.0001, "loss": 0.0179, "num_input_tokens_seen": 218977456, "step": 1485 }, { "epoch": 0.42807725569328337, "loss": 0.01962754875421524, "loss_ce": 0.01210115011781454, "loss_xval": 0.007537841796875, "num_input_tokens_seen": 218977456, "step": 1485 }, { "epoch": 0.4283655232055347, "grad_norm": 5.907577211244805, "learning_rate": 0.0001, "loss": 0.0162, "num_input_tokens_seen": 219112264, "step": 1486 }, { "epoch": 0.4283655232055347, "loss": 0.021406885236501694, "loss_ce": 0.011324640363454819, "loss_xval": 0.01007080078125, "num_input_tokens_seen": 219112264, "step": 1486 }, { "epoch": 0.4286537907177861, "grad_norm": 1.8264295246632904, "learning_rate": 0.0001, "loss": 0.0064, "num_input_tokens_seen": 219247248, "step": 1487 }, { "epoch": 0.4286537907177861, "loss": 0.0038554510101675987, "loss_ce": 0.0008065543370321393, "loss_xval": 0.0030517578125, "num_input_tokens_seen": 219247248, "step": 1487 }, { "epoch": 0.42894205823003745, "grad_norm": 5.810858391504198, "learning_rate": 0.0001, "loss": 0.0167, "num_input_tokens_seen": 219419848, "step": 1488 }, { "epoch": 0.42894205823003745, "loss": 0.013821925967931747, "loss_ce": 0.0071347616612911224, "loss_xval": 0.006683349609375, "num_input_tokens_seen": 219419848, "step": 1488 }, { "epoch": 0.42923032574228887, "grad_norm": 6.5915972029136105, "learning_rate": 0.0001, "loss": 0.0214, "num_input_tokens_seen": 219554760, "step": 1489 }, { "epoch": 0.42923032574228887, "loss": 0.02457248792052269, "loss_ce": 0.01850711926817894, "loss_xval": 0.006072998046875, "num_input_tokens_seen": 219554760, "step": 1489 }, { "epoch": 0.4295185932545402, "grad_norm": 3.073420111985128, "learning_rate": 0.0001, "loss": 0.0097, "num_input_tokens_seen": 219689760, "step": 1490 }, { "epoch": 0.4295185932545402, "loss": 0.012378985062241554, "loss_ce": 0.005218798760324717, "loss_xval": 0.007171630859375, "num_input_tokens_seen": 219689760, "step": 1490 }, { "epoch": 0.4298068607667916, "grad_norm": 7.785497592641905, "learning_rate": 0.0001, "loss": 0.0176, "num_input_tokens_seen": 219862240, "step": 1491 }, { "epoch": 0.4298068607667916, "loss": 0.00931491144001484, "loss_ce": 0.0023263858165591955, "loss_xval": 0.006988525390625, "num_input_tokens_seen": 219862240, "step": 1491 }, { "epoch": 0.43009512827904295, "grad_norm": 2.140618993640047, "learning_rate": 0.0001, "loss": 0.0091, "num_input_tokens_seen": 219997032, "step": 1492 }, { "epoch": 0.43009512827904295, "loss": 0.013707047328352928, "loss_ce": 0.011228448711335659, "loss_xval": 0.002471923828125, "num_input_tokens_seen": 219997032, "step": 1492 }, { "epoch": 0.4303833957912943, "grad_norm": 7.906417162325639, "learning_rate": 0.0001, "loss": 0.0109, "num_input_tokens_seen": 220132048, "step": 1493 }, { "epoch": 0.4303833957912943, "loss": 0.009202736429870129, "loss_ce": 0.0014970483025535941, "loss_xval": 0.0076904296875, "num_input_tokens_seen": 220132048, "step": 1493 }, { "epoch": 0.43067166330354567, "grad_norm": 8.165985635154161, "learning_rate": 0.0001, "loss": 0.0164, "num_input_tokens_seen": 220304544, "step": 1494 }, { "epoch": 0.43067166330354567, "loss": 0.010727154091000557, "loss_ce": 0.0025369995273649693, "loss_xval": 0.0081787109375, "num_input_tokens_seen": 220304544, "step": 1494 }, { "epoch": 0.43095993081579703, "grad_norm": 2.5659378266589496, "learning_rate": 0.0001, "loss": 0.0086, "num_input_tokens_seen": 220439288, "step": 1495 }, { "epoch": 0.43095993081579703, "loss": 0.012238608673214912, "loss_ce": 0.008729087188839912, "loss_xval": 0.003509521484375, "num_input_tokens_seen": 220439288, "step": 1495 }, { "epoch": 0.43124819832804845, "grad_norm": 9.235219599695903, "learning_rate": 0.0001, "loss": 0.0149, "num_input_tokens_seen": 220574336, "step": 1496 }, { "epoch": 0.43124819832804845, "loss": 0.009838768281042576, "loss_ce": 0.000660606543533504, "loss_xval": 0.0091552734375, "num_input_tokens_seen": 220574336, "step": 1496 }, { "epoch": 0.4315364658402998, "grad_norm": 2.91361378442123, "learning_rate": 0.0001, "loss": 0.0098, "num_input_tokens_seen": 220746832, "step": 1497 }, { "epoch": 0.4315364658402998, "loss": 0.007760161068290472, "loss_ce": 0.004582518711686134, "loss_xval": 0.003173828125, "num_input_tokens_seen": 220746832, "step": 1497 }, { "epoch": 0.43182473335255117, "grad_norm": 8.514799900684112, "learning_rate": 0.0001, "loss": 0.016, "num_input_tokens_seen": 220881584, "step": 1498 }, { "epoch": 0.43182473335255117, "loss": 0.017550978809595108, "loss_ce": 0.010741745121777058, "loss_xval": 0.006805419921875, "num_input_tokens_seen": 220881584, "step": 1498 }, { "epoch": 0.43211300086480253, "grad_norm": 8.262073445905802, "learning_rate": 0.0001, "loss": 0.0115, "num_input_tokens_seen": 221016736, "step": 1499 }, { "epoch": 0.43211300086480253, "loss": 0.008337888866662979, "loss_ce": 0.0006856058025732636, "loss_xval": 0.007659912109375, "num_input_tokens_seen": 221016736, "step": 1499 }, { "epoch": 0.4324012683770539, "grad_norm": 4.097932252545542, "learning_rate": 0.0001, "loss": 0.0091, "num_input_tokens_seen": 221189264, "step": 1500 }, { "epoch": 0.4324012683770539, "eval_websight_new_IoU": 0.0, "eval_websight_new_MAE_x": 0.11273882165551186, "eval_websight_new_MAE_y": 0.13310284167528152, "eval_websight_new_NUM_probability": 0.9946940243244171, "eval_websight_new_inside_bbox": 0.0, "eval_websight_new_loss": 0.024503586813807487, "eval_websight_new_loss_ce": 0.0006861111614853144, "eval_websight_new_loss_xval": 0.02381134033203125, "eval_websight_new_runtime": 35.3106, "eval_websight_new_samples_per_second": 1.416, "eval_websight_new_steps_per_second": 0.057, "num_input_tokens_seen": 221189264, "step": 1500 }, { "epoch": 0.4324012683770539, "eval_seeclick_IoU": 0.002761509735137224, "eval_seeclick_MAE_x": 0.08998997882008553, "eval_seeclick_MAE_y": 0.09225813671946526, "eval_seeclick_NUM_probability": 0.9951084852218628, "eval_seeclick_inside_bbox": 0.1493055559694767, "eval_seeclick_loss": 0.024002188816666603, "eval_seeclick_loss_ce": 0.010312775615602732, "eval_seeclick_loss_xval": 0.014301300048828125, "eval_seeclick_runtime": 65.2763, "eval_seeclick_samples_per_second": 0.766, "eval_seeclick_steps_per_second": 0.031, "num_input_tokens_seen": 221189264, "step": 1500 }, { "epoch": 0.4324012683770539, "eval_icons_IoU": 0.0, "eval_icons_MAE_x": 0.11441490426659584, "eval_icons_MAE_y": 0.14393816888332367, "eval_icons_NUM_probability": 0.9947239756584167, "eval_icons_inside_bbox": 0.0, "eval_icons_loss": 0.03313892334699631, "eval_icons_loss_ce": 0.013403720688074827, "eval_icons_loss_xval": 0.019145965576171875, "eval_icons_runtime": 66.7805, "eval_icons_samples_per_second": 0.749, "eval_icons_steps_per_second": 0.03, "num_input_tokens_seen": 221189264, "step": 1500 }, { "epoch": 0.4324012683770539, "loss": 0.03301580622792244, "loss_ce": 0.013606625609099865, "loss_xval": 0.0194091796875, "num_input_tokens_seen": 221189264, "step": 1500 }, { "epoch": 0.43268953588930525, "grad_norm": 11.926736727618014, "learning_rate": 0.0001, "loss": 0.0258, "num_input_tokens_seen": 221324040, "step": 1501 }, { "epoch": 0.43268953588930525, "loss": 0.0268569216132164, "loss_ce": 0.0121627077460289, "loss_xval": 0.01470947265625, "num_input_tokens_seen": 221324040, "step": 1501 }, { "epoch": 0.43297780340155667, "grad_norm": 3.8871982189529573, "learning_rate": 0.0001, "loss": 0.0043, "num_input_tokens_seen": 221459064, "step": 1502 }, { "epoch": 0.43297780340155667, "loss": 0.002975265495479107, "loss_ce": 0.0007198259700089693, "loss_xval": 0.00225830078125, "num_input_tokens_seen": 221459064, "step": 1502 }, { "epoch": 0.43326607091380803, "grad_norm": 10.091104014332721, "learning_rate": 0.0001, "loss": 0.0186, "num_input_tokens_seen": 221631456, "step": 1503 }, { "epoch": 0.43326607091380803, "loss": 0.01266709715127945, "loss_ce": 0.0017875813646242023, "loss_xval": 0.0108642578125, "num_input_tokens_seen": 221631456, "step": 1503 }, { "epoch": 0.4335543384260594, "grad_norm": 12.153293017387893, "learning_rate": 0.0001, "loss": 0.0266, "num_input_tokens_seen": 221766224, "step": 1504 }, { "epoch": 0.4335543384260594, "loss": 0.02504376508295536, "loss_ce": 0.00867108441889286, "loss_xval": 0.016357421875, "num_input_tokens_seen": 221766224, "step": 1504 }, { "epoch": 0.43384260593831075, "grad_norm": 0.4609686671533385, "learning_rate": 0.0001, "loss": 0.0024, "num_input_tokens_seen": 221901320, "step": 1505 }, { "epoch": 0.43384260593831075, "loss": 0.0018655576277524233, "loss_ce": 0.0009276189375668764, "loss_xval": 0.00093841552734375, "num_input_tokens_seen": 221901320, "step": 1505 }, { "epoch": 0.4341308734505621, "grad_norm": 11.897519898174188, "learning_rate": 0.0001, "loss": 0.0252, "num_input_tokens_seen": 222073768, "step": 1506 }, { "epoch": 0.4341308734505621, "loss": 0.017838824540376663, "loss_ce": 0.0018323541153222322, "loss_xval": 0.0159912109375, "num_input_tokens_seen": 222073768, "step": 1506 }, { "epoch": 0.4344191409628135, "grad_norm": 6.977404050056601, "learning_rate": 0.0001, "loss": 0.0152, "num_input_tokens_seen": 222208576, "step": 1507 }, { "epoch": 0.4344191409628135, "loss": 0.016943227499723434, "loss_ce": 0.010710012167692184, "loss_xval": 0.0062255859375, "num_input_tokens_seen": 222208576, "step": 1507 }, { "epoch": 0.43470740847506484, "grad_norm": 8.024467376062539, "learning_rate": 0.0001, "loss": 0.0111, "num_input_tokens_seen": 222343576, "step": 1508 }, { "epoch": 0.43470740847506484, "loss": 0.011738698929548264, "loss_ce": 0.0030793354380875826, "loss_xval": 0.0086669921875, "num_input_tokens_seen": 222343576, "step": 1508 }, { "epoch": 0.43499567598731625, "grad_norm": 12.988579617987343, "learning_rate": 0.0001, "loss": 0.0281, "num_input_tokens_seen": 222516032, "step": 1509 }, { "epoch": 0.43499567598731625, "loss": 0.01961071789264679, "loss_ce": 0.0027116083074361086, "loss_xval": 0.016845703125, "num_input_tokens_seen": 222516032, "step": 1509 }, { "epoch": 0.4352839434995676, "grad_norm": 0.8972896129622868, "learning_rate": 0.0001, "loss": 0.0082, "num_input_tokens_seen": 222650880, "step": 1510 }, { "epoch": 0.4352839434995676, "loss": 0.014146210625767708, "loss_ce": 0.012663723900914192, "loss_xval": 0.0014801025390625, "num_input_tokens_seen": 222650880, "step": 1510 }, { "epoch": 0.435572211011819, "grad_norm": 12.577305655701169, "learning_rate": 0.0001, "loss": 0.0239, "num_input_tokens_seen": 222785984, "step": 1511 }, { "epoch": 0.435572211011819, "loss": 0.019875552505254745, "loss_ce": 0.0024881642311811447, "loss_xval": 0.017333984375, "num_input_tokens_seen": 222785984, "step": 1511 }, { "epoch": 0.43586047852407034, "grad_norm": 8.79078606333596, "learning_rate": 0.0001, "loss": 0.017, "num_input_tokens_seen": 222958576, "step": 1512 }, { "epoch": 0.43586047852407034, "loss": 0.009240476414561272, "loss_ce": 0.0009969149250537157, "loss_xval": 0.00823974609375, "num_input_tokens_seen": 222958576, "step": 1512 }, { "epoch": 0.4361487460363217, "grad_norm": 6.959399811417409, "learning_rate": 0.0001, "loss": 0.0128, "num_input_tokens_seen": 223093344, "step": 1513 }, { "epoch": 0.4361487460363217, "loss": 0.017352435737848282, "loss_ce": 0.010268543846905231, "loss_xval": 0.007080078125, "num_input_tokens_seen": 223093344, "step": 1513 }, { "epoch": 0.43643701354857306, "grad_norm": 13.354758671346826, "learning_rate": 0.0001, "loss": 0.0229, "num_input_tokens_seen": 223228440, "step": 1514 }, { "epoch": 0.43643701354857306, "loss": 0.019258903339505196, "loss_ce": 0.0004600748070515692, "loss_xval": 0.018798828125, "num_input_tokens_seen": 223228440, "step": 1514 }, { "epoch": 0.4367252810608244, "grad_norm": 1.494140077747622, "learning_rate": 0.0001, "loss": 0.0071, "num_input_tokens_seen": 223400944, "step": 1515 }, { "epoch": 0.4367252810608244, "loss": 0.0038938431534916162, "loss_ce": 0.0023775009904056787, "loss_xval": 0.00151824951171875, "num_input_tokens_seen": 223400944, "step": 1515 }, { "epoch": 0.43701354857307584, "grad_norm": 13.336987482759294, "learning_rate": 0.0001, "loss": 0.0262, "num_input_tokens_seen": 223535744, "step": 1516 }, { "epoch": 0.43701354857307584, "loss": 0.026453927159309387, "loss_ce": 0.0064954305998981, "loss_xval": 0.02001953125, "num_input_tokens_seen": 223535744, "step": 1516 }, { "epoch": 0.4373018160853272, "grad_norm": 10.647586085016922, "learning_rate": 0.0001, "loss": 0.0155, "num_input_tokens_seen": 223670856, "step": 1517 }, { "epoch": 0.4373018160853272, "loss": 0.012804286554455757, "loss_ce": 0.0009253188036382198, "loss_xval": 0.01190185546875, "num_input_tokens_seen": 223670856, "step": 1517 }, { "epoch": 0.43759008359757856, "grad_norm": 6.0494325716405015, "learning_rate": 0.0001, "loss": 0.0106, "num_input_tokens_seen": 223843392, "step": 1518 }, { "epoch": 0.43759008359757856, "loss": 0.00721039017662406, "loss_ce": 0.0020567341707646847, "loss_xval": 0.005157470703125, "num_input_tokens_seen": 223843392, "step": 1518 }, { "epoch": 0.4378783511098299, "grad_norm": 13.82175508494315, "learning_rate": 0.0001, "loss": 0.0286, "num_input_tokens_seen": 223978120, "step": 1519 }, { "epoch": 0.4378783511098299, "loss": 0.027493847534060478, "loss_ce": 0.0073064700700342655, "loss_xval": 0.0201416015625, "num_input_tokens_seen": 223978120, "step": 1519 }, { "epoch": 0.4381666186220813, "grad_norm": 1.9713786580914048, "learning_rate": 0.0001, "loss": 0.0022, "num_input_tokens_seen": 224113256, "step": 1520 }, { "epoch": 0.4381666186220813, "loss": 0.0016766273183748126, "loss_ce": 0.000655718962661922, "loss_xval": 0.0010223388671875, "num_input_tokens_seen": 224113256, "step": 1520 }, { "epoch": 0.43845488613433264, "grad_norm": 12.724023038845006, "learning_rate": 0.0001, "loss": 0.0266, "num_input_tokens_seen": 224285712, "step": 1521 }, { "epoch": 0.43845488613433264, "loss": 0.01906008832156658, "loss_ce": 0.0006045823683962226, "loss_xval": 0.0184326171875, "num_input_tokens_seen": 224285712, "step": 1521 }, { "epoch": 0.438743153646584, "grad_norm": 9.66225346832843, "learning_rate": 0.0001, "loss": 0.017, "num_input_tokens_seen": 224420576, "step": 1522 }, { "epoch": 0.438743153646584, "loss": 0.017540834844112396, "loss_ce": 0.007927797734737396, "loss_xval": 0.0096435546875, "num_input_tokens_seen": 224420576, "step": 1522 }, { "epoch": 0.4390314211588354, "grad_norm": 7.291506408736569, "learning_rate": 0.0001, "loss": 0.0086, "num_input_tokens_seen": 224555656, "step": 1523 }, { "epoch": 0.4390314211588354, "loss": 0.009626722894608974, "loss_ce": 0.0011428362922742963, "loss_xval": 0.00848388671875, "num_input_tokens_seen": 224555656, "step": 1523 }, { "epoch": 0.4393196886710868, "grad_norm": 15.069879681871626, "learning_rate": 0.0001, "loss": 0.0341, "num_input_tokens_seen": 224728192, "step": 1524 }, { "epoch": 0.4393196886710868, "loss": 0.027095580473542213, "loss_ce": 0.001460814499296248, "loss_xval": 0.025634765625, "num_input_tokens_seen": 224728192, "step": 1524 }, { "epoch": 0.43960795618333814, "grad_norm": 2.966582613154629, "learning_rate": 0.0001, "loss": 0.0056, "num_input_tokens_seen": 224862904, "step": 1525 }, { "epoch": 0.43960795618333814, "loss": 0.007811859715729952, "loss_ce": 0.006425216794013977, "loss_xval": 0.0013885498046875, "num_input_tokens_seen": 224862904, "step": 1525 }, { "epoch": 0.4398962236955895, "grad_norm": 12.893278036449145, "learning_rate": 0.0001, "loss": 0.0228, "num_input_tokens_seen": 224997968, "step": 1526 }, { "epoch": 0.4398962236955895, "loss": 0.02297937497496605, "loss_ce": 0.0010067173279821873, "loss_xval": 0.02197265625, "num_input_tokens_seen": 224997968, "step": 1526 }, { "epoch": 0.44018449120784087, "grad_norm": 11.262248783259581, "learning_rate": 0.0001, "loss": 0.0216, "num_input_tokens_seen": 225170504, "step": 1527 }, { "epoch": 0.44018449120784087, "loss": 0.015866370871663094, "loss_ce": 0.0014544444857165217, "loss_xval": 0.014404296875, "num_input_tokens_seen": 225170504, "step": 1527 }, { "epoch": 0.44047275872009223, "grad_norm": 4.933947175820202, "learning_rate": 0.0001, "loss": 0.009, "num_input_tokens_seen": 225305240, "step": 1528 }, { "epoch": 0.44047275872009223, "loss": 0.014280106872320175, "loss_ce": 0.0090959332883358, "loss_xval": 0.00518798828125, "num_input_tokens_seen": 225305240, "step": 1528 }, { "epoch": 0.4407610262323436, "grad_norm": 13.410309682135633, "learning_rate": 0.0001, "loss": 0.0244, "num_input_tokens_seen": 225440432, "step": 1529 }, { "epoch": 0.4407610262323436, "loss": 0.023734629154205322, "loss_ce": 0.0011058447416871786, "loss_xval": 0.0225830078125, "num_input_tokens_seen": 225440432, "step": 1529 }, { "epoch": 0.441049293744595, "grad_norm": 3.972873618108575, "learning_rate": 0.0001, "loss": 0.0068, "num_input_tokens_seen": 225612808, "step": 1530 }, { "epoch": 0.441049293744595, "loss": 0.0027857262175530195, "loss_ce": 0.0006599860498681664, "loss_xval": 0.0021209716796875, "num_input_tokens_seen": 225612808, "step": 1530 }, { "epoch": 0.44133756125684637, "grad_norm": 9.76112441466362, "learning_rate": 0.0001, "loss": 0.0197, "num_input_tokens_seen": 225747616, "step": 1531 }, { "epoch": 0.44133756125684637, "loss": 0.02597951330244541, "loss_ce": 0.012353414669632912, "loss_xval": 0.01361083984375, "num_input_tokens_seen": 225747616, "step": 1531 }, { "epoch": 0.44162582876909773, "grad_norm": 9.468773076230084, "learning_rate": 0.0001, "loss": 0.0132, "num_input_tokens_seen": 225882616, "step": 1532 }, { "epoch": 0.44162582876909773, "loss": 0.011748703196644783, "loss_ce": 0.0003885347687173635, "loss_xval": 0.0113525390625, "num_input_tokens_seen": 225882616, "step": 1532 }, { "epoch": 0.4419140962813491, "grad_norm": 3.5672797055456202, "learning_rate": 0.0001, "loss": 0.0091, "num_input_tokens_seen": 226055320, "step": 1533 }, { "epoch": 0.4419140962813491, "loss": 0.008645853027701378, "loss_ce": 0.006130059715360403, "loss_xval": 0.0025177001953125, "num_input_tokens_seen": 226055320, "step": 1533 }, { "epoch": 0.44220236379360045, "grad_norm": 10.752583877190121, "learning_rate": 0.0001, "loss": 0.0222, "num_input_tokens_seen": 226190064, "step": 1534 }, { "epoch": 0.44220236379360045, "loss": 0.028693024069070816, "loss_ce": 0.012366121634840965, "loss_xval": 0.016357421875, "num_input_tokens_seen": 226190064, "step": 1534 }, { "epoch": 0.4424906313058518, "grad_norm": 3.465595947976591, "learning_rate": 0.0001, "loss": 0.0033, "num_input_tokens_seen": 226325024, "step": 1535 }, { "epoch": 0.4424906313058518, "loss": 0.0033873319625854492, "loss_ce": 0.0011738539906218648, "loss_xval": 0.0022125244140625, "num_input_tokens_seen": 226325024, "step": 1535 }, { "epoch": 0.4427788988181032, "grad_norm": 7.28989255271461, "learning_rate": 0.0001, "loss": 0.0141, "num_input_tokens_seen": 226497560, "step": 1536 }, { "epoch": 0.4427788988181032, "loss": 0.010725769214332104, "loss_ce": 0.0028865656349807978, "loss_xval": 0.0078125, "num_input_tokens_seen": 226497560, "step": 1536 }, { "epoch": 0.4430671663303546, "grad_norm": 6.376705732830883, "learning_rate": 0.0001, "loss": 0.0109, "num_input_tokens_seen": 226632384, "step": 1537 }, { "epoch": 0.4430671663303546, "loss": 0.014991188421845436, "loss_ce": 0.009280585683882236, "loss_xval": 0.005706787109375, "num_input_tokens_seen": 226632384, "step": 1537 }, { "epoch": 0.44335543384260595, "grad_norm": 3.6645045269522893, "learning_rate": 0.0001, "loss": 0.0037, "num_input_tokens_seen": 226767464, "step": 1538 }, { "epoch": 0.44335543384260595, "loss": 0.00352754769846797, "loss_ce": 0.0010041255736723542, "loss_xval": 0.0025177001953125, "num_input_tokens_seen": 226767464, "step": 1538 }, { "epoch": 0.4436437013548573, "grad_norm": 8.060341189612492, "learning_rate": 0.0001, "loss": 0.0144, "num_input_tokens_seen": 226939952, "step": 1539 }, { "epoch": 0.4436437013548573, "loss": 0.010525913909077644, "loss_ce": 0.0016758155543357134, "loss_xval": 0.00885009765625, "num_input_tokens_seen": 226939952, "step": 1539 }, { "epoch": 0.4439319688671087, "grad_norm": 1.1441313267565794, "learning_rate": 0.0001, "loss": 0.0064, "num_input_tokens_seen": 227074760, "step": 1540 }, { "epoch": 0.4439319688671087, "loss": 0.011146601289510727, "loss_ce": 0.01025300845503807, "loss_xval": 0.00089263916015625, "num_input_tokens_seen": 227074760, "step": 1540 }, { "epoch": 0.44422023637936003, "grad_norm": 7.097086335065152, "learning_rate": 0.0001, "loss": 0.0095, "num_input_tokens_seen": 227209952, "step": 1541 }, { "epoch": 0.44422023637936003, "loss": 0.010442393831908703, "loss_ce": 0.0031830251682549715, "loss_xval": 0.00726318359375, "num_input_tokens_seen": 227209952, "step": 1541 }, { "epoch": 0.4445085038916114, "grad_norm": 5.103558104826032, "learning_rate": 0.0001, "loss": 0.0101, "num_input_tokens_seen": 227382392, "step": 1542 }, { "epoch": 0.4445085038916114, "loss": 0.004657773766666651, "loss_ce": 0.0005798626225441694, "loss_xval": 0.00408935546875, "num_input_tokens_seen": 227382392, "step": 1542 }, { "epoch": 0.4447967714038628, "grad_norm": 4.064031752568589, "learning_rate": 0.0001, "loss": 0.0082, "num_input_tokens_seen": 227517256, "step": 1543 }, { "epoch": 0.4447967714038628, "loss": 0.012953763827681541, "loss_ce": 0.010111814364790916, "loss_xval": 0.002838134765625, "num_input_tokens_seen": 227517256, "step": 1543 }, { "epoch": 0.4450850389161142, "grad_norm": 7.784847830840979, "learning_rate": 0.0001, "loss": 0.0092, "num_input_tokens_seen": 227652296, "step": 1544 }, { "epoch": 0.4450850389161142, "loss": 0.00782286748290062, "loss_ce": 0.0004109111032448709, "loss_xval": 0.007415771484375, "num_input_tokens_seen": 227652296, "step": 1544 }, { "epoch": 0.44537330642836553, "grad_norm": 1.442024922252899, "learning_rate": 0.0001, "loss": 0.0064, "num_input_tokens_seen": 227824768, "step": 1545 }, { "epoch": 0.44537330642836553, "loss": 0.0016156777273863554, "loss_ce": 0.0008408173453062773, "loss_xval": 0.000774383544921875, "num_input_tokens_seen": 227824768, "step": 1545 }, { "epoch": 0.4456615739406169, "grad_norm": 6.247328802780809, "learning_rate": 0.0001, "loss": 0.0112, "num_input_tokens_seen": 227959496, "step": 1546 }, { "epoch": 0.4456615739406169, "loss": 0.01608903706073761, "loss_ce": 0.01050813589245081, "loss_xval": 0.005584716796875, "num_input_tokens_seen": 227959496, "step": 1546 }, { "epoch": 0.44594984145286826, "grad_norm": 5.746839659032812, "learning_rate": 0.0001, "loss": 0.0055, "num_input_tokens_seen": 228094560, "step": 1547 }, { "epoch": 0.44594984145286826, "loss": 0.004420662298798561, "loss_ce": 0.0004896164173260331, "loss_xval": 0.003936767578125, "num_input_tokens_seen": 228094560, "step": 1547 }, { "epoch": 0.4462381089651196, "grad_norm": 1.546779408913841, "learning_rate": 0.0001, "loss": 0.0063, "num_input_tokens_seen": 228266960, "step": 1548 }, { "epoch": 0.4462381089651196, "loss": 0.003417445346713066, "loss_ce": 0.0025643836706876755, "loss_xval": 0.0008544921875, "num_input_tokens_seen": 228266960, "step": 1548 }, { "epoch": 0.446526376477371, "grad_norm": 6.600943517960566, "learning_rate": 0.0001, "loss": 0.0104, "num_input_tokens_seen": 228401736, "step": 1549 }, { "epoch": 0.446526376477371, "loss": 0.014280472882091999, "loss_ce": 0.008085404522716999, "loss_xval": 0.006195068359375, "num_input_tokens_seen": 228401736, "step": 1549 }, { "epoch": 0.4468146439896224, "grad_norm": 3.7195384151417747, "learning_rate": 0.0001, "loss": 0.0027, "num_input_tokens_seen": 228536792, "step": 1550 }, { "epoch": 0.4468146439896224, "loss": 0.002413647249341011, "loss_ce": 0.0004652906209230423, "loss_xval": 0.00194549560546875, "num_input_tokens_seen": 228536792, "step": 1550 }, { "epoch": 0.44710291150187376, "grad_norm": 3.112689711889062, "learning_rate": 0.0001, "loss": 0.0061, "num_input_tokens_seen": 228709232, "step": 1551 }, { "epoch": 0.44710291150187376, "loss": 0.0021983864717185497, "loss_ce": 0.00048463381244800985, "loss_xval": 0.00171661376953125, "num_input_tokens_seen": 228709232, "step": 1551 }, { "epoch": 0.4473911790141251, "grad_norm": 5.5390335008117, "learning_rate": 0.0001, "loss": 0.0089, "num_input_tokens_seen": 228843976, "step": 1552 }, { "epoch": 0.4473911790141251, "loss": 0.012496663257479668, "loss_ce": 0.008344365283846855, "loss_xval": 0.004150390625, "num_input_tokens_seen": 228843976, "step": 1552 }, { "epoch": 0.4476794465263765, "grad_norm": 1.0931289083866806, "learning_rate": 0.0001, "loss": 0.001, "num_input_tokens_seen": 228979048, "step": 1553 }, { "epoch": 0.4476794465263765, "loss": 0.0007365739438682795, "loss_ce": 0.0003422295849304646, "loss_xval": 0.0003948211669921875, "num_input_tokens_seen": 228979048, "step": 1553 }, { "epoch": 0.44796771403862784, "grad_norm": 4.597918279702102, "learning_rate": 0.0001, "loss": 0.0079, "num_input_tokens_seen": 229151512, "step": 1554 }, { "epoch": 0.44796771403862784, "loss": 0.00347797479480505, "loss_ce": 0.00044147565495222807, "loss_xval": 0.0030364990234375, "num_input_tokens_seen": 229151512, "step": 1554 }, { "epoch": 0.4482559815508792, "grad_norm": 4.97960024521966, "learning_rate": 0.0001, "loss": 0.0078, "num_input_tokens_seen": 229286432, "step": 1555 }, { "epoch": 0.4482559815508792, "loss": 0.011004405096173286, "loss_ce": 0.007620769552886486, "loss_xval": 0.003387451171875, "num_input_tokens_seen": 229286432, "step": 1555 }, { "epoch": 0.44854424906313056, "grad_norm": 0.2455556335516732, "learning_rate": 0.0001, "loss": 0.0008, "num_input_tokens_seen": 229421552, "step": 1556 }, { "epoch": 0.44854424906313056, "loss": 0.0007927416008897126, "loss_ce": 0.00031506994855590165, "loss_xval": 0.000476837158203125, "num_input_tokens_seen": 229421552, "step": 1556 }, { "epoch": 0.448832516575382, "grad_norm": 5.320435855878716, "learning_rate": 0.0001, "loss": 0.0086, "num_input_tokens_seen": 229594152, "step": 1557 }, { "epoch": 0.448832516575382, "loss": 0.004450153559446335, "loss_ce": 0.0004847753734793514, "loss_xval": 0.00396728515625, "num_input_tokens_seen": 229594152, "step": 1557 }, { "epoch": 0.44912078408763334, "grad_norm": 4.645618704016488, "learning_rate": 0.0001, "loss": 0.0067, "num_input_tokens_seen": 229728976, "step": 1558 }, { "epoch": 0.44912078408763334, "loss": 0.00939270481467247, "loss_ce": 0.006438221782445908, "loss_xval": 0.002960205078125, "num_input_tokens_seen": 229728976, "step": 1558 }, { "epoch": 0.4494090515998847, "grad_norm": 1.4425698536358162, "learning_rate": 0.0001, "loss": 0.0018, "num_input_tokens_seen": 229864032, "step": 1559 }, { "epoch": 0.4494090515998847, "loss": 0.0012257457710802555, "loss_ce": 0.0005226494395174086, "loss_xval": 0.000701904296875, "num_input_tokens_seen": 229864032, "step": 1559 }, { "epoch": 0.44969731911213606, "grad_norm": 6.097482104716617, "learning_rate": 0.0001, "loss": 0.01, "num_input_tokens_seen": 230036728, "step": 1560 }, { "epoch": 0.44969731911213606, "loss": 0.006726293824613094, "loss_ce": 0.0020990660414099693, "loss_xval": 0.004638671875, "num_input_tokens_seen": 230036728, "step": 1560 }, { "epoch": 0.4499855866243874, "grad_norm": 4.215032668485646, "learning_rate": 0.0001, "loss": 0.008, "num_input_tokens_seen": 230171520, "step": 1561 }, { "epoch": 0.4499855866243874, "loss": 0.012901779264211655, "loss_ce": 0.009932037442922592, "loss_xval": 0.0029754638671875, "num_input_tokens_seen": 230171520, "step": 1561 }, { "epoch": 0.4502738541366388, "grad_norm": 2.535015434197968, "learning_rate": 0.0001, "loss": 0.0017, "num_input_tokens_seen": 230306656, "step": 1562 }, { "epoch": 0.4502738541366388, "loss": 0.0014059317763894796, "loss_ce": 0.0003254187176935375, "loss_xval": 0.0010833740234375, "num_input_tokens_seen": 230306656, "step": 1562 }, { "epoch": 0.45056212164889015, "grad_norm": 6.983310804340718, "learning_rate": 0.0001, "loss": 0.0116, "num_input_tokens_seen": 230479136, "step": 1563 }, { "epoch": 0.45056212164889015, "loss": 0.007696349173784256, "loss_ce": 0.0015699451323598623, "loss_xval": 0.006134033203125, "num_input_tokens_seen": 230479136, "step": 1563 }, { "epoch": 0.45085038916114156, "grad_norm": 3.982794410049864, "learning_rate": 0.0001, "loss": 0.0073, "num_input_tokens_seen": 230613944, "step": 1564 }, { "epoch": 0.45085038916114156, "loss": 0.011464044451713562, "loss_ce": 0.008942529559135437, "loss_xval": 0.0025177001953125, "num_input_tokens_seen": 230613944, "step": 1564 }, { "epoch": 0.4511386566733929, "grad_norm": 3.6182417808098513, "learning_rate": 0.0001, "loss": 0.0026, "num_input_tokens_seen": 230748992, "step": 1565 }, { "epoch": 0.4511386566733929, "loss": 0.0019969851709902287, "loss_ce": 0.00036429459578357637, "loss_xval": 0.0016326904296875, "num_input_tokens_seen": 230748992, "step": 1565 }, { "epoch": 0.4514269241856443, "grad_norm": 7.749495228699737, "learning_rate": 0.0001, "loss": 0.0132, "num_input_tokens_seen": 230921360, "step": 1566 }, { "epoch": 0.4514269241856443, "loss": 0.00788209494203329, "loss_ce": 0.0005044708959758282, "loss_xval": 0.00738525390625, "num_input_tokens_seen": 230921360, "step": 1566 }, { "epoch": 0.45171519169789565, "grad_norm": 3.8659449472484058, "learning_rate": 0.0001, "loss": 0.0062, "num_input_tokens_seen": 231056128, "step": 1567 }, { "epoch": 0.45171519169789565, "loss": 0.0096367122605443, "loss_ce": 0.00724871177226305, "loss_xval": 0.00238037109375, "num_input_tokens_seen": 231056128, "step": 1567 }, { "epoch": 0.452003459210147, "grad_norm": 4.199783172254735, "learning_rate": 0.0001, "loss": 0.0058, "num_input_tokens_seen": 231191184, "step": 1568 }, { "epoch": 0.452003459210147, "loss": 0.007501209620386362, "loss_ce": 0.005417431704699993, "loss_xval": 0.0020904541015625, "num_input_tokens_seen": 231191184, "step": 1568 }, { "epoch": 0.45229172672239837, "grad_norm": 8.37342705145472, "learning_rate": 0.0001, "loss": 0.0156, "num_input_tokens_seen": 231363672, "step": 1569 }, { "epoch": 0.45229172672239837, "loss": 0.010011442005634308, "loss_ce": 0.0010507181286811829, "loss_xval": 0.00897216796875, "num_input_tokens_seen": 231363672, "step": 1569 }, { "epoch": 0.45257999423464973, "grad_norm": 4.643924955363012, "learning_rate": 0.0001, "loss": 0.008, "num_input_tokens_seen": 231498432, "step": 1570 }, { "epoch": 0.45257999423464973, "loss": 0.012639187276363373, "loss_ce": 0.008965633809566498, "loss_xval": 0.0036773681640625, "num_input_tokens_seen": 231498432, "step": 1570 }, { "epoch": 0.45286826174690115, "grad_norm": 3.286366249617848, "learning_rate": 0.0001, "loss": 0.0025, "num_input_tokens_seen": 231633632, "step": 1571 }, { "epoch": 0.45286826174690115, "loss": 0.0012818737886846066, "loss_ce": 0.0002752706059254706, "loss_xval": 0.001007080078125, "num_input_tokens_seen": 231633632, "step": 1571 }, { "epoch": 0.4531565292591525, "grad_norm": 7.813638068618443, "learning_rate": 0.0001, "loss": 0.0125, "num_input_tokens_seen": 231806216, "step": 1572 }, { "epoch": 0.4531565292591525, "loss": 0.00804380513727665, "loss_ce": 0.0005097782704979181, "loss_xval": 0.007537841796875, "num_input_tokens_seen": 231806216, "step": 1572 }, { "epoch": 0.45344479677140387, "grad_norm": 4.395192620509415, "learning_rate": 0.0001, "loss": 0.008, "num_input_tokens_seen": 231940920, "step": 1573 }, { "epoch": 0.45344479677140387, "loss": 0.01320941187441349, "loss_ce": 0.009152481332421303, "loss_xval": 0.004058837890625, "num_input_tokens_seen": 231940920, "step": 1573 }, { "epoch": 0.45373306428365523, "grad_norm": 3.5827965481902977, "learning_rate": 0.0001, "loss": 0.0031, "num_input_tokens_seen": 232075888, "step": 1574 }, { "epoch": 0.45373306428365523, "loss": 0.0021838760003447533, "loss_ce": 0.0009717558859847486, "loss_xval": 0.00121307373046875, "num_input_tokens_seen": 232075888, "step": 1574 }, { "epoch": 0.4540213317959066, "grad_norm": 7.978160524452476, "learning_rate": 0.0001, "loss": 0.0139, "num_input_tokens_seen": 232248472, "step": 1575 }, { "epoch": 0.4540213317959066, "loss": 0.00975605845451355, "loss_ce": 0.0014400180662050843, "loss_xval": 0.00830078125, "num_input_tokens_seen": 232248472, "step": 1575 }, { "epoch": 0.45430959930815795, "grad_norm": 4.626883789668973, "learning_rate": 0.0001, "loss": 0.0086, "num_input_tokens_seen": 232383320, "step": 1576 }, { "epoch": 0.45430959930815795, "loss": 0.014307888224720955, "loss_ce": 0.009922893717885017, "loss_xval": 0.00439453125, "num_input_tokens_seen": 232383320, "step": 1576 }, { "epoch": 0.4545978668204093, "grad_norm": 2.864667933219195, "learning_rate": 0.0001, "loss": 0.0034, "num_input_tokens_seen": 232518432, "step": 1577 }, { "epoch": 0.4545978668204093, "loss": 0.0009719881927594543, "loss_ce": 0.00024481158470734954, "loss_xval": 0.000728607177734375, "num_input_tokens_seen": 232518432, "step": 1577 }, { "epoch": 0.45488613433266073, "grad_norm": 7.476888287506308, "learning_rate": 0.0001, "loss": 0.0128, "num_input_tokens_seen": 232690968, "step": 1578 }, { "epoch": 0.45488613433266073, "loss": 0.008270667865872383, "loss_ce": 0.0009350047912448645, "loss_xval": 0.00732421875, "num_input_tokens_seen": 232690968, "step": 1578 }, { "epoch": 0.4551744018449121, "grad_norm": 5.18413115965893, "learning_rate": 0.0001, "loss": 0.0087, "num_input_tokens_seen": 232825776, "step": 1579 }, { "epoch": 0.4551744018449121, "loss": 0.014096993952989578, "loss_ce": 0.008470315486192703, "loss_xval": 0.005615234375, "num_input_tokens_seen": 232825776, "step": 1579 }, { "epoch": 0.45546266935716345, "grad_norm": 1.2851043654472376, "learning_rate": 0.0001, "loss": 0.0013, "num_input_tokens_seen": 232960856, "step": 1580 }, { "epoch": 0.45546266935716345, "loss": 0.0006986982771195471, "loss_ce": 0.00029136016382835805, "loss_xval": 0.000408172607421875, "num_input_tokens_seen": 232960856, "step": 1580 }, { "epoch": 0.4557509368694148, "grad_norm": 5.719385694562323, "learning_rate": 0.0001, "loss": 0.0088, "num_input_tokens_seen": 233133512, "step": 1581 }, { "epoch": 0.4557509368694148, "loss": 0.0053933728486299515, "loss_ce": 0.000987397157587111, "loss_xval": 0.00439453125, "num_input_tokens_seen": 233133512, "step": 1581 }, { "epoch": 0.4560392043816662, "grad_norm": 4.430066093964202, "learning_rate": 0.0001, "loss": 0.0078, "num_input_tokens_seen": 233268232, "step": 1582 }, { "epoch": 0.4560392043816662, "loss": 0.013365356251597404, "loss_ce": 0.008581725880503654, "loss_xval": 0.004791259765625, "num_input_tokens_seen": 233268232, "step": 1582 }, { "epoch": 0.45632747189391754, "grad_norm": 0.44684700881540795, "learning_rate": 0.0001, "loss": 0.001, "num_input_tokens_seen": 233403200, "step": 1583 }, { "epoch": 0.45632747189391754, "loss": 0.0007035763701424003, "loss_ce": 0.00021958664001431316, "loss_xval": 0.000484466552734375, "num_input_tokens_seen": 233403200, "step": 1583 }, { "epoch": 0.45661573940616895, "grad_norm": 3.695879823804059, "learning_rate": 0.0001, "loss": 0.0041, "num_input_tokens_seen": 233575744, "step": 1584 }, { "epoch": 0.45661573940616895, "loss": 0.0021955205593258142, "loss_ce": 0.0003329944738652557, "loss_xval": 0.001861572265625, "num_input_tokens_seen": 233575744, "step": 1584 }, { "epoch": 0.4569040069184203, "grad_norm": 2.263990778477124, "learning_rate": 0.0001, "loss": 0.0069, "num_input_tokens_seen": 233710616, "step": 1585 }, { "epoch": 0.4569040069184203, "loss": 0.01318388245999813, "loss_ce": 0.011165907606482506, "loss_xval": 0.00201416015625, "num_input_tokens_seen": 233710616, "step": 1585 }, { "epoch": 0.4571922744306717, "grad_norm": 2.0982288678549565, "learning_rate": 0.0001, "loss": 0.0021, "num_input_tokens_seen": 233845624, "step": 1586 }, { "epoch": 0.4571922744306717, "loss": 0.0013415648136287928, "loss_ce": 0.000979049364104867, "loss_xval": 0.000362396240234375, "num_input_tokens_seen": 233845624, "step": 1586 }, { "epoch": 0.45748054194292304, "grad_norm": 4.895005433909848, "learning_rate": 0.0001, "loss": 0.0084, "num_input_tokens_seen": 234018112, "step": 1587 }, { "epoch": 0.45748054194292304, "loss": 0.004058973863720894, "loss_ce": 0.0008851455640979111, "loss_xval": 0.003173828125, "num_input_tokens_seen": 234018112, "step": 1587 }, { "epoch": 0.4577688094551744, "grad_norm": 3.559117590151945, "learning_rate": 0.0001, "loss": 0.0059, "num_input_tokens_seen": 234152976, "step": 1588 }, { "epoch": 0.4577688094551744, "loss": 0.009978879243135452, "loss_ce": 0.006557096727192402, "loss_xval": 0.00341796875, "num_input_tokens_seen": 234152976, "step": 1588 }, { "epoch": 0.45805707696742576, "grad_norm": 0.49625808451973086, "learning_rate": 0.0001, "loss": 0.0013, "num_input_tokens_seen": 234288176, "step": 1589 }, { "epoch": 0.45805707696742576, "loss": 0.0014246056089177728, "loss_ce": 0.0008972237701527774, "loss_xval": 0.00052642822265625, "num_input_tokens_seen": 234288176, "step": 1589 }, { "epoch": 0.4583453444796771, "grad_norm": 3.467829765242924, "learning_rate": 0.0001, "loss": 0.0079, "num_input_tokens_seen": 234460856, "step": 1590 }, { "epoch": 0.4583453444796771, "loss": 0.002010901691392064, "loss_ce": 0.00042875594226643443, "loss_xval": 0.00157928466796875, "num_input_tokens_seen": 234460856, "step": 1590 }, { "epoch": 0.45863361199192854, "grad_norm": 3.129702725598525, "learning_rate": 0.0001, "loss": 0.0064, "num_input_tokens_seen": 234595656, "step": 1591 }, { "epoch": 0.45863361199192854, "loss": 0.011616665869951248, "loss_ce": 0.00884147360920906, "loss_xval": 0.002777099609375, "num_input_tokens_seen": 234595656, "step": 1591 }, { "epoch": 0.4589218795041799, "grad_norm": 0.33646227501719334, "learning_rate": 0.0001, "loss": 0.0017, "num_input_tokens_seen": 234730832, "step": 1592 }, { "epoch": 0.4589218795041799, "loss": 0.0010132191237062216, "loss_ce": 0.0002991555375047028, "loss_xval": 0.000713348388671875, "num_input_tokens_seen": 234730832, "step": 1592 }, { "epoch": 0.45921014701643126, "grad_norm": 1.961299073545222, "learning_rate": 0.0001, "loss": 0.0045, "num_input_tokens_seen": 234903184, "step": 1593 }, { "epoch": 0.45921014701643126, "loss": 0.001006225822493434, "loss_ce": 0.000321726081892848, "loss_xval": 0.000682830810546875, "num_input_tokens_seen": 234903184, "step": 1593 }, { "epoch": 0.4594984145286826, "grad_norm": 1.721840997569964, "learning_rate": 0.0001, "loss": 0.0051, "num_input_tokens_seen": 235038056, "step": 1594 }, { "epoch": 0.4594984145286826, "loss": 0.009549458511173725, "loss_ce": 0.008188565261662006, "loss_xval": 0.0013580322265625, "num_input_tokens_seen": 235038056, "step": 1594 }, { "epoch": 0.459786682040934, "grad_norm": 0.17062183891564095, "learning_rate": 0.0001, "loss": 0.0008, "num_input_tokens_seen": 235173176, "step": 1595 }, { "epoch": 0.459786682040934, "loss": 0.0006244688993319869, "loss_ce": 0.00021963415201753378, "loss_xval": 0.00040435791015625, "num_input_tokens_seen": 235173176, "step": 1595 }, { "epoch": 0.46007494955318534, "grad_norm": 1.483910756864113, "learning_rate": 0.0001, "loss": 0.0063, "num_input_tokens_seen": 235345632, "step": 1596 }, { "epoch": 0.46007494955318534, "loss": 0.0008567938930355012, "loss_ce": 0.00030771593446843326, "loss_xval": 0.00054931640625, "num_input_tokens_seen": 235345632, "step": 1596 }, { "epoch": 0.4603632170654367, "grad_norm": 0.7808873756743552, "learning_rate": 0.0001, "loss": 0.0061, "num_input_tokens_seen": 235480456, "step": 1597 }, { "epoch": 0.4603632170654367, "loss": 0.011714190244674683, "loss_ce": 0.011104553937911987, "loss_xval": 0.0006103515625, "num_input_tokens_seen": 235480456, "step": 1597 }, { "epoch": 0.4606514845776881, "grad_norm": 1.2412921623326705, "learning_rate": 0.0001, "loss": 0.001, "num_input_tokens_seen": 235615480, "step": 1598 }, { "epoch": 0.4606514845776881, "loss": 0.0004532962338998914, "loss_ce": 0.00021571211982518435, "loss_xval": 0.00023746490478515625, "num_input_tokens_seen": 235615480, "step": 1598 }, { "epoch": 0.4609397520899395, "grad_norm": 2.2899244520261948, "learning_rate": 0.0001, "loss": 0.0046, "num_input_tokens_seen": 235788088, "step": 1599 }, { "epoch": 0.4609397520899395, "loss": 0.001188235473819077, "loss_ce": 0.0003819038684014231, "loss_xval": 0.000804901123046875, "num_input_tokens_seen": 235788088, "step": 1599 }, { "epoch": 0.46122801960219084, "grad_norm": 1.2176522588627314, "learning_rate": 0.0001, "loss": 0.0049, "num_input_tokens_seen": 235922920, "step": 1600 }, { "epoch": 0.46122801960219084, "loss": 0.00936118047684431, "loss_ce": 0.008414182811975479, "loss_xval": 0.000946044921875, "num_input_tokens_seen": 235922920, "step": 1600 }, { "epoch": 0.4615162871144422, "grad_norm": 0.9907528345995467, "learning_rate": 0.0001, "loss": 0.001, "num_input_tokens_seen": 236058200, "step": 1601 }, { "epoch": 0.4615162871144422, "loss": 0.0004800662863999605, "loss_ce": 0.00026364182122051716, "loss_xval": 0.00021648406982421875, "num_input_tokens_seen": 236058200, "step": 1601 }, { "epoch": 0.46180455462669356, "grad_norm": 2.525567127516242, "learning_rate": 0.0001, "loss": 0.0058, "num_input_tokens_seen": 236230672, "step": 1602 }, { "epoch": 0.46180455462669356, "loss": 0.0014512970810756087, "loss_ce": 0.000483317649923265, "loss_xval": 0.00096893310546875, "num_input_tokens_seen": 236230672, "step": 1602 }, { "epoch": 0.4620928221389449, "grad_norm": 2.373056990094241, "learning_rate": 0.0001, "loss": 0.0044, "num_input_tokens_seen": 236365440, "step": 1603 }, { "epoch": 0.4620928221389449, "loss": 0.007948949001729488, "loss_ce": 0.006201818119734526, "loss_xval": 0.00174713134765625, "num_input_tokens_seen": 236365440, "step": 1603 }, { "epoch": 0.4623810896511963, "grad_norm": 1.3101942198197403, "learning_rate": 0.0001, "loss": 0.0018, "num_input_tokens_seen": 236500472, "step": 1604 }, { "epoch": 0.4623810896511963, "loss": 0.001278056064620614, "loss_ce": 0.00020278830197639763, "loss_xval": 0.00107574462890625, "num_input_tokens_seen": 236500472, "step": 1604 }, { "epoch": 0.4626693571634477, "grad_norm": 1.1630350641608882, "learning_rate": 0.0001, "loss": 0.0039, "num_input_tokens_seen": 236672968, "step": 1605 }, { "epoch": 0.4626693571634477, "loss": 0.0008471074397675693, "loss_ce": 0.00029254582477733493, "loss_xval": 0.000553131103515625, "num_input_tokens_seen": 236672968, "step": 1605 }, { "epoch": 0.46295762467569906, "grad_norm": 2.9590444115680916, "learning_rate": 0.0001, "loss": 0.0066, "num_input_tokens_seen": 236807832, "step": 1606 }, { "epoch": 0.46295762467569906, "loss": 0.010720960795879364, "loss_ce": 0.009732477366924286, "loss_xval": 0.0009918212890625, "num_input_tokens_seen": 236807832, "step": 1606 }, { "epoch": 0.4632458921879504, "grad_norm": 6.278245676438151, "learning_rate": 0.0001, "loss": 0.0063, "num_input_tokens_seen": 236942912, "step": 1607 }, { "epoch": 0.4632458921879504, "loss": 0.004414130933582783, "loss_ce": 0.00018935362459160388, "loss_xval": 0.00421142578125, "num_input_tokens_seen": 236942912, "step": 1607 }, { "epoch": 0.4635341597002018, "grad_norm": 10.105785581159694, "learning_rate": 0.0001, "loss": 0.0192, "num_input_tokens_seen": 237115424, "step": 1608 }, { "epoch": 0.4635341597002018, "loss": 0.014623086899518967, "loss_ce": 0.0003790073096752167, "loss_xval": 0.01422119140625, "num_input_tokens_seen": 237115424, "step": 1608 }, { "epoch": 0.46382242721245315, "grad_norm": 14.24392109551276, "learning_rate": 0.0001, "loss": 0.0344, "num_input_tokens_seen": 237250240, "step": 1609 }, { "epoch": 0.46382242721245315, "loss": 0.04370329529047012, "loss_ce": 0.011598804034292698, "loss_xval": 0.0322265625, "num_input_tokens_seen": 237250240, "step": 1609 }, { "epoch": 0.4641106947247045, "grad_norm": 19.783437624937637, "learning_rate": 0.0001, "loss": 0.056, "num_input_tokens_seen": 237385408, "step": 1610 }, { "epoch": 0.4641106947247045, "loss": 0.062287457287311554, "loss_ce": 0.00021470512729138136, "loss_xval": 0.06201171875, "num_input_tokens_seen": 237385408, "step": 1610 }, { "epoch": 0.46439896223695587, "grad_norm": 27.140839690327642, "learning_rate": 0.0001, "loss": 0.1082, "num_input_tokens_seen": 237557888, "step": 1611 }, { "epoch": 0.46439896223695587, "loss": 0.10900463163852692, "loss_ce": 0.0005451533943414688, "loss_xval": 0.1083984375, "num_input_tokens_seen": 237557888, "step": 1611 }, { "epoch": 0.4646872297492073, "grad_norm": 31.234951537714885, "learning_rate": 0.0001, "loss": 0.1447, "num_input_tokens_seen": 237692728, "step": 1612 }, { "epoch": 0.4646872297492073, "loss": 0.14542171359062195, "loss_ce": 0.008214689791202545, "loss_xval": 0.13671875, "num_input_tokens_seen": 237692728, "step": 1612 }, { "epoch": 0.46497549726145865, "grad_norm": 23.55866477887437, "learning_rate": 0.0001, "loss": 0.083, "num_input_tokens_seen": 237827928, "step": 1613 }, { "epoch": 0.46497549726145865, "loss": 0.07829268276691437, "loss_ce": 0.0005338959745131433, "loss_xval": 0.07763671875, "num_input_tokens_seen": 237827928, "step": 1613 }, { "epoch": 0.46526376477371, "grad_norm": 3.971405211022664, "learning_rate": 0.0001, "loss": 0.0088, "num_input_tokens_seen": 238000440, "step": 1614 }, { "epoch": 0.46526376477371, "loss": 0.0032971161417663097, "loss_ce": 0.0005486266454681754, "loss_xval": 0.00274658203125, "num_input_tokens_seen": 238000440, "step": 1614 }, { "epoch": 0.46555203228596137, "grad_norm": 16.527875696120834, "learning_rate": 0.0001, "loss": 0.0465, "num_input_tokens_seen": 238135312, "step": 1615 }, { "epoch": 0.46555203228596137, "loss": 0.051238447427749634, "loss_ce": 0.007415205705910921, "loss_xval": 0.0439453125, "num_input_tokens_seen": 238135312, "step": 1615 }, { "epoch": 0.46584029979821273, "grad_norm": 22.9706387506555, "learning_rate": 0.0001, "loss": 0.0801, "num_input_tokens_seen": 238270408, "step": 1616 }, { "epoch": 0.46584029979821273, "loss": 0.07985322177410126, "loss_ce": 0.0002633807889651507, "loss_xval": 0.07958984375, "num_input_tokens_seen": 238270408, "step": 1616 }, { "epoch": 0.4661285673104641, "grad_norm": 10.81122800922098, "learning_rate": 0.0001, "loss": 0.0243, "num_input_tokens_seen": 238442976, "step": 1617 }, { "epoch": 0.4661285673104641, "loss": 0.021433737128973007, "loss_ce": 0.002375510986894369, "loss_xval": 0.01904296875, "num_input_tokens_seen": 238442976, "step": 1617 }, { "epoch": 0.46641683482271545, "grad_norm": 9.549291981727233, "learning_rate": 0.0001, "loss": 0.022, "num_input_tokens_seen": 238577672, "step": 1618 }, { "epoch": 0.46641683482271545, "loss": 0.023982558399438858, "loss_ce": 0.010119949467480183, "loss_xval": 0.01385498046875, "num_input_tokens_seen": 238577672, "step": 1618 }, { "epoch": 0.46670510233496687, "grad_norm": 18.65749974230963, "learning_rate": 0.0001, "loss": 0.055, "num_input_tokens_seen": 238712600, "step": 1619 }, { "epoch": 0.46670510233496687, "loss": 0.05401700735092163, "loss_ce": 0.0002755502355284989, "loss_xval": 0.0537109375, "num_input_tokens_seen": 238712600, "step": 1619 }, { "epoch": 0.46699336984721823, "grad_norm": 7.856487019792692, "learning_rate": 0.0001, "loss": 0.0168, "num_input_tokens_seen": 238885080, "step": 1620 }, { "epoch": 0.46699336984721823, "loss": 0.012627718970179558, "loss_ce": 0.000374911876861006, "loss_xval": 0.01226806640625, "num_input_tokens_seen": 238885080, "step": 1620 }, { "epoch": 0.4672816373594696, "grad_norm": 11.188000000786419, "learning_rate": 0.0001, "loss": 0.0247, "num_input_tokens_seen": 239019832, "step": 1621 }, { "epoch": 0.4672816373594696, "loss": 0.025063853710889816, "loss_ce": 0.007562023587524891, "loss_xval": 0.0174560546875, "num_input_tokens_seen": 239019832, "step": 1621 }, { "epoch": 0.46756990487172095, "grad_norm": 16.834899974492362, "learning_rate": 0.0001, "loss": 0.0455, "num_input_tokens_seen": 239154848, "step": 1622 }, { "epoch": 0.46756990487172095, "loss": 0.045431505888700485, "loss_ce": 0.0006011828663758934, "loss_xval": 0.044921875, "num_input_tokens_seen": 239154848, "step": 1622 }, { "epoch": 0.4678581723839723, "grad_norm": 2.7871025358915906, "learning_rate": 0.0001, "loss": 0.0092, "num_input_tokens_seen": 239327320, "step": 1623 }, { "epoch": 0.4678581723839723, "loss": 0.008208648301661015, "loss_ce": 0.00468577491119504, "loss_xval": 0.0035247802734375, "num_input_tokens_seen": 239327320, "step": 1623 }, { "epoch": 0.4681464398962237, "grad_norm": 13.562232424207437, "learning_rate": 0.0001, "loss": 0.0361, "num_input_tokens_seen": 239462112, "step": 1624 }, { "epoch": 0.4681464398962237, "loss": 0.033323727548122406, "loss_ce": 0.010069331154227257, "loss_xval": 0.023193359375, "num_input_tokens_seen": 239462112, "step": 1624 }, { "epoch": 0.46843470740847504, "grad_norm": 11.587959949993882, "learning_rate": 0.0001, "loss": 0.0237, "num_input_tokens_seen": 239597232, "step": 1625 }, { "epoch": 0.46843470740847504, "loss": 0.02408481389284134, "loss_ce": 0.00032687990460544825, "loss_xval": 0.0238037109375, "num_input_tokens_seen": 239597232, "step": 1625 }, { "epoch": 0.46872297492072645, "grad_norm": 5.998804309422722, "learning_rate": 0.0001, "loss": 0.015, "num_input_tokens_seen": 239769608, "step": 1626 }, { "epoch": 0.46872297492072645, "loss": 0.003973469138145447, "loss_ce": 0.0004830213147215545, "loss_xval": 0.0034942626953125, "num_input_tokens_seen": 239769608, "step": 1626 }, { "epoch": 0.4690112424329778, "grad_norm": 16.26213066494406, "learning_rate": 0.0001, "loss": 0.05, "num_input_tokens_seen": 239904400, "step": 1627 }, { "epoch": 0.4690112424329778, "loss": 0.044366300106048584, "loss_ce": 0.00798934418708086, "loss_xval": 0.036376953125, "num_input_tokens_seen": 239904400, "step": 1627 }, { "epoch": 0.4692995099452292, "grad_norm": 4.206662652807161, "learning_rate": 0.0001, "loss": 0.0064, "num_input_tokens_seen": 240039488, "step": 1628 }, { "epoch": 0.4692995099452292, "loss": 0.005551893264055252, "loss_ce": 0.00031812855741009116, "loss_xval": 0.0052490234375, "num_input_tokens_seen": 240039488, "step": 1628 }, { "epoch": 0.46958777745748054, "grad_norm": 16.1538880969112, "learning_rate": 0.0001, "loss": 0.0509, "num_input_tokens_seen": 240211976, "step": 1629 }, { "epoch": 0.46958777745748054, "loss": 0.029739946126937866, "loss_ce": 0.0005498826503753662, "loss_xval": 0.0291748046875, "num_input_tokens_seen": 240211976, "step": 1629 }, { "epoch": 0.4698760449697319, "grad_norm": 17.30655354392214, "learning_rate": 0.0001, "loss": 0.0562, "num_input_tokens_seen": 240346792, "step": 1630 }, { "epoch": 0.4698760449697319, "loss": 0.051143795251846313, "loss_ce": 0.007106929086148739, "loss_xval": 0.0439453125, "num_input_tokens_seen": 240346792, "step": 1630 }, { "epoch": 0.47016431248198326, "grad_norm": 4.871710309339203, "learning_rate": 0.0001, "loss": 0.0058, "num_input_tokens_seen": 240481968, "step": 1631 }, { "epoch": 0.47016431248198326, "loss": 0.0038508400321006775, "loss_ce": 0.0004004462971352041, "loss_xval": 0.003448486328125, "num_input_tokens_seen": 240481968, "step": 1631 }, { "epoch": 0.4704525799942347, "grad_norm": 22.22578398457069, "learning_rate": 0.0001, "loss": 0.0886, "num_input_tokens_seen": 240654584, "step": 1632 }, { "epoch": 0.4704525799942347, "loss": 0.06573119014501572, "loss_ce": 0.00039305369136855006, "loss_xval": 0.0654296875, "num_input_tokens_seen": 240654584, "step": 1632 }, { "epoch": 0.47074084750648604, "grad_norm": 10.295121901232179, "learning_rate": 0.0001, "loss": 0.0256, "num_input_tokens_seen": 240789440, "step": 1633 }, { "epoch": 0.47074084750648604, "loss": 0.02956472337245941, "loss_ce": 0.01014028675854206, "loss_xval": 0.0194091796875, "num_input_tokens_seen": 240789440, "step": 1633 }, { "epoch": 0.4710291150187374, "grad_norm": 14.965752387695096, "learning_rate": 0.0001, "loss": 0.0421, "num_input_tokens_seen": 240924440, "step": 1634 }, { "epoch": 0.4710291150187374, "loss": 0.03500206023454666, "loss_ce": 0.00042564034811221063, "loss_xval": 0.03466796875, "num_input_tokens_seen": 240924440, "step": 1634 }, { "epoch": 0.47131738253098876, "grad_norm": 17.858622605015228, "learning_rate": 0.0001, "loss": 0.0651, "num_input_tokens_seen": 241096856, "step": 1635 }, { "epoch": 0.47131738253098876, "loss": 0.05295751243829727, "loss_ce": 0.00037572276778519154, "loss_xval": 0.052490234375, "num_input_tokens_seen": 241096856, "step": 1635 }, { "epoch": 0.4716056500432401, "grad_norm": 4.614423817877169, "learning_rate": 0.0001, "loss": 0.0114, "num_input_tokens_seen": 241231656, "step": 1636 }, { "epoch": 0.4716056500432401, "loss": 0.014617221429944038, "loss_ce": 0.008631961420178413, "loss_xval": 0.0059814453125, "num_input_tokens_seen": 241231656, "step": 1636 }, { "epoch": 0.4718939175554915, "grad_norm": 19.190589867557332, "learning_rate": 0.0001, "loss": 0.0701, "num_input_tokens_seen": 241366888, "step": 1637 }, { "epoch": 0.4718939175554915, "loss": 0.06205695495009422, "loss_ce": 0.0008692118572071195, "loss_xval": 0.061279296875, "num_input_tokens_seen": 241366888, "step": 1637 }, { "epoch": 0.47218218506774284, "grad_norm": 4.293632420630124, "learning_rate": 0.0001, "loss": 0.0078, "num_input_tokens_seen": 241539280, "step": 1638 }, { "epoch": 0.47218218506774284, "loss": 0.005224054679274559, "loss_ce": 0.0013445076765492558, "loss_xval": 0.003875732421875, "num_input_tokens_seen": 241539280, "step": 1638 }, { "epoch": 0.47247045257999426, "grad_norm": 17.120904008588155, "learning_rate": 0.0001, "loss": 0.0587, "num_input_tokens_seen": 241674192, "step": 1639 }, { "epoch": 0.47247045257999426, "loss": 0.058812811970710754, "loss_ce": 0.007817939855158329, "loss_xval": 0.051025390625, "num_input_tokens_seen": 241674192, "step": 1639 }, { "epoch": 0.4727587200922456, "grad_norm": 12.678106685232084, "learning_rate": 0.0001, "loss": 0.0325, "num_input_tokens_seen": 241809352, "step": 1640 }, { "epoch": 0.4727587200922456, "loss": 0.02766924723982811, "loss_ce": 0.00029497960349544883, "loss_xval": 0.02734375, "num_input_tokens_seen": 241809352, "step": 1640 }, { "epoch": 0.473046987604497, "grad_norm": 10.072616136148826, "learning_rate": 0.0001, "loss": 0.0232, "num_input_tokens_seen": 241981848, "step": 1641 }, { "epoch": 0.473046987604497, "loss": 0.01985846646130085, "loss_ce": 0.0006323920097202063, "loss_xval": 0.019287109375, "num_input_tokens_seen": 241981848, "step": 1641 }, { "epoch": 0.47333525511674834, "grad_norm": 16.424813937686693, "learning_rate": 0.0001, "loss": 0.056, "num_input_tokens_seen": 242116624, "step": 1642 }, { "epoch": 0.47333525511674834, "loss": 0.05718439444899559, "loss_ce": 0.00887506827712059, "loss_xval": 0.04833984375, "num_input_tokens_seen": 242116624, "step": 1642 }, { "epoch": 0.4736235226289997, "grad_norm": 1.9623067562963932, "learning_rate": 0.0001, "loss": 0.002, "num_input_tokens_seen": 242251720, "step": 1643 }, { "epoch": 0.4736235226289997, "loss": 0.002029548864811659, "loss_ce": 0.0002747882972471416, "loss_xval": 0.0017547607421875, "num_input_tokens_seen": 242251720, "step": 1643 }, { "epoch": 0.47391179014125107, "grad_norm": 15.945841150302297, "learning_rate": 0.0001, "loss": 0.0542, "num_input_tokens_seen": 242424312, "step": 1644 }, { "epoch": 0.47391179014125107, "loss": 0.04716937243938446, "loss_ce": 0.0005079961847513914, "loss_xval": 0.046630859375, "num_input_tokens_seen": 242424312, "step": 1644 }, { "epoch": 0.4742000576535024, "grad_norm": 4.53648202392857, "learning_rate": 0.0001, "loss": 0.0089, "num_input_tokens_seen": 242559168, "step": 1645 }, { "epoch": 0.4742000576535024, "loss": 0.01110774278640747, "loss_ce": 0.006743729114532471, "loss_xval": 0.004364013671875, "num_input_tokens_seen": 242559168, "step": 1645 }, { "epoch": 0.47448832516575384, "grad_norm": 13.462705008051305, "learning_rate": 0.0001, "loss": 0.0368, "num_input_tokens_seen": 242694168, "step": 1646 }, { "epoch": 0.47448832516575384, "loss": 0.03658619150519371, "loss_ce": 0.00027027411852031946, "loss_xval": 0.036376953125, "num_input_tokens_seen": 242694168, "step": 1646 }, { "epoch": 0.4747765926780052, "grad_norm": 10.000124083236205, "learning_rate": 0.0001, "loss": 0.0252, "num_input_tokens_seen": 242866584, "step": 1647 }, { "epoch": 0.4747765926780052, "loss": 0.018924858421087265, "loss_ce": 0.0005990548525005579, "loss_xval": 0.018310546875, "num_input_tokens_seen": 242866584, "step": 1647 }, { "epoch": 0.47506486019025657, "grad_norm": 8.050537914125309, "learning_rate": 0.0001, "loss": 0.0166, "num_input_tokens_seen": 243001280, "step": 1648 }, { "epoch": 0.47506486019025657, "loss": 0.020638510584831238, "loss_ce": 0.005127951968461275, "loss_xval": 0.0155029296875, "num_input_tokens_seen": 243001280, "step": 1648 }, { "epoch": 0.4753531277025079, "grad_norm": 12.371986608475902, "learning_rate": 0.0001, "loss": 0.0323, "num_input_tokens_seen": 243136352, "step": 1649 }, { "epoch": 0.4753531277025079, "loss": 0.03136796876788139, "loss_ce": 0.0009114270796999335, "loss_xval": 0.030517578125, "num_input_tokens_seen": 243136352, "step": 1649 }, { "epoch": 0.4756413952147593, "grad_norm": 1.410721305972707, "learning_rate": 0.0001, "loss": 0.0055, "num_input_tokens_seen": 243308768, "step": 1650 }, { "epoch": 0.4756413952147593, "loss": 0.0017137866234406829, "loss_ce": 0.0004358630394563079, "loss_xval": 0.00128173828125, "num_input_tokens_seen": 243308768, "step": 1650 }, { "epoch": 0.47592966272701065, "grad_norm": 10.94040115444206, "learning_rate": 0.0001, "loss": 0.0276, "num_input_tokens_seen": 243443504, "step": 1651 }, { "epoch": 0.47592966272701065, "loss": 0.030689680948853493, "loss_ce": 0.005024399608373642, "loss_xval": 0.025634765625, "num_input_tokens_seen": 243443504, "step": 1651 }, { "epoch": 0.476217930239262, "grad_norm": 2.230035570569663, "learning_rate": 0.0001, "loss": 0.0018, "num_input_tokens_seen": 243578616, "step": 1652 }, { "epoch": 0.476217930239262, "loss": 0.001764374552294612, "loss_ce": 0.00029666972113773227, "loss_xval": 0.00146484375, "num_input_tokens_seen": 243578616, "step": 1652 }, { "epoch": 0.4765061977515134, "grad_norm": 9.022289083119624, "learning_rate": 0.0001, "loss": 0.0204, "num_input_tokens_seen": 243751184, "step": 1653 }, { "epoch": 0.4765061977515134, "loss": 0.018094517290592194, "loss_ce": 0.0011114849476143718, "loss_xval": 0.0169677734375, "num_input_tokens_seen": 243751184, "step": 1653 }, { "epoch": 0.4767944652637648, "grad_norm": 4.739831753216289, "learning_rate": 0.0001, "loss": 0.0092, "num_input_tokens_seen": 243886016, "step": 1654 }, { "epoch": 0.4767944652637648, "loss": 0.012720847502350807, "loss_ce": 0.007132316008210182, "loss_xval": 0.005584716796875, "num_input_tokens_seen": 243886016, "step": 1654 }, { "epoch": 0.47708273277601615, "grad_norm": 6.834353804413808, "learning_rate": 0.0001, "loss": 0.0119, "num_input_tokens_seen": 244021200, "step": 1655 }, { "epoch": 0.47708273277601615, "loss": 0.009534979239106178, "loss_ce": 0.0003110412508249283, "loss_xval": 0.00921630859375, "num_input_tokens_seen": 244021200, "step": 1655 }, { "epoch": 0.4773710002882675, "grad_norm": 6.508606064768901, "learning_rate": 0.0001, "loss": 0.0154, "num_input_tokens_seen": 244193648, "step": 1656 }, { "epoch": 0.4773710002882675, "loss": 0.008853845298290253, "loss_ce": 0.0007209099130704999, "loss_xval": 0.00811767578125, "num_input_tokens_seen": 244193648, "step": 1656 }, { "epoch": 0.47765926780051887, "grad_norm": 4.791951822258596, "learning_rate": 0.0001, "loss": 0.011, "num_input_tokens_seen": 244328336, "step": 1657 }, { "epoch": 0.47765926780051887, "loss": 0.015914246439933777, "loss_ce": 0.010794922709465027, "loss_xval": 0.005126953125, "num_input_tokens_seen": 244328336, "step": 1657 }, { "epoch": 0.47794753531277023, "grad_norm": 8.518485511752413, "learning_rate": 0.0001, "loss": 0.017, "num_input_tokens_seen": 244463576, "step": 1658 }, { "epoch": 0.47794753531277023, "loss": 0.013599589467048645, "loss_ce": 0.00042362435488030314, "loss_xval": 0.01318359375, "num_input_tokens_seen": 244463576, "step": 1658 }, { "epoch": 0.4782358028250216, "grad_norm": 1.213327411200081, "learning_rate": 0.0001, "loss": 0.0059, "num_input_tokens_seen": 244636112, "step": 1659 }, { "epoch": 0.4782358028250216, "loss": 0.002218852750957012, "loss_ce": 0.000748763675801456, "loss_xval": 0.00147247314453125, "num_input_tokens_seen": 244636112, "step": 1659 }, { "epoch": 0.478524070337273, "grad_norm": 9.181071206460544, "learning_rate": 0.0001, "loss": 0.0221, "num_input_tokens_seen": 244770920, "step": 1660 }, { "epoch": 0.478524070337273, "loss": 0.02432107925415039, "loss_ce": 0.007727145683020353, "loss_xval": 0.0166015625, "num_input_tokens_seen": 244770920, "step": 1660 }, { "epoch": 0.47881233784952437, "grad_norm": 3.4061375786237496, "learning_rate": 0.0001, "loss": 0.0037, "num_input_tokens_seen": 244906072, "step": 1661 }, { "epoch": 0.47881233784952437, "loss": 0.002656823955476284, "loss_ce": 0.000368005596101284, "loss_xval": 0.002288818359375, "num_input_tokens_seen": 244906072, "step": 1661 }, { "epoch": 0.47910060536177573, "grad_norm": 6.917202408403764, "learning_rate": 0.0001, "loss": 0.0162, "num_input_tokens_seen": 245078544, "step": 1662 }, { "epoch": 0.47910060536177573, "loss": 0.012157600373029709, "loss_ce": 0.000423592166043818, "loss_xval": 0.01171875, "num_input_tokens_seen": 245078544, "step": 1662 }, { "epoch": 0.4793888728740271, "grad_norm": 6.442072131706248, "learning_rate": 0.0001, "loss": 0.0151, "num_input_tokens_seen": 245213312, "step": 1663 }, { "epoch": 0.4793888728740271, "loss": 0.01970660500228405, "loss_ce": 0.01142108254134655, "loss_xval": 0.00830078125, "num_input_tokens_seen": 245213312, "step": 1663 }, { "epoch": 0.47967714038627846, "grad_norm": 3.332300474485623, "learning_rate": 0.0001, "loss": 0.0032, "num_input_tokens_seen": 245348296, "step": 1664 }, { "epoch": 0.47967714038627846, "loss": 0.0035281481686979532, "loss_ce": 0.0002932848874479532, "loss_xval": 0.00323486328125, "num_input_tokens_seen": 245348296, "step": 1664 }, { "epoch": 0.4799654078985298, "grad_norm": 7.695169704124381, "learning_rate": 0.0001, "loss": 0.0185, "num_input_tokens_seen": 245520760, "step": 1665 }, { "epoch": 0.4799654078985298, "loss": 0.013785162940621376, "loss_ce": 0.0004032054857816547, "loss_xval": 0.01336669921875, "num_input_tokens_seen": 245520760, "step": 1665 }, { "epoch": 0.4802536754107812, "grad_norm": 0.8855467303934494, "learning_rate": 0.0001, "loss": 0.0051, "num_input_tokens_seen": 245655544, "step": 1666 }, { "epoch": 0.4802536754107812, "loss": 0.009379335679113865, "loss_ce": 0.008762546814978123, "loss_xval": 0.00061798095703125, "num_input_tokens_seen": 245655544, "step": 1666 }, { "epoch": 0.4805419429230326, "grad_norm": 6.393784109798928, "learning_rate": 0.0001, "loss": 0.0094, "num_input_tokens_seen": 245790752, "step": 1667 }, { "epoch": 0.4805419429230326, "loss": 0.009521054103970528, "loss_ce": 0.0002742270880844444, "loss_xval": 0.00927734375, "num_input_tokens_seen": 245790752, "step": 1667 }, { "epoch": 0.48083021043528396, "grad_norm": 3.936996774178376, "learning_rate": 0.0001, "loss": 0.0077, "num_input_tokens_seen": 245963240, "step": 1668 }, { "epoch": 0.48083021043528396, "loss": 0.004050452262163162, "loss_ce": 0.00040741637349128723, "loss_xval": 0.0036468505859375, "num_input_tokens_seen": 245963240, "step": 1668 }, { "epoch": 0.4811184779475353, "grad_norm": 3.686948208785727, "learning_rate": 0.0001, "loss": 0.0074, "num_input_tokens_seen": 246098016, "step": 1669 }, { "epoch": 0.4811184779475353, "loss": 0.011370341293513775, "loss_ce": 0.007866541855037212, "loss_xval": 0.003509521484375, "num_input_tokens_seen": 246098016, "step": 1669 }, { "epoch": 0.4814067454597867, "grad_norm": 5.263022503025478, "learning_rate": 0.0001, "loss": 0.0066, "num_input_tokens_seen": 246232992, "step": 1670 }, { "epoch": 0.4814067454597867, "loss": 0.006171601824462414, "loss_ce": 0.00024737673811614513, "loss_xval": 0.00592041015625, "num_input_tokens_seen": 246232992, "step": 1670 }, { "epoch": 0.48169501297203804, "grad_norm": 0.8220510945207464, "learning_rate": 0.0001, "loss": 0.0085, "num_input_tokens_seen": 246405480, "step": 1671 }, { "epoch": 0.48169501297203804, "loss": 0.0007558593060821295, "loss_ce": 0.00026328652165830135, "loss_xval": 0.000492095947265625, "num_input_tokens_seen": 246405480, "step": 1671 }, { "epoch": 0.4819832804842894, "grad_norm": 5.489817625132127, "learning_rate": 0.0001, "loss": 0.0109, "num_input_tokens_seen": 246540264, "step": 1672 }, { "epoch": 0.4819832804842894, "loss": 0.014926630072295666, "loss_ce": 0.007903773337602615, "loss_xval": 0.00701904296875, "num_input_tokens_seen": 246540264, "step": 1672 }, { "epoch": 0.4822715479965408, "grad_norm": 2.518344153341866, "learning_rate": 0.0001, "loss": 0.0043, "num_input_tokens_seen": 246675400, "step": 1673 }, { "epoch": 0.4822715479965408, "loss": 0.005512160249054432, "loss_ce": 0.004024427849799395, "loss_xval": 0.00148773193359375, "num_input_tokens_seen": 246675400, "step": 1673 }, { "epoch": 0.4825598155087922, "grad_norm": 3.545674469895499, "learning_rate": 0.0001, "loss": 0.0084, "num_input_tokens_seen": 246847800, "step": 1674 }, { "epoch": 0.4825598155087922, "loss": 0.003581702709197998, "loss_ce": 0.00027626747032627463, "loss_xval": 0.0033111572265625, "num_input_tokens_seen": 246847800, "step": 1674 }, { "epoch": 0.48284808302104354, "grad_norm": 4.677097197848902, "learning_rate": 0.0001, "loss": 0.0101, "num_input_tokens_seen": 246982656, "step": 1675 }, { "epoch": 0.48284808302104354, "loss": 0.014951646327972412, "loss_ce": 0.009744584560394287, "loss_xval": 0.005218505859375, "num_input_tokens_seen": 246982656, "step": 1675 }, { "epoch": 0.4831363505332949, "grad_norm": 0.06732599670808967, "learning_rate": 0.0001, "loss": 0.0007, "num_input_tokens_seen": 247117624, "step": 1676 }, { "epoch": 0.4831363505332949, "loss": 0.0005512872594408691, "loss_ce": 0.00023967419110704213, "loss_xval": 0.0003108978271484375, "num_input_tokens_seen": 247117624, "step": 1676 }, { "epoch": 0.48342461804554626, "grad_norm": 4.186459031367488, "learning_rate": 0.0001, "loss": 0.0084, "num_input_tokens_seen": 247290104, "step": 1677 }, { "epoch": 0.48342461804554626, "loss": 0.004762859083712101, "loss_ce": 0.0006620598142035306, "loss_xval": 0.00408935546875, "num_input_tokens_seen": 247290104, "step": 1677 }, { "epoch": 0.4837128855577976, "grad_norm": 2.814617904179985, "learning_rate": 0.0001, "loss": 0.0069, "num_input_tokens_seen": 247424832, "step": 1678 }, { "epoch": 0.4837128855577976, "loss": 0.011508513242006302, "loss_ce": 0.00955061987042427, "loss_xval": 0.001953125, "num_input_tokens_seen": 247424832, "step": 1678 }, { "epoch": 0.484001153070049, "grad_norm": 1.9229211822494456, "learning_rate": 0.0001, "loss": 0.0013, "num_input_tokens_seen": 247559832, "step": 1679 }, { "epoch": 0.484001153070049, "loss": 0.0012820811243727803, "loss_ce": 0.00022779422579333186, "loss_xval": 0.0010528564453125, "num_input_tokens_seen": 247559832, "step": 1679 }, { "epoch": 0.4842894205823004, "grad_norm": 3.973230759058265, "learning_rate": 0.0001, "loss": 0.0074, "num_input_tokens_seen": 247732560, "step": 1680 }, { "epoch": 0.4842894205823004, "loss": 0.004148183390498161, "loss_ce": 0.00030106125632300973, "loss_xval": 0.00384521484375, "num_input_tokens_seen": 247732560, "step": 1680 }, { "epoch": 0.48457768809455176, "grad_norm": 1.1343054790113372, "learning_rate": 0.0001, "loss": 0.0054, "num_input_tokens_seen": 247867400, "step": 1681 }, { "epoch": 0.48457768809455176, "loss": 0.0073390984907746315, "loss_ce": 0.00671324972063303, "loss_xval": 0.0006256103515625, "num_input_tokens_seen": 247867400, "step": 1681 }, { "epoch": 0.4848659556068031, "grad_norm": 2.8233753979799636, "learning_rate": 0.0001, "loss": 0.0032, "num_input_tokens_seen": 248002544, "step": 1682 }, { "epoch": 0.4848659556068031, "loss": 0.0029423628002405167, "loss_ce": 0.0011876022908836603, "loss_xval": 0.0017547607421875, "num_input_tokens_seen": 248002544, "step": 1682 }, { "epoch": 0.4851542231190545, "grad_norm": 3.1356041359079727, "learning_rate": 0.0001, "loss": 0.0081, "num_input_tokens_seen": 248174976, "step": 1683 }, { "epoch": 0.4851542231190545, "loss": 0.0028833728283643723, "loss_ce": 0.0003389697812963277, "loss_xval": 0.0025482177734375, "num_input_tokens_seen": 248174976, "step": 1683 }, { "epoch": 0.48544249063130585, "grad_norm": 0.16825660364549808, "learning_rate": 0.0001, "loss": 0.0031, "num_input_tokens_seen": 248309784, "step": 1684 }, { "epoch": 0.48544249063130585, "loss": 0.005761802662163973, "loss_ce": 0.005357802379876375, "loss_xval": 0.00040435791015625, "num_input_tokens_seen": 248309784, "step": 1684 }, { "epoch": 0.4857307581435572, "grad_norm": 2.9574485021450614, "learning_rate": 0.0001, "loss": 0.0025, "num_input_tokens_seen": 248444784, "step": 1685 }, { "epoch": 0.4857307581435572, "loss": 0.0023077609948813915, "loss_ce": 0.00037084840005263686, "loss_xval": 0.0019378662109375, "num_input_tokens_seen": 248444784, "step": 1685 }, { "epoch": 0.48601902565580857, "grad_norm": 1.9710236987670244, "learning_rate": 0.0001, "loss": 0.0041, "num_input_tokens_seen": 248617416, "step": 1686 }, { "epoch": 0.48601902565580857, "loss": 0.001532458234578371, "loss_ce": 0.00034704094287008047, "loss_xval": 0.00118255615234375, "num_input_tokens_seen": 248617416, "step": 1686 }, { "epoch": 0.48630729316806, "grad_norm": 1.6574027342722102, "learning_rate": 0.0001, "loss": 0.0044, "num_input_tokens_seen": 248752320, "step": 1687 }, { "epoch": 0.48630729316806, "loss": 0.007482764311134815, "loss_ce": 0.006698128767311573, "loss_xval": 0.00078582763671875, "num_input_tokens_seen": 248752320, "step": 1687 }, { "epoch": 0.48659556068031135, "grad_norm": 3.402714718010406, "learning_rate": 0.0001, "loss": 0.003, "num_input_tokens_seen": 248887352, "step": 1688 }, { "epoch": 0.48659556068031135, "loss": 0.002607089001685381, "loss_ce": 0.00019810753292404115, "loss_xval": 0.002410888671875, "num_input_tokens_seen": 248887352, "step": 1688 }, { "epoch": 0.4868838281925627, "grad_norm": 1.0612269469325708, "learning_rate": 0.0001, "loss": 0.0037, "num_input_tokens_seen": 249060080, "step": 1689 }, { "epoch": 0.4868838281925627, "loss": 0.000750762177631259, "loss_ce": 0.00023482434335164726, "loss_xval": 0.000514984130859375, "num_input_tokens_seen": 249060080, "step": 1689 }, { "epoch": 0.48717209570481407, "grad_norm": 2.6202481583055857, "learning_rate": 0.0001, "loss": 0.0065, "num_input_tokens_seen": 249194848, "step": 1690 }, { "epoch": 0.48717209570481407, "loss": 0.010731605812907219, "loss_ce": 0.009190468117594719, "loss_xval": 0.0015411376953125, "num_input_tokens_seen": 249194848, "step": 1690 }, { "epoch": 0.48746036321706543, "grad_norm": 3.391612477312743, "learning_rate": 0.0001, "loss": 0.003, "num_input_tokens_seen": 249329816, "step": 1691 }, { "epoch": 0.48746036321706543, "loss": 0.002606350462883711, "loss_ce": 0.00022121102665551007, "loss_xval": 0.00238037109375, "num_input_tokens_seen": 249329816, "step": 1691 }, { "epoch": 0.4877486307293168, "grad_norm": 0.29936391697277404, "learning_rate": 0.0001, "loss": 0.0049, "num_input_tokens_seen": 249502304, "step": 1692 }, { "epoch": 0.4877486307293168, "loss": 0.001809053821489215, "loss_ce": 0.0015478662680834532, "loss_xval": 0.0002613067626953125, "num_input_tokens_seen": 249502304, "step": 1692 }, { "epoch": 0.48803689824156815, "grad_norm": 3.5497603899955794, "learning_rate": 0.0001, "loss": 0.007, "num_input_tokens_seen": 249637096, "step": 1693 }, { "epoch": 0.48803689824156815, "loss": 0.01052839308977127, "loss_ce": 0.007900066673755646, "loss_xval": 0.00262451171875, "num_input_tokens_seen": 249637096, "step": 1693 }, { "epoch": 0.48832516575381957, "grad_norm": 4.189153427151052, "learning_rate": 0.0001, "loss": 0.0048, "num_input_tokens_seen": 249772064, "step": 1694 }, { "epoch": 0.48832516575381957, "loss": 0.0034670508466660976, "loss_ce": 0.00020929938182234764, "loss_xval": 0.003265380859375, "num_input_tokens_seen": 249772064, "step": 1694 }, { "epoch": 0.48861343326607093, "grad_norm": 0.6277208093933458, "learning_rate": 0.0001, "loss": 0.0045, "num_input_tokens_seen": 249944584, "step": 1695 }, { "epoch": 0.48861343326607093, "loss": 0.0007549943984486163, "loss_ce": 0.00036637208540923893, "loss_xval": 0.00038909912109375, "num_input_tokens_seen": 249944584, "step": 1695 }, { "epoch": 0.4889017007783223, "grad_norm": 3.3679481066575474, "learning_rate": 0.0001, "loss": 0.006, "num_input_tokens_seen": 250079368, "step": 1696 }, { "epoch": 0.4889017007783223, "loss": 0.008834033273160458, "loss_ce": 0.006375460885465145, "loss_xval": 0.0024566650390625, "num_input_tokens_seen": 250079368, "step": 1696 }, { "epoch": 0.48918996829057365, "grad_norm": 4.179445419185496, "learning_rate": 0.0001, "loss": 0.0043, "num_input_tokens_seen": 250214560, "step": 1697 }, { "epoch": 0.48918996829057365, "loss": 0.004044243134558201, "loss_ce": 0.00017232507525477558, "loss_xval": 0.003875732421875, "num_input_tokens_seen": 250214560, "step": 1697 }, { "epoch": 0.489478235802825, "grad_norm": 1.138036604988876, "learning_rate": 0.0001, "loss": 0.0051, "num_input_tokens_seen": 250387136, "step": 1698 }, { "epoch": 0.489478235802825, "loss": 0.0008948410395532846, "loss_ce": 0.00018316156638320535, "loss_xval": 0.000713348388671875, "num_input_tokens_seen": 250387136, "step": 1698 }, { "epoch": 0.4897665033150764, "grad_norm": 2.9359114211323214, "learning_rate": 0.0001, "loss": 0.0066, "num_input_tokens_seen": 250521960, "step": 1699 }, { "epoch": 0.4897665033150764, "loss": 0.008544711396098137, "loss_ce": 0.006434229668229818, "loss_xval": 0.002105712890625, "num_input_tokens_seen": 250521960, "step": 1699 }, { "epoch": 0.49005477082732773, "grad_norm": 4.246204134764685, "learning_rate": 0.0001, "loss": 0.0044, "num_input_tokens_seen": 250657000, "step": 1700 }, { "epoch": 0.49005477082732773, "loss": 0.004051597788929939, "loss_ce": 0.000160606752615422, "loss_xval": 0.0038909912109375, "num_input_tokens_seen": 250657000, "step": 1700 }, { "epoch": 0.49034303833957915, "grad_norm": 1.6708110961924532, "learning_rate": 0.0001, "loss": 0.0059, "num_input_tokens_seen": 250829520, "step": 1701 }, { "epoch": 0.49034303833957915, "loss": 0.0034429319202899933, "loss_ce": 0.0023800618946552277, "loss_xval": 0.00106048583984375, "num_input_tokens_seen": 250829520, "step": 1701 }, { "epoch": 0.4906313058518305, "grad_norm": 2.29495962909665, "learning_rate": 0.0001, "loss": 0.006, "num_input_tokens_seen": 250964360, "step": 1702 }, { "epoch": 0.4906313058518305, "loss": 0.009584905579686165, "loss_ce": 0.008198263123631477, "loss_xval": 0.0013885498046875, "num_input_tokens_seen": 250964360, "step": 1702 }, { "epoch": 0.4909195733640819, "grad_norm": 4.051342750901919, "learning_rate": 0.0001, "loss": 0.004, "num_input_tokens_seen": 251099480, "step": 1703 }, { "epoch": 0.4909195733640819, "loss": 0.0038174032233655453, "loss_ce": 0.0001457572216168046, "loss_xval": 0.0036773681640625, "num_input_tokens_seen": 251099480, "step": 1703 }, { "epoch": 0.49120784087633323, "grad_norm": 2.4280188175113104, "learning_rate": 0.0001, "loss": 0.0067, "num_input_tokens_seen": 251271904, "step": 1704 }, { "epoch": 0.49120784087633323, "loss": 0.002196403918787837, "loss_ce": 0.0004912342992611229, "loss_xval": 0.001708984375, "num_input_tokens_seen": 251271904, "step": 1704 }, { "epoch": 0.4914961083885846, "grad_norm": 0.9184818784150366, "learning_rate": 0.0001, "loss": 0.0049, "num_input_tokens_seen": 251406736, "step": 1705 }, { "epoch": 0.4914961083885846, "loss": 0.009084072895348072, "loss_ce": 0.008687344379723072, "loss_xval": 0.000396728515625, "num_input_tokens_seen": 251406736, "step": 1705 }, { "epoch": 0.49178437590083596, "grad_norm": 3.2573763940603477, "learning_rate": 0.0001, "loss": 0.0028, "num_input_tokens_seen": 251541728, "step": 1706 }, { "epoch": 0.49178437590083596, "loss": 0.00256602605804801, "loss_ce": 0.00013606389984488487, "loss_xval": 0.0024261474609375, "num_input_tokens_seen": 251541728, "step": 1706 }, { "epoch": 0.4920726434130873, "grad_norm": 2.9257386010126085, "learning_rate": 0.0001, "loss": 0.0066, "num_input_tokens_seen": 251714192, "step": 1707 }, { "epoch": 0.4920726434130873, "loss": 0.0038506370037794113, "loss_ce": 0.001319585251621902, "loss_xval": 0.002532958984375, "num_input_tokens_seen": 251714192, "step": 1707 }, { "epoch": 0.49236091092533874, "grad_norm": 0.546968323447548, "learning_rate": 0.0001, "loss": 0.0041, "num_input_tokens_seen": 251848992, "step": 1708 }, { "epoch": 0.49236091092533874, "loss": 0.007951841689646244, "loss_ce": 0.007509575225412846, "loss_xval": 0.0004425048828125, "num_input_tokens_seen": 251848992, "step": 1708 }, { "epoch": 0.4926491784375901, "grad_norm": 1.8227164441424433, "learning_rate": 0.0001, "loss": 0.0013, "num_input_tokens_seen": 251984240, "step": 1709 }, { "epoch": 0.4926491784375901, "loss": 0.0008962653810158372, "loss_ce": 0.00012998809688724577, "loss_xval": 0.000766754150390625, "num_input_tokens_seen": 251984240, "step": 1709 }, { "epoch": 0.49293744594984146, "grad_norm": 2.5433669083958867, "learning_rate": 0.0001, "loss": 0.0068, "num_input_tokens_seen": 252156808, "step": 1710 }, { "epoch": 0.49293744594984146, "loss": 0.002464288379997015, "loss_ce": 0.0006427703192457557, "loss_xval": 0.00182342529296875, "num_input_tokens_seen": 252156808, "step": 1710 }, { "epoch": 0.4932257134620928, "grad_norm": 1.8610536126480004, "learning_rate": 0.0001, "loss": 0.0043, "num_input_tokens_seen": 252291600, "step": 1711 }, { "epoch": 0.4932257134620928, "loss": 0.007783045060932636, "loss_ce": 0.0064679281786084175, "loss_xval": 0.001312255859375, "num_input_tokens_seen": 252291600, "step": 1711 }, { "epoch": 0.4935139809743442, "grad_norm": 0.8857956722653375, "learning_rate": 0.0001, "loss": 0.0016, "num_input_tokens_seen": 252426768, "step": 1712 }, { "epoch": 0.4935139809743442, "loss": 0.0005321372882463038, "loss_ce": 0.00012110365059925243, "loss_xval": 0.0004119873046875, "num_input_tokens_seen": 252426768, "step": 1712 }, { "epoch": 0.49380224848659554, "grad_norm": 0.5796544010149521, "learning_rate": 0.0001, "loss": 0.0064, "num_input_tokens_seen": 252599320, "step": 1713 }, { "epoch": 0.49380224848659554, "loss": 0.0036989161744713783, "loss_ce": 0.0034384438768029213, "loss_xval": 0.0002613067626953125, "num_input_tokens_seen": 252599320, "step": 1713 }, { "epoch": 0.49409051599884696, "grad_norm": 0.7655430660339378, "learning_rate": 0.0001, "loss": 0.0038, "num_input_tokens_seen": 252734120, "step": 1714 }, { "epoch": 0.49409051599884696, "loss": 0.006788511760532856, "loss_ce": 0.006498952396214008, "loss_xval": 0.0002899169921875, "num_input_tokens_seen": 252734120, "step": 1714 }, { "epoch": 0.4943787835110983, "grad_norm": 0.7202521182045087, "learning_rate": 0.0001, "loss": 0.0006, "num_input_tokens_seen": 252869224, "step": 1715 }, { "epoch": 0.4943787835110983, "loss": 0.00029843367519788444, "loss_ce": 0.00011461296526249498, "loss_xval": 0.00018405914306640625, "num_input_tokens_seen": 252869224, "step": 1715 }, { "epoch": 0.4946670510233497, "grad_norm": 0.20232642834434011, "learning_rate": 0.0001, "loss": 0.0053, "num_input_tokens_seen": 253041680, "step": 1716 }, { "epoch": 0.4946670510233497, "loss": 0.0009578837198205292, "loss_ce": 0.0007630957406945527, "loss_xval": 0.000194549560546875, "num_input_tokens_seen": 253041680, "step": 1716 }, { "epoch": 0.49495531853560104, "grad_norm": 1.1670590573363684, "learning_rate": 0.0001, "loss": 0.0048, "num_input_tokens_seen": 253176552, "step": 1717 }, { "epoch": 0.49495531853560104, "loss": 0.008777554146945477, "loss_ce": 0.008417423814535141, "loss_xval": 0.0003604888916015625, "num_input_tokens_seen": 253176552, "step": 1717 }, { "epoch": 0.4952435860478524, "grad_norm": 1.4988461928806927, "learning_rate": 0.0001, "loss": 0.0008, "num_input_tokens_seen": 253311680, "step": 1718 }, { "epoch": 0.4952435860478524, "loss": 0.0006728050066158175, "loss_ce": 0.00010918345651589334, "loss_xval": 0.0005645751953125, "num_input_tokens_seen": 253311680, "step": 1718 }, { "epoch": 0.49553185356010376, "grad_norm": 0.8959117241888527, "learning_rate": 0.0001, "loss": 0.0043, "num_input_tokens_seen": 253484192, "step": 1719 }, { "epoch": 0.49553185356010376, "loss": 0.0006553968414664268, "loss_ce": 0.00014923422713764012, "loss_xval": 0.000507354736328125, "num_input_tokens_seen": 253484192, "step": 1719 }, { "epoch": 0.4958201210723551, "grad_norm": 0.19446805773449719, "learning_rate": 0.0001, "loss": 0.0034, "num_input_tokens_seen": 253618976, "step": 1720 }, { "epoch": 0.4958201210723551, "loss": 0.0064426204189658165, "loss_ce": 0.006185128353536129, "loss_xval": 0.0002574920654296875, "num_input_tokens_seen": 253618976, "step": 1720 }, { "epoch": 0.49610838858460654, "grad_norm": 0.5419645813642442, "learning_rate": 0.0001, "loss": 0.0004, "num_input_tokens_seen": 253754168, "step": 1721 }, { "epoch": 0.49610838858460654, "loss": 0.00029128711321391165, "loss_ce": 0.0001080028378055431, "loss_xval": 0.00018310546875, "num_input_tokens_seen": 253754168, "step": 1721 }, { "epoch": 0.4963966560968579, "grad_norm": 0.16764965953239006, "learning_rate": 0.0001, "loss": 0.0031, "num_input_tokens_seen": 253926776, "step": 1722 }, { "epoch": 0.4963966560968579, "loss": 0.00037144176894798875, "loss_ce": 0.00013326163752935827, "loss_xval": 0.0002384185791015625, "num_input_tokens_seen": 253926776, "step": 1722 }, { "epoch": 0.49668492360910926, "grad_norm": 0.5143012980626315, "learning_rate": 0.0001, "loss": 0.006, "num_input_tokens_seen": 254061584, "step": 1723 }, { "epoch": 0.49668492360910926, "loss": 0.011035285890102386, "loss_ce": 0.010848961770534515, "loss_xval": 0.00018596649169921875, "num_input_tokens_seen": 254061584, "step": 1723 }, { "epoch": 0.4969731911213606, "grad_norm": 0.6497143775508163, "learning_rate": 0.0001, "loss": 0.0004, "num_input_tokens_seen": 254196720, "step": 1724 }, { "epoch": 0.4969731911213606, "loss": 0.0003499590093269944, "loss_ce": 0.00016345609037671238, "loss_xval": 0.000186920166015625, "num_input_tokens_seen": 254196720, "step": 1724 }, { "epoch": 0.497261458633612, "grad_norm": 0.3602351152941275, "learning_rate": 0.0001, "loss": 0.0066, "num_input_tokens_seen": 254369112, "step": 1725 }, { "epoch": 0.497261458633612, "loss": 0.0003184653469361365, "loss_ce": 0.00011902822006959468, "loss_xval": 0.00019931793212890625, "num_input_tokens_seen": 254369112, "step": 1725 }, { "epoch": 0.49754972614586335, "grad_norm": 0.9732779173841355, "learning_rate": 0.0001, "loss": 0.0049, "num_input_tokens_seen": 254503904, "step": 1726 }, { "epoch": 0.49754972614586335, "loss": 0.009241588413715363, "loss_ce": 0.00895286351442337, "loss_xval": 0.0002880096435546875, "num_input_tokens_seen": 254503904, "step": 1726 }, { "epoch": 0.4978379936581147, "grad_norm": 1.8909439477260013, "learning_rate": 0.0001, "loss": 0.0012, "num_input_tokens_seen": 254638912, "step": 1727 }, { "epoch": 0.4978379936581147, "loss": 0.0012038424611091614, "loss_ce": 0.00030452755163423717, "loss_xval": 0.0009002685546875, "num_input_tokens_seen": 254638912, "step": 1727 }, { "epoch": 0.4981262611703661, "grad_norm": 2.5298356679902243, "learning_rate": 0.0001, "loss": 0.0055, "num_input_tokens_seen": 254811496, "step": 1728 }, { "epoch": 0.4981262611703661, "loss": 0.0020236079581081867, "loss_ce": 0.0001296107075177133, "loss_xval": 0.00189208984375, "num_input_tokens_seen": 254811496, "step": 1728 }, { "epoch": 0.4984145286826175, "grad_norm": 3.4742859942049025, "learning_rate": 0.0001, "loss": 0.0072, "num_input_tokens_seen": 254946296, "step": 1729 }, { "epoch": 0.4984145286826175, "loss": 0.012024440802633762, "loss_ce": 0.008614100515842438, "loss_xval": 0.00341796875, "num_input_tokens_seen": 254946296, "step": 1729 }, { "epoch": 0.49870279619486885, "grad_norm": 5.673154601826852, "learning_rate": 0.0001, "loss": 0.0074, "num_input_tokens_seen": 255081392, "step": 1730 }, { "epoch": 0.49870279619486885, "loss": 0.007622344419360161, "loss_ce": 0.00011502046254463494, "loss_xval": 0.00750732421875, "num_input_tokens_seen": 255081392, "step": 1730 }, { "epoch": 0.4989910637071202, "grad_norm": 10.11347194036202, "learning_rate": 0.0001, "loss": 0.027, "num_input_tokens_seen": 255253968, "step": 1731 }, { "epoch": 0.4989910637071202, "loss": 0.021371876820921898, "loss_ce": 0.00011638353316811845, "loss_xval": 0.021240234375, "num_input_tokens_seen": 255253968, "step": 1731 }, { "epoch": 0.49927933121937157, "grad_norm": 17.873211376225438, "learning_rate": 0.0001, "loss": 0.074, "num_input_tokens_seen": 255388760, "step": 1732 }, { "epoch": 0.49927933121937157, "loss": 0.07567343860864639, "loss_ce": 0.007741309702396393, "loss_xval": 0.06787109375, "num_input_tokens_seen": 255388760, "step": 1732 }, { "epoch": 0.49956759873162293, "grad_norm": 28.20506840621217, "learning_rate": 0.0001, "loss": 0.1821, "num_input_tokens_seen": 255523872, "step": 1733 }, { "epoch": 0.49956759873162293, "loss": 0.18528181314468384, "loss_ce": 0.00010114896576851606, "loss_xval": 0.185546875, "num_input_tokens_seen": 255523872, "step": 1733 }, { "epoch": 0.4998558662438743, "grad_norm": 32.107443205850224, "learning_rate": 0.0001, "loss": 0.2388, "num_input_tokens_seen": 255696352, "step": 1734 }, { "epoch": 0.4998558662438743, "loss": 0.24280518293380737, "loss_ce": 0.00012939998123329133, "loss_xval": 0.2421875, "num_input_tokens_seen": 255696352, "step": 1734 }, { "epoch": 0.5001441337561257, "grad_norm": 20.192170135400502, "learning_rate": 0.0001, "loss": 0.1051, "num_input_tokens_seen": 255831144, "step": 1735 }, { "epoch": 0.5001441337561257, "loss": 0.11830128729343414, "loss_ce": 0.012649435549974442, "loss_xval": 0.10546875, "num_input_tokens_seen": 255831144, "step": 1735 }, { "epoch": 0.5004324012683771, "grad_norm": 4.138208841567372, "learning_rate": 0.0001, "loss": 0.0099, "num_input_tokens_seen": 255966200, "step": 1736 }, { "epoch": 0.5004324012683771, "loss": 0.009793087840080261, "loss_ce": 0.00044708000496029854, "loss_xval": 0.00933837890625, "num_input_tokens_seen": 255966200, "step": 1736 }, { "epoch": 0.5007206687806284, "grad_norm": 20.913269450739836, "learning_rate": 0.0001, "loss": 0.1158, "num_input_tokens_seen": 256138808, "step": 1737 }, { "epoch": 0.5007206687806284, "loss": 0.11857728660106659, "loss_ce": 0.00023012000019662082, "loss_xval": 0.1181640625, "num_input_tokens_seen": 256138808, "step": 1737 }, { "epoch": 0.5010089362928798, "grad_norm": 16.722405346702054, "learning_rate": 0.0001, "loss": 0.0794, "num_input_tokens_seen": 256273608, "step": 1738 }, { "epoch": 0.5010089362928798, "loss": 0.09203828871250153, "loss_ce": 0.006344921886920929, "loss_xval": 0.0859375, "num_input_tokens_seen": 256273608, "step": 1738 }, { "epoch": 0.5012972038051312, "grad_norm": 2.1893892288404886, "learning_rate": 0.0001, "loss": 0.0049, "num_input_tokens_seen": 256408632, "step": 1739 }, { "epoch": 0.5012972038051312, "loss": 0.0053595867939293385, "loss_ce": 0.0009078349103219807, "loss_xval": 0.00445556640625, "num_input_tokens_seen": 256408632, "step": 1739 }, { "epoch": 0.5015854713173825, "grad_norm": 14.686392840406226, "learning_rate": 0.0001, "loss": 0.0608, "num_input_tokens_seen": 256581120, "step": 1740 }, { "epoch": 0.5015854713173825, "loss": 0.058756496757268906, "loss_ce": 0.0004679245757870376, "loss_xval": 0.058349609375, "num_input_tokens_seen": 256581120, "step": 1740 }, { "epoch": 0.5018737388296339, "grad_norm": 11.310540214461815, "learning_rate": 0.0001, "loss": 0.0392, "num_input_tokens_seen": 256715976, "step": 1741 }, { "epoch": 0.5018737388296339, "loss": 0.04806603491306305, "loss_ce": 0.008728873915970325, "loss_xval": 0.039306640625, "num_input_tokens_seen": 256715976, "step": 1741 }, { "epoch": 0.5021620063418852, "grad_norm": 1.33807983469355, "learning_rate": 0.0001, "loss": 0.0034, "num_input_tokens_seen": 256851024, "step": 1742 }, { "epoch": 0.5021620063418852, "loss": 0.003706869902089238, "loss_ce": 0.0002393101021880284, "loss_xval": 0.0034637451171875, "num_input_tokens_seen": 256851024, "step": 1742 }, { "epoch": 0.5024502738541367, "grad_norm": 10.614122739332574, "learning_rate": 0.0001, "loss": 0.0328, "num_input_tokens_seen": 257023632, "step": 1743 }, { "epoch": 0.5024502738541367, "loss": 0.027926770970225334, "loss_ce": 0.00026258552679792047, "loss_xval": 0.0277099609375, "num_input_tokens_seen": 257023632, "step": 1743 }, { "epoch": 0.502738541366388, "grad_norm": 7.657015223009477, "learning_rate": 0.0001, "loss": 0.0198, "num_input_tokens_seen": 257158440, "step": 1744 }, { "epoch": 0.502738541366388, "loss": 0.02629696950316429, "loss_ce": 0.007711765356361866, "loss_xval": 0.0185546875, "num_input_tokens_seen": 257158440, "step": 1744 }, { "epoch": 0.5030268088786394, "grad_norm": 2.2808369659423535, "learning_rate": 0.0001, "loss": 0.0023, "num_input_tokens_seen": 257293408, "step": 1745 }, { "epoch": 0.5030268088786394, "loss": 0.0011452080216258764, "loss_ce": 0.0002578140702098608, "loss_xval": 0.000888824462890625, "num_input_tokens_seen": 257293408, "step": 1745 }, { "epoch": 0.5033150763908908, "grad_norm": 8.013615070959606, "learning_rate": 0.0001, "loss": 0.0228, "num_input_tokens_seen": 257465864, "step": 1746 }, { "epoch": 0.5033150763908908, "loss": 0.01814066618680954, "loss_ce": 0.00030314087052829564, "loss_xval": 0.017822265625, "num_input_tokens_seen": 257465864, "step": 1746 }, { "epoch": 0.5036033439031421, "grad_norm": 3.457585227141799, "learning_rate": 0.0001, "loss": 0.0069, "num_input_tokens_seen": 257600728, "step": 1747 }, { "epoch": 0.5036033439031421, "loss": 0.011352559551596642, "loss_ce": 0.005506536923348904, "loss_xval": 0.005859375, "num_input_tokens_seen": 257600728, "step": 1747 }, { "epoch": 0.5038916114153935, "grad_norm": 5.38481600143461, "learning_rate": 0.0001, "loss": 0.0087, "num_input_tokens_seen": 257735848, "step": 1748 }, { "epoch": 0.5038916114153935, "loss": 0.005389847327023745, "loss_ce": 0.00027815281646326184, "loss_xval": 0.005126953125, "num_input_tokens_seen": 257735848, "step": 1748 }, { "epoch": 0.5041798789276448, "grad_norm": 6.642525855981442, "learning_rate": 0.0001, "loss": 0.0169, "num_input_tokens_seen": 257908416, "step": 1749 }, { "epoch": 0.5041798789276448, "loss": 0.01205461099743843, "loss_ce": 0.00032060168450698256, "loss_xval": 0.01171875, "num_input_tokens_seen": 257908416, "step": 1749 }, { "epoch": 0.5044681464398962, "grad_norm": 1.1926943126740923, "learning_rate": 0.0001, "loss": 0.0057, "num_input_tokens_seen": 258043136, "step": 1750 }, { "epoch": 0.5044681464398962, "eval_websight_new_IoU": 0.0, "eval_websight_new_MAE_x": 0.08808813989162445, "eval_websight_new_MAE_y": 0.08523305132985115, "eval_websight_new_NUM_probability": 0.99669548869133, "eval_websight_new_inside_bbox": 0.0, "eval_websight_new_loss": 0.013562072068452835, "eval_websight_new_loss_ce": 0.00028378186107147485, "eval_websight_new_loss_xval": 0.01325225830078125, "eval_websight_new_runtime": 35.8548, "eval_websight_new_samples_per_second": 1.395, "eval_websight_new_steps_per_second": 0.056, "num_input_tokens_seen": 258043136, "step": 1750 }, { "epoch": 0.5044681464398962, "eval_seeclick_IoU": 0.006532402941957116, "eval_seeclick_MAE_x": 0.13058312609791756, "eval_seeclick_MAE_y": 0.1318240687251091, "eval_seeclick_NUM_probability": 0.9967434406280518, "eval_seeclick_inside_bbox": 0.0711805559694767, "eval_seeclick_loss": 0.03064095973968506, "eval_seeclick_loss_ce": 0.00829253252595663, "eval_seeclick_loss_xval": 0.02297210693359375, "eval_seeclick_runtime": 63.4085, "eval_seeclick_samples_per_second": 0.789, "eval_seeclick_steps_per_second": 0.032, "num_input_tokens_seen": 258043136, "step": 1750 }, { "epoch": 0.5044681464398962, "eval_icons_IoU": 0.0, "eval_icons_MAE_x": 0.11214055120944977, "eval_icons_MAE_y": 0.1266588643193245, "eval_icons_NUM_probability": 0.9963801205158234, "eval_icons_inside_bbox": 0.0, "eval_icons_loss": 0.028072558343410492, "eval_icons_loss_ce": 0.004345909343101084, "eval_icons_loss_xval": 0.02297210693359375, "eval_icons_runtime": 66.7092, "eval_icons_samples_per_second": 0.75, "eval_icons_steps_per_second": 0.03, "num_input_tokens_seen": 258043136, "step": 1750 }, { "epoch": 0.5044681464398962, "loss": 0.02850811555981636, "loss_ce": 0.005391049198806286, "loss_xval": 0.0230712890625, "num_input_tokens_seen": 258043136, "step": 1750 }, { "epoch": 0.5047564139521475, "grad_norm": 7.831924408909281, "learning_rate": 0.0001, "loss": 0.0184, "num_input_tokens_seen": 258178240, "step": 1751 }, { "epoch": 0.5047564139521475, "loss": 0.013527311384677887, "loss_ce": 0.0002903116401284933, "loss_xval": 0.01324462890625, "num_input_tokens_seen": 258178240, "step": 1751 }, { "epoch": 0.505044681464399, "grad_norm": 4.176178068633609, "learning_rate": 0.0001, "loss": 0.0089, "num_input_tokens_seen": 258350680, "step": 1752 }, { "epoch": 0.505044681464399, "loss": 0.006143480539321899, "loss_ce": 0.0004366936336737126, "loss_xval": 0.005706787109375, "num_input_tokens_seen": 258350680, "step": 1752 }, { "epoch": 0.5053329489766504, "grad_norm": 5.643783467526188, "learning_rate": 0.0001, "loss": 0.014, "num_input_tokens_seen": 258485472, "step": 1753 }, { "epoch": 0.5053329489766504, "loss": 0.014654072932898998, "loss_ce": 0.008191976696252823, "loss_xval": 0.0064697265625, "num_input_tokens_seen": 258485472, "step": 1753 }, { "epoch": 0.5056212164889017, "grad_norm": 8.477634537026859, "learning_rate": 0.0001, "loss": 0.0205, "num_input_tokens_seen": 258620400, "step": 1754 }, { "epoch": 0.5056212164889017, "loss": 0.01704748347401619, "loss_ce": 0.00027044457965530455, "loss_xval": 0.0167236328125, "num_input_tokens_seen": 258620400, "step": 1754 }, { "epoch": 0.5059094840011531, "grad_norm": 0.35876812800575075, "learning_rate": 0.0001, "loss": 0.0048, "num_input_tokens_seen": 258792904, "step": 1755 }, { "epoch": 0.5059094840011531, "loss": 0.0015499168075621128, "loss_ce": 0.00030537188285961747, "loss_xval": 0.00124359130859375, "num_input_tokens_seen": 258792904, "step": 1755 }, { "epoch": 0.5061977515134044, "grad_norm": 9.14907245606204, "learning_rate": 0.0001, "loss": 0.0295, "num_input_tokens_seen": 258927752, "step": 1756 }, { "epoch": 0.5061977515134044, "loss": 0.02945183776319027, "loss_ce": 0.01081322692334652, "loss_xval": 0.0186767578125, "num_input_tokens_seen": 258927752, "step": 1756 }, { "epoch": 0.5064860190256558, "grad_norm": 6.475913983913407, "learning_rate": 0.0001, "loss": 0.0124, "num_input_tokens_seen": 259063000, "step": 1757 }, { "epoch": 0.5064860190256558, "loss": 0.010992877185344696, "loss_ce": 0.00026594821247272193, "loss_xval": 0.0107421875, "num_input_tokens_seen": 259063000, "step": 1757 }, { "epoch": 0.5067742865379071, "grad_norm": 5.060122557936152, "learning_rate": 0.0001, "loss": 0.0132, "num_input_tokens_seen": 259235464, "step": 1758 }, { "epoch": 0.5067742865379071, "loss": 0.0065870825201272964, "loss_ce": 0.00031190545996651053, "loss_xval": 0.00628662109375, "num_input_tokens_seen": 259235464, "step": 1758 }, { "epoch": 0.5070625540501585, "grad_norm": 10.671722543873086, "learning_rate": 0.0001, "loss": 0.0356, "num_input_tokens_seen": 259370384, "step": 1759 }, { "epoch": 0.5070625540501585, "loss": 0.033719681203365326, "loss_ce": 0.0073677548207342625, "loss_xval": 0.0263671875, "num_input_tokens_seen": 259370384, "step": 1759 }, { "epoch": 0.50735082156241, "grad_norm": 3.14861007606756, "learning_rate": 0.0001, "loss": 0.0038, "num_input_tokens_seen": 259505544, "step": 1760 }, { "epoch": 0.50735082156241, "loss": 0.003241258906200528, "loss_ce": 0.0002581655571702868, "loss_xval": 0.00299072265625, "num_input_tokens_seen": 259505544, "step": 1760 }, { "epoch": 0.5076390890746613, "grad_norm": 8.30325635822721, "learning_rate": 0.0001, "loss": 0.0248, "num_input_tokens_seen": 259677936, "step": 1761 }, { "epoch": 0.5076390890746613, "loss": 0.016355641186237335, "loss_ce": 0.0003110232937615365, "loss_xval": 0.0159912109375, "num_input_tokens_seen": 259677936, "step": 1761 }, { "epoch": 0.5079273565869127, "grad_norm": 9.637160112658565, "learning_rate": 0.0001, "loss": 0.03, "num_input_tokens_seen": 259812696, "step": 1762 }, { "epoch": 0.5079273565869127, "loss": 0.028667747974395752, "loss_ce": 0.006618797779083252, "loss_xval": 0.0220947265625, "num_input_tokens_seen": 259812696, "step": 1762 }, { "epoch": 0.508215624099164, "grad_norm": 0.7828280927300808, "learning_rate": 0.0001, "loss": 0.0013, "num_input_tokens_seen": 259947800, "step": 1763 }, { "epoch": 0.508215624099164, "loss": 0.0010191877372562885, "loss_ce": 0.0002243001654278487, "loss_xval": 0.00079345703125, "num_input_tokens_seen": 259947800, "step": 1763 }, { "epoch": 0.5085038916114154, "grad_norm": 10.565618258218029, "learning_rate": 0.0001, "loss": 0.036, "num_input_tokens_seen": 260120256, "step": 1764 }, { "epoch": 0.5085038916114154, "loss": 0.027929434552788734, "loss_ce": 0.000417837843997404, "loss_xval": 0.0274658203125, "num_input_tokens_seen": 260120256, "step": 1764 }, { "epoch": 0.5087921591236667, "grad_norm": 7.7345371447851194, "learning_rate": 0.0001, "loss": 0.0213, "num_input_tokens_seen": 260255000, "step": 1765 }, { "epoch": 0.5087921591236667, "loss": 0.0220201276242733, "loss_ce": 0.0070436252281069756, "loss_xval": 0.01495361328125, "num_input_tokens_seen": 260255000, "step": 1765 }, { "epoch": 0.5090804266359181, "grad_norm": 4.785958281539146, "learning_rate": 0.0001, "loss": 0.0075, "num_input_tokens_seen": 260389984, "step": 1766 }, { "epoch": 0.5090804266359181, "loss": 0.006561644375324249, "loss_ce": 0.00021780251699965447, "loss_xval": 0.00634765625, "num_input_tokens_seen": 260389984, "step": 1766 }, { "epoch": 0.5093686941481695, "grad_norm": 12.14625787907636, "learning_rate": 0.0001, "loss": 0.0455, "num_input_tokens_seen": 260562448, "step": 1767 }, { "epoch": 0.5093686941481695, "loss": 0.03634582459926605, "loss_ce": 0.0002740452764555812, "loss_xval": 0.0361328125, "num_input_tokens_seen": 260562448, "step": 1767 }, { "epoch": 0.5096569616604208, "grad_norm": 4.580556944591601, "learning_rate": 0.0001, "loss": 0.0113, "num_input_tokens_seen": 260697216, "step": 1768 }, { "epoch": 0.5096569616604208, "loss": 0.013978520408272743, "loss_ce": 0.008794346824288368, "loss_xval": 0.00518798828125, "num_input_tokens_seen": 260697216, "step": 1768 }, { "epoch": 0.5099452291726723, "grad_norm": 9.088690506902278, "learning_rate": 0.0001, "loss": 0.0248, "num_input_tokens_seen": 260832144, "step": 1769 }, { "epoch": 0.5099452291726723, "loss": 0.022012032568454742, "loss_ce": 0.00028351714718155563, "loss_xval": 0.021728515625, "num_input_tokens_seen": 260832144, "step": 1769 }, { "epoch": 0.5102334966849236, "grad_norm": 11.86962561681491, "learning_rate": 0.0001, "loss": 0.0466, "num_input_tokens_seen": 261004736, "step": 1770 }, { "epoch": 0.5102334966849236, "loss": 0.036182355135679245, "loss_ce": 0.000751446234062314, "loss_xval": 0.035400390625, "num_input_tokens_seen": 261004736, "step": 1770 }, { "epoch": 0.510521764197175, "grad_norm": 0.4202323598180629, "learning_rate": 0.0001, "loss": 0.0042, "num_input_tokens_seen": 261139512, "step": 1771 }, { "epoch": 0.510521764197175, "loss": 0.00725182332098484, "loss_ce": 0.006140792742371559, "loss_xval": 0.0011138916015625, "num_input_tokens_seen": 261139512, "step": 1771 }, { "epoch": 0.5108100317094263, "grad_norm": 12.340846024432985, "learning_rate": 0.0001, "loss": 0.0462, "num_input_tokens_seen": 261274520, "step": 1772 }, { "epoch": 0.5108100317094263, "loss": 0.04091305285692215, "loss_ce": 0.00023312387929763645, "loss_xval": 0.040771484375, "num_input_tokens_seen": 261274520, "step": 1772 }, { "epoch": 0.5110982992216777, "grad_norm": 9.083396449002233, "learning_rate": 0.0001, "loss": 0.0313, "num_input_tokens_seen": 261446968, "step": 1773 }, { "epoch": 0.5110982992216777, "loss": 0.020383324474096298, "loss_ce": 0.00030275824246928096, "loss_xval": 0.02001953125, "num_input_tokens_seen": 261446968, "step": 1773 }, { "epoch": 0.5113865667339291, "grad_norm": 5.6260253773587685, "learning_rate": 0.0001, "loss": 0.013, "num_input_tokens_seen": 261581808, "step": 1774 }, { "epoch": 0.5113865667339291, "loss": 0.01707657054066658, "loss_ce": 0.005991059355437756, "loss_xval": 0.0111083984375, "num_input_tokens_seen": 261581808, "step": 1774 }, { "epoch": 0.5116748342461804, "grad_norm": 13.279502917108802, "learning_rate": 0.0001, "loss": 0.0533, "num_input_tokens_seen": 261716872, "step": 1775 }, { "epoch": 0.5116748342461804, "loss": 0.04808903858065605, "loss_ce": 0.0002374750911258161, "loss_xval": 0.0478515625, "num_input_tokens_seen": 261716872, "step": 1775 }, { "epoch": 0.5119631017584318, "grad_norm": 4.9284076730976185, "learning_rate": 0.0001, "loss": 0.0114, "num_input_tokens_seen": 261889328, "step": 1776 }, { "epoch": 0.5119631017584318, "loss": 0.0062470389530062675, "loss_ce": 0.00025796430418267846, "loss_xval": 0.0059814453125, "num_input_tokens_seen": 261889328, "step": 1776 }, { "epoch": 0.5122513692706832, "grad_norm": 8.774623567615105, "learning_rate": 0.0001, "loss": 0.0281, "num_input_tokens_seen": 262024152, "step": 1777 }, { "epoch": 0.5122513692706832, "loss": 0.03278300166130066, "loss_ce": 0.00882670283317566, "loss_xval": 0.02392578125, "num_input_tokens_seen": 262024152, "step": 1777 }, { "epoch": 0.5125396367829346, "grad_norm": 10.381916712519102, "learning_rate": 0.0001, "loss": 0.0332, "num_input_tokens_seen": 262159096, "step": 1778 }, { "epoch": 0.5125396367829346, "loss": 0.030212176963686943, "loss_ce": 0.0003202082007192075, "loss_xval": 0.0299072265625, "num_input_tokens_seen": 262159096, "step": 1778 }, { "epoch": 0.512827904295186, "grad_norm": 1.1520748376796106, "learning_rate": 0.0001, "loss": 0.0058, "num_input_tokens_seen": 262331560, "step": 1779 }, { "epoch": 0.512827904295186, "loss": 0.001818220131099224, "loss_ce": 0.00024370371829718351, "loss_xval": 0.0015716552734375, "num_input_tokens_seen": 262331560, "step": 1779 }, { "epoch": 0.5131161718074373, "grad_norm": 10.169366738375263, "learning_rate": 0.0001, "loss": 0.0374, "num_input_tokens_seen": 262466256, "step": 1780 }, { "epoch": 0.5131161718074373, "loss": 0.0423487164080143, "loss_ce": 0.010671471245586872, "loss_xval": 0.03173828125, "num_input_tokens_seen": 262466256, "step": 1780 }, { "epoch": 0.5134044393196887, "grad_norm": 5.603413067960133, "learning_rate": 0.0001, "loss": 0.0103, "num_input_tokens_seen": 262601296, "step": 1781 }, { "epoch": 0.5134044393196887, "loss": 0.009353495202958584, "loss_ce": 0.0002134803362423554, "loss_xval": 0.0091552734375, "num_input_tokens_seen": 262601296, "step": 1781 }, { "epoch": 0.51369270683194, "grad_norm": 5.697274476706217, "learning_rate": 0.0001, "loss": 0.0151, "num_input_tokens_seen": 262773776, "step": 1782 }, { "epoch": 0.51369270683194, "loss": 0.01040747668594122, "loss_ce": 0.0002374936593696475, "loss_xval": 0.01019287109375, "num_input_tokens_seen": 262773776, "step": 1782 }, { "epoch": 0.5139809743441914, "grad_norm": 9.185995857737648, "learning_rate": 0.0001, "loss": 0.0307, "num_input_tokens_seen": 262908520, "step": 1783 }, { "epoch": 0.5139809743441914, "loss": 0.03518703579902649, "loss_ce": 0.008499410934746265, "loss_xval": 0.0267333984375, "num_input_tokens_seen": 262908520, "step": 1783 }, { "epoch": 0.5142692418564427, "grad_norm": 0.980429739430197, "learning_rate": 0.0001, "loss": 0.0013, "num_input_tokens_seen": 263043456, "step": 1784 }, { "epoch": 0.5142692418564427, "loss": 0.0013816391583532095, "loss_ce": 0.00020671238598879427, "loss_xval": 0.0011749267578125, "num_input_tokens_seen": 263043456, "step": 1784 }, { "epoch": 0.5145575093686942, "grad_norm": 7.476174623428016, "learning_rate": 0.0001, "loss": 0.0216, "num_input_tokens_seen": 263216024, "step": 1785 }, { "epoch": 0.5145575093686942, "loss": 0.01764541119337082, "loss_ce": 0.00025039329193532467, "loss_xval": 0.017333984375, "num_input_tokens_seen": 263216024, "step": 1785 }, { "epoch": 0.5148457768809456, "grad_norm": 5.115796542754154, "learning_rate": 0.0001, "loss": 0.0133, "num_input_tokens_seen": 263350784, "step": 1786 }, { "epoch": 0.5148457768809456, "loss": 0.018444210290908813, "loss_ce": 0.008831174112856388, "loss_xval": 0.0096435546875, "num_input_tokens_seen": 263350784, "step": 1786 }, { "epoch": 0.5151340443931969, "grad_norm": 3.797885601558983, "learning_rate": 0.0001, "loss": 0.0056, "num_input_tokens_seen": 263485776, "step": 1787 }, { "epoch": 0.5151340443931969, "loss": 0.004911388270556927, "loss_ce": 0.00021740313968621194, "loss_xval": 0.00469970703125, "num_input_tokens_seen": 263485776, "step": 1787 }, { "epoch": 0.5154223119054483, "grad_norm": 6.4852572950830485, "learning_rate": 0.0001, "loss": 0.0177, "num_input_tokens_seen": 263658240, "step": 1788 }, { "epoch": 0.5154223119054483, "loss": 0.01347002387046814, "loss_ce": 0.0003093187406193465, "loss_xval": 0.01318359375, "num_input_tokens_seen": 263658240, "step": 1788 }, { "epoch": 0.5157105794176996, "grad_norm": 0.36006105128606547, "learning_rate": 0.0001, "loss": 0.0046, "num_input_tokens_seen": 263793016, "step": 1789 }, { "epoch": 0.5157105794176996, "loss": 0.008257750421762466, "loss_ce": 0.007303599268198013, "loss_xval": 0.00095367431640625, "num_input_tokens_seen": 263793016, "step": 1789 }, { "epoch": 0.515998846929951, "grad_norm": 6.51905227217711, "learning_rate": 0.0001, "loss": 0.0139, "num_input_tokens_seen": 263928216, "step": 1790 }, { "epoch": 0.515998846929951, "loss": 0.012620228342711926, "loss_ce": 0.00019957451149821281, "loss_xval": 0.012451171875, "num_input_tokens_seen": 263928216, "step": 1790 }, { "epoch": 0.5162871144422023, "grad_norm": 3.3272530057976772, "learning_rate": 0.0001, "loss": 0.0085, "num_input_tokens_seen": 264100696, "step": 1791 }, { "epoch": 0.5162871144422023, "loss": 0.005464356858283281, "loss_ce": 0.0015428480692207813, "loss_xval": 0.00390625, "num_input_tokens_seen": 264100696, "step": 1791 }, { "epoch": 0.5165753819544537, "grad_norm": 4.536962600288751, "learning_rate": 0.0001, "loss": 0.0141, "num_input_tokens_seen": 264235528, "step": 1792 }, { "epoch": 0.5165753819544537, "loss": 0.01935257576406002, "loss_ce": 0.01350846141576767, "loss_xval": 0.005859375, "num_input_tokens_seen": 264235528, "step": 1792 }, { "epoch": 0.5168636494667052, "grad_norm": 6.180124664142906, "learning_rate": 0.0001, "loss": 0.0128, "num_input_tokens_seen": 264370616, "step": 1793 }, { "epoch": 0.5168636494667052, "loss": 0.01103239506483078, "loss_ce": 0.0001910249120555818, "loss_xval": 0.0108642578125, "num_input_tokens_seen": 264370616, "step": 1793 }, { "epoch": 0.5171519169789565, "grad_norm": 0.7206910189063704, "learning_rate": 0.0001, "loss": 0.0053, "num_input_tokens_seen": 264543200, "step": 1794 }, { "epoch": 0.5171519169789565, "loss": 0.001006332109682262, "loss_ce": 0.00018807948799803853, "loss_xval": 0.00081634521484375, "num_input_tokens_seen": 264543200, "step": 1794 }, { "epoch": 0.5174401844912079, "grad_norm": 7.206321887425217, "learning_rate": 0.0001, "loss": 0.0217, "num_input_tokens_seen": 264678144, "step": 1795 }, { "epoch": 0.5174401844912079, "loss": 0.02243190072476864, "loss_ce": 0.007913162931799889, "loss_xval": 0.0145263671875, "num_input_tokens_seen": 264678144, "step": 1795 }, { "epoch": 0.5177284520034592, "grad_norm": 4.848419346001407, "learning_rate": 0.0001, "loss": 0.0082, "num_input_tokens_seen": 264813280, "step": 1796 }, { "epoch": 0.5177284520034592, "loss": 0.007033906411379576, "loss_ce": 0.0008197647985070944, "loss_xval": 0.0062255859375, "num_input_tokens_seen": 264813280, "step": 1796 }, { "epoch": 0.5180167195157106, "grad_norm": 3.7155723030374883, "learning_rate": 0.0001, "loss": 0.0092, "num_input_tokens_seen": 264985632, "step": 1797 }, { "epoch": 0.5180167195157106, "loss": 0.004432442598044872, "loss_ce": 0.00022864610946271569, "loss_xval": 0.00421142578125, "num_input_tokens_seen": 264985632, "step": 1797 }, { "epoch": 0.5183049870279619, "grad_norm": 7.999785529658378, "learning_rate": 0.0001, "loss": 0.0261, "num_input_tokens_seen": 265120480, "step": 1798 }, { "epoch": 0.5183049870279619, "loss": 0.029360122978687286, "loss_ce": 0.011392900720238686, "loss_xval": 0.0179443359375, "num_input_tokens_seen": 265120480, "step": 1798 }, { "epoch": 0.5185932545402133, "grad_norm": 2.7260965333996285, "learning_rate": 0.0001, "loss": 0.0039, "num_input_tokens_seen": 265255616, "step": 1799 }, { "epoch": 0.5185932545402133, "loss": 0.0042513152584433556, "loss_ce": 0.0022514602169394493, "loss_xval": 0.0019989013671875, "num_input_tokens_seen": 265255616, "step": 1799 }, { "epoch": 0.5188815220524647, "grad_norm": 5.770778443890454, "learning_rate": 0.0001, "loss": 0.0149, "num_input_tokens_seen": 265428080, "step": 1800 }, { "epoch": 0.5188815220524647, "loss": 0.009834205731749535, "loss_ce": 0.00018302083481103182, "loss_xval": 0.0096435546875, "num_input_tokens_seen": 265428080, "step": 1800 }, { "epoch": 0.519169789564716, "grad_norm": 7.326870513573077, "learning_rate": 0.0001, "loss": 0.0205, "num_input_tokens_seen": 265562848, "step": 1801 }, { "epoch": 0.519169789564716, "loss": 0.021131791174411774, "loss_ce": 0.006170548032969236, "loss_xval": 0.01495361328125, "num_input_tokens_seen": 265562848, "step": 1801 }, { "epoch": 0.5194580570769675, "grad_norm": 0.2707278759222347, "learning_rate": 0.0001, "loss": 0.0006, "num_input_tokens_seen": 265697912, "step": 1802 }, { "epoch": 0.5194580570769675, "loss": 0.0003889158251695335, "loss_ce": 0.00016402750043198466, "loss_xval": 0.000225067138671875, "num_input_tokens_seen": 265697912, "step": 1802 }, { "epoch": 0.5197463245892188, "grad_norm": 7.100925150836463, "learning_rate": 0.0001, "loss": 0.0215, "num_input_tokens_seen": 265870368, "step": 1803 }, { "epoch": 0.5197463245892188, "loss": 0.014973396435379982, "loss_ce": 0.00018763082334771752, "loss_xval": 0.0147705078125, "num_input_tokens_seen": 265870368, "step": 1803 }, { "epoch": 0.5200345921014702, "grad_norm": 6.053470067030953, "learning_rate": 0.0001, "loss": 0.0168, "num_input_tokens_seen": 266005056, "step": 1804 }, { "epoch": 0.5200345921014702, "loss": 0.019502371549606323, "loss_ce": 0.008950917981564999, "loss_xval": 0.01055908203125, "num_input_tokens_seen": 266005056, "step": 1804 }, { "epoch": 0.5203228596137215, "grad_norm": 2.057397577661355, "learning_rate": 0.0001, "loss": 0.0019, "num_input_tokens_seen": 266140120, "step": 1805 }, { "epoch": 0.5203228596137215, "loss": 0.001837803632952273, "loss_ce": 0.00018031768559012562, "loss_xval": 0.00165557861328125, "num_input_tokens_seen": 266140120, "step": 1805 }, { "epoch": 0.5206111271259729, "grad_norm": 7.649635337597759, "learning_rate": 0.0001, "loss": 0.0226, "num_input_tokens_seen": 266312648, "step": 1806 }, { "epoch": 0.5206111271259729, "loss": 0.017032187432050705, "loss_ce": 0.00018648413242772222, "loss_xval": 0.016845703125, "num_input_tokens_seen": 266312648, "step": 1806 }, { "epoch": 0.5208993946382243, "grad_norm": 4.2351822034209405, "learning_rate": 0.0001, "loss": 0.0102, "num_input_tokens_seen": 266447576, "step": 1807 }, { "epoch": 0.5208993946382243, "loss": 0.013426562771201134, "loss_ce": 0.007937213405966759, "loss_xval": 0.0054931640625, "num_input_tokens_seen": 266447576, "step": 1807 }, { "epoch": 0.5211876621504756, "grad_norm": 4.046343404175255, "learning_rate": 0.0001, "loss": 0.0057, "num_input_tokens_seen": 266582712, "step": 1808 }, { "epoch": 0.5211876621504756, "loss": 0.005520365200936794, "loss_ce": 0.00015308592992369086, "loss_xval": 0.00537109375, "num_input_tokens_seen": 266582712, "step": 1808 }, { "epoch": 0.521475929662727, "grad_norm": 7.701796112055166, "learning_rate": 0.0001, "loss": 0.0244, "num_input_tokens_seen": 266755160, "step": 1809 }, { "epoch": 0.521475929662727, "loss": 0.018067223951220512, "loss_ce": 0.00024495949037373066, "loss_xval": 0.017822265625, "num_input_tokens_seen": 266755160, "step": 1809 }, { "epoch": 0.5217641971749784, "grad_norm": 2.614532008892686, "learning_rate": 0.0001, "loss": 0.0074, "num_input_tokens_seen": 266889984, "step": 1810 }, { "epoch": 0.5217641971749784, "loss": 0.011933187022805214, "loss_ce": 0.009657721035182476, "loss_xval": 0.0022735595703125, "num_input_tokens_seen": 266889984, "step": 1810 }, { "epoch": 0.5220524646872298, "grad_norm": 5.425181300265865, "learning_rate": 0.0001, "loss": 0.0106, "num_input_tokens_seen": 267025000, "step": 1811 }, { "epoch": 0.5220524646872298, "loss": 0.009021684527397156, "loss_ce": 0.00024025217862799764, "loss_xval": 0.0087890625, "num_input_tokens_seen": 267025000, "step": 1811 }, { "epoch": 0.5223407321994811, "grad_norm": 7.369734476588075, "learning_rate": 0.0001, "loss": 0.0236, "num_input_tokens_seen": 267197600, "step": 1812 }, { "epoch": 0.5223407321994811, "loss": 0.01665113866329193, "loss_ce": 0.00018690709839574993, "loss_xval": 0.0164794921875, "num_input_tokens_seen": 267197600, "step": 1812 }, { "epoch": 0.5226289997117325, "grad_norm": 1.0926084372171434, "learning_rate": 0.0001, "loss": 0.0045, "num_input_tokens_seen": 267332288, "step": 1813 }, { "epoch": 0.5226289997117325, "loss": 0.008218837901949883, "loss_ce": 0.007448745891451836, "loss_xval": 0.00077056884765625, "num_input_tokens_seen": 267332288, "step": 1813 }, { "epoch": 0.5229172672239839, "grad_norm": 6.322536422783003, "learning_rate": 0.0001, "loss": 0.0137, "num_input_tokens_seen": 267467416, "step": 1814 }, { "epoch": 0.5229172672239839, "loss": 0.011986615136265755, "loss_ce": 0.0001534248294774443, "loss_xval": 0.0118408203125, "num_input_tokens_seen": 267467416, "step": 1814 }, { "epoch": 0.5232055347362352, "grad_norm": 6.580074951937422, "learning_rate": 0.0001, "loss": 0.0192, "num_input_tokens_seen": 267640160, "step": 1815 }, { "epoch": 0.5232055347362352, "loss": 0.013704424723982811, "loss_ce": 0.00020039669470861554, "loss_xval": 0.01348876953125, "num_input_tokens_seen": 267640160, "step": 1815 }, { "epoch": 0.5234938022484866, "grad_norm": 0.5802635231365518, "learning_rate": 0.0001, "loss": 0.0048, "num_input_tokens_seen": 267774944, "step": 1816 }, { "epoch": 0.5234938022484866, "loss": 0.008905167691409588, "loss_ce": 0.007684941403567791, "loss_xval": 0.001220703125, "num_input_tokens_seen": 267774944, "step": 1816 }, { "epoch": 0.5237820697607379, "grad_norm": 6.992165248213569, "learning_rate": 0.0001, "loss": 0.017, "num_input_tokens_seen": 267909920, "step": 1817 }, { "epoch": 0.5237820697607379, "loss": 0.014009732753038406, "loss_ce": 0.00015475264808628708, "loss_xval": 0.01385498046875, "num_input_tokens_seen": 267909920, "step": 1817 }, { "epoch": 0.5240703372729894, "grad_norm": 5.715831527574991, "learning_rate": 0.0001, "loss": 0.0151, "num_input_tokens_seen": 268082416, "step": 1818 }, { "epoch": 0.5240703372729894, "loss": 0.010570320300757885, "loss_ce": 0.0001714556710794568, "loss_xval": 0.0103759765625, "num_input_tokens_seen": 268082416, "step": 1818 }, { "epoch": 0.5243586047852407, "grad_norm": 2.103081581176938, "learning_rate": 0.0001, "loss": 0.0061, "num_input_tokens_seen": 268217104, "step": 1819 }, { "epoch": 0.5243586047852407, "loss": 0.009669620543718338, "loss_ce": 0.008071262389421463, "loss_xval": 0.0016021728515625, "num_input_tokens_seen": 268217104, "step": 1819 }, { "epoch": 0.5246468722974921, "grad_norm": 7.97819003535755, "learning_rate": 0.0001, "loss": 0.0222, "num_input_tokens_seen": 268352232, "step": 1820 }, { "epoch": 0.5246468722974921, "loss": 0.019121186807751656, "loss_ce": 0.00018502911552786827, "loss_xval": 0.0189208984375, "num_input_tokens_seen": 268352232, "step": 1820 }, { "epoch": 0.5249351398097435, "grad_norm": 5.277971127917007, "learning_rate": 0.0001, "loss": 0.0138, "num_input_tokens_seen": 268524752, "step": 1821 }, { "epoch": 0.5249351398097435, "loss": 0.009244978427886963, "loss_ce": 0.00015836962847970426, "loss_xval": 0.00909423828125, "num_input_tokens_seen": 268524752, "step": 1821 }, { "epoch": 0.5252234073219948, "grad_norm": 3.166177126377048, "learning_rate": 0.0001, "loss": 0.0085, "num_input_tokens_seen": 268659552, "step": 1822 }, { "epoch": 0.5252234073219948, "loss": 0.0116716418415308, "loss_ce": 0.008358577266335487, "loss_xval": 0.0033111572265625, "num_input_tokens_seen": 268659552, "step": 1822 }, { "epoch": 0.5255116748342462, "grad_norm": 8.588310727501373, "learning_rate": 0.0001, "loss": 0.0261, "num_input_tokens_seen": 268794736, "step": 1823 }, { "epoch": 0.5255116748342462, "loss": 0.0217589084059, "loss_ce": 0.00013720503193326294, "loss_xval": 0.0216064453125, "num_input_tokens_seen": 268794736, "step": 1823 }, { "epoch": 0.5257999423464975, "grad_norm": 5.4901507624366195, "learning_rate": 0.0001, "loss": 0.0146, "num_input_tokens_seen": 268967424, "step": 1824 }, { "epoch": 0.5257999423464975, "loss": 0.010655020363628864, "loss_ce": 0.0001798617740860209, "loss_xval": 0.010498046875, "num_input_tokens_seen": 268967424, "step": 1824 }, { "epoch": 0.5260882098587489, "grad_norm": 3.5760439539186324, "learning_rate": 0.0001, "loss": 0.0088, "num_input_tokens_seen": 269102152, "step": 1825 }, { "epoch": 0.5260882098587489, "loss": 0.011267166584730148, "loss_ce": 0.007673721294850111, "loss_xval": 0.00360107421875, "num_input_tokens_seen": 269102152, "step": 1825 }, { "epoch": 0.5263764773710002, "grad_norm": 8.878070935563086, "learning_rate": 0.0001, "loss": 0.0276, "num_input_tokens_seen": 269237248, "step": 1826 }, { "epoch": 0.5263764773710002, "loss": 0.022971712052822113, "loss_ce": 0.000144564313814044, "loss_xval": 0.0228271484375, "num_input_tokens_seen": 269237248, "step": 1826 }, { "epoch": 0.5266647448832517, "grad_norm": 4.414978131465267, "learning_rate": 0.0001, "loss": 0.0112, "num_input_tokens_seen": 269409816, "step": 1827 }, { "epoch": 0.5266647448832517, "loss": 0.007491076365113258, "loss_ce": 0.0002088190958602354, "loss_xval": 0.007293701171875, "num_input_tokens_seen": 269409816, "step": 1827 }, { "epoch": 0.5269530123955031, "grad_norm": 5.027340329287891, "learning_rate": 0.0001, "loss": 0.0123, "num_input_tokens_seen": 269544552, "step": 1828 }, { "epoch": 0.5269530123955031, "loss": 0.012437459081411362, "loss_ce": 0.005471821874380112, "loss_xval": 0.0069580078125, "num_input_tokens_seen": 269544552, "step": 1828 }, { "epoch": 0.5272412799077544, "grad_norm": 9.30005855051124, "learning_rate": 0.0001, "loss": 0.0303, "num_input_tokens_seen": 269679608, "step": 1829 }, { "epoch": 0.5272412799077544, "loss": 0.026024669408798218, "loss_ce": 0.0001457621401641518, "loss_xval": 0.02587890625, "num_input_tokens_seen": 269679608, "step": 1829 }, { "epoch": 0.5275295474200058, "grad_norm": 3.385395007624453, "learning_rate": 0.0001, "loss": 0.0085, "num_input_tokens_seen": 269852144, "step": 1830 }, { "epoch": 0.5275295474200058, "loss": 0.005543527193367481, "loss_ce": 0.00035553891211748123, "loss_xval": 0.00518798828125, "num_input_tokens_seen": 269852144, "step": 1830 }, { "epoch": 0.5278178149322571, "grad_norm": 6.672355098963897, "learning_rate": 0.0001, "loss": 0.0213, "num_input_tokens_seen": 269987008, "step": 1831 }, { "epoch": 0.5278178149322571, "loss": 0.02160380780696869, "loss_ce": 0.009183155372738838, "loss_xval": 0.012451171875, "num_input_tokens_seen": 269987008, "step": 1831 }, { "epoch": 0.5281060824445085, "grad_norm": 9.189129921555727, "learning_rate": 0.0001, "loss": 0.0302, "num_input_tokens_seen": 270122144, "step": 1832 }, { "epoch": 0.5281060824445085, "loss": 0.027190502732992172, "loss_ce": 0.00018244492821395397, "loss_xval": 0.0269775390625, "num_input_tokens_seen": 270122144, "step": 1832 }, { "epoch": 0.5283943499567598, "grad_norm": 0.8676142019276292, "learning_rate": 0.0001, "loss": 0.0057, "num_input_tokens_seen": 270294576, "step": 1833 }, { "epoch": 0.5283943499567598, "loss": 0.0018882546573877335, "loss_ce": 0.00017068708257284015, "loss_xval": 0.00171661376953125, "num_input_tokens_seen": 270294576, "step": 1833 }, { "epoch": 0.5286826174690112, "grad_norm": 8.93930668699493, "learning_rate": 0.0001, "loss": 0.0326, "num_input_tokens_seen": 270429344, "step": 1834 }, { "epoch": 0.5286826174690112, "loss": 0.02938574180006981, "loss_ce": 0.006665404886007309, "loss_xval": 0.022705078125, "num_input_tokens_seen": 270429344, "step": 1834 }, { "epoch": 0.5289708849812627, "grad_norm": 8.682313899729719, "learning_rate": 0.0001, "loss": 0.0271, "num_input_tokens_seen": 270564392, "step": 1835 }, { "epoch": 0.5289708849812627, "loss": 0.025439847260713577, "loss_ce": 0.00014077400555834174, "loss_xval": 0.0252685546875, "num_input_tokens_seen": 270564392, "step": 1835 }, { "epoch": 0.529259152493514, "grad_norm": 1.9778568650894202, "learning_rate": 0.0001, "loss": 0.0084, "num_input_tokens_seen": 270736976, "step": 1836 }, { "epoch": 0.529259152493514, "loss": 0.0010150633752346039, "loss_ce": 0.00016915427113417536, "loss_xval": 0.00084686279296875, "num_input_tokens_seen": 270736976, "step": 1836 }, { "epoch": 0.5295474200057654, "grad_norm": 11.147218175352355, "learning_rate": 0.0001, "loss": 0.0491, "num_input_tokens_seen": 270871816, "step": 1837 }, { "epoch": 0.5295474200057654, "loss": 0.04592829942703247, "loss_ce": 0.008117021061480045, "loss_xval": 0.037841796875, "num_input_tokens_seen": 270871816, "step": 1837 }, { "epoch": 0.5298356875180167, "grad_norm": 7.234562197358699, "learning_rate": 0.0001, "loss": 0.0202, "num_input_tokens_seen": 271006896, "step": 1838 }, { "epoch": 0.5298356875180167, "loss": 0.019972706213593483, "loss_ce": 0.001356984837912023, "loss_xval": 0.0185546875, "num_input_tokens_seen": 271006896, "step": 1838 }, { "epoch": 0.5301239550302681, "grad_norm": 6.134931735856935, "learning_rate": 0.0001, "loss": 0.0202, "num_input_tokens_seen": 271179408, "step": 1839 }, { "epoch": 0.5301239550302681, "loss": 0.008071623742580414, "loss_ce": 0.00021334676421247423, "loss_xval": 0.00787353515625, "num_input_tokens_seen": 271179408, "step": 1839 }, { "epoch": 0.5304122225425194, "grad_norm": 13.488095571657878, "learning_rate": 0.0001, "loss": 0.0714, "num_input_tokens_seen": 271314296, "step": 1840 }, { "epoch": 0.5304122225425194, "loss": 0.0717129334807396, "loss_ce": 0.008877238258719444, "loss_xval": 0.06298828125, "num_input_tokens_seen": 271314296, "step": 1840 }, { "epoch": 0.5307004900547708, "grad_norm": 4.802872837119547, "learning_rate": 0.0001, "loss": 0.0123, "num_input_tokens_seen": 271449488, "step": 1841 }, { "epoch": 0.5307004900547708, "loss": 0.012417205609381199, "loss_ce": 0.00016439828323200345, "loss_xval": 0.01226806640625, "num_input_tokens_seen": 271449488, "step": 1841 }, { "epoch": 0.5309887575670222, "grad_norm": 9.425082060032546, "learning_rate": 0.0001, "loss": 0.0396, "num_input_tokens_seen": 271621952, "step": 1842 }, { "epoch": 0.5309887575670222, "loss": 0.03309786319732666, "loss_ce": 0.00019991426961496472, "loss_xval": 0.032958984375, "num_input_tokens_seen": 271621952, "step": 1842 }, { "epoch": 0.5312770250792735, "grad_norm": 10.929547089579462, "learning_rate": 0.0001, "loss": 0.0485, "num_input_tokens_seen": 271756720, "step": 1843 }, { "epoch": 0.5312770250792735, "loss": 0.052716903388500214, "loss_ce": 0.008374866098165512, "loss_xval": 0.04443359375, "num_input_tokens_seen": 271756720, "step": 1843 }, { "epoch": 0.531565292591525, "grad_norm": 1.3531818049696558, "learning_rate": 0.0001, "loss": 0.002, "num_input_tokens_seen": 271891840, "step": 1844 }, { "epoch": 0.531565292591525, "loss": 0.0014669632073491812, "loss_ce": 0.00018427114991936833, "loss_xval": 0.00128173828125, "num_input_tokens_seen": 271891840, "step": 1844 }, { "epoch": 0.5318535601037763, "grad_norm": 12.025348100936883, "learning_rate": 0.0001, "loss": 0.0575, "num_input_tokens_seen": 272064320, "step": 1845 }, { "epoch": 0.5318535601037763, "loss": 0.04556536301970482, "loss_ce": 0.00021624250803142786, "loss_xval": 0.04541015625, "num_input_tokens_seen": 272064320, "step": 1845 }, { "epoch": 0.5321418276160277, "grad_norm": 8.611132853096239, "learning_rate": 0.0001, "loss": 0.0297, "num_input_tokens_seen": 272199200, "step": 1846 }, { "epoch": 0.5321418276160277, "loss": 0.032210852950811386, "loss_ce": 0.004851843696087599, "loss_xval": 0.02734375, "num_input_tokens_seen": 272199200, "step": 1846 }, { "epoch": 0.532430095128279, "grad_norm": 5.0086818591762965, "learning_rate": 0.0001, "loss": 0.0105, "num_input_tokens_seen": 272334240, "step": 1847 }, { "epoch": 0.532430095128279, "loss": 0.006865452043712139, "loss_ce": 0.00019736125250346959, "loss_xval": 0.00665283203125, "num_input_tokens_seen": 272334240, "step": 1847 }, { "epoch": 0.5327183626405304, "grad_norm": 11.640411204028348, "learning_rate": 0.0001, "loss": 0.0558, "num_input_tokens_seen": 272506904, "step": 1848 }, { "epoch": 0.5327183626405304, "loss": 0.04578360170125961, "loss_ce": 0.0002208554942626506, "loss_xval": 0.045654296875, "num_input_tokens_seen": 272506904, "step": 1848 }, { "epoch": 0.5330066301527818, "grad_norm": 2.439540588150794, "learning_rate": 0.0001, "loss": 0.0096, "num_input_tokens_seen": 272641704, "step": 1849 }, { "epoch": 0.5330066301527818, "loss": 0.016928596422076225, "loss_ce": 0.011700552888214588, "loss_xval": 0.005218505859375, "num_input_tokens_seen": 272641704, "step": 1849 }, { "epoch": 0.5332948976650331, "grad_norm": 10.359025160538259, "learning_rate": 0.0001, "loss": 0.0403, "num_input_tokens_seen": 272776880, "step": 1850 }, { "epoch": 0.5332948976650331, "loss": 0.030838388949632645, "loss_ce": 0.00021399807883426547, "loss_xval": 0.0306396484375, "num_input_tokens_seen": 272776880, "step": 1850 }, { "epoch": 0.5335831651772845, "grad_norm": 9.252019092841493, "learning_rate": 0.0001, "loss": 0.0363, "num_input_tokens_seen": 272949280, "step": 1851 }, { "epoch": 0.5335831651772845, "loss": 0.031272031366825104, "loss_ce": 0.0002966886095236987, "loss_xval": 0.031005859375, "num_input_tokens_seen": 272949280, "step": 1851 }, { "epoch": 0.5338714326895359, "grad_norm": 4.503404550676964, "learning_rate": 0.0001, "loss": 0.0125, "num_input_tokens_seen": 273084072, "step": 1852 }, { "epoch": 0.5338714326895359, "loss": 0.014995909295976162, "loss_ce": 0.007736540865153074, "loss_xval": 0.00726318359375, "num_input_tokens_seen": 273084072, "step": 1852 }, { "epoch": 0.5341597002017873, "grad_norm": 13.679304448516996, "learning_rate": 0.0001, "loss": 0.069, "num_input_tokens_seen": 273219208, "step": 1853 }, { "epoch": 0.5341597002017873, "loss": 0.05733451992273331, "loss_ce": 0.000205614764126949, "loss_xval": 0.05712890625, "num_input_tokens_seen": 273219208, "step": 1853 }, { "epoch": 0.5344479677140386, "grad_norm": 6.295374009468746, "learning_rate": 0.0001, "loss": 0.0209, "num_input_tokens_seen": 273391976, "step": 1854 }, { "epoch": 0.5344479677140386, "loss": 0.017546821385622025, "loss_ce": 0.002890755422413349, "loss_xval": 0.0146484375, "num_input_tokens_seen": 273391976, "step": 1854 }, { "epoch": 0.53473623522629, "grad_norm": 8.300922492017126, "learning_rate": 0.0001, "loss": 0.032, "num_input_tokens_seen": 273526776, "step": 1855 }, { "epoch": 0.53473623522629, "loss": 0.03134995698928833, "loss_ce": 0.01007920689880848, "loss_xval": 0.021240234375, "num_input_tokens_seen": 273526776, "step": 1855 }, { "epoch": 0.5350245027385414, "grad_norm": 14.090258374182888, "learning_rate": 0.0001, "loss": 0.0721, "num_input_tokens_seen": 273661712, "step": 1856 }, { "epoch": 0.5350245027385414, "loss": 0.06754553318023682, "loss_ce": 0.0002542752190493047, "loss_xval": 0.0673828125, "num_input_tokens_seen": 273661712, "step": 1856 }, { "epoch": 0.5353127702507927, "grad_norm": 4.217604913128783, "learning_rate": 0.0001, "loss": 0.0139, "num_input_tokens_seen": 273834160, "step": 1857 }, { "epoch": 0.5353127702507927, "loss": 0.011369526386260986, "loss_ce": 0.0009859215933829546, "loss_xval": 0.0103759765625, "num_input_tokens_seen": 273834160, "step": 1857 }, { "epoch": 0.5356010377630441, "grad_norm": 10.066890717856893, "learning_rate": 0.0001, "loss": 0.0434, "num_input_tokens_seen": 273969016, "step": 1858 }, { "epoch": 0.5356010377630441, "loss": 0.034813039004802704, "loss_ce": 0.006157036870718002, "loss_xval": 0.0286865234375, "num_input_tokens_seen": 273969016, "step": 1858 }, { "epoch": 0.5358893052752954, "grad_norm": 9.636156631624512, "learning_rate": 0.0001, "loss": 0.0347, "num_input_tokens_seen": 274104152, "step": 1859 }, { "epoch": 0.5358893052752954, "loss": 0.029770299792289734, "loss_ce": 0.00016824921476654708, "loss_xval": 0.029541015625, "num_input_tokens_seen": 274104152, "step": 1859 }, { "epoch": 0.5361775727875469, "grad_norm": 6.198292477325853, "learning_rate": 0.0001, "loss": 0.0202, "num_input_tokens_seen": 274276760, "step": 1860 }, { "epoch": 0.5361775727875469, "loss": 0.012494480237364769, "loss_ce": 0.0002264140930492431, "loss_xval": 0.01226806640625, "num_input_tokens_seen": 274276760, "step": 1860 }, { "epoch": 0.5364658402997982, "grad_norm": 13.371852634114951, "learning_rate": 0.0001, "loss": 0.0723, "num_input_tokens_seen": 274411536, "step": 1861 }, { "epoch": 0.5364658402997982, "loss": 0.06802873313426971, "loss_ce": 0.008672050200402737, "loss_xval": 0.059326171875, "num_input_tokens_seen": 274411536, "step": 1861 }, { "epoch": 0.5367541078120496, "grad_norm": 0.6687132978930205, "learning_rate": 0.0001, "loss": 0.0013, "num_input_tokens_seen": 274546536, "step": 1862 }, { "epoch": 0.5367541078120496, "loss": 0.0009921127930283546, "loss_ce": 0.00022941174393054098, "loss_xval": 0.000762939453125, "num_input_tokens_seen": 274546536, "step": 1862 }, { "epoch": 0.537042375324301, "grad_norm": 12.9414839972833, "learning_rate": 0.0001, "loss": 0.0686, "num_input_tokens_seen": 274719008, "step": 1863 }, { "epoch": 0.537042375324301, "loss": 0.06119033694267273, "loss_ce": 0.0002772501902654767, "loss_xval": 0.06103515625, "num_input_tokens_seen": 274719008, "step": 1863 }, { "epoch": 0.5373306428365523, "grad_norm": 6.5873752554268545, "learning_rate": 0.0001, "loss": 0.0216, "num_input_tokens_seen": 274853752, "step": 1864 }, { "epoch": 0.5373306428365523, "loss": 0.023443717509508133, "loss_ce": 0.0076966481283307076, "loss_xval": 0.0157470703125, "num_input_tokens_seen": 274853752, "step": 1864 }, { "epoch": 0.5376189103488037, "grad_norm": 9.18362001601729, "learning_rate": 0.0001, "loss": 0.0335, "num_input_tokens_seen": 274988920, "step": 1865 }, { "epoch": 0.5376189103488037, "loss": 0.03302977979183197, "loss_ce": 0.0003149337135255337, "loss_xval": 0.03271484375, "num_input_tokens_seen": 274988920, "step": 1865 }, { "epoch": 0.537907177861055, "grad_norm": 9.78562809125422, "learning_rate": 0.0001, "loss": 0.0415, "num_input_tokens_seen": 275161448, "step": 1866 }, { "epoch": 0.537907177861055, "loss": 0.03606872633099556, "loss_ce": 0.0004241948190610856, "loss_xval": 0.03564453125, "num_input_tokens_seen": 275161448, "step": 1866 }, { "epoch": 0.5381954453733064, "grad_norm": 4.664040810762385, "learning_rate": 0.0001, "loss": 0.0146, "num_input_tokens_seen": 275296272, "step": 1867 }, { "epoch": 0.5381954453733064, "loss": 0.019615836441516876, "loss_ce": 0.010201163589954376, "loss_xval": 0.0093994140625, "num_input_tokens_seen": 275296272, "step": 1867 }, { "epoch": 0.5384837128855579, "grad_norm": 10.882200468816723, "learning_rate": 0.0001, "loss": 0.0486, "num_input_tokens_seen": 275431280, "step": 1868 }, { "epoch": 0.5384837128855579, "loss": 0.046006783843040466, "loss_ce": 0.0003524859785102308, "loss_xval": 0.045654296875, "num_input_tokens_seen": 275431280, "step": 1868 }, { "epoch": 0.5387719803978092, "grad_norm": 0.21921989623535026, "learning_rate": 0.0001, "loss": 0.005, "num_input_tokens_seen": 275603792, "step": 1869 }, { "epoch": 0.5387719803978092, "loss": 0.0015932258684188128, "loss_ce": 0.0003429587814025581, "loss_xval": 0.001251220703125, "num_input_tokens_seen": 275603792, "step": 1869 }, { "epoch": 0.5390602479100606, "grad_norm": 9.76831164206979, "learning_rate": 0.0001, "loss": 0.0434, "num_input_tokens_seen": 275738648, "step": 1870 }, { "epoch": 0.5390602479100606, "loss": 0.04737325385212898, "loss_ce": 0.0074562616646289825, "loss_xval": 0.0400390625, "num_input_tokens_seen": 275738648, "step": 1870 }, { "epoch": 0.5393485154223119, "grad_norm": 3.2196833292638347, "learning_rate": 0.0001, "loss": 0.0053, "num_input_tokens_seen": 275873688, "step": 1871 }, { "epoch": 0.5393485154223119, "loss": 0.00434330478310585, "loss_ce": 0.00030735484324395657, "loss_xval": 0.0040283203125, "num_input_tokens_seen": 275873688, "step": 1871 }, { "epoch": 0.5396367829345633, "grad_norm": 7.713986595540021, "learning_rate": 0.0001, "loss": 0.0286, "num_input_tokens_seen": 276046200, "step": 1872 }, { "epoch": 0.5396367829345633, "loss": 0.026184234768152237, "loss_ce": 0.00029007039847783744, "loss_xval": 0.02587890625, "num_input_tokens_seen": 276046200, "step": 1872 }, { "epoch": 0.5399250504468146, "grad_norm": 5.44441982546994, "learning_rate": 0.0001, "loss": 0.0178, "num_input_tokens_seen": 276181072, "step": 1873 }, { "epoch": 0.5399250504468146, "loss": 0.022309284657239914, "loss_ce": 0.010231953114271164, "loss_xval": 0.0120849609375, "num_input_tokens_seen": 276181072, "step": 1873 }, { "epoch": 0.540213317959066, "grad_norm": 4.975650159557211, "learning_rate": 0.0001, "loss": 0.0108, "num_input_tokens_seen": 276316168, "step": 1874 }, { "epoch": 0.540213317959066, "loss": 0.011838541366159916, "loss_ce": 0.0002571204677224159, "loss_xval": 0.0115966796875, "num_input_tokens_seen": 276316168, "step": 1874 }, { "epoch": 0.5405015854713174, "grad_norm": 6.232380580900199, "learning_rate": 0.0001, "loss": 0.0203, "num_input_tokens_seen": 276488528, "step": 1875 }, { "epoch": 0.5405015854713174, "loss": 0.016596008092164993, "loss_ce": 0.0003225088585168123, "loss_xval": 0.0162353515625, "num_input_tokens_seen": 276488528, "step": 1875 }, { "epoch": 0.5407898529835687, "grad_norm": 2.164403279556159, "learning_rate": 0.0001, "loss": 0.006, "num_input_tokens_seen": 276623368, "step": 1876 }, { "epoch": 0.5407898529835687, "loss": 0.009121393784880638, "loss_ce": 0.006442523095756769, "loss_xval": 0.002685546875, "num_input_tokens_seen": 276623368, "step": 1876 }, { "epoch": 0.5410781204958202, "grad_norm": 5.647682673041729, "learning_rate": 0.0001, "loss": 0.0141, "num_input_tokens_seen": 276758480, "step": 1877 }, { "epoch": 0.5410781204958202, "loss": 0.014722021296620369, "loss_ce": 0.0005389765137806535, "loss_xval": 0.01416015625, "num_input_tokens_seen": 276758480, "step": 1877 }, { "epoch": 0.5413663880080715, "grad_norm": 0.9526118525776436, "learning_rate": 0.0001, "loss": 0.0067, "num_input_tokens_seen": 276930976, "step": 1878 }, { "epoch": 0.5413663880080715, "loss": 0.0019027346279472113, "loss_ce": 0.00040117441676557064, "loss_xval": 0.00150299072265625, "num_input_tokens_seen": 276930976, "step": 1878 }, { "epoch": 0.5416546555203229, "grad_norm": 5.5350916551675615, "learning_rate": 0.0001, "loss": 0.0166, "num_input_tokens_seen": 277065800, "step": 1879 }, { "epoch": 0.5416546555203229, "loss": 0.019967705011367798, "loss_ce": 0.006677298806607723, "loss_xval": 0.0133056640625, "num_input_tokens_seen": 277065800, "step": 1879 }, { "epoch": 0.5419429230325742, "grad_norm": 0.4270329198572373, "learning_rate": 0.0001, "loss": 0.0009, "num_input_tokens_seen": 277200872, "step": 1880 }, { "epoch": 0.5419429230325742, "loss": 0.0007261130958795547, "loss_ce": 0.00027693252195604146, "loss_xval": 0.00045013427734375, "num_input_tokens_seen": 277200872, "step": 1880 }, { "epoch": 0.5422311905448256, "grad_norm": 5.050655608523017, "learning_rate": 0.0001, "loss": 0.0155, "num_input_tokens_seen": 277373448, "step": 1881 }, { "epoch": 0.5422311905448256, "loss": 0.010783690959215164, "loss_ce": 0.00045349146239459515, "loss_xval": 0.01031494140625, "num_input_tokens_seen": 277373448, "step": 1881 }, { "epoch": 0.542519458057077, "grad_norm": 1.734342283968104, "learning_rate": 0.0001, "loss": 0.0062, "num_input_tokens_seen": 277508216, "step": 1882 }, { "epoch": 0.542519458057077, "loss": 0.010175340808928013, "loss_ce": 0.007627123035490513, "loss_xval": 0.0025482177734375, "num_input_tokens_seen": 277508216, "step": 1882 }, { "epoch": 0.5428077255693283, "grad_norm": 4.093064236279448, "learning_rate": 0.0001, "loss": 0.0079, "num_input_tokens_seen": 277643312, "step": 1883 }, { "epoch": 0.5428077255693283, "loss": 0.007740652654320002, "loss_ce": 0.0003019930445589125, "loss_xval": 0.0074462890625, "num_input_tokens_seen": 277643312, "step": 1883 }, { "epoch": 0.5430959930815797, "grad_norm": 2.7869194069931886, "learning_rate": 0.0001, "loss": 0.0087, "num_input_tokens_seen": 277815768, "step": 1884 }, { "epoch": 0.5430959930815797, "loss": 0.003793244482949376, "loss_ce": 0.00024176128499675542, "loss_xval": 0.0035552978515625, "num_input_tokens_seen": 277815768, "step": 1884 }, { "epoch": 0.543384260593831, "grad_norm": 3.0191744495092117, "learning_rate": 0.0001, "loss": 0.0091, "num_input_tokens_seen": 277950600, "step": 1885 }, { "epoch": 0.543384260593831, "loss": 0.014148483984172344, "loss_ce": 0.009771118871867657, "loss_xval": 0.004364013671875, "num_input_tokens_seen": 277950600, "step": 1885 }, { "epoch": 0.5436725281060825, "grad_norm": 3.7978812660289627, "learning_rate": 0.0001, "loss": 0.0067, "num_input_tokens_seen": 278085648, "step": 1886 }, { "epoch": 0.5436725281060825, "loss": 0.006118366494774818, "loss_ce": 0.00019795639673247933, "loss_xval": 0.00592041015625, "num_input_tokens_seen": 278085648, "step": 1886 }, { "epoch": 0.5439607956183338, "grad_norm": 1.212810871733268, "learning_rate": 0.0001, "loss": 0.0054, "num_input_tokens_seen": 278258016, "step": 1887 }, { "epoch": 0.5439607956183338, "loss": 0.001508941175416112, "loss_ce": 0.00021194410510361195, "loss_xval": 0.0012969970703125, "num_input_tokens_seen": 278258016, "step": 1887 }, { "epoch": 0.5442490631305852, "grad_norm": 3.660909833810732, "learning_rate": 0.0001, "loss": 0.0091, "num_input_tokens_seen": 278392800, "step": 1888 }, { "epoch": 0.5442490631305852, "loss": 0.012451540678739548, "loss_ce": 0.006084810942411423, "loss_xval": 0.006378173828125, "num_input_tokens_seen": 278392800, "step": 1888 }, { "epoch": 0.5445373306428366, "grad_norm": 0.2878029698109918, "learning_rate": 0.0001, "loss": 0.0005, "num_input_tokens_seen": 278527768, "step": 1889 }, { "epoch": 0.5445373306428366, "loss": 0.0005817347555421293, "loss_ce": 0.00019430456450209022, "loss_xval": 0.0003871917724609375, "num_input_tokens_seen": 278527768, "step": 1889 }, { "epoch": 0.5448255981550879, "grad_norm": 3.167419414539796, "learning_rate": 0.0001, "loss": 0.0072, "num_input_tokens_seen": 278700264, "step": 1890 }, { "epoch": 0.5448255981550879, "loss": 0.0045308684930205345, "loss_ce": 0.0001973719772649929, "loss_xval": 0.00433349609375, "num_input_tokens_seen": 278700264, "step": 1890 }, { "epoch": 0.5451138656673393, "grad_norm": 1.49057272645955, "learning_rate": 0.0001, "loss": 0.0053, "num_input_tokens_seen": 278835072, "step": 1891 }, { "epoch": 0.5451138656673393, "loss": 0.009009573608636856, "loss_ce": 0.007309172302484512, "loss_xval": 0.00170135498046875, "num_input_tokens_seen": 278835072, "step": 1891 }, { "epoch": 0.5454021331795906, "grad_norm": 2.4543003292367245, "learning_rate": 0.0001, "loss": 0.0031, "num_input_tokens_seen": 278970104, "step": 1892 }, { "epoch": 0.5454021331795906, "loss": 0.0029379362240433693, "loss_ce": 0.00036873770295642316, "loss_xval": 0.0025634765625, "num_input_tokens_seen": 278970104, "step": 1892 }, { "epoch": 0.545690400691842, "grad_norm": 2.4981723443169925, "learning_rate": 0.0001, "loss": 0.0082, "num_input_tokens_seen": 279142720, "step": 1893 }, { "epoch": 0.545690400691842, "loss": 0.0029300451278686523, "loss_ce": 0.00019872195844072849, "loss_xval": 0.0027313232421875, "num_input_tokens_seen": 279142720, "step": 1893 }, { "epoch": 0.5459786682040934, "grad_norm": 1.2333966495433941, "learning_rate": 0.0001, "loss": 0.005, "num_input_tokens_seen": 279277584, "step": 1894 }, { "epoch": 0.5459786682040934, "loss": 0.008969184011220932, "loss_ce": 0.008108016103506088, "loss_xval": 0.00086212158203125, "num_input_tokens_seen": 279277584, "step": 1894 }, { "epoch": 0.5462669357163448, "grad_norm": 3.0088690684316033, "learning_rate": 0.0001, "loss": 0.0045, "num_input_tokens_seen": 279412664, "step": 1895 }, { "epoch": 0.5462669357163448, "loss": 0.004141474142670631, "loss_ce": 0.0001284123572986573, "loss_xval": 0.0040283203125, "num_input_tokens_seen": 279412664, "step": 1895 }, { "epoch": 0.5465552032285962, "grad_norm": 0.3048082625937777, "learning_rate": 0.0001, "loss": 0.0047, "num_input_tokens_seen": 279585048, "step": 1896 }, { "epoch": 0.5465552032285962, "loss": 0.000647731008939445, "loss_ce": 0.00013370058150030673, "loss_xval": 0.000514984130859375, "num_input_tokens_seen": 279585048, "step": 1896 }, { "epoch": 0.5468434707408475, "grad_norm": 2.8539549965482647, "learning_rate": 0.0001, "loss": 0.007, "num_input_tokens_seen": 279719824, "step": 1897 }, { "epoch": 0.5468434707408475, "loss": 0.010153815150260925, "loss_ce": 0.006661459803581238, "loss_xval": 0.0034942626953125, "num_input_tokens_seen": 279719824, "step": 1897 }, { "epoch": 0.5471317382530989, "grad_norm": 1.8076048875343496, "learning_rate": 0.0001, "loss": 0.0026, "num_input_tokens_seen": 279854936, "step": 1898 }, { "epoch": 0.5471317382530989, "loss": 0.0016184869455173612, "loss_ce": 0.0001383843191433698, "loss_xval": 0.0014801025390625, "num_input_tokens_seen": 279854936, "step": 1898 }, { "epoch": 0.5474200057653502, "grad_norm": 1.9186558700507859, "learning_rate": 0.0001, "loss": 0.0071, "num_input_tokens_seen": 280027392, "step": 1899 }, { "epoch": 0.5474200057653502, "loss": 0.001938400324434042, "loss_ce": 0.0002294160076417029, "loss_xval": 0.001708984375, "num_input_tokens_seen": 280027392, "step": 1899 }, { "epoch": 0.5477082732776016, "grad_norm": 2.81423870825424, "learning_rate": 0.0001, "loss": 0.0088, "num_input_tokens_seen": 280162184, "step": 1900 }, { "epoch": 0.5477082732776016, "loss": 0.013608219102025032, "loss_ce": 0.010356190614402294, "loss_xval": 0.0032501220703125, "num_input_tokens_seen": 280162184, "step": 1900 }, { "epoch": 0.5479965407898529, "grad_norm": 0.24242660438385522, "learning_rate": 0.0001, "loss": 0.0004, "num_input_tokens_seen": 280297216, "step": 1901 }, { "epoch": 0.5479965407898529, "loss": 0.0003102511982433498, "loss_ce": 9.877391858026385e-05, "loss_xval": 0.0002117156982421875, "num_input_tokens_seen": 280297216, "step": 1901 }, { "epoch": 0.5482848083021044, "grad_norm": 3.045715546334397, "learning_rate": 0.0001, "loss": 0.0089, "num_input_tokens_seen": 280469696, "step": 1902 }, { "epoch": 0.5482848083021044, "loss": 0.004016020335257053, "loss_ce": 0.00010786311759147793, "loss_xval": 0.00390625, "num_input_tokens_seen": 280469696, "step": 1902 }, { "epoch": 0.5485730758143558, "grad_norm": 1.9843275332507238, "learning_rate": 0.0001, "loss": 0.0062, "num_input_tokens_seen": 280604424, "step": 1903 }, { "epoch": 0.5485730758143558, "loss": 0.01020785328000784, "loss_ce": 0.008550367318093777, "loss_xval": 0.00165557861328125, "num_input_tokens_seen": 280604424, "step": 1903 }, { "epoch": 0.5488613433266071, "grad_norm": 1.5118493970879978, "learning_rate": 0.0001, "loss": 0.0012, "num_input_tokens_seen": 280739472, "step": 1904 }, { "epoch": 0.5488613433266071, "loss": 0.0010831006802618504, "loss_ce": 7.172907498897985e-05, "loss_xval": 0.00101470947265625, "num_input_tokens_seen": 280739472, "step": 1904 }, { "epoch": 0.5491496108388585, "grad_norm": 3.1270755163143464, "learning_rate": 0.0001, "loss": 0.0082, "num_input_tokens_seen": 280912072, "step": 1905 }, { "epoch": 0.5491496108388585, "loss": 0.004354151897132397, "loss_ce": 0.00036397893563844264, "loss_xval": 0.003997802734375, "num_input_tokens_seen": 280912072, "step": 1905 }, { "epoch": 0.5494378783511098, "grad_norm": 1.1936946363165124, "learning_rate": 0.0001, "loss": 0.0055, "num_input_tokens_seen": 281046904, "step": 1906 }, { "epoch": 0.5494378783511098, "loss": 0.010138033889234066, "loss_ce": 0.009292601607739925, "loss_xval": 0.00084686279296875, "num_input_tokens_seen": 281046904, "step": 1906 }, { "epoch": 0.5497261458633612, "grad_norm": 1.7984578180285498, "learning_rate": 0.0001, "loss": 0.0017, "num_input_tokens_seen": 281181984, "step": 1907 }, { "epoch": 0.5497261458633612, "loss": 0.0018197790486738086, "loss_ce": 0.00028245607973076403, "loss_xval": 0.0015411376953125, "num_input_tokens_seen": 281181984, "step": 1907 }, { "epoch": 0.5500144133756125, "grad_norm": 2.5448443442932898, "learning_rate": 0.0001, "loss": 0.0057, "num_input_tokens_seen": 281354480, "step": 1908 }, { "epoch": 0.5500144133756125, "loss": 0.002872986253350973, "loss_ce": 7.490596908610314e-05, "loss_xval": 0.0027923583984375, "num_input_tokens_seen": 281354480, "step": 1908 }, { "epoch": 0.5503026808878639, "grad_norm": 0.48281903020689454, "learning_rate": 0.0001, "loss": 0.0062, "num_input_tokens_seen": 281489320, "step": 1909 }, { "epoch": 0.5503026808878639, "loss": 0.012108450755476952, "loss_ce": 0.011801367625594139, "loss_xval": 0.0003070831298828125, "num_input_tokens_seen": 281489320, "step": 1909 }, { "epoch": 0.5505909484001154, "grad_norm": 2.0299099430728376, "learning_rate": 0.0001, "loss": 0.002, "num_input_tokens_seen": 281624280, "step": 1910 }, { "epoch": 0.5505909484001154, "loss": 0.001749855582602322, "loss_ce": 6.185208621900529e-05, "loss_xval": 0.00168609619140625, "num_input_tokens_seen": 281624280, "step": 1910 }, { "epoch": 0.5508792159123667, "grad_norm": 2.240271411263848, "learning_rate": 0.0001, "loss": 0.0056, "num_input_tokens_seen": 281796704, "step": 1911 }, { "epoch": 0.5508792159123667, "loss": 0.0022659413516521454, "loss_ce": 7.153685146477073e-05, "loss_xval": 0.002197265625, "num_input_tokens_seen": 281796704, "step": 1911 }, { "epoch": 0.5511674834246181, "grad_norm": 0.1330004944553925, "learning_rate": 0.0001, "loss": 0.0029, "num_input_tokens_seen": 281931464, "step": 1912 }, { "epoch": 0.5511674834246181, "loss": 0.005604051053524017, "loss_ce": 0.005324862897396088, "loss_xval": 0.000278472900390625, "num_input_tokens_seen": 281931464, "step": 1912 }, { "epoch": 0.5514557509368694, "grad_norm": 2.229550340309766, "learning_rate": 0.0001, "loss": 0.0024, "num_input_tokens_seen": 282066520, "step": 1913 }, { "epoch": 0.5514557509368694, "loss": 0.0020462670363485813, "loss_ce": 6.262438546400517e-05, "loss_xval": 0.001983642578125, "num_input_tokens_seen": 282066520, "step": 1913 }, { "epoch": 0.5517440184491208, "grad_norm": 2.3761471501605627, "learning_rate": 0.0001, "loss": 0.0065, "num_input_tokens_seen": 282239008, "step": 1914 }, { "epoch": 0.5517440184491208, "loss": 0.002356612589210272, "loss_ce": 0.0001564860576763749, "loss_xval": 0.002197265625, "num_input_tokens_seen": 282239008, "step": 1914 }, { "epoch": 0.5520322859613721, "grad_norm": 0.21665882302988462, "learning_rate": 0.0001, "loss": 0.0054, "num_input_tokens_seen": 282373776, "step": 1915 }, { "epoch": 0.5520322859613721, "loss": 0.010722009465098381, "loss_ce": 0.010517684742808342, "loss_xval": 0.0002040863037109375, "num_input_tokens_seen": 282373776, "step": 1915 }, { "epoch": 0.5523205534736235, "grad_norm": 2.1117128245187184, "learning_rate": 0.0001, "loss": 0.0022, "num_input_tokens_seen": 282508896, "step": 1916 }, { "epoch": 0.5523205534736235, "loss": 0.0018395634833723307, "loss_ce": 6.668292917311192e-05, "loss_xval": 0.00177001953125, "num_input_tokens_seen": 282508896, "step": 1916 }, { "epoch": 0.5526088209858749, "grad_norm": 2.2123876638762874, "learning_rate": 0.0001, "loss": 0.0048, "num_input_tokens_seen": 282681552, "step": 1917 }, { "epoch": 0.5526088209858749, "loss": 0.0022970284335315228, "loss_ce": 0.00014172433293424547, "loss_xval": 0.0021514892578125, "num_input_tokens_seen": 282681552, "step": 1917 }, { "epoch": 0.5528970884981262, "grad_norm": 0.1690305145590322, "learning_rate": 0.0001, "loss": 0.0039, "num_input_tokens_seen": 282816320, "step": 1918 }, { "epoch": 0.5528970884981262, "loss": 0.0076433876529335976, "loss_ce": 0.007427559234201908, "loss_xval": 0.0002155303955078125, "num_input_tokens_seen": 282816320, "step": 1918 }, { "epoch": 0.5531853560103777, "grad_norm": 1.9800549126606803, "learning_rate": 0.0001, "loss": 0.0019, "num_input_tokens_seen": 282951264, "step": 1919 }, { "epoch": 0.5531853560103777, "loss": 0.001511584734544158, "loss_ce": 5.5323973356280476e-05, "loss_xval": 0.00145721435546875, "num_input_tokens_seen": 282951264, "step": 1919 }, { "epoch": 0.553473623522629, "grad_norm": 2.107008911020967, "learning_rate": 0.0001, "loss": 0.0044, "num_input_tokens_seen": 283123816, "step": 1920 }, { "epoch": 0.553473623522629, "loss": 0.002012682380154729, "loss_ce": 6.337207014439628e-05, "loss_xval": 0.001953125, "num_input_tokens_seen": 283123816, "step": 1920 }, { "epoch": 0.5537618910348804, "grad_norm": 0.16338842906769652, "learning_rate": 0.0001, "loss": 0.0048, "num_input_tokens_seen": 283258736, "step": 1921 }, { "epoch": 0.5537618910348804, "loss": 0.009544363245368004, "loss_ce": 0.009310713037848473, "loss_xval": 0.00023365020751953125, "num_input_tokens_seen": 283258736, "step": 1921 }, { "epoch": 0.5540501585471317, "grad_norm": 1.9086123569129076, "learning_rate": 0.0001, "loss": 0.002, "num_input_tokens_seen": 283394088, "step": 1922 }, { "epoch": 0.5540501585471317, "loss": 0.001516035059466958, "loss_ce": 0.00015323443221859634, "loss_xval": 0.00136566162109375, "num_input_tokens_seen": 283394088, "step": 1922 }, { "epoch": 0.5543384260593831, "grad_norm": 1.9160847816709878, "learning_rate": 0.0001, "loss": 0.0068, "num_input_tokens_seen": 283566624, "step": 1923 }, { "epoch": 0.5543384260593831, "loss": 0.0018914049724116921, "loss_ce": 8.896048530004919e-05, "loss_xval": 0.001800537109375, "num_input_tokens_seen": 283566624, "step": 1923 }, { "epoch": 0.5546266935716345, "grad_norm": 0.051652544586469715, "learning_rate": 0.0001, "loss": 0.0036, "num_input_tokens_seen": 283701376, "step": 1924 }, { "epoch": 0.5546266935716345, "loss": 0.007118854206055403, "loss_ce": 0.006899747531861067, "loss_xval": 0.0002193450927734375, "num_input_tokens_seen": 283701376, "step": 1924 }, { "epoch": 0.5549149610838858, "grad_norm": 1.980739323892179, "learning_rate": 0.0001, "loss": 0.0019, "num_input_tokens_seen": 283836400, "step": 1925 }, { "epoch": 0.5549149610838858, "loss": 0.0013998394133523107, "loss_ce": 5.5158656323328614e-05, "loss_xval": 0.0013427734375, "num_input_tokens_seen": 283836400, "step": 1925 }, { "epoch": 0.5552032285961372, "grad_norm": 2.1895260282147717, "learning_rate": 0.0001, "loss": 0.0065, "num_input_tokens_seen": 284008896, "step": 1926 }, { "epoch": 0.5552032285961372, "loss": 0.0025523242074996233, "loss_ce": 5.751205389969982e-05, "loss_xval": 0.00250244140625, "num_input_tokens_seen": 284008896, "step": 1926 }, { "epoch": 0.5554914961083885, "grad_norm": 0.5431506925705571, "learning_rate": 0.0001, "loss": 0.0059, "num_input_tokens_seen": 284143592, "step": 1927 }, { "epoch": 0.5554914961083885, "loss": 0.011616826057434082, "loss_ce": 0.01074516773223877, "loss_xval": 0.0008697509765625, "num_input_tokens_seen": 284143592, "step": 1927 }, { "epoch": 0.55577976362064, "grad_norm": 1.1463366590129798, "learning_rate": 0.0001, "loss": 0.0008, "num_input_tokens_seen": 284278800, "step": 1928 }, { "epoch": 0.55577976362064, "loss": 0.0005050962208770216, "loss_ce": 6.75981500535272e-05, "loss_xval": 0.0004367828369140625, "num_input_tokens_seen": 284278800, "step": 1928 }, { "epoch": 0.5560680311328913, "grad_norm": 1.4869739800687145, "learning_rate": 0.0001, "loss": 0.0041, "num_input_tokens_seen": 284451328, "step": 1929 }, { "epoch": 0.5560680311328913, "loss": 0.0015234043821692467, "loss_ce": 5.665321441483684e-05, "loss_xval": 0.00146484375, "num_input_tokens_seen": 284451328, "step": 1929 }, { "epoch": 0.5563562986451427, "grad_norm": 0.4843964706871106, "learning_rate": 0.0001, "loss": 0.0038, "num_input_tokens_seen": 284586216, "step": 1930 }, { "epoch": 0.5563562986451427, "loss": 0.007321392651647329, "loss_ce": 0.006811653729528189, "loss_xval": 0.00051116943359375, "num_input_tokens_seen": 284586216, "step": 1930 }, { "epoch": 0.5566445661573941, "grad_norm": 0.958553380827203, "learning_rate": 0.0001, "loss": 0.0013, "num_input_tokens_seen": 284721176, "step": 1931 }, { "epoch": 0.5566445661573941, "loss": 0.00031205732375383377, "loss_ce": 4.8127942136488855e-05, "loss_xval": 0.000263214111328125, "num_input_tokens_seen": 284721176, "step": 1931 }, { "epoch": 0.5569328336696454, "grad_norm": 1.4978071655368779, "learning_rate": 0.0001, "loss": 0.0044, "num_input_tokens_seen": 284893792, "step": 1932 }, { "epoch": 0.5569328336696454, "loss": 0.0014730643015354872, "loss_ce": 5.208961374592036e-05, "loss_xval": 0.0014190673828125, "num_input_tokens_seen": 284893792, "step": 1932 }, { "epoch": 0.5572211011818968, "grad_norm": 0.8566778501180569, "learning_rate": 0.0001, "loss": 0.0065, "num_input_tokens_seen": 285028528, "step": 1933 }, { "epoch": 0.5572211011818968, "loss": 0.0127099072560668, "loss_ce": 0.01148157473653555, "loss_xval": 0.00122833251953125, "num_input_tokens_seen": 285028528, "step": 1933 }, { "epoch": 0.5575093686941481, "grad_norm": 0.044511992402982, "learning_rate": 0.0001, "loss": 0.0002, "num_input_tokens_seen": 285163648, "step": 1934 }, { "epoch": 0.5575093686941481, "loss": 0.0001727144990582019, "loss_ce": 4.5816203055437654e-05, "loss_xval": 0.00012683868408203125, "num_input_tokens_seen": 285163648, "step": 1934 }, { "epoch": 0.5577976362063995, "grad_norm": 0.2350786549418237, "learning_rate": 0.0001, "loss": 0.004, "num_input_tokens_seen": 285336112, "step": 1935 }, { "epoch": 0.5577976362063995, "loss": 0.0003692085447255522, "loss_ce": 6.03372827754356e-05, "loss_xval": 0.000308990478515625, "num_input_tokens_seen": 285336112, "step": 1935 }, { "epoch": 0.5580859037186509, "grad_norm": 0.19000274994071045, "learning_rate": 0.0001, "loss": 0.0041, "num_input_tokens_seen": 285470816, "step": 1936 }, { "epoch": 0.5580859037186509, "loss": 0.007709057070314884, "loss_ce": 0.007516414858400822, "loss_xval": 0.0001926422119140625, "num_input_tokens_seen": 285470816, "step": 1936 }, { "epoch": 0.5583741712309023, "grad_norm": 0.7432285967026308, "learning_rate": 0.0001, "loss": 0.0005, "num_input_tokens_seen": 285605896, "step": 1937 }, { "epoch": 0.5583741712309023, "loss": 0.00025071113486774266, "loss_ce": 7.595031638629735e-05, "loss_xval": 0.00017452239990234375, "num_input_tokens_seen": 285605896, "step": 1937 }, { "epoch": 0.5586624387431537, "grad_norm": 0.6835868718300878, "learning_rate": 0.0001, "loss": 0.0051, "num_input_tokens_seen": 285778400, "step": 1938 }, { "epoch": 0.5586624387431537, "loss": 0.000513557402882725, "loss_ce": 8.464240818284452e-05, "loss_xval": 0.0004291534423828125, "num_input_tokens_seen": 285778400, "step": 1938 }, { "epoch": 0.558950706255405, "grad_norm": 0.21160262356402706, "learning_rate": 0.0001, "loss": 0.0039, "num_input_tokens_seen": 285913136, "step": 1939 }, { "epoch": 0.558950706255405, "loss": 0.007476561237126589, "loss_ce": 0.007180326152592897, "loss_xval": 0.0002956390380859375, "num_input_tokens_seen": 285913136, "step": 1939 }, { "epoch": 0.5592389737676564, "grad_norm": 0.9580294627468874, "learning_rate": 0.0001, "loss": 0.0006, "num_input_tokens_seen": 286048264, "step": 1940 }, { "epoch": 0.5592389737676564, "loss": 0.0003514891432132572, "loss_ce": 7.075125904520974e-05, "loss_xval": 0.0002803802490234375, "num_input_tokens_seen": 286048264, "step": 1940 }, { "epoch": 0.5595272412799077, "grad_norm": 1.1122987198740457, "learning_rate": 0.0001, "loss": 0.005, "num_input_tokens_seen": 286220784, "step": 1941 }, { "epoch": 0.5595272412799077, "loss": 0.000870700110681355, "loss_ce": 8.201141463359818e-05, "loss_xval": 0.000789642333984375, "num_input_tokens_seen": 286220784, "step": 1941 }, { "epoch": 0.5598155087921591, "grad_norm": 0.654693458550546, "learning_rate": 0.0001, "loss": 0.0042, "num_input_tokens_seen": 286355584, "step": 1942 }, { "epoch": 0.5598155087921591, "loss": 0.008090196177363396, "loss_ce": 0.0076016769744455814, "loss_xval": 0.00048828125, "num_input_tokens_seen": 286355584, "step": 1942 }, { "epoch": 0.5601037763044104, "grad_norm": 0.06036064124455231, "learning_rate": 0.0001, "loss": 0.0002, "num_input_tokens_seen": 286490616, "step": 1943 }, { "epoch": 0.5601037763044104, "loss": 0.00018729933071881533, "loss_ce": 9.282596147386357e-05, "loss_xval": 9.441375732421875e-05, "num_input_tokens_seen": 286490616, "step": 1943 }, { "epoch": 0.5603920438166619, "grad_norm": 0.14486727422936166, "learning_rate": 0.0001, "loss": 0.0034, "num_input_tokens_seen": 286663224, "step": 1944 }, { "epoch": 0.5603920438166619, "loss": 0.00021684798412024975, "loss_ce": 5.150470678927377e-05, "loss_xval": 0.00016498565673828125, "num_input_tokens_seen": 286663224, "step": 1944 }, { "epoch": 0.5606803113289133, "grad_norm": 0.4009978276180538, "learning_rate": 0.0001, "loss": 0.0049, "num_input_tokens_seen": 286797968, "step": 1945 }, { "epoch": 0.5606803113289133, "loss": 0.009587855078279972, "loss_ce": 0.009425134398043156, "loss_xval": 0.00016307830810546875, "num_input_tokens_seen": 286797968, "step": 1945 }, { "epoch": 0.5609685788411646, "grad_norm": 1.2402087854563595, "learning_rate": 0.0001, "loss": 0.0012, "num_input_tokens_seen": 286933040, "step": 1946 }, { "epoch": 0.5609685788411646, "loss": 0.0003389227786101401, "loss_ce": 3.267412830609828e-05, "loss_xval": 0.0003070831298828125, "num_input_tokens_seen": 286933040, "step": 1946 }, { "epoch": 0.561256846353416, "grad_norm": 1.2725344281821624, "learning_rate": 0.0001, "loss": 0.0064, "num_input_tokens_seen": 287105616, "step": 1947 }, { "epoch": 0.561256846353416, "loss": 0.0011517228558659554, "loss_ce": 0.00024334801128134131, "loss_xval": 0.00090789794921875, "num_input_tokens_seen": 287105616, "step": 1947 }, { "epoch": 0.5615451138656673, "grad_norm": 1.1853218638615954, "learning_rate": 0.0001, "loss": 0.0049, "num_input_tokens_seen": 287240328, "step": 1948 }, { "epoch": 0.5615451138656673, "loss": 0.009418094530701637, "loss_ce": 0.008385742083191872, "loss_xval": 0.00102996826171875, "num_input_tokens_seen": 287240328, "step": 1948 }, { "epoch": 0.5618333813779187, "grad_norm": 1.1085830242231645, "learning_rate": 0.0001, "loss": 0.0007, "num_input_tokens_seen": 287375576, "step": 1949 }, { "epoch": 0.5618333813779187, "loss": 0.0008955195662565529, "loss_ce": 3.3397995139239356e-05, "loss_xval": 0.00086212158203125, "num_input_tokens_seen": 287375576, "step": 1949 }, { "epoch": 0.56212164889017, "grad_norm": 1.5125347817040997, "learning_rate": 0.0001, "loss": 0.0056, "num_input_tokens_seen": 287548080, "step": 1950 }, { "epoch": 0.56212164889017, "loss": 0.0017731888219714165, "loss_ce": 0.0006511909887194633, "loss_xval": 0.00112152099609375, "num_input_tokens_seen": 287548080, "step": 1950 }, { "epoch": 0.5624099164024214, "grad_norm": 2.704419859032451, "learning_rate": 0.0001, "loss": 0.0075, "num_input_tokens_seen": 287682832, "step": 1951 }, { "epoch": 0.5624099164024214, "loss": 0.011266487650573254, "loss_ce": 0.008250969462096691, "loss_xval": 0.003021240234375, "num_input_tokens_seen": 287682832, "step": 1951 }, { "epoch": 0.5626981839146729, "grad_norm": 4.881457725475292, "learning_rate": 0.0001, "loss": 0.0117, "num_input_tokens_seen": 287817840, "step": 1952 }, { "epoch": 0.5626981839146729, "loss": 0.009428413584828377, "loss_ce": 6.714653864037246e-05, "loss_xval": 0.00933837890625, "num_input_tokens_seen": 287817840, "step": 1952 }, { "epoch": 0.5629864514269242, "grad_norm": 8.142683543107829, "learning_rate": 0.0001, "loss": 0.0319, "num_input_tokens_seen": 287990176, "step": 1953 }, { "epoch": 0.5629864514269242, "loss": 0.028103932738304138, "loss_ce": 5.8278888900531456e-05, "loss_xval": 0.028076171875, "num_input_tokens_seen": 287990176, "step": 1953 }, { "epoch": 0.5632747189391756, "grad_norm": 12.683316002509716, "learning_rate": 0.0001, "loss": 0.0747, "num_input_tokens_seen": 288124992, "step": 1954 }, { "epoch": 0.5632747189391756, "loss": 0.0827786922454834, "loss_ce": 0.00831579975783825, "loss_xval": 0.07421875, "num_input_tokens_seen": 288124992, "step": 1954 }, { "epoch": 0.5635629864514269, "grad_norm": 16.530990540062938, "learning_rate": 0.0001, "loss": 0.1233, "num_input_tokens_seen": 288260168, "step": 1955 }, { "epoch": 0.5635629864514269, "loss": 0.12602970004081726, "loss_ce": 5.31386467628181e-05, "loss_xval": 0.1259765625, "num_input_tokens_seen": 288260168, "step": 1955 }, { "epoch": 0.5638512539636783, "grad_norm": 13.143860245392554, "learning_rate": 0.0001, "loss": 0.0874, "num_input_tokens_seen": 288432592, "step": 1956 }, { "epoch": 0.5638512539636783, "loss": 0.09267762303352356, "loss_ce": 8.729415276320651e-05, "loss_xval": 0.0927734375, "num_input_tokens_seen": 288432592, "step": 1956 }, { "epoch": 0.5641395214759297, "grad_norm": 2.6561824504781044, "learning_rate": 0.0001, "loss": 0.0156, "num_input_tokens_seen": 288567392, "step": 1957 }, { "epoch": 0.5641395214759297, "loss": 0.024685639888048172, "loss_ce": 0.008061189204454422, "loss_xval": 0.0166015625, "num_input_tokens_seen": 288567392, "step": 1957 }, { "epoch": 0.564427788988181, "grad_norm": 6.84049611185504, "learning_rate": 0.0001, "loss": 0.0231, "num_input_tokens_seen": 288702376, "step": 1958 }, { "epoch": 0.564427788988181, "loss": 0.024861274287104607, "loss_ce": 0.00012677747872658074, "loss_xval": 0.0247802734375, "num_input_tokens_seen": 288702376, "step": 1958 }, { "epoch": 0.5647160565004324, "grad_norm": 10.650617694885725, "learning_rate": 0.0001, "loss": 0.055, "num_input_tokens_seen": 288875056, "step": 1959 }, { "epoch": 0.5647160565004324, "loss": 0.0596018023788929, "loss_ce": 0.00021459662821143866, "loss_xval": 0.059326171875, "num_input_tokens_seen": 288875056, "step": 1959 }, { "epoch": 0.5650043240126837, "grad_norm": 8.25795596922048, "learning_rate": 0.0001, "loss": 0.0363, "num_input_tokens_seen": 289009872, "step": 1960 }, { "epoch": 0.5650043240126837, "loss": 0.048701632767915726, "loss_ce": 0.008052217774093151, "loss_xval": 0.04052734375, "num_input_tokens_seen": 289009872, "step": 1960 }, { "epoch": 0.5652925915249352, "grad_norm": 2.4798138856675305, "learning_rate": 0.0001, "loss": 0.0046, "num_input_tokens_seen": 289144976, "step": 1961 }, { "epoch": 0.5652925915249352, "loss": 0.005279832519590855, "loss_ce": 0.00028448639204725623, "loss_xval": 0.0050048828125, "num_input_tokens_seen": 289144976, "step": 1961 }, { "epoch": 0.5655808590371865, "grad_norm": 1.561935056225871, "learning_rate": 0.0001, "loss": 0.0064, "num_input_tokens_seen": 289317568, "step": 1962 }, { "epoch": 0.5655808590371865, "loss": 0.0034121754579246044, "loss_ce": 0.0003909351071342826, "loss_xval": 0.003021240234375, "num_input_tokens_seen": 289317568, "step": 1962 }, { "epoch": 0.5658691265494379, "grad_norm": 2.9034700037078753, "learning_rate": 0.0001, "loss": 0.0106, "num_input_tokens_seen": 289452392, "step": 1963 }, { "epoch": 0.5658691265494379, "loss": 0.018562277778983116, "loss_ce": 0.008354147896170616, "loss_xval": 0.01019287109375, "num_input_tokens_seen": 289452392, "step": 1963 }, { "epoch": 0.5661573940616893, "grad_norm": 3.2432257799309285, "learning_rate": 0.0001, "loss": 0.0059, "num_input_tokens_seen": 289587400, "step": 1964 }, { "epoch": 0.5661573940616893, "loss": 0.007287283428013325, "loss_ce": 0.0004742340533994138, "loss_xval": 0.006805419921875, "num_input_tokens_seen": 289587400, "step": 1964 }, { "epoch": 0.5664456615739406, "grad_norm": 4.038297008837272, "learning_rate": 0.0001, "loss": 0.0129, "num_input_tokens_seen": 289759840, "step": 1965 }, { "epoch": 0.5664456615739406, "loss": 0.0066073378548026085, "loss_ce": 0.0006297073559835553, "loss_xval": 0.0059814453125, "num_input_tokens_seen": 289759840, "step": 1965 }, { "epoch": 0.566733929086192, "grad_norm": 5.565205838171376, "learning_rate": 0.0001, "loss": 0.0184, "num_input_tokens_seen": 289894704, "step": 1966 }, { "epoch": 0.566733929086192, "loss": 0.018734395503997803, "loss_ce": 0.007687033619731665, "loss_xval": 0.01104736328125, "num_input_tokens_seen": 289894704, "step": 1966 }, { "epoch": 0.5670221965984433, "grad_norm": 5.467379562241336, "learning_rate": 0.0001, "loss": 0.0144, "num_input_tokens_seen": 290029936, "step": 1967 }, { "epoch": 0.5670221965984433, "loss": 0.012764690443873405, "loss_ce": 0.0004889954579994082, "loss_xval": 0.01226806640625, "num_input_tokens_seen": 290029936, "step": 1967 }, { "epoch": 0.5673104641106947, "grad_norm": 2.99187780606278, "learning_rate": 0.0001, "loss": 0.0084, "num_input_tokens_seen": 290202328, "step": 1968 }, { "epoch": 0.5673104641106947, "loss": 0.00564954150468111, "loss_ce": 0.00032040924998000264, "loss_xval": 0.005340576171875, "num_input_tokens_seen": 290202328, "step": 1968 }, { "epoch": 0.567598731622946, "grad_norm": 0.48645630930362904, "learning_rate": 0.0001, "loss": 0.0053, "num_input_tokens_seen": 290337224, "step": 1969 }, { "epoch": 0.567598731622946, "loss": 0.009277354925870895, "loss_ce": 0.008250724524259567, "loss_xval": 0.00102996826171875, "num_input_tokens_seen": 290337224, "step": 1969 }, { "epoch": 0.5678869991351975, "grad_norm": 2.6207641878735055, "learning_rate": 0.0001, "loss": 0.004, "num_input_tokens_seen": 290472392, "step": 1970 }, { "epoch": 0.5678869991351975, "loss": 0.003790928516536951, "loss_ce": 0.0001993910118471831, "loss_xval": 0.0035858154296875, "num_input_tokens_seen": 290472392, "step": 1970 }, { "epoch": 0.5681752666474489, "grad_norm": 1.7294043568848192, "learning_rate": 0.0001, "loss": 0.0064, "num_input_tokens_seen": 290644992, "step": 1971 }, { "epoch": 0.5681752666474489, "loss": 0.0025520434137433767, "loss_ce": 0.00023270744713954628, "loss_xval": 0.0023193359375, "num_input_tokens_seen": 290644992, "step": 1971 }, { "epoch": 0.5684635341597002, "grad_norm": 0.5996015104252971, "learning_rate": 0.0001, "loss": 0.0045, "num_input_tokens_seen": 290779760, "step": 1972 }, { "epoch": 0.5684635341597002, "loss": 0.007963595911860466, "loss_ce": 0.007266698870807886, "loss_xval": 0.000698089599609375, "num_input_tokens_seen": 290779760, "step": 1972 }, { "epoch": 0.5687518016719516, "grad_norm": 2.5431713335506845, "learning_rate": 0.0001, "loss": 0.0035, "num_input_tokens_seen": 290914776, "step": 1973 }, { "epoch": 0.5687518016719516, "loss": 0.0034647989086806774, "loss_ce": 8.306984091177583e-05, "loss_xval": 0.003387451171875, "num_input_tokens_seen": 290914776, "step": 1973 }, { "epoch": 0.5690400691842029, "grad_norm": 3.0238650907286813, "learning_rate": 0.0001, "loss": 0.0083, "num_input_tokens_seen": 291087272, "step": 1974 }, { "epoch": 0.5690400691842029, "loss": 0.005696265026926994, "loss_ce": 0.00045487057650461793, "loss_xval": 0.0052490234375, "num_input_tokens_seen": 291087272, "step": 1974 }, { "epoch": 0.5693283366964543, "grad_norm": 2.6482166457440774, "learning_rate": 0.0001, "loss": 0.0081, "num_input_tokens_seen": 291222040, "step": 1975 }, { "epoch": 0.5693283366964543, "loss": 0.01379525288939476, "loss_ce": 0.009589549154043198, "loss_xval": 0.00421142578125, "num_input_tokens_seen": 291222040, "step": 1975 }, { "epoch": 0.5696166042087056, "grad_norm": 2.197137165884688, "learning_rate": 0.0001, "loss": 0.0026, "num_input_tokens_seen": 291356984, "step": 1976 }, { "epoch": 0.5696166042087056, "loss": 0.0022935783490538597, "loss_ce": 5.339750714483671e-05, "loss_xval": 0.0022430419921875, "num_input_tokens_seen": 291356984, "step": 1976 }, { "epoch": 0.569904871720957, "grad_norm": 1.9490519769257666, "learning_rate": 0.0001, "loss": 0.0074, "num_input_tokens_seen": 291529568, "step": 1977 }, { "epoch": 0.569904871720957, "loss": 0.0018777311779558659, "loss_ce": 7.337937859119847e-05, "loss_xval": 0.001800537109375, "num_input_tokens_seen": 291529568, "step": 1977 }, { "epoch": 0.5701931392332085, "grad_norm": 1.9049005772357561, "learning_rate": 0.0001, "loss": 0.0069, "num_input_tokens_seen": 291664280, "step": 1978 }, { "epoch": 0.5701931392332085, "loss": 0.011196194216609001, "loss_ce": 0.009730396792292595, "loss_xval": 0.00146484375, "num_input_tokens_seen": 291664280, "step": 1978 }, { "epoch": 0.5704814067454598, "grad_norm": 1.4667761838514597, "learning_rate": 0.0001, "loss": 0.0014, "num_input_tokens_seen": 291799408, "step": 1979 }, { "epoch": 0.5704814067454598, "loss": 0.0012744119158014655, "loss_ce": 3.8450009014923126e-05, "loss_xval": 0.0012359619140625, "num_input_tokens_seen": 291799408, "step": 1979 }, { "epoch": 0.5707696742577112, "grad_norm": 0.5428364480716036, "learning_rate": 0.0001, "loss": 0.0058, "num_input_tokens_seen": 291972168, "step": 1980 }, { "epoch": 0.5707696742577112, "loss": 0.0022662151604890823, "loss_ce": 0.0017080771503970027, "loss_xval": 0.00055694580078125, "num_input_tokens_seen": 291972168, "step": 1980 }, { "epoch": 0.5710579417699625, "grad_norm": 1.0430200816446416, "learning_rate": 0.0001, "loss": 0.012, "num_input_tokens_seen": 292106936, "step": 1981 }, { "epoch": 0.5710579417699625, "loss": 0.02298683673143387, "loss_ce": 0.02252192050218582, "loss_xval": 0.00046539306640625, "num_input_tokens_seen": 292106936, "step": 1981 }, { "epoch": 0.5713462092822139, "grad_norm": 1.6942707055207802, "learning_rate": 0.0001, "loss": 0.0015, "num_input_tokens_seen": 292242048, "step": 1982 }, { "epoch": 0.5713462092822139, "loss": 0.0013274959055706859, "loss_ce": 7.341417949646711e-05, "loss_xval": 0.001251220703125, "num_input_tokens_seen": 292242048, "step": 1982 }, { "epoch": 0.5716344767944652, "grad_norm": 1.6282262101978988, "learning_rate": 0.0001, "loss": 0.0086, "num_input_tokens_seen": 292414584, "step": 1983 }, { "epoch": 0.5716344767944652, "loss": 0.005388371646404266, "loss_ce": 0.0033703967928886414, "loss_xval": 0.00201416015625, "num_input_tokens_seen": 292414584, "step": 1983 }, { "epoch": 0.5719227443067166, "grad_norm": 0.6386831560604371, "learning_rate": 0.0001, "loss": 0.0049, "num_input_tokens_seen": 292549432, "step": 1984 }, { "epoch": 0.5719227443067166, "loss": 0.00935543142259121, "loss_ce": 0.008496647700667381, "loss_xval": 0.000858306884765625, "num_input_tokens_seen": 292549432, "step": 1984 }, { "epoch": 0.572211011818968, "grad_norm": 0.552995258421988, "learning_rate": 0.0001, "loss": 0.0078, "num_input_tokens_seen": 292684424, "step": 1985 }, { "epoch": 0.572211011818968, "loss": 0.0003244928375352174, "loss_ce": 9.286919521400705e-05, "loss_xval": 0.00023174285888671875, "num_input_tokens_seen": 292684424, "step": 1985 }, { "epoch": 0.5724992793312194, "grad_norm": 0.3768425457267613, "learning_rate": 0.0001, "loss": 0.0042, "num_input_tokens_seen": 292856968, "step": 1986 }, { "epoch": 0.5724992793312194, "loss": 0.0005440630484372377, "loss_ce": 9.464404865866527e-05, "loss_xval": 0.00045013427734375, "num_input_tokens_seen": 292856968, "step": 1986 }, { "epoch": 0.5727875468434708, "grad_norm": 0.7890485953633781, "learning_rate": 0.0001, "loss": 0.0069, "num_input_tokens_seen": 292991680, "step": 1987 }, { "epoch": 0.5727875468434708, "loss": 0.012787751853466034, "loss_ce": 0.012313298881053925, "loss_xval": 0.0004749298095703125, "num_input_tokens_seen": 292991680, "step": 1987 }, { "epoch": 0.5730758143557221, "grad_norm": 1.4873338901823483, "learning_rate": 0.0001, "loss": 0.0013, "num_input_tokens_seen": 293126728, "step": 1988 }, { "epoch": 0.5730758143557221, "loss": 0.000927687855437398, "loss_ce": 3.838654811261222e-05, "loss_xval": 0.000888824462890625, "num_input_tokens_seen": 293126728, "step": 1988 }, { "epoch": 0.5733640818679735, "grad_norm": 1.6546880055852196, "learning_rate": 0.0001, "loss": 0.005, "num_input_tokens_seen": 293299200, "step": 1989 }, { "epoch": 0.5733640818679735, "loss": 0.0014969916082918644, "loss_ce": 5.4082367569208145e-05, "loss_xval": 0.00144195556640625, "num_input_tokens_seen": 293299200, "step": 1989 }, { "epoch": 0.5736523493802248, "grad_norm": 1.2679658205061728, "learning_rate": 0.0001, "loss": 0.0062, "num_input_tokens_seen": 293434000, "step": 1990 }, { "epoch": 0.5736523493802248, "loss": 0.012005031108856201, "loss_ce": 0.009815394878387451, "loss_xval": 0.002197265625, "num_input_tokens_seen": 293434000, "step": 1990 }, { "epoch": 0.5739406168924762, "grad_norm": 1.1819654147990233, "learning_rate": 0.0001, "loss": 0.0017, "num_input_tokens_seen": 293569008, "step": 1991 }, { "epoch": 0.5739406168924762, "loss": 0.0011650105006992817, "loss_ce": 5.397993663791567e-05, "loss_xval": 0.0011138916015625, "num_input_tokens_seen": 293569008, "step": 1991 }, { "epoch": 0.5742288844047276, "grad_norm": 1.6972687442414842, "learning_rate": 0.0001, "loss": 0.0065, "num_input_tokens_seen": 293741424, "step": 1992 }, { "epoch": 0.5742288844047276, "loss": 0.0014634766848757863, "loss_ce": 5.966809840174392e-05, "loss_xval": 0.00140380859375, "num_input_tokens_seen": 293741424, "step": 1992 }, { "epoch": 0.5745171519169789, "grad_norm": 2.852577005265175, "learning_rate": 0.0001, "loss": 0.0088, "num_input_tokens_seen": 293876288, "step": 1993 }, { "epoch": 0.5745171519169789, "loss": 0.012815849855542183, "loss_ce": 0.009848015382885933, "loss_xval": 0.002960205078125, "num_input_tokens_seen": 293876288, "step": 1993 }, { "epoch": 0.5748054194292304, "grad_norm": 4.215657705576143, "learning_rate": 0.0001, "loss": 0.0081, "num_input_tokens_seen": 294011360, "step": 1994 }, { "epoch": 0.5748054194292304, "loss": 0.007397228851914406, "loss_ce": 4.24925638071727e-05, "loss_xval": 0.007354736328125, "num_input_tokens_seen": 294011360, "step": 1994 }, { "epoch": 0.5750936869414817, "grad_norm": 5.016030693031347, "learning_rate": 0.0001, "loss": 0.0149, "num_input_tokens_seen": 294183848, "step": 1995 }, { "epoch": 0.5750936869414817, "loss": 0.01234062947332859, "loss_ce": 4.204545257380232e-05, "loss_xval": 0.0123291015625, "num_input_tokens_seen": 294183848, "step": 1995 }, { "epoch": 0.5753819544537331, "grad_norm": 4.917456119960526, "learning_rate": 0.0001, "loss": 0.0151, "num_input_tokens_seen": 294318704, "step": 1996 }, { "epoch": 0.5753819544537331, "loss": 0.020639140158891678, "loss_ce": 0.007631022948771715, "loss_xval": 0.01300048828125, "num_input_tokens_seen": 294318704, "step": 1996 }, { "epoch": 0.5756702219659844, "grad_norm": 4.2676973987215705, "learning_rate": 0.0001, "loss": 0.0085, "num_input_tokens_seen": 294453744, "step": 1997 }, { "epoch": 0.5756702219659844, "loss": 0.009153537452220917, "loss_ce": 0.00017374010349158198, "loss_xval": 0.00897216796875, "num_input_tokens_seen": 294453744, "step": 1997 }, { "epoch": 0.5759584894782358, "grad_norm": 3.510931597546058, "learning_rate": 0.0001, "loss": 0.0107, "num_input_tokens_seen": 294626248, "step": 1998 }, { "epoch": 0.5759584894782358, "loss": 0.0059237708337605, "loss_ce": 0.00030472176149487495, "loss_xval": 0.005615234375, "num_input_tokens_seen": 294626248, "step": 1998 }, { "epoch": 0.5762467569904872, "grad_norm": 2.5517557847208376, "learning_rate": 0.0001, "loss": 0.0095, "num_input_tokens_seen": 294761032, "step": 1999 }, { "epoch": 0.5762467569904872, "loss": 0.015102425590157509, "loss_ce": 0.012073555961251259, "loss_xval": 0.003021240234375, "num_input_tokens_seen": 294761032, "step": 1999 }, { "epoch": 0.5765350245027385, "grad_norm": 1.4836132785368261, "learning_rate": 0.0001, "loss": 0.0013, "num_input_tokens_seen": 294896120, "step": 2000 }, { "epoch": 0.5765350245027385, "eval_websight_new_IoU": 0.4954599142074585, "eval_websight_new_MAE_x": 0.01227654330432415, "eval_websight_new_MAE_y": 0.012751261238008738, "eval_websight_new_NUM_probability": 0.9992389678955078, "eval_websight_new_inside_bbox": 0.7829861044883728, "eval_websight_new_loss": 0.0003192399744875729, "eval_websight_new_loss_ce": 0.00010440748155815527, "eval_websight_new_loss_xval": 0.00020268559455871582, "eval_websight_new_runtime": 35.7316, "eval_websight_new_samples_per_second": 1.399, "eval_websight_new_steps_per_second": 0.056, "num_input_tokens_seen": 294896120, "step": 2000 }, { "epoch": 0.5765350245027385, "eval_seeclick_IoU": 0.5345141887664795, "eval_seeclick_MAE_x": 0.015516493003815413, "eval_seeclick_MAE_y": 0.013822767417877913, "eval_seeclick_NUM_probability": 0.9992666840553284, "eval_seeclick_inside_bbox": 0.7829861044883728, "eval_seeclick_loss": 0.007619759999215603, "eval_seeclick_loss_ce": 0.008350821677595377, "eval_seeclick_loss_xval": 0.0003256499767303467, "eval_seeclick_runtime": 64.0935, "eval_seeclick_samples_per_second": 0.78, "eval_seeclick_steps_per_second": 0.031, "num_input_tokens_seen": 294896120, "step": 2000 }, { "epoch": 0.5765350245027385, "eval_icons_IoU": 0.18187464028596878, "eval_icons_MAE_x": 0.0158988106995821, "eval_icons_MAE_y": 0.017659504897892475, "eval_icons_NUM_probability": 0.9992129504680634, "eval_icons_inside_bbox": 0.2361111119389534, "eval_icons_loss": 0.007123068440705538, "eval_icons_loss_ce": 0.005604170728474855, "eval_icons_loss_xval": 0.00037682056427001953, "eval_icons_runtime": 69.6929, "eval_icons_samples_per_second": 0.717, "eval_icons_steps_per_second": 0.029, "num_input_tokens_seen": 294896120, "step": 2000 } ], "logging_steps": 1.0, "max_steps": 10000, "num_input_tokens_seen": 294896120, "num_train_epochs": 3, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2411691198119936.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }